crawl.py

#!/usr/bin/env python3
# encoding: utf-8
#
# ** header v3.0
# This file is a part of the CaosDB Project.
#
# Copyright (C) 2020,2021 IndiScale GmbH <info@indiscale.com>
# Copyright (C) 2020,2021 Henrik tom Wörden <h.tomwoerden@indiscale.com>
# Copyright (C) 2018 Research Group Biomedical Physics
# Max-Planck-Institute for Dynamics and Self-Organization Göttingen
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
# ** end header
#
"""
This script invokes a CaosDB Crawler.
"""

import argparse
import logging
import os
import sys
from argparse import RawTextHelpFormatter

import caosdb as db
from caosadvancedtools.cfood import fileguide
from caosadvancedtools.crawler import FileCrawler
from caosadvancedtools.guard import UPDATE

from example_cfood import ExampleCFood


def get_parser():

    parser = argparse.ArgumentParser(description=__doc__,
                                     formatter_class=RawTextHelpFormatter)
    parser.add_argument("path",
                        help="the subtree of files below the given (local) "
                        " path will be considered.")

    return parser


def local_access(path):
    return path


if __name__ == "__main__":
    parser = get_parser()
    args = parser.parse_args()

    fileguide.access = local_access

    # logging config
    logger = logging.getLogger("caosadvancedtools")
    logger.addHandler(logging.StreamHandler(sys.stdout))
    logger.setLevel(logging.DEBUG)

    # create file objects manually since the files do not exist on the server
    # and cannot be queried using 'FileCrawler.query_files`
    to_be_crawled = []

    for root, dirs, files in os.walk(args.path):
        to_be_crawled.extend(
            [db.File(path=os.path.join(root, f)) for f in files])

    c = FileCrawler(files=to_be_crawled, use_cache=False,
                    abort_on_exception=False,
                    interactive=False,
                    cfood_types=[
                        ExampleCFood,
                    ],
                    )
    c.crawl(security_level=UPDATE, path=args.path)