diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 540dcb51db648e5a900c9a1765a18f31ea44b0f7..4167ebf443916d0c28b42443b335082f0b52d13a 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -4,8 +4,8 @@ # ** header v3.0 # This file is a part of the CaosDB Project. # -# Copyright (C) 2021 Henrik tom Wörden -# 2021-2023 Alexander Schlemmer (alexander.schlemmer@ds.mpg.de) +# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# 2021-2023 Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as @@ -67,6 +67,8 @@ from .stores import GeneralStore, RecordStore from .structure_elements import StructureElement, Directory, NoneElement from .version import check_cfood_version +from .scanner import scan_directory + logger = logging.getLogger(__name__) SPECIAL_PROPERTIES_STRICT = ("description", "name", "id", "path") @@ -175,27 +177,13 @@ class Crawler(object): """ def __init__(self, - generalStore: Optional[GeneralStore] = None, - debug: bool = False, - identifiableAdapter: IdentifiableAdapter = None, - securityMode: SecurityMode = SecurityMode.UPDATE - ): + identifiableAdapter: Optional[IdentifiableAdapter] = None, + securityMode: SecurityMode = SecurityMode.UPDATE): """ Create a new crawler and initialize an empty RecordStore and GeneralStore. Parameters ---------- - recordStore : GeneralStore - An initial GeneralStore which might store e.g. environment variables. - debug : bool - Create a debugging information tree when set to True. - The debugging information tree is a variable stored in - self.debug_tree. It is a dictionary mapping directory entries - to a tuple of general stores and record stores which are valid for - the directory scope. - Furthermore, it is stored in a second tree named self.debug_copied whether the - objects in debug_tree had been copied from a higher level in the hierarchy - of the structureelements. identifiableAdapter : IdentifiableAdapter TODO describe securityMode : int @@ -208,30 +196,30 @@ class Crawler(object): # different caches. self.remote_existing_cache = IdentifiedCache() self.remote_missing_cache = IdentifiedCache() - self.recordStore = RecordStore() self.securityMode = securityMode - self.generalStore = generalStore - if generalStore is None: - self.generalStore = GeneralStore() - self.identifiableAdapter: IdentifiableAdapter = LocalStorageIdentifiableAdapter() if identifiableAdapter is not None: self.identifiableAdapter = identifiableAdapter - # If a directory is crawled this may hold the path to that directory - self.crawled_directory: Optional[str] = None - self.debug = debug - - def synchronize(self, commit_changes: bool = True, unique_names=True): + def crawl_directory(self, + crawled_directory: str, + crawler_definition_path: str, + restricted_path: Optional[list[str]] = None): """ - Carry out the actual synchronization. + The new main function to run the crawler on a directory. """ - # After the crawling, the actual synchronization with the database, based on the - # update list is carried out: + self.crawled_directory = crawled_directory + self.run_id = uuid.uuid1() + + # TODO: This is not ideal yet, the data is just returned and needs to be + # separately supplied to the synchronize function. + + return scan_directory(crawled_directory, + crawler_definition_path, + restricted_path) - return self._synchronize(self.crawled_data, commit_changes, unique_names=unique_names) def _has_reference_value_without_id(self, ident: Identifiable) -> bool: """ @@ -704,7 +692,8 @@ class Crawler(object): return db.Entity(id=id).retrieve() @staticmethod - def execute_inserts_in_list(to_be_inserted, securityMode, run_id: uuid.UUID = None, + def execute_inserts_in_list(to_be_inserted, securityMode, + run_id: Optional[uuid.UUID] = None, unique_names=True): for record in to_be_inserted: for prop in record.properties: @@ -732,7 +721,8 @@ class Crawler(object): _resolve_datatype(prop, entity) @staticmethod - def execute_updates_in_list(to_be_updated, securityMode, run_id: uuid.UUID = None, + def execute_updates_in_list(to_be_updated, securityMode, + run_id: Optional[uuid.UUID] = None, unique_names=True): Crawler.set_ids_and_datatype_of_parents_and_properties(to_be_updated) logger.debug("UPDATE") @@ -744,8 +734,10 @@ class Crawler(object): update_cache = UpdateCache() update_cache.insert(to_be_updated, run_id) - def _synchronize(self, crawled_data: list[db.Record], commit_changes: bool = True, - unique_names=True): + def synchronize(self, + crawled_data: list[db.Record], + commit_changes: bool = True, + unique_names=True): """ This function applies several stages: 1) Retrieve identifiables for all records in crawled_data.