From 90620c94e089b9e3c189bcb06cf79f9723c624be Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <alexander@mail-schlemmer.de> Date: Wed, 8 Mar 2023 14:35:07 +0100 Subject: [PATCH] MAIN: refactored scan_structure_elements and scan_directory functions --- src/caoscrawler/scanner.py | 45 +++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py index 328c8933..cc0a50b5 100644 --- a/src/caoscrawler/scanner.py +++ b/src/caoscrawler/scanner.py @@ -164,6 +164,8 @@ def create_converter_registry(definition: dict): - module is the name of the module to be imported which must be installed - class is the converter class to load and associate with this converter entry + Formerly known as "load_converters". + all other info for the converter needs to be included in the converter plugin directory: schema.yml file @@ -337,10 +339,12 @@ def _crawl(self, # -------------------------------------------------------------------------------- -def crawl_directory(dirname: str, crawler_definition_path: str, +def scan_directory(dirname: str, crawler_definition_path: str, restricted_path: Optional[list[str]] = None): """ Crawl a single directory. + Formerly known as "crawl_directory". + Convenience function that starts the crawler (calls start_crawling) with a single directory as the StructureElement. @@ -351,7 +355,7 @@ def crawl_directory(dirname: str, crawler_definition_path: str, crawler_definition = load_definition(crawler_definition_path) # Load and register converter packages: - converter_registry = load_converters(crawler_definition) + converter_registry = create_converter_registry(crawler_definition) if not dirname: raise ValueError( @@ -366,21 +370,22 @@ def crawl_directory(dirname: str, crawler_definition_path: str, # dirname had a trailing '/' dir_structure_name = os.path.basename(dirname[:-1]) - start_crawling(Directory(dir_structure_name, - dirname), - crawler_definition, - converter_registry, - restricted_path=restricted_path - ) + return scan_structure_elements(Directory(dir_structure_name, + dirname), + crawler_definition, + converter_registry, + restricted_path=restricted_path) -def start_crawling(items: Union[list[StructureElement], StructureElement], - crawler_definition: dict, - converter_registry: dict, - restricted_path: Optional[list[str]] = None): +def scan_structure_elements(items: Union[list[StructureElement], StructureElement], + crawler_definition: dict, + converter_registry: dict, + restricted_path: Optional[list[str]] = None): """ Start point of the crawler recursion. + Formerly known as "start_crawling". + Parameters ---------- items: list @@ -400,27 +405,17 @@ def start_crawling(items: Union[list[StructureElement], StructureElement], """ # This function builds the tree of converters out of the crawler definition. - - if self.generalStore is None: - raise RuntimeError("Should not happen.") - if not isinstance(items, list): items = [items] self.run_id = uuid.uuid1() - local_converters = Crawler.initialize_converters(crawler_definition, converter_registry) + converters = initialize_converters(crawler_definition, converter_registry) - # This recursive crawling procedure generates the update list: - self.crawled_data: list[db.Record] = [] - self._crawl( + return _crawl( items=items, - local_converters=local_converters, + local_converters=converters, generalStore=self.generalStore, recordStore=self.recordStore, structure_elements_path=[], converters_path=[], restricted_path=restricted_path) - if self.debug: - self.debug_converters = local_converters - - return self.crawled_data -- GitLab