From 2fdacd93d2045db537f019db2127384eb1f8b766 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <alexander@mail-schlemmer.de> Date: Wed, 8 Mar 2023 14:14:13 +0100 Subject: [PATCH] MAINT: moved crawling functions which are actually part of the scanner to scanner module --- src/caoscrawler/crawl.py | 112 ----------- src/caoscrawler/scanner.py | 379 +++++++++++++++++++++++-------------- 2 files changed, 232 insertions(+), 259 deletions(-) diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 50163ff3..0b8f921d 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -233,118 +233,6 @@ class Crawler(object): - - def crawl_directory(self, dirname: str, crawler_definition_path: str, - restricted_path: Optional[list[str]] = None): - """ Crawl a single directory. - - Convenience function that starts the crawler (calls start_crawling) - with a single directory as the StructureElement. - - restricted_path: optional, list of strings - Traverse the data tree only along the given path. When the end of the given path - is reached, traverse the full tree as normal. - """ - - crawler_definition = self.load_definition(crawler_definition_path) - # Load and register converter packages: - converter_registry = self.load_converters(crawler_definition) - - if not dirname: - raise ValueError( - "You have to provide a non-empty path for crawling.") - dir_structure_name = os.path.basename(dirname) - self.crawled_directory = dirname - if not dir_structure_name and dirname.endswith('/'): - if dirname == '/': - # Crawling the entire file system - dir_structure_name = "root" - else: - # dirname had a trailing '/' - dir_structure_name = os.path.basename(dirname[:-1]) - - self.start_crawling(Directory(dir_structure_name, - dirname), - crawler_definition, - converter_registry, - restricted_path=restricted_path - ) - - @staticmethod - def initialize_converters(crawler_definition: dict, converter_registry: dict): - """ - takes the cfood as dict (`crawler_definition`) and creates the converter objects that - are defined on the highest level. Child Converters will in turn be created during the - initialization of the Converters. - """ - converters = [] - - for key, value in crawler_definition.items(): - # Definitions and Converters are reserved keywords - # on the top level of the yaml file. - # TODO: there should also be a top level keyword for the actual - # CFood to avoid confusion between top level keywords - # and the CFood. - if key == "Definitions": - continue - elif key == "Converters": - continue - converters.append(Converter.converter_factory( - value, key, converter_registry)) - - return converters - - def start_crawling(self, items: Union[list[StructureElement], StructureElement], - crawler_definition: dict, - converter_registry: dict, - restricted_path: Optional[list[str]] = None): - """ - Start point of the crawler recursion. - - Parameters - ---------- - items: list - A list of structure elements (or a single StructureElement) that is used for - generating the initial items for the crawler. This could e.g. be a Directory. - crawler_definition : dict - A dictionary representing the crawler definition, possibly from a yaml - file. - restricted_path: optional, list of strings - Traverse the data tree only along the given path. When the end of the given path - is reached, traverse the full tree as normal. - - Returns - ------- - crawled_data : list - the final list with the target state of Records. - """ - - # This function builds the tree of converters out of the crawler definition. - - if self.generalStore is None: - raise RuntimeError("Should not happen.") - - if not isinstance(items, list): - items = [items] - - self.run_id = uuid.uuid1() - local_converters = Crawler.initialize_converters(crawler_definition, converter_registry) - - # This recursive crawling procedure generates the update list: - self.crawled_data: list[db.Record] = [] - self._crawl( - items=items, - local_converters=local_converters, - generalStore=self.generalStore, - recordStore=self.recordStore, - structure_elements_path=[], - converters_path=[], - restricted_path=restricted_path) - if self.debug: - self.debug_converters = local_converters - - return self.crawled_data - def synchronize(self, commit_changes: bool = True, unique_names=True): """ Carry out the actual synchronization. diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py index 740ba464..be0f0ab6 100644 --- a/src/caoscrawler/scanner.py +++ b/src/caoscrawler/scanner.py @@ -70,154 +70,239 @@ from .debug.debug_tree import (DebugTreeStructureElement, logger = logging.getLogger(__name__) - - - - - - - - - - - - - - - - - - - def load_definition(self, crawler_definition_path: str): - """ - Load a cfood from a crawler definition defined by - crawler definition path and validate it using cfood-schema.yml. - """ - - # Load the cfood from a yaml file: - with open(crawler_definition_path, "r") as f: - crawler_definitions = list(yaml.safe_load_all(f)) - - crawler_definition = self._load_definition_from_yaml_dict( - crawler_definitions) - - return self._resolve_validator_paths(crawler_definition, crawler_definition_path) - - def _load_definition_from_yaml_dict(self, crawler_definitions: list[dict]): - """Load crawler definitions from a list of (yaml) dicts `crawler_definitions` which - contains either one or two documents. - - Doesn't resolve the validator paths in the cfood definition, so for - internal and testing use only. - - """ - if len(crawler_definitions) == 1: - # Simple case, just one document: - crawler_definition = crawler_definitions[0] - metadata = {} - elif len(crawler_definitions) == 2: - metadata = crawler_definitions[0]["metadata"] if "metadata" in crawler_definitions[0] else { - } - crawler_definition = crawler_definitions[1] - else: - raise RuntimeError( - "Crawler definition must not contain more than two documents.") - - check_cfood_version(metadata) - - # TODO: at this point this function can already load the cfood schema extensions - # from the crawler definition and add them to the yaml schema that will be - # tested in the next lines of code: - - # Load the cfood schema: - with open(str(files('caoscrawler').joinpath('cfood-schema.yml')), "r") as f: - schema = yaml.safe_load(f) - - # Add custom converters to converter enum in schema: - if "Converters" in crawler_definition: - for key in crawler_definition["Converters"]: +def load_definition(self, crawler_definition_path: str): + """ + Load a cfood from a crawler definition defined by + crawler definition path and validate it using cfood-schema.yml. + """ + + # Load the cfood from a yaml file: + with open(crawler_definition_path, "r") as f: + crawler_definitions = list(yaml.safe_load_all(f)) + + crawler_definition = self._load_definition_from_yaml_dict( + crawler_definitions) + + return self._resolve_validator_paths(crawler_definition, crawler_definition_path) + +def _load_definition_from_yaml_dict(self, crawler_definitions: list[dict]): + """Load crawler definitions from a list of (yaml) dicts `crawler_definitions` which + contains either one or two documents. + + Doesn't resolve the validator paths in the cfood definition, so for + internal and testing use only. + + """ + if len(crawler_definitions) == 1: + # Simple case, just one document: + crawler_definition = crawler_definitions[0] + metadata = {} + elif len(crawler_definitions) == 2: + metadata = crawler_definitions[0]["metadata"] if "metadata" in crawler_definitions[0] else { + } + crawler_definition = crawler_definitions[1] + else: + raise RuntimeError( + "Crawler definition must not contain more than two documents.") + + check_cfood_version(metadata) + + # TODO: at this point this function can already load the cfood schema extensions + # from the crawler definition and add them to the yaml schema that will be + # tested in the next lines of code: + + # Load the cfood schema: + with open(str(files('caoscrawler').joinpath('cfood-schema.yml')), "r") as f: + schema = yaml.safe_load(f) + + # Add custom converters to converter enum in schema: + if "Converters" in crawler_definition: + for key in crawler_definition["Converters"]: + schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append( + key) + if len(crawler_definitions) == 2: + if "Converters" in metadata: + for key in metadata["Converters"]: schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append( key) - if len(crawler_definitions) == 2: - if "Converters" in metadata: - for key in metadata["Converters"]: - schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append( - key) - - # Validate the cfood schema: - validate(instance=crawler_definition, schema=schema["cfood"]) - - return crawler_definition - - def _resolve_validator_paths(self, definition: dict, definition_path: str): - """Resolve path to validation files with respect to the file in which - the crawler was defined. - - """ - - for key, value in definition.items(): - - if key == "validate" and isinstance(value, str): - # Validator is given by a path - if not value.startswith('/'): - # Not an absolute path - definition[key] = os.path.join(os.path.dirname(definition_path), value) - if not os.path.isfile(definition[key]): - # TODO(henrik) capture this in `crawler_main` similar to - # `ConverterValidationError`. - raise FileNotFoundError( - f"Couldn't find validation file {definition[key]}") - elif isinstance(value, dict): - # Recursively resolve all validators - definition[key] = self._resolve_validator_paths(value, definition_path) - - return definition - - def load_converters(self, definition: dict): - """ - Currently the converter registry is a dictionary containing for each converter: - - key is the short code, abbreviation for the converter class name - - module is the name of the module to be imported which must be installed - - class is the converter class to load and associate with this converter entry - - all other info for the converter needs to be included in the converter plugin - directory: - schema.yml file - README.md documentation - - TODO: this function does not make use of self, so it could become static. - """ - - # Defaults for the converter registry: - with open(str(files('caoscrawler').joinpath('default_converters.yml')), "r") as f: - converter_registry: dict[str, dict[str, str]] = yaml.safe_load(f) - - # More converters from definition file: - if "Converters" in definition: - for key, entry in definition["Converters"].items(): - if key in ["Dict", "DictTextElement", "DictIntegerElement", "DictBooleanElement", - "DictDictElement", "DictListElement", "DictFloatElement"]: - warnings.warn(DeprecationWarning(f"{key} is deprecated. Please use the new" - " variant; without 'Dict' prefix or " - "'DictElement' in case of 'Dict'")) - - converter_registry[key] = { - "converter": entry["converter"], - "package": entry["package"] - } - - # Load modules and associate classes: - for key, value in converter_registry.items(): - module = importlib.import_module(value["package"]) - value["class"] = getattr(module, value["converter"]) - return converter_registry - - - - - - - - + # Validate the cfood schema: + validate(instance=crawler_definition, schema=schema["cfood"]) + + return crawler_definition + +def _resolve_validator_paths(self, definition: dict, definition_path: str): + """Resolve path to validation files with respect to the file in which + the crawler was defined. + + """ + + for key, value in definition.items(): + + if key == "validate" and isinstance(value, str): + # Validator is given by a path + if not value.startswith('/'): + # Not an absolute path + definition[key] = os.path.join(os.path.dirname(definition_path), value) + if not os.path.isfile(definition[key]): + # TODO(henrik) capture this in `crawler_main` similar to + # `ConverterValidationError`. + raise FileNotFoundError( + f"Couldn't find validation file {definition[key]}") + elif isinstance(value, dict): + # Recursively resolve all validators + definition[key] = self._resolve_validator_paths(value, definition_path) + + return definition + +def load_converters(self, definition: dict): + """ + Currently the converter registry is a dictionary containing for each converter: + - key is the short code, abbreviation for the converter class name + - module is the name of the module to be imported which must be installed + - class is the converter class to load and associate with this converter entry + + all other info for the converter needs to be included in the converter plugin + directory: + schema.yml file + README.md documentation + + TODO: this function does not make use of self, so it could become static. + """ + + # Defaults for the converter registry: + with open(str(files('caoscrawler').joinpath('default_converters.yml')), "r") as f: + converter_registry: dict[str, dict[str, str]] = yaml.safe_load(f) + + # More converters from definition file: + if "Converters" in definition: + for key, entry in definition["Converters"].items(): + if key in ["Dict", "DictTextElement", "DictIntegerElement", "DictBooleanElement", + "DictDictElement", "DictListElement", "DictFloatElement"]: + warnings.warn(DeprecationWarning(f"{key} is deprecated. Please use the new" + " variant; without 'Dict' prefix or " + "'DictElement' in case of 'Dict'")) + + converter_registry[key] = { + "converter": entry["converter"], + "package": entry["package"] + } + # Load modules and associate classes: + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry + + +def crawl_directory(self, dirname: str, crawler_definition_path: str, + restricted_path: Optional[list[str]] = None): + """ Crawl a single directory. + + Convenience function that starts the crawler (calls start_crawling) + with a single directory as the StructureElement. + + restricted_path: optional, list of strings + Traverse the data tree only along the given path. When the end of the given path + is reached, traverse the full tree as normal. + """ + + crawler_definition = self.load_definition(crawler_definition_path) + # Load and register converter packages: + converter_registry = self.load_converters(crawler_definition) + + if not dirname: + raise ValueError( + "You have to provide a non-empty path for crawling.") + dir_structure_name = os.path.basename(dirname) + self.crawled_directory = dirname + if not dir_structure_name and dirname.endswith('/'): + if dirname == '/': + # Crawling the entire file system + dir_structure_name = "root" + else: + # dirname had a trailing '/' + dir_structure_name = os.path.basename(dirname[:-1]) + + self.start_crawling(Directory(dir_structure_name, + dirname), + crawler_definition, + converter_registry, + restricted_path=restricted_path + ) + + +def initialize_converters(crawler_definition: dict, converter_registry: dict): + """ + takes the cfood as dict (`crawler_definition`) and creates the converter objects that + are defined on the highest level. Child Converters will in turn be created during the + initialization of the Converters. + """ + converters = [] + + for key, value in crawler_definition.items(): + # Definitions and Converters are reserved keywords + # on the top level of the yaml file. + # TODO: there should also be a top level keyword for the actual + # CFood to avoid confusion between top level keywords + # and the CFood. + if key == "Definitions": + continue + elif key == "Converters": + continue + converters.append(Converter.converter_factory( + value, key, converter_registry)) + + return converters + + +def start_crawling(self, items: Union[list[StructureElement], StructureElement], + crawler_definition: dict, + converter_registry: dict, + restricted_path: Optional[list[str]] = None): + """ + Start point of the crawler recursion. + + Parameters + ---------- + items: list + A list of structure elements (or a single StructureElement) that is used for + generating the initial items for the crawler. This could e.g. be a Directory. + crawler_definition : dict + A dictionary representing the crawler definition, possibly from a yaml + file. + restricted_path: optional, list of strings + Traverse the data tree only along the given path. When the end of the given path + is reached, traverse the full tree as normal. + + Returns + ------- + crawled_data : list + the final list with the target state of Records. + """ + + # This function builds the tree of converters out of the crawler definition. + + if self.generalStore is None: + raise RuntimeError("Should not happen.") + + if not isinstance(items, list): + items = [items] + + self.run_id = uuid.uuid1() + local_converters = Crawler.initialize_converters(crawler_definition, converter_registry) + + # This recursive crawling procedure generates the update list: + self.crawled_data: list[db.Record] = [] + self._crawl( + items=items, + local_converters=local_converters, + generalStore=self.generalStore, + recordStore=self.recordStore, + structure_elements_path=[], + converters_path=[], + restricted_path=restricted_path) + if self.debug: + self.debug_converters = local_converters + + return self.crawled_data -- GitLab