From f547fa391a9a63b79764e29b82c4203ff7206289 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <alexander@mail-schlemmer.de> Date: Wed, 8 Mar 2023 14:20:39 +0100 Subject: [PATCH] MAINT: made utility and converter registry functions top level functions without references to self --- src/caoscrawler/scanner.py | 66 +++++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 30 deletions(-) diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py index 66519aee..f2c05ab1 100644 --- a/src/caoscrawler/scanner.py +++ b/src/caoscrawler/scanner.py @@ -70,7 +70,7 @@ from .debug.debug_tree import (DebugTreeStructureElement, logger = logging.getLogger(__name__) -def load_definition(self, crawler_definition_path: str): +def load_definition(crawler_definition_path: str): """ Load a cfood from a crawler definition defined by crawler definition path and validate it using cfood-schema.yml. @@ -80,12 +80,12 @@ def load_definition(self, crawler_definition_path: str): with open(crawler_definition_path, "r") as f: crawler_definitions = list(yaml.safe_load_all(f)) - crawler_definition = self._load_definition_from_yaml_dict( + crawler_definition = _load_definition_from_yaml_dict( crawler_definitions) - return self._resolve_validator_paths(crawler_definition, crawler_definition_path) + return _resolve_validator_paths(crawler_definition, crawler_definition_path) -def _load_definition_from_yaml_dict(self, crawler_definitions: list[dict]): +def _load_definition_from_yaml_dict(crawler_definitions: list[dict]): """Load crawler definitions from a list of (yaml) dicts `crawler_definitions` which contains either one or two documents. @@ -131,7 +131,8 @@ def _load_definition_from_yaml_dict(self, crawler_definitions: list[dict]): return crawler_definition -def _resolve_validator_paths(self, definition: dict, definition_path: str): + +def _resolve_validator_paths(definition: dict, definition_path: str): """Resolve path to validation files with respect to the file in which the crawler was defined. @@ -151,7 +152,7 @@ def _resolve_validator_paths(self, definition: dict, definition_path: str): f"Couldn't find validation file {definition[key]}") elif isinstance(value, dict): # Recursively resolve all validators - definition[key] = self._resolve_validator_paths(value, definition_path) + definition[key] = _resolve_validator_paths(value, definition_path) return definition @@ -194,6 +195,35 @@ def create_converter_registry(definition: dict): return converter_registry +def initialize_converters(crawler_definition: dict, converter_registry: dict): + """ + takes the cfood as dict (`crawler_definition`) and creates the converter objects that + are defined on the highest level. Child Converters will in turn be created during the + initialization of the Converters. + """ + converters = [] + + for key, value in crawler_definition.items(): + # Definitions and Converters are reserved keywords + # on the top level of the yaml file. + # TODO: there should also be a top level keyword for the actual + # CFood to avoid confusion between top level keywords + # and the CFood. + if key == "Definitions": + continue + elif key == "Converters": + continue + converters.append(Converter.converter_factory( + value, key, converter_registry)) + + return converters + + +# -------------------------------------------------------------------------------- +# Main scanning functions: +# -------------------------------------------------------------------------------- + + def crawl_directory(self, dirname: str, crawler_definition_path: str, restricted_path: Optional[list[str]] = None): """ Crawl a single directory. @@ -229,31 +259,7 @@ def crawl_directory(self, dirname: str, crawler_definition_path: str, converter_registry, restricted_path=restricted_path ) - -def initialize_converters(crawler_definition: dict, converter_registry: dict): - """ - takes the cfood as dict (`crawler_definition`) and creates the converter objects that - are defined on the highest level. Child Converters will in turn be created during the - initialization of the Converters. - """ - converters = [] - - for key, value in crawler_definition.items(): - # Definitions and Converters are reserved keywords - # on the top level of the yaml file. - # TODO: there should also be a top level keyword for the actual - # CFood to avoid confusion between top level keywords - # and the CFood. - if key == "Definitions": - continue - elif key == "Converters": - continue - converters.append(Converter.converter_factory( - value, key, converter_registry)) - - return converters - def start_crawling(self, items: Union[list[StructureElement], StructureElement], crawler_definition: dict, -- GitLab