MAINT: moved crawling functions which are actually part of the scanner to scanner module

2fdacd93 · Alexander Schlemmer · dfe93653 · 2fdacd93 · 2fdacd93
Commit 2fdacd93 authored 2 years ago by Alexander Schlemmer
--- a/src/caoscrawler/crawl.py
+++ b/src/caoscrawler/crawl.py
@@ -233,118 +233,6 @@ class Crawler(object):


            
-
-    def crawl_directory(self, dirname: str, crawler_definition_path: str,
-                        restricted_path: Optional[list[str]] = None):
-        """ Crawl a single directory.
-
-        Convenience function that starts the crawler (calls start_crawling)
-        with a single directory as the StructureElement.
-
-        restricted_path: optional, list of strings
-                Traverse the data tree only along the given path. When the end of the given path
-                is reached, traverse the full tree as normal.
-        """
-
-        crawler_definition = self.load_definition(crawler_definition_path)
-        # Load and register converter packages:
-        converter_registry = self.load_converters(crawler_definition)
-
-        if not dirname:
-            raise ValueError(
-                "You have to provide a non-empty path for crawling.")
-        dir_structure_name = os.path.basename(dirname)
-        self.crawled_directory = dirname
-        if not dir_structure_name and dirname.endswith('/'):
-            if dirname == '/':
-                # Crawling the entire file system
-                dir_structure_name = "root"
-            else:
-                # dirname had a trailing '/'
-                dir_structure_name = os.path.basename(dirname[:-1])
-
-        self.start_crawling(Directory(dir_structure_name,
-                                      dirname),
-                            crawler_definition,
-                            converter_registry,
-                            restricted_path=restricted_path
-                            )
-
-    @staticmethod
-    def initialize_converters(crawler_definition: dict, converter_registry: dict):
-        """
-        takes the cfood as dict (`crawler_definition`) and creates the converter objects that
-        are defined on the highest level. Child Converters will in turn be created during the
-        initialization of the Converters.
-        """
-        converters = []
-
-        for key, value in crawler_definition.items():
-            # Definitions and Converters are reserved keywords
-            # on the top level of the yaml file.
-            # TODO: there should also be a top level keyword for the actual
-            #       CFood to avoid confusion between top level keywords
-            #       and the CFood.
-            if key == "Definitions":
-                continue
-            elif key == "Converters":
-                continue
-            converters.append(Converter.converter_factory(
-                value, key, converter_registry))
-
-        return converters
-
-    def start_crawling(self, items: Union[list[StructureElement], StructureElement],
-                       crawler_definition: dict,
-                       converter_registry: dict,
-                       restricted_path: Optional[list[str]] = None):
-        """
-        Start point of the crawler recursion.
-
-        Parameters
-        ----------
-        items: list
-             A list of structure elements (or a single StructureElement) that is used for
-             generating the initial items for the crawler. This could e.g. be a Directory.
-        crawler_definition : dict
-             A dictionary representing the crawler definition, possibly from a yaml
-             file.
-        restricted_path: optional, list of strings
-             Traverse the data tree only along the given path. When the end of the given path
-             is reached, traverse the full tree as normal.
-
-        Returns
-        -------
-        crawled_data : list
-            the final list with the target state of Records.
-        """
-
-        # This function builds the tree of converters out of the crawler definition.
-
-        if self.generalStore is None:
-            raise RuntimeError("Should not happen.")
-
-        if not isinstance(items, list):
-            items = [items]
-
-        self.run_id = uuid.uuid1()
-        local_converters = Crawler.initialize_converters(crawler_definition, converter_registry)
-
-        # This recursive crawling procedure generates the update list:
-        self.crawled_data: list[db.Record] = []
-        self._crawl(
-            items=items,
-            local_converters=local_converters,
-            generalStore=self.generalStore,
-            recordStore=self.recordStore,
-            structure_elements_path=[],
-            converters_path=[],
-            restricted_path=restricted_path)
-        if self.debug:
-            self.debug_converters = local_converters
-
-        return self.crawled_data
-
    def synchronize(self, commit_changes: bool = True, unique_names=True):
        """
        Carry out the actual synchronization.

--- a/src/caoscrawler/scanner.py
+++ b/src/caoscrawler/scanner.py
@@ -70,154 +70,239 @@ from .debug.debug_tree import (DebugTreeStructureElement,
 logger = logging.getLogger(__name__)


-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-    def load_definition(self, crawler_definition_path: str):
-        """
-        Load a cfood from a crawler definition defined by
-        crawler definition path and validate it using cfood-schema.yml.
-        """
-
-        # Load the cfood from a yaml file:
-        with open(crawler_definition_path, "r") as f:
-            crawler_definitions = list(yaml.safe_load_all(f))
-
-        crawler_definition = self._load_definition_from_yaml_dict(
-            crawler_definitions)
-
-        return self._resolve_validator_paths(crawler_definition, crawler_definition_path)
-
-    def _load_definition_from_yaml_dict(self, crawler_definitions: list[dict]):
-        """Load crawler definitions from a list of (yaml) dicts `crawler_definitions` which
-        contains either one or two documents.
-
-        Doesn't resolve the validator paths in the cfood definition, so for
-        internal and testing use only.
-
-        """
-        if len(crawler_definitions) == 1:
-            # Simple case, just one document:
-            crawler_definition = crawler_definitions[0]
-            metadata = {}
-        elif len(crawler_definitions) == 2:
-            metadata = crawler_definitions[0]["metadata"] if "metadata" in crawler_definitions[0] else {
-            }
-            crawler_definition = crawler_definitions[1]
-        else:
-            raise RuntimeError(
-                "Crawler definition must not contain more than two documents.")
-
-        check_cfood_version(metadata)
-
-        # TODO: at this point this function can already load the cfood schema extensions
-        #       from the crawler definition and add them to the yaml schema that will be
-        #       tested in the next lines of code:
-
-        # Load the cfood schema:
-        with open(str(files('caoscrawler').joinpath('cfood-schema.yml')), "r") as f:
-            schema = yaml.safe_load(f)
-
-        # Add custom converters to converter enum in schema:
-        if "Converters" in crawler_definition:
-            for key in crawler_definition["Converters"]:
+def load_definition(self, crawler_definition_path: str):
+    """
+    Load a cfood from a crawler definition defined by
+    crawler definition path and validate it using cfood-schema.yml.
+    """
+
+    # Load the cfood from a yaml file:
+    with open(crawler_definition_path, "r") as f:
+        crawler_definitions = list(yaml.safe_load_all(f))
+
+    crawler_definition = self._load_definition_from_yaml_dict(
+        crawler_definitions)
+
+    return self._resolve_validator_paths(crawler_definition, crawler_definition_path)
+
+def _load_definition_from_yaml_dict(self, crawler_definitions: list[dict]):
+    """Load crawler definitions from a list of (yaml) dicts `crawler_definitions` which
+    contains either one or two documents.
+
+    Doesn't resolve the validator paths in the cfood definition, so for
+    internal and testing use only.
+
+    """
+    if len(crawler_definitions) == 1:
+        # Simple case, just one document:
+        crawler_definition = crawler_definitions[0]
+        metadata = {}
+    elif len(crawler_definitions) == 2:
+        metadata = crawler_definitions[0]["metadata"] if "metadata" in crawler_definitions[0] else {
+        }
+        crawler_definition = crawler_definitions[1]
+    else:
+        raise RuntimeError(
+            "Crawler definition must not contain more than two documents.")
+
+    check_cfood_version(metadata)
+
+    # TODO: at this point this function can already load the cfood schema extensions
+    #       from the crawler definition and add them to the yaml schema that will be
+    #       tested in the next lines of code:
+
+    # Load the cfood schema:
+    with open(str(files('caoscrawler').joinpath('cfood-schema.yml')), "r") as f:
+        schema = yaml.safe_load(f)
+
+    # Add custom converters to converter enum in schema:
+    if "Converters" in crawler_definition:
+        for key in crawler_definition["Converters"]:
+            schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append(
+                key)
+    if len(crawler_definitions) == 2:
+        if "Converters" in metadata:
+            for key in metadata["Converters"]:
                schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append(
                    key)
-        if len(crawler_definitions) == 2:
-            if "Converters" in metadata:
-                for key in metadata["Converters"]:
-                    schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append(
-                        key)
-
-        # Validate the cfood schema:
-        validate(instance=crawler_definition, schema=schema["cfood"])
-
-        return crawler_definition
-
-    def _resolve_validator_paths(self, definition: dict, definition_path: str):
-        """Resolve path to validation files with respect to the file in which
-        the crawler was defined.
-
-        """
-
-        for key, value in definition.items():
-
-            if key == "validate" and isinstance(value, str):
-                # Validator is given by a path
-                if not value.startswith('/'):
-                    # Not an absolute path
-                    definition[key] = os.path.join(os.path.dirname(definition_path), value)
-                    if not os.path.isfile(definition[key]):
-                        # TODO(henrik) capture this in `crawler_main` similar to
-                        # `ConverterValidationError`.
-                        raise FileNotFoundError(
-                            f"Couldn't find validation file {definition[key]}")
-            elif isinstance(value, dict):
-                # Recursively resolve all validators
-                definition[key] = self._resolve_validator_paths(value, definition_path)
-
-        return definition
-
-    def load_converters(self, definition: dict):
-        """
-        Currently the converter registry is a dictionary containing for each converter:
-        - key is the short code, abbreviation for the converter class name
-        - module is the name of the module to be imported which must be installed
-        - class is the converter class to load and associate with this converter entry
-
-        all other info for the converter needs to be included in the converter plugin
-        directory:
-        schema.yml file
-        README.md documentation
-
-        TODO: this function does not make use of self, so it could become static.
-        """
-
-        # Defaults for the converter registry:
-        with open(str(files('caoscrawler').joinpath('default_converters.yml')), "r") as f:
-            converter_registry: dict[str, dict[str, str]] = yaml.safe_load(f)
-
-        # More converters from definition file:
-        if "Converters" in definition:
-            for key, entry in definition["Converters"].items():
-                if key in ["Dict", "DictTextElement", "DictIntegerElement", "DictBooleanElement",
-                           "DictDictElement", "DictListElement", "DictFloatElement"]:
-                    warnings.warn(DeprecationWarning(f"{key} is deprecated. Please use the new"
-                                                     " variant; without 'Dict' prefix or "
-                                                     "'DictElement' in case of 'Dict'"))
-
-                converter_registry[key] = {
-                    "converter": entry["converter"],
-                    "package": entry["package"]
-                }
-
-        # Load modules and associate classes:
-        for key, value in converter_registry.items():
-            module = importlib.import_module(value["package"])
-            value["class"] = getattr(module, value["converter"])
-        return converter_registry
-
-
-
-
-
-
-
-

+    # Validate the cfood schema:
+    validate(instance=crawler_definition, schema=schema["cfood"])
+
+    return crawler_definition
+
+def _resolve_validator_paths(self, definition: dict, definition_path: str):
+    """Resolve path to validation files with respect to the file in which
+    the crawler was defined.
+
+    """
+
+    for key, value in definition.items():
+
+        if key == "validate" and isinstance(value, str):
+            # Validator is given by a path
+            if not value.startswith('/'):
+                # Not an absolute path
+                definition[key] = os.path.join(os.path.dirname(definition_path), value)
+                if not os.path.isfile(definition[key]):
+                    # TODO(henrik) capture this in `crawler_main` similar to
+                    # `ConverterValidationError`.
+                    raise FileNotFoundError(
+                        f"Couldn't find validation file {definition[key]}")
+        elif isinstance(value, dict):
+            # Recursively resolve all validators
+            definition[key] = self._resolve_validator_paths(value, definition_path)
+
+    return definition
+
+def load_converters(self, definition: dict):
+    """
+    Currently the converter registry is a dictionary containing for each converter:
+    - key is the short code, abbreviation for the converter class name
+    - module is the name of the module to be imported which must be installed
+    - class is the converter class to load and associate with this converter entry
+
+    all other info for the converter needs to be included in the converter plugin
+    directory:
+    schema.yml file
+    README.md documentation
+
+    TODO: this function does not make use of self, so it could become static.
+    """
+
+    # Defaults for the converter registry:
+    with open(str(files('caoscrawler').joinpath('default_converters.yml')), "r") as f:
+        converter_registry: dict[str, dict[str, str]] = yaml.safe_load(f)
+
+    # More converters from definition file:
+    if "Converters" in definition:
+        for key, entry in definition["Converters"].items():
+            if key in ["Dict", "DictTextElement", "DictIntegerElement", "DictBooleanElement",
+                       "DictDictElement", "DictListElement", "DictFloatElement"]:
+                warnings.warn(DeprecationWarning(f"{key} is deprecated. Please use the new"
+                                                 " variant; without 'Dict' prefix or "
+                                                 "'DictElement' in case of 'Dict'"))
+
+            converter_registry[key] = {
+                "converter": entry["converter"],
+                "package": entry["package"]
+            }

+    # Load modules and associate classes:
+    for key, value in converter_registry.items():
+        module = importlib.import_module(value["package"])
+        value["class"] = getattr(module, value["converter"])
+    return converter_registry
+
+
+def crawl_directory(self, dirname: str, crawler_definition_path: str,
+                    restricted_path: Optional[list[str]] = None):
+    """ Crawl a single directory.
+
+    Convenience function that starts the crawler (calls start_crawling)
+    with a single directory as the StructureElement.
+
+    restricted_path: optional, list of strings
+            Traverse the data tree only along the given path. When the end of the given path
+            is reached, traverse the full tree as normal.
+    """
+
+    crawler_definition = self.load_definition(crawler_definition_path)
+    # Load and register converter packages:
+    converter_registry = self.load_converters(crawler_definition)
+
+    if not dirname:
+        raise ValueError(
+            "You have to provide a non-empty path for crawling.")
+    dir_structure_name = os.path.basename(dirname)
+    self.crawled_directory = dirname
+    if not dir_structure_name and dirname.endswith('/'):
+        if dirname == '/':
+            # Crawling the entire file system
+            dir_structure_name = "root"
+        else:
+            # dirname had a trailing '/'
+            dir_structure_name = os.path.basename(dirname[:-1])
+
+    self.start_crawling(Directory(dir_structure_name,
+                                  dirname),
+                        crawler_definition,
+                        converter_registry,
+                        restricted_path=restricted_path
+                        )
+
+    
+def initialize_converters(crawler_definition: dict, converter_registry: dict):
+    """
+    takes the cfood as dict (`crawler_definition`) and creates the converter objects that
+    are defined on the highest level. Child Converters will in turn be created during the
+    initialization of the Converters.
+    """
+    converters = []
+
+    for key, value in crawler_definition.items():
+        # Definitions and Converters are reserved keywords
+        # on the top level of the yaml file.
+        # TODO: there should also be a top level keyword for the actual
+        #       CFood to avoid confusion between top level keywords
+        #       and the CFood.
+        if key == "Definitions":
+            continue
+        elif key == "Converters":
+            continue
+        converters.append(Converter.converter_factory(
+            value, key, converter_registry))
+
+    return converters
+
+
+def start_crawling(self, items: Union[list[StructureElement], StructureElement],
+                   crawler_definition: dict,
+                   converter_registry: dict,
+                   restricted_path: Optional[list[str]] = None):
+    """
+    Start point of the crawler recursion.
+
+    Parameters
+    ----------
+    items: list
+         A list of structure elements (or a single StructureElement) that is used for
+         generating the initial items for the crawler. This could e.g. be a Directory.
+    crawler_definition : dict
+         A dictionary representing the crawler definition, possibly from a yaml
+         file.
+    restricted_path: optional, list of strings
+         Traverse the data tree only along the given path. When the end of the given path
+         is reached, traverse the full tree as normal.
+
+    Returns
+    -------
+    crawled_data : list
+        the final list with the target state of Records.
+    """
+
+    # This function builds the tree of converters out of the crawler definition.
+
+    if self.generalStore is None:
+        raise RuntimeError("Should not happen.")
+
+    if not isinstance(items, list):
+        items = [items]
+
+    self.run_id = uuid.uuid1()
+    local_converters = Crawler.initialize_converters(crawler_definition, converter_registry)
+
+    # This recursive crawling procedure generates the update list:
+    self.crawled_data: list[db.Record] = []
+    self._crawl(
+        items=items,
+        local_converters=local_converters,
+        generalStore=self.generalStore,
+        recordStore=self.recordStore,
+        structure_elements_path=[],
+        converters_path=[],
+        restricted_path=restricted_path)
+    if self.debug:
+        self.debug_converters = local_converters
+
+    return self.crawled_data