From dfe9365347b7fdd790c34689e601ce5fe73e8af6 Mon Sep 17 00:00:00 2001
From: Alexander Schlemmer <alexander@mail-schlemmer.de>
Date: Wed, 8 Mar 2023 14:11:54 +0100
Subject: [PATCH] ENH: created scanner module and moved some functions there

---
 src/caoscrawler/crawl.py   | 123 +-------------------
 src/caoscrawler/scanner.py | 223 +++++++++++++++++++++++++++++++++++++
 2 files changed, 224 insertions(+), 122 deletions(-)
 create mode 100644 src/caoscrawler/scanner.py

diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py
index 5269aeb8..50163ff3 100644
--- a/src/caoscrawler/crawl.py
+++ b/src/caoscrawler/crawl.py
@@ -231,129 +231,8 @@ class Crawler(object):
             self.debug_metadata["provenance"] = defaultdict(lambda: dict())
             self.debug_metadata["usage"] = defaultdict(lambda: set())
 
-    def load_definition(self, crawler_definition_path: str):
-        """
-        Load a cfood from a crawler definition defined by
-        crawler definition path and validate it using cfood-schema.yml.
-        """
-
-        # Load the cfood from a yaml file:
-        with open(crawler_definition_path, "r") as f:
-            crawler_definitions = list(yaml.safe_load_all(f))
-
-        crawler_definition = self._load_definition_from_yaml_dict(
-            crawler_definitions)
-
-        return self._resolve_validator_paths(crawler_definition, crawler_definition_path)
-
-    def _load_definition_from_yaml_dict(self, crawler_definitions: list[dict]):
-        """Load crawler definitions from a list of (yaml) dicts `crawler_definitions` which
-        contains either one or two documents.
-
-        Doesn't resolve the validator paths in the cfood definition, so for
-        internal and testing use only.
-
-        """
-        if len(crawler_definitions) == 1:
-            # Simple case, just one document:
-            crawler_definition = crawler_definitions[0]
-            metadata = {}
-        elif len(crawler_definitions) == 2:
-            metadata = crawler_definitions[0]["metadata"] if "metadata" in crawler_definitions[0] else {
-            }
-            crawler_definition = crawler_definitions[1]
-        else:
-            raise RuntimeError(
-                "Crawler definition must not contain more than two documents.")
-
-        check_cfood_version(metadata)
-
-        # TODO: at this point this function can already load the cfood schema extensions
-        #       from the crawler definition and add them to the yaml schema that will be
-        #       tested in the next lines of code:
-
-        # Load the cfood schema:
-        with open(str(files('caoscrawler').joinpath('cfood-schema.yml')), "r") as f:
-            schema = yaml.safe_load(f)
-
-        # Add custom converters to converter enum in schema:
-        if "Converters" in crawler_definition:
-            for key in crawler_definition["Converters"]:
-                schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append(
-                    key)
-        if len(crawler_definitions) == 2:
-            if "Converters" in metadata:
-                for key in metadata["Converters"]:
-                    schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append(
-                        key)
-
-        # Validate the cfood schema:
-        validate(instance=crawler_definition, schema=schema["cfood"])
-
-        return crawler_definition
-
-    def _resolve_validator_paths(self, definition: dict, definition_path: str):
-        """Resolve path to validation files with respect to the file in which
-        the crawler was defined.
-
-        """
-
-        for key, value in definition.items():
-
-            if key == "validate" and isinstance(value, str):
-                # Validator is given by a path
-                if not value.startswith('/'):
-                    # Not an absolute path
-                    definition[key] = os.path.join(os.path.dirname(definition_path), value)
-                    if not os.path.isfile(definition[key]):
-                        # TODO(henrik) capture this in `crawler_main` similar to
-                        # `ConverterValidationError`.
-                        raise FileNotFoundError(
-                            f"Couldn't find validation file {definition[key]}")
-            elif isinstance(value, dict):
-                # Recursively resolve all validators
-                definition[key] = self._resolve_validator_paths(value, definition_path)
-
-        return definition
-
-    def load_converters(self, definition: dict):
-        """
-        Currently the converter registry is a dictionary containing for each converter:
-        - key is the short code, abbreviation for the converter class name
-        - module is the name of the module to be imported which must be installed
-        - class is the converter class to load and associate with this converter entry
-
-        all other info for the converter needs to be included in the converter plugin
-        directory:
-        schema.yml file
-        README.md documentation
-
-        TODO: this function does not make use of self, so it could become static.
-        """
 
-        # Defaults for the converter registry:
-        with open(str(files('caoscrawler').joinpath('default_converters.yml')), "r") as f:
-            converter_registry: dict[str, dict[str, str]] = yaml.safe_load(f)
-
-        # More converters from definition file:
-        if "Converters" in definition:
-            for key, entry in definition["Converters"].items():
-                if key in ["Dict", "DictTextElement", "DictIntegerElement", "DictBooleanElement",
-                           "DictDictElement", "DictListElement", "DictFloatElement"]:
-                    warnings.warn(DeprecationWarning(f"{key} is deprecated. Please use the new"
-                                                     " variant; without 'Dict' prefix or "
-                                                     "'DictElement' in case of 'Dict'"))
-
-                converter_registry[key] = {
-                    "converter": entry["converter"],
-                    "package": entry["package"]
-                }
-
-        # Load modules and associate classes:
-        for key, value in converter_registry.items():
-            module = importlib.import_module(value["package"])
-            value["class"] = getattr(module, value["converter"])
-        return converter_registry
+            
 
     def crawl_directory(self, dirname: str, crawler_definition_path: str,
                         restricted_path: Optional[list[str]] = None):
diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py
new file mode 100644
index 00000000..740ba464
--- /dev/null
+++ b/src/caoscrawler/scanner.py
@@ -0,0 +1,223 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+#
+# ** header v3.0
+# This file is a part of the CaosDB Project.
+#
+# Copyright (C) 2023 Alexander Schlemmer <alexander.schlemmer@ds.mpg.de>
+#               
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+# ** end header
+#
+
+"""
+This is the scanner, the original "_crawl" function from crawl.py.
+This is just the functionality, that extracts data from the file system.
+"""
+
+from __future__ import annotations
+
+import argparse
+import importlib
+import logging
+import os
+import sys
+import warnings
+import yaml
+
+from argparse import RawTextHelpFormatter
+from collections import defaultdict
+from copy import deepcopy
+from enum import Enum
+from importlib_resources import files
+from jsonschema import validate
+from typing import Any, Optional, Type, Union
+
+import caosdb as db
+
+from caosadvancedtools.cache import UpdateCache, Cache
+from caosadvancedtools.crawler import Crawler as OldCrawler
+from caosdb.apiutils import (compare_entities, EntityMergeConflictError,
+                             merge_entities)
+from caosdb.common.datatype import is_reference
+
+from .converters import Converter, DirectoryConverter, ConverterValidationError
+
+from .macros import defmacro_constructor, macro_constructor
+from .stores import Store, GeneralStore, RecordStore
+from .structure_elements import StructureElement, Directory, NoneElement
+from .version import check_cfood_version
+
+from caosdb.high_level_api import convert_to_python_object
+
+from .debug.debug_tree import (DebugTreeStructureElement,
+                                    DebugTreeConverter,
+                                    DebugTreeVariable)
+
+logger = logging.getLogger(__name__)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+    def load_definition(self, crawler_definition_path: str):
+        """
+        Load a cfood from a crawler definition defined by
+        crawler definition path and validate it using cfood-schema.yml.
+        """
+
+        # Load the cfood from a yaml file:
+        with open(crawler_definition_path, "r") as f:
+            crawler_definitions = list(yaml.safe_load_all(f))
+
+        crawler_definition = self._load_definition_from_yaml_dict(
+            crawler_definitions)
+
+        return self._resolve_validator_paths(crawler_definition, crawler_definition_path)
+
+    def _load_definition_from_yaml_dict(self, crawler_definitions: list[dict]):
+        """Load crawler definitions from a list of (yaml) dicts `crawler_definitions` which
+        contains either one or two documents.
+
+        Doesn't resolve the validator paths in the cfood definition, so for
+        internal and testing use only.
+
+        """
+        if len(crawler_definitions) == 1:
+            # Simple case, just one document:
+            crawler_definition = crawler_definitions[0]
+            metadata = {}
+        elif len(crawler_definitions) == 2:
+            metadata = crawler_definitions[0]["metadata"] if "metadata" in crawler_definitions[0] else {
+            }
+            crawler_definition = crawler_definitions[1]
+        else:
+            raise RuntimeError(
+                "Crawler definition must not contain more than two documents.")
+
+        check_cfood_version(metadata)
+
+        # TODO: at this point this function can already load the cfood schema extensions
+        #       from the crawler definition and add them to the yaml schema that will be
+        #       tested in the next lines of code:
+
+        # Load the cfood schema:
+        with open(str(files('caoscrawler').joinpath('cfood-schema.yml')), "r") as f:
+            schema = yaml.safe_load(f)
+
+        # Add custom converters to converter enum in schema:
+        if "Converters" in crawler_definition:
+            for key in crawler_definition["Converters"]:
+                schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append(
+                    key)
+        if len(crawler_definitions) == 2:
+            if "Converters" in metadata:
+                for key in metadata["Converters"]:
+                    schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append(
+                        key)
+
+        # Validate the cfood schema:
+        validate(instance=crawler_definition, schema=schema["cfood"])
+
+        return crawler_definition
+
+    def _resolve_validator_paths(self, definition: dict, definition_path: str):
+        """Resolve path to validation files with respect to the file in which
+        the crawler was defined.
+
+        """
+
+        for key, value in definition.items():
+
+            if key == "validate" and isinstance(value, str):
+                # Validator is given by a path
+                if not value.startswith('/'):
+                    # Not an absolute path
+                    definition[key] = os.path.join(os.path.dirname(definition_path), value)
+                    if not os.path.isfile(definition[key]):
+                        # TODO(henrik) capture this in `crawler_main` similar to
+                        # `ConverterValidationError`.
+                        raise FileNotFoundError(
+                            f"Couldn't find validation file {definition[key]}")
+            elif isinstance(value, dict):
+                # Recursively resolve all validators
+                definition[key] = self._resolve_validator_paths(value, definition_path)
+
+        return definition
+
+    def load_converters(self, definition: dict):
+        """
+        Currently the converter registry is a dictionary containing for each converter:
+        - key is the short code, abbreviation for the converter class name
+        - module is the name of the module to be imported which must be installed
+        - class is the converter class to load and associate with this converter entry
+
+        all other info for the converter needs to be included in the converter plugin
+        directory:
+        schema.yml file
+        README.md documentation
+
+        TODO: this function does not make use of self, so it could become static.
+        """
+
+        # Defaults for the converter registry:
+        with open(str(files('caoscrawler').joinpath('default_converters.yml')), "r") as f:
+            converter_registry: dict[str, dict[str, str]] = yaml.safe_load(f)
+
+        # More converters from definition file:
+        if "Converters" in definition:
+            for key, entry in definition["Converters"].items():
+                if key in ["Dict", "DictTextElement", "DictIntegerElement", "DictBooleanElement",
+                           "DictDictElement", "DictListElement", "DictFloatElement"]:
+                    warnings.warn(DeprecationWarning(f"{key} is deprecated. Please use the new"
+                                                     " variant; without 'Dict' prefix or "
+                                                     "'DictElement' in case of 'Dict'"))
+
+                converter_registry[key] = {
+                    "converter": entry["converter"],
+                    "package": entry["package"]
+                }
+
+        # Load modules and associate classes:
+        for key, value in converter_registry.items():
+            module = importlib.import_module(value["package"])
+            value["class"] = getattr(module, value["converter"])
+        return converter_registry
+
+
+
+
+
+
+
+
+
+
-- 
GitLab