From dfe9365347b7fdd790c34689e601ce5fe73e8af6 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <alexander@mail-schlemmer.de> Date: Wed, 8 Mar 2023 14:11:54 +0100 Subject: [PATCH] ENH: created scanner module and moved some functions there --- src/caoscrawler/crawl.py | 123 +------------------- src/caoscrawler/scanner.py | 223 +++++++++++++++++++++++++++++++++++++ 2 files changed, 224 insertions(+), 122 deletions(-) create mode 100644 src/caoscrawler/scanner.py diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 5269aeb8..50163ff3 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -231,129 +231,8 @@ class Crawler(object): self.debug_metadata["provenance"] = defaultdict(lambda: dict()) self.debug_metadata["usage"] = defaultdict(lambda: set()) - def load_definition(self, crawler_definition_path: str): - """ - Load a cfood from a crawler definition defined by - crawler definition path and validate it using cfood-schema.yml. - """ - - # Load the cfood from a yaml file: - with open(crawler_definition_path, "r") as f: - crawler_definitions = list(yaml.safe_load_all(f)) - - crawler_definition = self._load_definition_from_yaml_dict( - crawler_definitions) - - return self._resolve_validator_paths(crawler_definition, crawler_definition_path) - - def _load_definition_from_yaml_dict(self, crawler_definitions: list[dict]): - """Load crawler definitions from a list of (yaml) dicts `crawler_definitions` which - contains either one or two documents. - - Doesn't resolve the validator paths in the cfood definition, so for - internal and testing use only. - - """ - if len(crawler_definitions) == 1: - # Simple case, just one document: - crawler_definition = crawler_definitions[0] - metadata = {} - elif len(crawler_definitions) == 2: - metadata = crawler_definitions[0]["metadata"] if "metadata" in crawler_definitions[0] else { - } - crawler_definition = crawler_definitions[1] - else: - raise RuntimeError( - "Crawler definition must not contain more than two documents.") - - check_cfood_version(metadata) - - # TODO: at this point this function can already load the cfood schema extensions - # from the crawler definition and add them to the yaml schema that will be - # tested in the next lines of code: - - # Load the cfood schema: - with open(str(files('caoscrawler').joinpath('cfood-schema.yml')), "r") as f: - schema = yaml.safe_load(f) - - # Add custom converters to converter enum in schema: - if "Converters" in crawler_definition: - for key in crawler_definition["Converters"]: - schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append( - key) - if len(crawler_definitions) == 2: - if "Converters" in metadata: - for key in metadata["Converters"]: - schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append( - key) - - # Validate the cfood schema: - validate(instance=crawler_definition, schema=schema["cfood"]) - - return crawler_definition - - def _resolve_validator_paths(self, definition: dict, definition_path: str): - """Resolve path to validation files with respect to the file in which - the crawler was defined. - - """ - - for key, value in definition.items(): - - if key == "validate" and isinstance(value, str): - # Validator is given by a path - if not value.startswith('/'): - # Not an absolute path - definition[key] = os.path.join(os.path.dirname(definition_path), value) - if not os.path.isfile(definition[key]): - # TODO(henrik) capture this in `crawler_main` similar to - # `ConverterValidationError`. - raise FileNotFoundError( - f"Couldn't find validation file {definition[key]}") - elif isinstance(value, dict): - # Recursively resolve all validators - definition[key] = self._resolve_validator_paths(value, definition_path) - - return definition - - def load_converters(self, definition: dict): - """ - Currently the converter registry is a dictionary containing for each converter: - - key is the short code, abbreviation for the converter class name - - module is the name of the module to be imported which must be installed - - class is the converter class to load and associate with this converter entry - - all other info for the converter needs to be included in the converter plugin - directory: - schema.yml file - README.md documentation - - TODO: this function does not make use of self, so it could become static. - """ - # Defaults for the converter registry: - with open(str(files('caoscrawler').joinpath('default_converters.yml')), "r") as f: - converter_registry: dict[str, dict[str, str]] = yaml.safe_load(f) - - # More converters from definition file: - if "Converters" in definition: - for key, entry in definition["Converters"].items(): - if key in ["Dict", "DictTextElement", "DictIntegerElement", "DictBooleanElement", - "DictDictElement", "DictListElement", "DictFloatElement"]: - warnings.warn(DeprecationWarning(f"{key} is deprecated. Please use the new" - " variant; without 'Dict' prefix or " - "'DictElement' in case of 'Dict'")) - - converter_registry[key] = { - "converter": entry["converter"], - "package": entry["package"] - } - - # Load modules and associate classes: - for key, value in converter_registry.items(): - module = importlib.import_module(value["package"]) - value["class"] = getattr(module, value["converter"]) - return converter_registry + def crawl_directory(self, dirname: str, crawler_definition_path: str, restricted_path: Optional[list[str]] = None): diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py new file mode 100644 index 00000000..740ba464 --- /dev/null +++ b/src/caoscrawler/scanner.py @@ -0,0 +1,223 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2023 Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> +# +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +""" +This is the scanner, the original "_crawl" function from crawl.py. +This is just the functionality, that extracts data from the file system. +""" + +from __future__ import annotations + +import argparse +import importlib +import logging +import os +import sys +import warnings +import yaml + +from argparse import RawTextHelpFormatter +from collections import defaultdict +from copy import deepcopy +from enum import Enum +from importlib_resources import files +from jsonschema import validate +from typing import Any, Optional, Type, Union + +import caosdb as db + +from caosadvancedtools.cache import UpdateCache, Cache +from caosadvancedtools.crawler import Crawler as OldCrawler +from caosdb.apiutils import (compare_entities, EntityMergeConflictError, + merge_entities) +from caosdb.common.datatype import is_reference + +from .converters import Converter, DirectoryConverter, ConverterValidationError + +from .macros import defmacro_constructor, macro_constructor +from .stores import Store, GeneralStore, RecordStore +from .structure_elements import StructureElement, Directory, NoneElement +from .version import check_cfood_version + +from caosdb.high_level_api import convert_to_python_object + +from .debug.debug_tree import (DebugTreeStructureElement, + DebugTreeConverter, + DebugTreeVariable) + +logger = logging.getLogger(__name__) + + + + + + + + + + + + + + + + + + + + + def load_definition(self, crawler_definition_path: str): + """ + Load a cfood from a crawler definition defined by + crawler definition path and validate it using cfood-schema.yml. + """ + + # Load the cfood from a yaml file: + with open(crawler_definition_path, "r") as f: + crawler_definitions = list(yaml.safe_load_all(f)) + + crawler_definition = self._load_definition_from_yaml_dict( + crawler_definitions) + + return self._resolve_validator_paths(crawler_definition, crawler_definition_path) + + def _load_definition_from_yaml_dict(self, crawler_definitions: list[dict]): + """Load crawler definitions from a list of (yaml) dicts `crawler_definitions` which + contains either one or two documents. + + Doesn't resolve the validator paths in the cfood definition, so for + internal and testing use only. + + """ + if len(crawler_definitions) == 1: + # Simple case, just one document: + crawler_definition = crawler_definitions[0] + metadata = {} + elif len(crawler_definitions) == 2: + metadata = crawler_definitions[0]["metadata"] if "metadata" in crawler_definitions[0] else { + } + crawler_definition = crawler_definitions[1] + else: + raise RuntimeError( + "Crawler definition must not contain more than two documents.") + + check_cfood_version(metadata) + + # TODO: at this point this function can already load the cfood schema extensions + # from the crawler definition and add them to the yaml schema that will be + # tested in the next lines of code: + + # Load the cfood schema: + with open(str(files('caoscrawler').joinpath('cfood-schema.yml')), "r") as f: + schema = yaml.safe_load(f) + + # Add custom converters to converter enum in schema: + if "Converters" in crawler_definition: + for key in crawler_definition["Converters"]: + schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append( + key) + if len(crawler_definitions) == 2: + if "Converters" in metadata: + for key in metadata["Converters"]: + schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append( + key) + + # Validate the cfood schema: + validate(instance=crawler_definition, schema=schema["cfood"]) + + return crawler_definition + + def _resolve_validator_paths(self, definition: dict, definition_path: str): + """Resolve path to validation files with respect to the file in which + the crawler was defined. + + """ + + for key, value in definition.items(): + + if key == "validate" and isinstance(value, str): + # Validator is given by a path + if not value.startswith('/'): + # Not an absolute path + definition[key] = os.path.join(os.path.dirname(definition_path), value) + if not os.path.isfile(definition[key]): + # TODO(henrik) capture this in `crawler_main` similar to + # `ConverterValidationError`. + raise FileNotFoundError( + f"Couldn't find validation file {definition[key]}") + elif isinstance(value, dict): + # Recursively resolve all validators + definition[key] = self._resolve_validator_paths(value, definition_path) + + return definition + + def load_converters(self, definition: dict): + """ + Currently the converter registry is a dictionary containing for each converter: + - key is the short code, abbreviation for the converter class name + - module is the name of the module to be imported which must be installed + - class is the converter class to load and associate with this converter entry + + all other info for the converter needs to be included in the converter plugin + directory: + schema.yml file + README.md documentation + + TODO: this function does not make use of self, so it could become static. + """ + + # Defaults for the converter registry: + with open(str(files('caoscrawler').joinpath('default_converters.yml')), "r") as f: + converter_registry: dict[str, dict[str, str]] = yaml.safe_load(f) + + # More converters from definition file: + if "Converters" in definition: + for key, entry in definition["Converters"].items(): + if key in ["Dict", "DictTextElement", "DictIntegerElement", "DictBooleanElement", + "DictDictElement", "DictListElement", "DictFloatElement"]: + warnings.warn(DeprecationWarning(f"{key} is deprecated. Please use the new" + " variant; without 'Dict' prefix or " + "'DictElement' in case of 'Dict'")) + + converter_registry[key] = { + "converter": entry["converter"], + "package": entry["package"] + } + + # Load modules and associate classes: + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry + + + + + + + + + + -- GitLab