diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 00363f9700914adaaa0b2b1c40074db2887983e9..b3537a9e88507f7c438ef0a72ee646f322d6c9a7 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -66,6 +66,8 @@ from .stores import GeneralStore, RecordStore from .structure_elements import StructureElement, Directory, NoneElement from .version import check_cfood_version +from .scanner import scan_directory + logger = logging.getLogger(__name__) SPECIAL_PROPERTIES_STRICT = ("description", "name", "id", "path") @@ -174,27 +176,13 @@ class Crawler(object): """ def __init__(self, - generalStore: Optional[GeneralStore] = None, - debug: bool = False, - identifiableAdapter: IdentifiableAdapter = None, - securityMode: SecurityMode = SecurityMode.UPDATE - ): + identifiableAdapter: Optional[IdentifiableAdapter] = None, + securityMode: SecurityMode = SecurityMode.UPDATE): """ Create a new crawler and initialize an empty RecordStore and GeneralStore. Parameters ---------- - recordStore : GeneralStore - An initial GeneralStore which might store e.g. environment variables. - debug : bool - Create a debugging information tree when set to True. - The debugging information tree is a variable stored in - self.debug_tree. It is a dictionary mapping directory entries - to a tuple of general stores and record stores which are valid for - the directory scope. - Furthermore, it is stored in a second tree named self.debug_copied whether the - objects in debug_tree had been copied from a higher level in the hierarchy - of the structureelements. identifiableAdapter : IdentifiableAdapter TODO describe securityMode : int @@ -207,279 +195,34 @@ class Crawler(object): # different caches. self.remote_existing_cache = IdentifiedCache() self.remote_missing_cache = IdentifiedCache() - self.recordStore = RecordStore() self.securityMode = securityMode - self.generalStore = generalStore - if generalStore is None: - self.generalStore = GeneralStore() - self.identifiableAdapter: IdentifiableAdapter = LocalStorageIdentifiableAdapter() if identifiableAdapter is not None: self.identifiableAdapter = identifiableAdapter - # If a directory is crawled this may hold the path to that directory - self.crawled_directory: Optional[str] = None - self.debug = debug - if self.debug: - # order in the tuple: - # 0: generalStore - # 1: recordStore - self.debug_tree: dict[str, tuple] = dict() - self.debug_metadata: dict[str, dict] = dict() - self.debug_metadata["copied"] = dict() - self.debug_metadata["provenance"] = defaultdict(lambda: dict()) - self.debug_metadata["usage"] = defaultdict(lambda: set()) - - def load_definition(self, crawler_definition_path: str): - """ - Load a cfood from a crawler definition defined by - crawler definition path and validate it using cfood-schema.yml. - """ - - # Load the cfood from a yaml file: - with open(crawler_definition_path, "r") as f: - crawler_definitions = list(yaml.safe_load_all(f)) - - crawler_definition = self._load_definition_from_yaml_dict( - crawler_definitions) - - return self._resolve_validator_paths(crawler_definition, crawler_definition_path) - - def _load_definition_from_yaml_dict(self, crawler_definitions: list[dict]): - """Load crawler definitions from a list of (yaml) dicts `crawler_definitions` which - contains either one or two documents. - - Doesn't resolve the validator paths in the cfood definition, so for - internal and testing use only. - - """ - if len(crawler_definitions) == 1: - # Simple case, just one document: - crawler_definition = crawler_definitions[0] - metadata = {} - elif len(crawler_definitions) == 2: - metadata = crawler_definitions[0]["metadata"] if "metadata" in crawler_definitions[0] else { - } - crawler_definition = crawler_definitions[1] - else: - raise RuntimeError( - "Crawler definition must not contain more than two documents.") - - check_cfood_version(metadata) - - # TODO: at this point this function can already load the cfood schema extensions - # from the crawler definition and add them to the yaml schema that will be - # tested in the next lines of code: - - # Load the cfood schema: - with open(str(files('caoscrawler').joinpath('cfood-schema.yml')), "r") as f: - schema = yaml.safe_load(f) - - # Add custom converters to converter enum in schema: - if "Converters" in crawler_definition: - for key in crawler_definition["Converters"]: - schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append( - key) - if len(crawler_definitions) == 2: - if "Converters" in metadata: - for key in metadata["Converters"]: - schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append( - key) - - # Validate the cfood schema: - validate(instance=crawler_definition, schema=schema["cfood"]) - - return crawler_definition - - def _resolve_validator_paths(self, definition: dict, definition_path: str): - """Resolve path to validation files with respect to the file in which - the crawler was defined. - - """ - - for key, value in definition.items(): - - if key == "validate" and isinstance(value, str): - # Validator is given by a path - if not value.startswith('/'): - # Not an absolute path - definition[key] = os.path.join( - os.path.dirname(definition_path), value) - if not os.path.isfile(definition[key]): - # TODO(henrik) capture this in `crawler_main` similar to - # `ConverterValidationError`. - raise FileNotFoundError( - f"Couldn't find validation file {definition[key]}") - elif isinstance(value, dict): - # Recursively resolve all validators - definition[key] = self._resolve_validator_paths( - value, definition_path) - - return definition - - def load_converters(self, definition: dict): - """ - Currently the converter registry is a dictionary containing for each converter: - - key is the short code, abbreviation for the converter class name - - module is the name of the module to be imported which must be installed - - class is the converter class to load and associate with this converter entry - - all other info for the converter needs to be included in the converter plugin - directory: - schema.yml file - README.md documentation - TODO: this function does not make use of self, so it could become static. - """ - - # Defaults for the converter registry: - with open(str(files('caoscrawler').joinpath('default_converters.yml')), "r") as f: - converter_registry: dict[str, dict[str, str]] = yaml.safe_load(f) - - # More converters from definition file: - if "Converters" in definition: - for key, entry in definition["Converters"].items(): - if key in ["Dict", "DictTextElement", "DictIntegerElement", "DictBooleanElement", - "DictDictElement", "DictListElement", "DictFloatElement"]: - warnings.warn(DeprecationWarning(f"{key} is deprecated. Please use the new" - " variant; without 'Dict' prefix or " - "'DictElement' in case of 'Dict'")) - - converter_registry[key] = { - "converter": entry["converter"], - "package": entry["package"] - } - - # Load modules and associate classes: - for key, value in converter_registry.items(): - module = importlib.import_module(value["package"]) - value["class"] = getattr(module, value["converter"]) - return converter_registry - - def crawl_directory(self, dirname: str, crawler_definition_path: str, + def crawl_directory(self, + crawled_directory: str, + crawler_definition_path: str, restricted_path: Optional[list[str]] = None): - """ Crawl a single directory. - - Convenience function that starts the crawler (calls start_crawling) - with a single directory as the StructureElement. - - restricted_path: optional, list of strings - Traverse the data tree only along the given path. When the end of the given path - is reached, traverse the full tree as normal. """ - - crawler_definition = self.load_definition(crawler_definition_path) - # Load and register converter packages: - converter_registry = self.load_converters(crawler_definition) - - if not dirname: - raise ValueError( - "You have to provide a non-empty path for crawling.") - dir_structure_name = os.path.basename(dirname) - self.crawled_directory = dirname - if not dir_structure_name and dirname.endswith('/'): - if dirname == '/': - # Crawling the entire file system - dir_structure_name = "root" - else: - # dirname had a trailing '/' - dir_structure_name = os.path.basename(dirname[:-1]) - - self.start_crawling(Directory(dir_structure_name, - dirname), - crawler_definition, - converter_registry, - restricted_path=restricted_path - ) - - @staticmethod - def initialize_converters(crawler_definition: dict, converter_registry: dict): - """ - takes the cfood as dict (`crawler_definition`) and creates the converter objects that - are defined on the highest level. Child Converters will in turn be created during the - initialization of the Converters. - """ - converters = [] - - for key, value in crawler_definition.items(): - # Definitions and Converters are reserved keywords - # on the top level of the yaml file. - # TODO: there should also be a top level keyword for the actual - # CFood to avoid confusion between top level keywords - # and the CFood. - if key == "Definitions": - continue - elif key == "Converters": - continue - converters.append(Converter.converter_factory( - value, key, converter_registry)) - - return converters - - def start_crawling(self, items: Union[list[StructureElement], StructureElement], - crawler_definition: dict, - converter_registry: dict, - restricted_path: Optional[list[str]] = None): - """ - Start point of the crawler recursion. - - Parameters - ---------- - items: list - A list of structure elements (or a single StructureElement) that is used for - generating the initial items for the crawler. This could e.g. be a Directory. - crawler_definition : dict - A dictionary representing the crawler definition, possibly from a yaml - file. - restricted_path: optional, list of strings - Traverse the data tree only along the given path. When the end of the given path - is reached, traverse the full tree as normal. - - Returns - ------- - crawled_data : list - the final list with the target state of Records. + The new main function to run the crawler on a directory. """ - # This function builds the tree of converters out of the crawler definition. - - if self.generalStore is None: - raise RuntimeError("Should not happen.") - - if not isinstance(items, list): - items = [items] - + self.crawled_directory = crawled_directory self.run_id = uuid.uuid1() - local_converters = Crawler.initialize_converters(crawler_definition, converter_registry) - - # This recursive crawling procedure generates the update list: - self.crawled_data: list[db.Record] = [] - self._crawl( - items=items, - local_converters=local_converters, - generalStore=self.generalStore, - recordStore=self.recordStore, - structure_elements_path=[], - converters_path=[], - restricted_path=restricted_path) - if self.debug: - self.debug_converters = local_converters - - return self.crawled_data - - def synchronize(self, commit_changes: bool = True, unique_names=True): - """ - Carry out the actual synchronization. - """ - # After the crawling, the actual synchronization with the database, based on the - # update list is carried out: + # TODO: This is not ideal yet, the data is just returned and needs to be + # separately supplied to the synchronize function. - return self._synchronize(self.crawled_data, commit_changes, unique_names=unique_names) + return scan_directory(crawled_directory, + crawler_definition_path, + restricted_path) def _has_reference_value_without_id(self, ident: Identifiable) -> bool: """ - Returns True if there is at least one value in the properties attribute of ``ident`` which: + Returns True if there is at least one value in the properties + attribute of ``ident`` which: a) is a reference property AND b) where the value is set to a @@ -947,7 +690,8 @@ class Crawler(object): return db.Entity(id=id).retrieve() @staticmethod - def execute_inserts_in_list(to_be_inserted, securityMode, run_id: uuid.UUID = None, + def execute_inserts_in_list(to_be_inserted, securityMode, + run_id: Optional[uuid.UUID] = None, unique_names=True): for record in to_be_inserted: for prop in record.properties: @@ -975,7 +719,8 @@ class Crawler(object): _resolve_datatype(prop, entity) @staticmethod - def execute_updates_in_list(to_be_updated, securityMode, run_id: uuid.UUID = None, + def execute_updates_in_list(to_be_updated, securityMode, + run_id: Optional[uuid.UUID] = None, unique_names=True): Crawler.set_ids_and_datatype_of_parents_and_properties(to_be_updated) logger.debug("UPDATE") @@ -987,7 +732,9 @@ class Crawler(object): update_cache = UpdateCache() update_cache.insert(to_be_updated, run_id) - def _synchronize(self, crawled_data: list[db.Record], commit_changes: bool = True, + def synchronize(self, + crawled_data: list[db.Record], + commit_changes: bool = True, unique_names=True): """ This function applies several stages: @@ -1068,163 +815,11 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) + " by invoking the crawler" " with the run id: {rid}\n".format(rid=run_id)) - @staticmethod - def debug_build_usage_tree(converter: Converter): - res: dict[str, dict[str, Any]] = { - converter.name: { - "usage": ", ".join(converter.metadata["usage"]), - "subtree": {} - } - } - - for subconv in converter.converters: - d = Crawler.debug_build_usage_tree(subconv) - k = list(d.keys()) - if len(k) != 1: - raise RuntimeError( - "Unkonwn error during building of usage tree.") - res[converter.name]["subtree"][k[0]] = d[k[0]] - return res - - def save_debug_data(self, filename: str): - paths: dict[str, Union[dict, list]] = dict() - - def flatten_debug_info(key): - mod_info = self.debug_metadata[key] - paths[key] = dict() - for record_name in mod_info: - if key == "provenance": - paths[key][record_name] = dict() - for prop_name in mod_info[record_name]: - paths[key][record_name][prop_name] = { - "structure_elements_path": "/".join( - mod_info[record_name][prop_name][0]), - "converters_path": "/".join( - mod_info[record_name][prop_name][1])} - elif key == "usage": - paths[key][record_name] = ", ".join(mod_info[record_name]) - for key in ("provenance", "usage"): - flatten_debug_info(key) - - paths["converters_usage"] = [self.debug_build_usage_tree( - cv) for cv in self.debug_converters] - - with open(filename, "w") as f: - f.write(yaml.dump(paths, sort_keys=False)) - - def _crawl(self, - items: list[StructureElement], - local_converters: list[Converter], - generalStore: GeneralStore, - recordStore: RecordStore, - structure_elements_path: list[str], - converters_path: list[str], - restricted_path: Optional[list[str]] = None): - """ - Crawl a list of StructureElements and apply any matching converters. - - items: structure_elements (e.g. files and folders on one level on the hierarchy) - local_converters: locally defined converters for - treating structure elements. A locally defined converter could be - one that is only valid for a specific subtree of the originally - cralwed StructureElement structure. - generalStore and recordStore: This recursion of the crawl function should only operate on - copies of the global stores of the Crawler object. - restricted_path: optional, list of strings, traverse the data tree only along the given - path. For example, when a directory contains files a, b and c and b is - given in restricted_path, a and c will be ignroed by the crawler. - When the end of the given path is reached, traverse the full tree as - normal. The first element of the list provided by restricted_path should - be the name of the StructureElement at this level, i.e. denoting the - respective element in the items argument. - """ - # This path_found variable stores wether the path given by restricted_path was found in the - # data tree - path_found = False - if restricted_path is not None and len(restricted_path) == 0: - restricted_path = None - - for element in items: - for converter in local_converters: - - # type is something like "matches files", replace isinstance with "type_matches" - # match function tests regexp for example - if (converter.typecheck(element) and ( - restricted_path is None or element.name == restricted_path[0]) - and converter.match(element) is not None): - path_found = True - generalStore_copy = generalStore.create_scoped_copy() - recordStore_copy = recordStore.create_scoped_copy() - - # Create an entry for this matched structure element: - generalStore_copy[converter.name] = ( - os.path.join(*(structure_elements_path + [element.get_name()]))) - - # extracts values from structure element and stores them in the - # variable store - converter.create_values(generalStore_copy, element) - - keys_modified = converter.create_records( - generalStore_copy, recordStore_copy, element) - - children = converter.create_children(generalStore_copy, element) - - if self.debug: - # add provenance information for each variable - self.debug_tree[str(element)] = ( - generalStore_copy.get_storage(), recordStore_copy.get_storage()) - self.debug_metadata["copied"][str(element)] = ( - generalStore_copy.get_dict_copied(), - recordStore_copy.get_dict_copied()) - self.debug_metadata["usage"][str(element)].add( - "/".join(converters_path + [converter.name])) - mod_info = self.debug_metadata["provenance"] - for record_name, prop_name in keys_modified: - # TODO: check - internal_id = recordStore_copy.get_internal_id( - record_name) - record_identifier = record_name + \ - "_" + str(internal_id) - converter.metadata["usage"].add(record_identifier) - mod_info[record_identifier][prop_name] = ( - structure_elements_path + [element.get_name()], - converters_path + [converter.name]) - - self._crawl(children, converter.converters, - generalStore_copy, recordStore_copy, - structure_elements_path + [element.get_name()], - converters_path + [converter.name], - restricted_path[1:] if restricted_path is not None else None) - - if restricted_path and not path_found: - raise RuntimeError("A 'restricted_path' argument was given that is not contained in " - "the data tree") - # if the crawler is running out of scope, copy all records in - # the recordStore, that were created in this scope - # to the general update container. - scoped_records = recordStore.get_records_current_scope() - for record in scoped_records: - self.crawled_data.append(record) - - # TODO: the scoped variables should be cleaned up as soon if the variables - # are no longer in the current scope. This can be implemented as follows, - # but this breaks the test "test_record_structure_generation", because - # some debug info is also deleted. This implementation can be used as soon - # as the remaining problems with the debug_tree are fixed. - # Delete the variables that are no longer needed: - # scoped_names = recordStore.get_names_current_scope() - # for name in scoped_names: - # del recordStore[name] - # del generalStore[name] - - return self.crawled_data - def crawler_main(crawled_directory_path: str, cfood_file_name: str, - identifiables_definition_file: str = None, - debug: bool = False, - provenance_file: str = None, + identifiables_definition_file: Optional[str] = None, + provenance_file: Optional[str] = None, dry_run: bool = False, prefix: str = "", securityMode: SecurityMode = SecurityMode.UPDATE, @@ -1262,14 +857,17 @@ def crawler_main(crawled_directory_path: str, return_value : int 0 if successful """ - crawler = Crawler(debug=debug, securityMode=securityMode) + crawler = Crawler(securityMode=securityMode) try: - crawler.crawl_directory(crawled_directory_path, cfood_file_name, restricted_path) + crawled_data, debug_tree = crawler.crawl_directory(crawled_directory_path, + cfood_file_name, + restricted_path) except ConverterValidationError as err: print(err) return 1 - if provenance_file is not None and debug: - crawler.save_debug_data(provenance_file) + if provenance_file is not None: + with open(provenance_file, "w") as f: + yaml.dump(debug_tree) if identifiables_definition_file is not None: @@ -1278,7 +876,8 @@ def crawler_main(crawled_directory_path: str, crawler.identifiableAdapter = ident if dry_run: - ins, upd = crawler.synchronize(commit_changes=False) + ins, upd = crawler.synchronize(crawled_data, + commit_changes=False) inserts = [str(i) for i in ins] updates = [str(i) for i in upd] with open("dry.yml", "w") as f: @@ -1287,7 +886,7 @@ def crawler_main(crawled_directory_path: str, "update": updates})) else: rtsfinder = dict() - for elem in crawler.crawled_data: + for elem in crawled_data: if isinstance(elem, db.File): # correct the file path: # elem.file = os.path.join(args.path, elem.file) @@ -1320,7 +919,9 @@ def crawler_main(crawled_directory_path: str, raise RuntimeError("Missing RecordTypes: {}". format(", ".join(notfound))) - crawler.synchronize(commit_changes=True, unique_names=unique_names) + crawler.synchronize(crawled_data, + commit_changes=True, + unique_names=unique_names) return 0 @@ -1342,7 +943,7 @@ def parse_args(): help="Path name of the provenance yaml file. " "This file will only be generated if this option is set.") parser.add_argument("--debug", required=False, action="store_true", - help="Path name of the cfood yaml file to be used.") + help="Generate debug output.") parser.add_argument("crawled_directory_path", help="The subtree of files below the given path will " "be considered. Use '/' for everything.") @@ -1357,7 +958,7 @@ def parse_args(): help="Create two files dry.yml to show" "what would actually be committed without doing the synchronization.") - # TODO: load identifiables is a dirty implementation currently + # TODO: load identifiables currently is a very simple implementation parser.add_argument("-i", "--load-identifiables", help="Load identifiables from the given yaml file.") parser.add_argument("-u", "--unique-names", @@ -1403,7 +1004,6 @@ def main(): crawled_directory_path=args.crawled_directory_path, cfood_file_name=args.cfood_file_name, identifiables_definition_file=args.load_identifiables, - debug=args.debug, provenance_file=args.provenance, dry_run=args.dry_run, prefix=args.prefix, diff --git a/src/caoscrawler/debug/__init__.py b/src/caoscrawler/debug/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/caoscrawler/debug/debug_tree.py b/src/caoscrawler/debug/debug_tree.py new file mode 100644 index 0000000000000000000000000000000000000000..825fac2cebafaad9992bbf465b0cd1a899262e2a --- /dev/null +++ b/src/caoscrawler/debug/debug_tree.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2023 Alexander Schlemmer +# +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +""" +A tree structure that is used during the scanning stage of the crawler +to store information used for debugging. +""" + +from __future__ import annotations + +import argparse +import importlib +import logging +import os +import sys +import uuid +import warnings +import yaml + +from argparse import RawTextHelpFormatter +from collections import defaultdict +from copy import deepcopy +from enum import Enum +from importlib_resources import files +from jsonschema import validate +from typing import Any, Optional, Type, Union + +from ..structure_elements import StructureElement +from ..converters import Converter + + +from dataclasses import dataclass + + + +@dataclass +class DebugTreeStructureElement: + path_segment: str # a name + element: StructureElement + matching_converters: list[DebugTreeConverter] + nonmatching_converters: list[DebugTreeConverter] + +@dataclass +class DebugTreeVariable: + key: str + value: Any + copied: bool + internal_id: int + + +@dataclass +class DebugTreeConverter: + path_segment: str # a name + converter: Converter + current_variables: list[DebugTreeVariable] + current_records: list[DebugTreeVariable] # Here, value is always a CaosDB Entity + children: list[DebugTreeStructureElement] diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py new file mode 100644 index 0000000000000000000000000000000000000000..bc2924dc41f6835c07b8a21914dfeb86919cdc96 --- /dev/null +++ b/src/caoscrawler/scanner.py @@ -0,0 +1,445 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2023 Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> +# +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +""" +This is the scanner, the original "_crawl" function from crawl.py. +This is just the functionality, that extracts data from the file system. +""" + +from __future__ import annotations + +import argparse +import importlib +import logging +import os +import sys +import warnings +import yaml + +from argparse import RawTextHelpFormatter +from collections import defaultdict +from copy import deepcopy +from enum import Enum +from importlib_resources import files +from jsonschema import validate +from typing import Any, Optional, Type, Union + +import caosdb as db + +from caosadvancedtools.cache import UpdateCache, Cache +from caosadvancedtools.crawler import Crawler as OldCrawler +from caosdb.apiutils import (compare_entities, EntityMergeConflictError, + merge_entities) +from caosdb.common.datatype import is_reference + +from .converters import Converter, DirectoryConverter, ConverterValidationError + +from .macros import defmacro_constructor, macro_constructor +from .stores import Store, GeneralStore, RecordStore +from .structure_elements import StructureElement, Directory, NoneElement +from .version import check_cfood_version + +from .debug.debug_tree import (DebugTreeStructureElement, + DebugTreeConverter, + DebugTreeVariable) + +logger = logging.getLogger(__name__) + + +def scanner(items: list[StructureElement], + converters: list[Converter], + general_store: Optional[GeneralStore] = None, + record_store: Optional[RecordStore] = None, + structure_elements_path: Optional[list[str]] = None, + restricted_path: Optional[list[str]] = None, + crawled_data: Optional[list[db.Record]] = None): + """ + Crawl a list of StructureElements and apply any matching converters. + + items: structure_elements (e.g. files and folders on one level on the hierarchy) + local_converters: locally defined converters for + treating structure elements. A locally defined converter could be + one that is only valid for a specific subtree of the originally + cralwed StructureElement structure. + general_store and record_store: This recursion of the crawl function should only operate on + copies of the global stores of the Crawler object. + restricted_path: optional, list of strings, traverse the data tree only along the given + path. For example, when a directory contains files a, b and c and b is + given in restricted_path, a and c will be ignroed by the crawler. + When the end of the given path is reached, traverse the full tree as + normal. The first element of the list provided by restricted_path should + be the name of the StructureElement at this level, i.e. denoting the + respective element in the items argument. + """ + + # The path_found variable stores wether the path given by restricted_path was + # found in the data tree + path_found = False + if restricted_path is not None and len(restricted_path) == 0: + restricted_path = None + + # This list stores the debug tree which is returned at the end: + tree_elements: list[DebugTreeStructureElement] = list() + + if crawled_data is None: + crawled_data = [] + + if general_store is None: + general_store = GeneralStore() + + if record_store is None: + record_store = RecordStore() + + if structure_elements_path is None: + structure_elements_path = [] + + for element in items: + # Create a tree element for the current structure element: + tree_SE = DebugTreeStructureElement( + element.name, element, [], []) + tree_elements.append(tree_SE) + + for converter in converters: + general_store_copy = general_store.create_scoped_copy() + record_store_copy = record_store.create_scoped_copy() + + # Create a tree element for this converter and add it to the tree_SE later: + tree_C = DebugTreeConverter(converter.name, converter, [], [], []) + + + # type is something like "matches files", replace isinstance with "type_matches" + # match function tests regexp for example + if (converter.typecheck(element) and ( + restricted_path is None or element.name == restricted_path[0]) + and converter.match(element) is not None): + path_found = True + tree_SE.matching_converters.append(tree_C) + + # Create an entry for this matched structure element: + general_store_copy[converter.name] = ( + os.path.join(*(structure_elements_path + [element.get_name()]))) + + # extracts values from structure element and stores them in the + # variable store + converter.create_values(general_store_copy, element) + + keys_modified = converter.create_records( + general_store_copy, record_store_copy, element) + + children = converter.create_children(general_store_copy, element) + + tree_C.current_variables = store_to_tree(general_store_copy) + tree_C.current_records = store_to_tree(record_store_copy) + + _, tree_elements_children = scanner(children, converter.converters, + general_store_copy, record_store_copy, + structure_elements_path + [element.get_name()], + restricted_path[1:] if restricted_path is not None else None, + crawled_data) + tree_C.children.extend(tree_elements_children) + else: + tree_SE.nonmatching_converters.append(tree_C) + + if restricted_path and not path_found: + raise RuntimeError("A 'restricted_path' argument was given that is not contained in " + "the data tree") + # if the crawler is running out of scope, copy all records in + # the record_store, that were created in this scope + # to the general update container. + scoped_records = record_store.get_records_current_scope() + for record in scoped_records: + crawled_data.append(record) + + # TODO: the scoped variables should be cleaned up as soon if the variables + # are no longer in the current scope. This can be implemented as follows, + # but this breaks the test "test_record_structure_generation", because + # some debug info is also deleted. This implementation can be used as soon + # as the remaining problems with the debug_tree are fixed. + # Delete the variables that are no longer needed: + # scoped_names = record_store.get_names_current_scope() + # for name in scoped_names: + # del record_store[name] + # del general_store[name] + + return crawled_data, tree_elements + + +def load_definition(crawler_definition_path: str): + """ + Load a cfood from a crawler definition defined by + crawler definition path and validate it using cfood-schema.yml. + """ + + # Load the cfood from a yaml file: + with open(crawler_definition_path, "r") as f: + crawler_definitions = list(yaml.safe_load_all(f)) + + crawler_definition = _load_definition_from_yaml_dict(crawler_definitions) + return _resolve_validator_paths(crawler_definition, crawler_definition_path) + +def _load_definition_from_yaml_dict(crawler_definitions: list[dict]): + """ + Load crawler definitions from a list of (yaml) dicts `crawler_definitions` which + contains either one or two documents. + + Doesn't resolve the validator paths in the cfood definition, so for + internal and testing use only. + """ + if len(crawler_definitions) == 1: + # Simple case, just one document: + crawler_definition = crawler_definitions[0] + metadata = {} + elif len(crawler_definitions) == 2: + metadata = (crawler_definitions[0]["metadata"] if "metadata" + in crawler_definitions[0] else {}) + crawler_definition = crawler_definitions[1] + else: + raise RuntimeError( + "Crawler definition must not contain more than two documents.") + + check_cfood_version(metadata) + + # TODO: at this point this function can already load the cfood schema extensions + # from the crawler definition and add them to the yaml schema that will be + # tested in the next lines of code: + + # Load the cfood schema: + with open(str(files('caoscrawler').joinpath('cfood-schema.yml')), "r") as f: + schema = yaml.safe_load(f) + + # Add custom converters to converter enum in schema: + if "Converters" in crawler_definition: + for key in crawler_definition["Converters"]: + schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append( + key) + if len(crawler_definitions) == 2: + if "Converters" in metadata: + for key in metadata["Converters"]: + schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append( + key) + + # Validate the cfood schema: + validate(instance=crawler_definition, schema=schema["cfood"]) + + return crawler_definition + +def _resolve_validator_paths(definition: dict, definition_path: str): + """ + Resolve path to validation files with respect to the file in which + the crawler was defined. + """ + + for key, value in definition.items(): + + if key == "validate" and isinstance(value, str): + # Validator is given by a path + if not value.startswith('/'): + # Not an absolute path + definition[key] = os.path.join( + os.path.dirname(definition_path), value) + if not os.path.isfile(definition[key]): + # TODO(henrik) capture this in `crawler_main` similar to + # `ConverterValidationError`. + raise FileNotFoundError( + f"Couldn't find validation file {definition[key]}") + elif isinstance(value, dict): + # Recursively resolve all validators + definition[key] = _resolve_validator_paths( + value, definition_path) + + return definition + + +def create_converter_registry(definition: dict): + """ + Currently the converter registry is a dictionary containing for each converter: + - key is the short code, abbreviation for the converter class name + - module is the name of the module to be imported which must be installed + - class is the converter class to load and associate with this converter entry + + all other info for the converter needs to be included in the converter plugin + directory: + schema.yml file + README.md documentation + + Returns + ------- + The converter registry which is a dictonary containing + dictionaries defining the individual converters. + """ + + # Defaults for the converter registry: + with open(str(files('caoscrawler').joinpath('default_converters.yml')), "r") as f: + converter_registry: dict[str, dict[str, str]] = yaml.safe_load(f) + + # More converters from definition file: + if "Converters" in definition: + for key, entry in definition["Converters"].items(): + if key in ["Dict", "DictTextElement", "DictIntegerElement", "DictBooleanElement", + "DictDictElement", "DictListElement", "DictFloatElement"]: + warnings.warn(DeprecationWarning(f"{key} is deprecated. Please use the new" + " variant; without 'Dict' prefix or " + "'DictElement' in case of 'Dict'")) + + converter_registry[key] = { + "converter": entry["converter"], + "package": entry["package"] + } + + # Load modules and associate classes: + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry + +def initialize_converters(crawler_definition: dict, converter_registry: dict): + """ + takes the cfood as dict (`crawler_definition`) and creates the converter objects that + are defined on the highest level. Child Converters will in turn be created during the + initialization of the Converters. + + Returns + ------- + A list of converters. + """ + converters = [] + + for key, value in crawler_definition.items(): + # Definitions and Converters are reserved keywords + # on the top level of the yaml file. + # TODO: there should also be a top level keyword for the actual + # CFood to avoid confusion between top level keywords + # and the CFood. + if key == "Definitions": + continue + elif key == "Converters": + continue + converters.append(Converter.converter_factory( + value, key, converter_registry)) + + return converters + +def scan_structure_elements( + items: Union[list[StructureElement], StructureElement], + crawler_definition: dict, + converter_registry: dict, + restricted_path: Optional[list[str]] = None): + """ + Start point of the crawler recursion. + + Parameters + ---------- + items: list + A list of structure elements (or a single StructureElement) that is used for + generating the initial items for the crawler. This could e.g. be a Directory. + crawler_definition : dict + A dictionary representing the crawler definition, possibly from a yaml + file. + converter_registry : dict + TODO: documentation missing + restricted_path: optional, list of strings + Traverse the data tree only along the given path. When the end of the given path + is reached, traverse the full tree as normal. + + Returns + ------- + The result of invoking `scanner`: + - A list of resulting objects of type db.Entity + - The debug tree + """ + + # This function builds the tree of converters out of the crawler definition. + if not isinstance(items, list): + items = [items] + + + + return scanner(items, + initialize_converters(crawler_definition, converter_registry), + restricted_path=restricted_path) + + +def scan_directory(dirname: str, + crawler_definition_path: str, + restricted_path: Optional[list[str]] = None): + """ + Scan a single directory. + + Convenience function that starts the crawler (calls start_crawling) + with a single directory as the StructureElement. + + restricted_path: optional, list of strings + Traverse the data tree only along the given path. When the end of the given path + is reached, traverse the full tree as normal. + + Returns + ------- + The result of invoking `scanner`: + - A list of resulting objects of type db.Entity + - The debug tree + """ + + crawler_definition = load_definition(crawler_definition_path) + # Load and register converter packages: + converter_registry = create_converter_registry(crawler_definition) + + if not dirname: + raise ValueError( + "You have to provide a non-empty path for crawling.") + dir_structure_name = os.path.basename(dirname) + + if not dir_structure_name and dirname.endswith('/'): + if dirname == '/': + # Crawling the entire file system + dir_structure_name = "root" + else: + # dirname had a trailing '/' + dir_structure_name = os.path.basename(dirname[:-1]) + + return scan_structure_elements( + Directory(dir_structure_name, dirname), + crawler_definition, + converter_registry, + restricted_path) + + +# ------------------------- +# Utilities for debugging +# ------------------------- + +def store_to_tree(store: Store): + """ + Converts a (Record-/General-)Store to a simple structure + that can be used for debugging. + """ + res: list[DebugTreeVariable] = list() + copied = store.get_dict_copied() + for name, value in store.get_storage().items(): + res.append( + DebugTreeVariable( + name, value, + copied[name], + store.get_internal_id(name))) + return res diff --git a/unittests/debug_tree_test.py b/unittests/debug_tree_test.py new file mode 100644 index 0000000000000000000000000000000000000000..5b7bb9dceedec4b8fd6a4aadb05d37f65844ab37 --- /dev/null +++ b/unittests/debug_tree_test.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2021-2023 Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> +# Copyright (C) 2023 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# Copyright (C) 2023 IndiScale GmbH <info@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" +Tests for the new debug tree feature of the crawler. +""" + +from caoscrawler.stores import GeneralStore, RecordStore +import os + +from caoscrawler.structure_elements import (File, Directory, + DictTextElement, DictListElement, DictElement) + +from caoscrawler.converters import SimpleFileConverter + + +from functools import partial +from copy import deepcopy +from unittest.mock import patch + + +from unittest.mock import MagicMock, Mock +from os.path import join, dirname, basename +import yaml +import caosdb as db +from caosdb.apiutils import compare_entities + +import pytest +from pytest import raises + +from caoscrawler.scanner import scanner + + +def test_scanner(): + d = File("2023-02-07_ProjectName.txt", "2023-02-07_ProjectName.txt") + c = SimpleFileConverter({ + "match": "^(?P<date>.*?)_(?P<identifier>.*?)$" + }, "ProjectFile", {}) + + crawled_data, debug_tree = scanner([d], [c]) + print(yaml.dump(debug_tree)) + assert False + +def test_scanner_directories(): + d = Directory("2023-02-07_ProjectName", "/")