diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 8d14dc2a2d617a685a75a33e2ad726b2c4b44666..540dcb51db648e5a900c9a1765a18f31ea44b0f7 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -221,16 +221,6 @@ class Crawler(object): # If a directory is crawled this may hold the path to that directory self.crawled_directory: Optional[str] = None self.debug = debug - if self.debug: - # order in the tuple: - # 0: generalStore - # 1: recordStore - self.debug_tree: dict[str, tuple] = dict() - self.debug_metadata: dict[str, dict] = dict() - self.debug_metadata["copied"] = dict() - self.debug_metadata["provenance"] = defaultdict(lambda: dict()) - self.debug_metadata["usage"] = defaultdict(lambda: set()) - def synchronize(self, commit_changes: bool = True, unique_names=True): diff --git a/src/caoscrawler/debug_tree.py b/src/caoscrawler/debug_tree.py new file mode 100644 index 0000000000000000000000000000000000000000..58bda2cde2c93915935a87c54120178374b64881 --- /dev/null +++ b/src/caoscrawler/debug_tree.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2023 Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> +# +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +""" +A structure containing debug tree information. +""" + +from __future__ import annotations + +import argparse +import importlib +import logging +import os +import sys +import warnings +import yaml + +from argparse import RawTextHelpFormatter +from collections import defaultdict +from copy import deepcopy +from enum import Enum +from importlib_resources import files +from jsonschema import validate +from typing import Any, Optional, Type, Union + +import caosdb as db + +from caosadvancedtools.cache import UpdateCache, Cache +from caosadvancedtools.crawler import Crawler as OldCrawler +from caosdb.apiutils import (compare_entities, EntityMergeConflictError, + merge_entities) +from caosdb.common.datatype import is_reference + +from .converters import Converter, DirectoryConverter, ConverterValidationError + +from .macros import defmacro_constructor, macro_constructor +from .stores import Store, GeneralStore, RecordStore +from .structure_elements import StructureElement, Directory, NoneElement +from .version import check_cfood_version + +from caosdb.high_level_api import convert_to_python_object + + +class DebugTree(object): + + def __init__(self): + # order in the tuple: + # 0: general_store + # 1: record_store + self.debug_tree: dict[str, tuple] = dict() + self.debug_metadata: dict[str, dict] = dict() + self.debug_metadata["copied"] = dict() + self.debug_metadata["provenance"] = defaultdict(lambda: dict()) + self.debug_metadata["usage"] = defaultdict(lambda: set()) + + # TODO: turn the tuple into two individual elements diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py index 6689579949dccd94208bdf70e5350d6937acacda..dda27c8a797915be2a30769fa4c5f45637ae44d9 100644 --- a/src/caoscrawler/scanner.py +++ b/src/caoscrawler/scanner.py @@ -63,9 +63,7 @@ from .version import check_cfood_version from caosdb.high_level_api import convert_to_python_object -from .debug.debug_tree import (DebugTreeStructureElement, - DebugTreeConverter, - DebugTreeVariable) +from .debug_tree import DebugTree logger = logging.getLogger(__name__) @@ -224,15 +222,15 @@ def initialize_converters(crawler_definition: dict, converter_registry: dict): # Main scanner function: # -------------------------------------------------------------------------------- -def scanner(self, - items: list[StructureElement], +def scanner(items: list[StructureElement], converters: list[Converter], general_store: Optional[GeneralStore] = None, record_store: Optional[RecordStore] = None, structure_elements_path: Optional[list[str]] = None, converters_path: Optional[list[str]] = None, restricted_path: Optional[list[str]] = None, - crawled_data: Optional[list[db.Record]] = None): + crawled_data: Optional[list[db.Record]] = None, + debug_tree: Optional[DebugTree] = None): """ Crawl a list of StructureElements and apply any matching converters. @@ -271,6 +269,12 @@ def scanner(self, if structure_elements_path is None: structure_elements_path = [] + if converters_path is None: + converters_path = [] + + if debug_tree is None: + debug_tree = DebugTree() + for element in items: for converter in converters: @@ -296,32 +300,32 @@ def scanner(self, children = converter.create_children(general_store_copy, element) - if self.debug: - # add provenance information for each variable - self.debug_tree[str(element)] = ( - general_store_copy.get_storage(), record_store_copy.get_storage()) - self.debug_metadata["copied"][str(element)] = ( - general_store_copy.get_dict_copied(), - record_store_copy.get_dict_copied()) - self.debug_metadata["usage"][str(element)].add( - "/".join(converters_path + [converter.name])) - mod_info = self.debug_metadata["provenance"] - for record_name, prop_name in keys_modified: - # TODO: check - internal_id = record_store_copy.get_internal_id( - record_name) - record_identifier = record_name + \ - "_" + str(internal_id) - converter.metadata["usage"].add(record_identifier) - mod_info[record_identifier][prop_name] = ( - structure_elements_path + [element.get_name()], - converters_path + [converter.name]) - - self.scanner(children, converter.converters, - general_store_copy, record_store_copy, - structure_elements_path + [element.get_name()], - converters_path + [converter.name], - restricted_path[1:] if restricted_path is not None else None) + # add provenance information for each variable + self.debug_tree.debug_tree[str(element)] = ( + general_store_copy.get_storage(), record_store_copy.get_storage()) + self.debug_tree.debug_metadata["copied"][str(element)] = ( + general_store_copy.get_dict_copied(), + record_store_copy.get_dict_copied()) + self.debug_tree.debug_metadata["usage"][str(element)].add( + "/".join(converters_path + [converter.name])) + mod_info = self.debug_tree.debug_metadata["provenance"] + for record_name, prop_name in keys_modified: + # TODO: check + internal_id = record_store_copy.get_internal_id( + record_name) + record_identifier = record_name + \ + "_" + str(internal_id) + converter.metadata["usage"].add(record_identifier) + mod_info[record_identifier][prop_name] = ( + structure_elements_path + [element.get_name()], + converters_path + [converter.name]) + + scanner(children, converter.converters, + general_store_copy, record_store_copy, + structure_elements_path + [element.get_name()], + converters_path + [converter.name], + restricted_path[1:] if restricted_path is not None else None, + crawled_data, debug_tree) if restricted_path and not path_found: raise RuntimeError("A 'restricted_path' argument was given that is not contained in " @@ -331,7 +335,7 @@ def scanner(self, # to the general update container. scoped_records = record_store.get_records_current_scope() for record in scoped_records: - self.crawled_data.append(record) + crawled_data.append(record) # TODO: the scoped variables should be cleaned up as soon if the variables # are no longer in the current scope. This can be implemented as follows, @@ -344,7 +348,7 @@ def scanner(self, # del record_store[name] # del general_store[name] - return self.crawled_data + return crawled_data, debug_tree @@ -376,6 +380,8 @@ def scan_directory(dirname: str, crawler_definition_path: str, raise ValueError( "You have to provide a non-empty path for crawling.") dir_structure_name = os.path.basename(dirname) + + # TODO: needs to be covered somewhere else crawled_directory = dirname if not dir_structure_name and dirname.endswith('/'): if dirname == '/': @@ -423,7 +429,8 @@ def scan_structure_elements(items: Union[list[StructureElement], StructureElemen if not isinstance(items, list): items = [items] - self.run_id = uuid.uuid1() + # TODO: needs to be covered somewhere else + # self.run_id = uuid.uuid1() converters = initialize_converters(crawler_definition, converter_registry) return scanner(