Skip to content
Snippets Groups Projects
Commit 50a18727 authored by Alexander Schlemmer's avatar Alexander Schlemmer
Browse files

MAINT: moved debug tree from crawl.py to scanner.py and created a new class in module debug tree

parent 47ea54d8
Branches
Tags
2 merge requests!108Release 0.5.0,!104Create a new scanner module and move functions from crawl module there
Pipeline #34410 failed
...@@ -221,16 +221,6 @@ class Crawler(object): ...@@ -221,16 +221,6 @@ class Crawler(object):
# If a directory is crawled this may hold the path to that directory # If a directory is crawled this may hold the path to that directory
self.crawled_directory: Optional[str] = None self.crawled_directory: Optional[str] = None
self.debug = debug self.debug = debug
if self.debug:
# order in the tuple:
# 0: generalStore
# 1: recordStore
self.debug_tree: dict[str, tuple] = dict()
self.debug_metadata: dict[str, dict] = dict()
self.debug_metadata["copied"] = dict()
self.debug_metadata["provenance"] = defaultdict(lambda: dict())
self.debug_metadata["usage"] = defaultdict(lambda: set())
def synchronize(self, commit_changes: bool = True, unique_names=True): def synchronize(self, commit_changes: bool = True, unique_names=True):
......
#!/usr/bin/env python3
# encoding: utf-8
#
# ** header v3.0
# This file is a part of the CaosDB Project.
#
# Copyright (C) 2023 Alexander Schlemmer <alexander.schlemmer@ds.mpg.de>
#
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
# ** end header
#
"""
A structure containing debug tree information.
"""
from __future__ import annotations
import argparse
import importlib
import logging
import os
import sys
import warnings
import yaml
from argparse import RawTextHelpFormatter
from collections import defaultdict
from copy import deepcopy
from enum import Enum
from importlib_resources import files
from jsonschema import validate
from typing import Any, Optional, Type, Union
import caosdb as db
from caosadvancedtools.cache import UpdateCache, Cache
from caosadvancedtools.crawler import Crawler as OldCrawler
from caosdb.apiutils import (compare_entities, EntityMergeConflictError,
merge_entities)
from caosdb.common.datatype import is_reference
from .converters import Converter, DirectoryConverter, ConverterValidationError
from .macros import defmacro_constructor, macro_constructor
from .stores import Store, GeneralStore, RecordStore
from .structure_elements import StructureElement, Directory, NoneElement
from .version import check_cfood_version
from caosdb.high_level_api import convert_to_python_object
class DebugTree(object):
def __init__(self):
# order in the tuple:
# 0: general_store
# 1: record_store
self.debug_tree: dict[str, tuple] = dict()
self.debug_metadata: dict[str, dict] = dict()
self.debug_metadata["copied"] = dict()
self.debug_metadata["provenance"] = defaultdict(lambda: dict())
self.debug_metadata["usage"] = defaultdict(lambda: set())
# TODO: turn the tuple into two individual elements
...@@ -63,9 +63,7 @@ from .version import check_cfood_version ...@@ -63,9 +63,7 @@ from .version import check_cfood_version
from caosdb.high_level_api import convert_to_python_object from caosdb.high_level_api import convert_to_python_object
from .debug.debug_tree import (DebugTreeStructureElement, from .debug_tree import DebugTree
DebugTreeConverter,
DebugTreeVariable)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -224,15 +222,15 @@ def initialize_converters(crawler_definition: dict, converter_registry: dict): ...@@ -224,15 +222,15 @@ def initialize_converters(crawler_definition: dict, converter_registry: dict):
# Main scanner function: # Main scanner function:
# -------------------------------------------------------------------------------- # --------------------------------------------------------------------------------
def scanner(self, def scanner(items: list[StructureElement],
items: list[StructureElement],
converters: list[Converter], converters: list[Converter],
general_store: Optional[GeneralStore] = None, general_store: Optional[GeneralStore] = None,
record_store: Optional[RecordStore] = None, record_store: Optional[RecordStore] = None,
structure_elements_path: Optional[list[str]] = None, structure_elements_path: Optional[list[str]] = None,
converters_path: Optional[list[str]] = None, converters_path: Optional[list[str]] = None,
restricted_path: Optional[list[str]] = None, restricted_path: Optional[list[str]] = None,
crawled_data: Optional[list[db.Record]] = None): crawled_data: Optional[list[db.Record]] = None,
debug_tree: Optional[DebugTree] = None):
""" """
Crawl a list of StructureElements and apply any matching converters. Crawl a list of StructureElements and apply any matching converters.
...@@ -271,6 +269,12 @@ def scanner(self, ...@@ -271,6 +269,12 @@ def scanner(self,
if structure_elements_path is None: if structure_elements_path is None:
structure_elements_path = [] structure_elements_path = []
if converters_path is None:
converters_path = []
if debug_tree is None:
debug_tree = DebugTree()
for element in items: for element in items:
for converter in converters: for converter in converters:
...@@ -296,32 +300,32 @@ def scanner(self, ...@@ -296,32 +300,32 @@ def scanner(self,
children = converter.create_children(general_store_copy, element) children = converter.create_children(general_store_copy, element)
if self.debug: # add provenance information for each variable
# add provenance information for each variable self.debug_tree.debug_tree[str(element)] = (
self.debug_tree[str(element)] = ( general_store_copy.get_storage(), record_store_copy.get_storage())
general_store_copy.get_storage(), record_store_copy.get_storage()) self.debug_tree.debug_metadata["copied"][str(element)] = (
self.debug_metadata["copied"][str(element)] = ( general_store_copy.get_dict_copied(),
general_store_copy.get_dict_copied(), record_store_copy.get_dict_copied())
record_store_copy.get_dict_copied()) self.debug_tree.debug_metadata["usage"][str(element)].add(
self.debug_metadata["usage"][str(element)].add( "/".join(converters_path + [converter.name]))
"/".join(converters_path + [converter.name])) mod_info = self.debug_tree.debug_metadata["provenance"]
mod_info = self.debug_metadata["provenance"] for record_name, prop_name in keys_modified:
for record_name, prop_name in keys_modified: # TODO: check
# TODO: check internal_id = record_store_copy.get_internal_id(
internal_id = record_store_copy.get_internal_id( record_name)
record_name) record_identifier = record_name + \
record_identifier = record_name + \ "_" + str(internal_id)
"_" + str(internal_id) converter.metadata["usage"].add(record_identifier)
converter.metadata["usage"].add(record_identifier) mod_info[record_identifier][prop_name] = (
mod_info[record_identifier][prop_name] = ( structure_elements_path + [element.get_name()],
structure_elements_path + [element.get_name()], converters_path + [converter.name])
converters_path + [converter.name])
scanner(children, converter.converters,
self.scanner(children, converter.converters, general_store_copy, record_store_copy,
general_store_copy, record_store_copy, structure_elements_path + [element.get_name()],
structure_elements_path + [element.get_name()], converters_path + [converter.name],
converters_path + [converter.name], restricted_path[1:] if restricted_path is not None else None,
restricted_path[1:] if restricted_path is not None else None) crawled_data, debug_tree)
if restricted_path and not path_found: if restricted_path and not path_found:
raise RuntimeError("A 'restricted_path' argument was given that is not contained in " raise RuntimeError("A 'restricted_path' argument was given that is not contained in "
...@@ -331,7 +335,7 @@ def scanner(self, ...@@ -331,7 +335,7 @@ def scanner(self,
# to the general update container. # to the general update container.
scoped_records = record_store.get_records_current_scope() scoped_records = record_store.get_records_current_scope()
for record in scoped_records: for record in scoped_records:
self.crawled_data.append(record) crawled_data.append(record)
# TODO: the scoped variables should be cleaned up as soon if the variables # TODO: the scoped variables should be cleaned up as soon if the variables
# are no longer in the current scope. This can be implemented as follows, # are no longer in the current scope. This can be implemented as follows,
...@@ -344,7 +348,7 @@ def scanner(self, ...@@ -344,7 +348,7 @@ def scanner(self,
# del record_store[name] # del record_store[name]
# del general_store[name] # del general_store[name]
return self.crawled_data return crawled_data, debug_tree
...@@ -376,6 +380,8 @@ def scan_directory(dirname: str, crawler_definition_path: str, ...@@ -376,6 +380,8 @@ def scan_directory(dirname: str, crawler_definition_path: str,
raise ValueError( raise ValueError(
"You have to provide a non-empty path for crawling.") "You have to provide a non-empty path for crawling.")
dir_structure_name = os.path.basename(dirname) dir_structure_name = os.path.basename(dirname)
# TODO: needs to be covered somewhere else
crawled_directory = dirname crawled_directory = dirname
if not dir_structure_name and dirname.endswith('/'): if not dir_structure_name and dirname.endswith('/'):
if dirname == '/': if dirname == '/':
...@@ -423,7 +429,8 @@ def scan_structure_elements(items: Union[list[StructureElement], StructureElemen ...@@ -423,7 +429,8 @@ def scan_structure_elements(items: Union[list[StructureElement], StructureElemen
if not isinstance(items, list): if not isinstance(items, list):
items = [items] items = [items]
self.run_id = uuid.uuid1() # TODO: needs to be covered somewhere else
# self.run_id = uuid.uuid1()
converters = initialize_converters(crawler_definition, converter_registry) converters = initialize_converters(crawler_definition, converter_registry)
return scanner( return scanner(
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment