diff --git a/CHANGELOG.md b/CHANGELOG.md index 29013891d7f53dd4c4d8164e79eecfa169fcb289..a27165062f8b7c93bd5a53f54cc6b26003699562 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ## ### Added ### +* 'transform' sections can be added to a CFood to apply functions to values stored in variables. ### Changed ### - If the `parents` key is used in a cfood at a lower level for a Record that @@ -23,9 +24,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Removed ### ### Fixed ### -- Empty Records can now be created (https://gitlab.com/caosdb/caosdb-crawler/-/issues/27) - +* Empty Records can now be created (https://gitlab.com/caosdb/caosdb-crawler/-/issues/27) * [#58](https://gitlab.com/caosdb/caosdb-crawler/-/issues/58) Documentation builds API docs in pipeline now. +* [#117](https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/117) `replace_variable` does no longer unnecessarily change the type. Values stored in variables in a CFood can have now other types. ### Security ### diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index 930a08c167e17efaed39b4b9045589cde68b2e8c..708a3c40c2b94062f5ab8fede47a884b8103fb16 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -32,8 +32,9 @@ import os import re import warnings from abc import ABCMeta, abstractmethod +from inspect import signature from string import Template -from typing import List, Optional, Tuple, Union +from typing import Any, List, Optional, Tuple, Union import caosdb as db import pandas as pd @@ -52,7 +53,7 @@ from .utils import has_parent # by the converters: SPECIAL_PROPERTIES = ("description", "name", "id", "path", "file", "checksum", "size") - +SINGLE_VAR_RE = re.compile(r"^\$(\{)?(?P<varname>[0-9a-zA-Z_]+)(\})?$") logger = logging.getLogger(__name__) @@ -128,35 +129,34 @@ def create_path_value(func): return inner -def replace_variables(propvalue, values: GeneralStore): +def replace_variables(propvalue: Any, values: GeneralStore): """ This function replaces variables in property values (and possibly other locations, where the crawler can replace cfood-internal variables). - This function checks whether the value that is to be replaced is of type db.Entity. - In this case the entity is returned (note that this is of course only possible, if the - occurrence of the variable is directly at the beginning of the value and e.g. no string - concatenation is attempted. - - In any other case the variable substitution is carried out and a new string with the - replaced variables is returned. + If `propvalue` is a single variable name preceeded with a '$' (e.g. '$var' or '${var}'), then + the corresponding value stored in `values` is returned. + In any other case the variable substitution is carried out as defined by string templates + and a new string with the replaced variables is returned. """ + # We only replace string variable names. If it is not a string the value stays unchanged + if not isinstance(propvalue, str): + return propvalue + # Check if the replacement is a single variable containing a record: - match = re.match(r"^\$(\{)?(?P<varname>[0-9a-zA-Z_]+)(\})?$", propvalue) + match = SINGLE_VAR_RE.match(propvalue) if match is not None: varname = match.group("varname") if varname in values: - if values[varname] is None: - return None - if isinstance(values[varname], db.Entity): - return values[varname] + return values[varname] propvalue_template = CrawlerTemplate(propvalue) return propvalue_template.safe_substitute(**values.get_storage()) def handle_value(value: Union[dict, str, list], values: GeneralStore): - """Determine whether the given value needs to set a property, be added to an existing value (create a list) or + """Determine whether the given value needs to set a property, + be added to an existing value (create a list) or add as an additional property (multiproperty). Variable names (starting with a "$") are replaced by the corresponding value stored in the @@ -204,7 +204,6 @@ out: tuple # being able to directly set list values. Semantics is, however, a bit # different from the two cases above. collection_mode = "single" - propvalue = value # variables replacement: propvalue = list() @@ -319,6 +318,16 @@ class Converter(object, metaclass=ABCMeta): """Converters treat StructureElements contained in the hierarchical sturcture.""" def __init__(self, definition: dict, name: str, converter_registry: dict): + """ + + Parameters + ---------- + definition: dict, Please refer to ``src/doc/converters.rst`` to learn about the structure + that the definition dict must have. + converter_registry: dict, A dictionary that contains converter names as keys and dicts as + values. Those value dicts have the keys 'converter' and 'package'. + """ + self.definition = definition self.name = name @@ -328,6 +337,23 @@ class Converter(object, metaclass=ABCMeta): } self.converters = [] + if "transform" in self.definition: + if not isinstance(self.definition["transform"], dict): + raise RuntimeError("The value corresponding to the 'transform' key in the " + "converter definition must be a dict") + for transformer_key, transformer in self.definition["transform"].items(): + if "in" not in transformer: + raise RuntimeError("In-variable not defined!") + if "out" not in transformer: + raise RuntimeError("Out-variable not defined!") + if "functions" not in transformer: + raise RuntimeError("No functions given for transformer!") + if not isinstance(transformer["functions"], list): + raise RuntimeError("The value corresponding to the 'functions' key in the " + "transform section must be a list") + + if not isinstance(transformer["in"], str): + raise RuntimeError("You should provide the variable name as string") if "subtree" in definition: for converter_name in definition['subtree']: @@ -363,8 +389,14 @@ class Converter(object, metaclass=ABCMeta): Extract information from the structure element and store them as values in the general store. - values: The GeneralStore to store values in. - element: The StructureElement to extract values from. + Parameters: + ------------ + + values: GeneralStore + The GeneralStore to store values in. + + element: StructureElement + The StructureElement to extract values from. """ m = self.match(element) if m is None: @@ -372,6 +404,61 @@ class Converter(object, metaclass=ABCMeta): raise RuntimeError("Condition does not match.") values.update(m) + def apply_transformers(self, values: GeneralStore, transformer_functions: dict): + """ + Check if transformers are defined using the "transform" keyword. + Then apply the transformers to the variables defined in GeneralStore "values". + + Parameters: + ------------ + + values: GeneralStore + The GeneralStore to store values in. + + transformer_functions: dict + A dictionary of registered functions that can be used within this transformer block. + The keys of the dict are the function keys and the values the callable functions of the + form: + + def func(in_value: Any, in_parameters: dict) -> Any: + pass + """ + + if not "transform" in self.definition: + return + for transformer_key, transformer in self.definition["transform"].items(): + in_value = replace_variables(transformer["in"], values) + + for tr_func_el in transformer["functions"]: + if not isinstance(tr_func_el, dict): + raise RuntimeError("Elements of the list of the functions key " + "must be dictonaries!") + if len(tr_func_el) != 1: + raise RuntimeError("List element dictionaries must have exactly" + " one element with they key being the name" + " of the function!") + tr_func_key = list(tr_func_el.keys())[0] + tr_func_params = tr_func_el[tr_func_key] + if tr_func_key not in transformer_functions: + raise RuntimeError("Unknown transformer function: {}".format(tr_func_key)) + + # Retrieve the function from the dictionary: + tr_func = transformer_functions[tr_func_key] + # Call the function: + sig = signature(tr_func) + if len(sig.parameters) == 1 and len(tr_func_params) == 0: + out_value = tr_func(in_value) + else: + out_value = tr_func(in_value, tr_func_params) + # The next in_value is the current out_value: + in_value = out_value + # If everything succeeded, store the final value in the general store: + match = SINGLE_VAR_RE.match(transformer["out"]) + if match is None: + raise RuntimeError("'out' of the transformer definition must specify a single" + f" variable name. It was {transformer['out']}") + values[match.group('varname')] = out_value + @abstractmethod def create_children(self, values: GeneralStore, element: StructureElement): @@ -597,6 +684,15 @@ class MarkdownFileConverter(SimpleFileConverter): "Error during the validation (yaml header cannot be read) of the markdown file " "located at the following node in the data structure:\n" f"{path}") + except yaml_header_tools.ParseErrorsInHeader as err: + if generalStore is not None and self.name in generalStore: + path = generalStore[self.name] + else: + path = "<path not set>" + raise ConverterValidationError( + "Error during the validation (yaml header cannot be read) of the markdown file " + "located at the following node in the data structure:\n" + "{}\nError:\n{}".format(path, err)) children: List[StructureElement] = [] for name, entry in header.items(): @@ -605,8 +701,12 @@ class MarkdownFileConverter(SimpleFileConverter): elif type(entry) == str: children.append(TextElement(name, entry)) else: + if generalStore is not None and self.name in generalStore: + path = generalStore[self.name] + else: + path = "<path not set>" raise RuntimeError( - "Header entry {} has incompatible type.".format(name)) + "Header entry {} has incompatible type.\nFilename: {}".format(name, path)) return children diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 82e573cb8749417b522b15bfe4984acec3fdf8a9..7b05120b3f609368528cd83ed0c5ab3b0a0ad9f1 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -32,7 +32,6 @@ the acuired data with CaosDB. from __future__ import annotations import argparse -import importlib import logging import os import sys @@ -40,28 +39,24 @@ import traceback import uuid import warnings from argparse import RawTextHelpFormatter -from collections import defaultdict from copy import deepcopy from datetime import datetime from enum import Enum -from typing import Any, Optional, Type, Union +from typing import Any, Optional, Union import caosdb as db import yaml -from caosadvancedtools.cache import Cache, UpdateCache +from caosadvancedtools.cache import UpdateCache from caosadvancedtools.crawler import Crawler as OldCrawler from caosadvancedtools.serverside.helper import send_mail from caosadvancedtools.utils import create_entity_link from caosdb.apiutils import (EntityMergeConflictError, compare_entities, merge_entities) from caosdb.cached import cache_clear, cached_get_entity_by -from caosdb.common.datatype import is_reference from caosdb.exceptions import EmptyUniqueQueryError -from importlib_resources import files -from jsonschema import validate from .config import get_config_setting -from .converters import Converter, ConverterValidationError, DirectoryConverter +from .converters import Converter, ConverterValidationError from .debug_tree import DebugTree from .identifiable import Identifiable from .identifiable_adapters import (CaosDBIdentifiableAdapter, @@ -72,9 +67,8 @@ from .logging import configure_server_side_logging from .macros import defmacro_constructor, macro_constructor from .scanner import (create_converter_registry, initialize_converters, load_definition, scan_directory, scan_structure_elements) -from .stores import GeneralStore, RecordStore -from .structure_elements import Directory, NoneElement, StructureElement -from .version import check_cfood_version +from .stores import GeneralStore +from .structure_elements import StructureElement logger = logging.getLogger(__name__) diff --git a/src/caoscrawler/default_transformers.yml b/src/caoscrawler/default_transformers.yml new file mode 100644 index 0000000000000000000000000000000000000000..74a76ae7f0a4051682a949e5ba7ed79e84c71578 --- /dev/null +++ b/src/caoscrawler/default_transformers.yml @@ -0,0 +1,8 @@ + + +submatch: + package: caoscrawler.transformer_functions + function: submatch +split: + package: caoscrawler.transformer_functions + function: split diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py index 2f4f4228ccba232fb94654ae2b67756de8bf121c..3f8b85f4c49b33d4e065a840293c871381c31396 100644 --- a/src/caoscrawler/scanner.py +++ b/src/caoscrawler/scanner.py @@ -36,21 +36,20 @@ import importlib import logging import os import warnings -import yaml +from collections.abc import Callable +from typing import Any, Optional, Type, Union +import caosdb as db +import yaml from importlib_resources import files from jsonschema import validate -from typing import Any, Optional, Type, Union -import caosdb as db from .converters import Converter - +from .debug_tree import DebugTree from .stores import GeneralStore, RecordStore -from .structure_elements import StructureElement, Directory +from .structure_elements import Directory, StructureElement from .version import check_cfood_version -from .debug_tree import DebugTree - logger = logging.getLogger(__name__) @@ -110,6 +109,7 @@ def _load_definition_from_yaml_dict(crawler_definitions: list[dict]): for key in metadata["Converters"]: schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append( key) + # TODO: We need a similar thing for "Transformers". # Validate the cfood schema: validate(instance=crawler_definition, schema=schema["cfood"]) @@ -182,6 +182,42 @@ def create_converter_registry(definition: dict): return converter_registry +def create_transformer_registry(definition: dict[str, dict[str, str]]): + """ + Currently the transformer registry is a dictionary containing for each transformer: + - key is the short code, abbreviation for the converter class name + - module is the name of the module to be imported which must be installed + - class is the transformer function to load and associate with this converter entry + + all other info for the converter needs to be included in the converter plugin + directory: + schema.yml file + README.md documentation + + Please refer to the docstring of function "scanner" for more information about the + detailed structure of the transformer functions. + """ + + # Defaults for the transformer registry: + with open(str(files('caoscrawler').joinpath('default_transformers.yml')), "r") as f: + transformer_def: dict[str, dict[str, str]] = yaml.safe_load(f) + + registry: dict[str, Callable[[Any, dict], Any]] = {} + # More transformers from definition file: + if "Transformers" in definition: + for key, entry in definition["Transformers"].items(): + transformer_def[key] = { + "function": entry["function"], + "package": entry["package"] + } + + # Load modules and associate classes: + for key, value in transformer_def.items(): + module = importlib.import_module(value["package"]) + registry[key] = getattr(module, value["function"]) + return registry + + def initialize_converters(crawler_definition: dict, converter_registry: dict): """ takes the cfood as dict (`crawler_definition`) and creates the converter objects that @@ -200,6 +236,8 @@ def initialize_converters(crawler_definition: dict, converter_registry: dict): continue elif key == "Converters": continue + elif key == "Transformers": + continue converters.append(Converter.converter_factory( value, key, converter_registry)) @@ -218,7 +256,8 @@ def scanner(items: list[StructureElement], converters_path: Optional[list[str]] = None, restricted_path: Optional[list[str]] = None, crawled_data: Optional[list[db.Record]] = None, - debug_tree: Optional[DebugTree] = None): + debug_tree: Optional[DebugTree] = None, + registered_transformer_functions: Optional[dict] = None): """Crawl a list of StructureElements and apply any matching converters. Formerly known as "_crawl". @@ -241,6 +280,19 @@ def scanner(items: list[StructureElement], be the name of the StructureElement at this level, i.e. denoting the respective element in the items argument. + registered_transformer_functions: dict + A dictionary of transformer functions that can be used in the "transform" block + of a converter and that allows to apply simple transformations to variables extracted + either by the current converter or to other variables found in the current variable store. + + Each function is a dictionary: + - The key is the name of the function to be looked up in the dictionary + of registered transformer functions. + - The value is the function which needs to be of the form: + + def func(in_value: Any, in_parameters: dict) -> Any: + pass + """ # This path_found variable stores wether the path given by restricted_path was found in the # data tree @@ -281,9 +333,13 @@ def scanner(items: list[StructureElement], general_store_copy[converter.name] = element_path # extracts values from structure element and stores them in the - # variable store + # variable store. converter.create_values(general_store_copy, element) + # Apply transformers if there are any: + converter.apply_transformers(general_store_copy, + registered_transformer_functions) + keys_modified = converter.create_records( general_store_copy, record_store_copy, element) @@ -315,7 +371,8 @@ def scanner(items: list[StructureElement], structure_elements_path + [element.get_name()], converters_path + [converter.name], restricted_path[1:] if restricted_path is not None else None, - crawled_data, debug_tree) + crawled_data, debug_tree, + registered_transformer_functions) if restricted_path and not path_found: raise RuntimeError("A 'restricted_path' argument was given that is not contained in " @@ -363,6 +420,9 @@ def scan_directory(dirname: str, crawler_definition_path: str, # Load and register converter packages: converter_registry = create_converter_registry(crawler_definition) + # Load and register transformer functions: + registered_transformer_functions = create_transformer_registry(crawler_definition) + if not dirname: raise ValueError( "You have to provide a non-empty path for crawling.") @@ -383,7 +443,8 @@ def scan_directory(dirname: str, crawler_definition_path: str, crawler_definition, converter_registry, restricted_path=restricted_path, - debug_tree=debug_tree + debug_tree=debug_tree, + registered_transformer_functions=registered_transformer_functions ) @@ -391,7 +452,8 @@ def scan_structure_elements(items: Union[list[StructureElement], StructureElemen crawler_definition: dict, converter_registry: dict, restricted_path: Optional[list[str]] = None, - debug_tree: Optional[DebugTree] = None): + debug_tree: Optional[DebugTree] = None, + registered_transformer_functions: Optional[dict] = None): """ Start point of the crawler recursion. @@ -428,5 +490,6 @@ def scan_structure_elements(items: Union[list[StructureElement], StructureElemen items=items, converters=converters, restricted_path=restricted_path, - debug_tree=debug_tree + debug_tree=debug_tree, + registered_transformer_functions=registered_transformer_functions ) diff --git a/src/caoscrawler/transformer_functions.py b/src/caoscrawler/transformer_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..8901c2f53bcfbfb28333887a98576e67cff81386 --- /dev/null +++ b/src/caoscrawler/transformer_functions.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# Copyright (C) 2023 Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +""" +Defnition of default transformer functions. +""" +import re +from typing import Any + + +def submatch(in_value: Any, in_parameters: dict): + """ + Substitute the variable if it matches the regexp stored in "match". + + Returns the "in" value if it does NOT match the reg exp of 'match'. + Otherwise (if it matches) the value of 'then' stored in the second argument is returned. + """ + if "match" not in in_parameters or "then" not in in_parameters: + raise RuntimeError("Mandatory parameters missing.") + if re.match(in_parameters["match"], in_value) is not None: + return in_parameters["then"] + return in_value + + +def split(in_value: Any, in_parameters: dict): + """calls the string 'split' function on the first argument and uses the value of the key + 'marker' stored in the second argument + """ + if "marker" not in in_parameters: + raise RuntimeError("Mandatory parameter missing.") + if not isinstance(in_value, str): + raise RuntimeError("must be string") + return in_value.split(in_parameters['marker']) diff --git a/src/doc/cfood.rst b/src/doc/cfood.rst index 6564ee677f0b363a52c44dd5ceabe5378c255105..c12e251d49e164a737b20e92e56e7b3e10149d4f 100644 --- a/src/doc/cfood.rst +++ b/src/doc/cfood.rst @@ -4,6 +4,7 @@ CFood-Definition The crawler specification is called CFood-definition. It is stored inside a yaml file, or - more precisely - inside of one single or two yaml documents inside a yaml file. The specification consists of three separate parts: + #. Metadata and macro definitions #. Custom converter registrations #. The converter tree specification @@ -178,6 +179,23 @@ in a vairable with the same name (as it is the case for other Records). SomeRecord: ParameterFile: $fileEntity # creates a reference to the file + +Transform Functions +------------------- +You can use transform functions to alter variable values that the crawler consumes (e.g. a string +that was matched with a reg exp). See :doc:`Converter Documentation<converters.rst>`. + +You can define your own transform functions by adding the the same way you add custom converters: + +.. code-block:: yaml + + Transformers: + transform_foo: + package: some.package + function: some_foo + + + Automatically generated keys ++++++++++++++++++++++++++++ diff --git a/src/doc/converters.rst b/src/doc/converters.rst index 98849609f0cab2afba037a82fe4ae6802caa5956..60da52d3ed110f050a3d7aae866cc7d8b6b8dc31 100644 --- a/src/doc/converters.rst +++ b/src/doc/converters.rst @@ -56,6 +56,54 @@ to generate records (see :py:meth:`~caoscrawler.converters.Converter.create_reco Subtree contains a list of Converter defnitions that look like the one described here. +Transform Functions ++++++++++++++++++++ +Often the situation arises, that you cannot use a value as it is found. Maybe a value should be +increased by an offset or a string should be split into a list of pieces. In order to allow such +simple conversions, transform functions can be named in the converter definition that are then +applied to the respective variables when the converter is executed. + +.. code-block:: yaml + + <NodeName>: + type: <ConverterName> + match: ".*" + transform: + <TransformNodeName>: + in: $<in_var_name> + out: $<out_var_name> + functions: + - <func_name>: # name of the function to be applied + <func_arg1>: <func_arg1_value> # key value pairs that are passed as parameters + <func_arg2>: <func_arg2_value> + # ... + +An example that splits the variable ``a`` and puts the generated list in ``b`` is the following: + +.. code-block:: yaml + + Experiment: + type: Dict + match: ".*" + transform: + param_split: + in: $a + out: $b + functions: + - split: # split is a function that is defined by default + marker: "|" # its only parameter is the marker that is used to split the string + records: + Report: + tags: $b + +This splits the string in '$a' and stores the resulting list in '$b'. This is here used to add a +list valued property to the Report Record. + + +There are a number of transform functions that are defined by default (see +``src/caoscrawler/default_transformers.yml``). You can define custom transform functions by adding +them to the cfood definition (see :doc:`CFood Documentation<cfood.rst>`). + Standard Converters +++++++++++++++++++ diff --git a/src/doc/macros.rst b/src/doc/macros.rst index 5d8a411607af223c5b8d65b1553e710553d998f0..560827e6fc4ff8b0238f16ca8d76b2c682bce505 100644 --- a/src/doc/macros.rst +++ b/src/doc/macros.rst @@ -231,3 +231,43 @@ positions. Consider: However, this should not be a real limitation, as the crawler is designed in a way, that the order of the nodes in the same level should not matter. + + +Using macros within macro definitions +===================================== + +It is possible to use other macros in macro definitions. Again, examples can be found in +the macro unit tests (see e.g. :func:`unittests.test_macros.test_macros_in_macros`): + +.. _example_macros_in_macros: +.. code-block:: yaml + + --- + metadata: + crawler-version: 0.3.1 + macros: + - !defmacro + name: one_macro + params: + a: 25 + definition: + macro_sub_$a: + b: $a + another_param: 3 + - !defmacro + name: test_macrodef + params: {} + definition: + macro_top: !macro + one_macro: + - a: 17 + - {} + - a: 98 + not_macro: + a: 26 + --- + extroot: !macro + test_macrodef: + +TODO: +to be continued diff --git a/unittests/test_converters.py b/unittests/test_converters.py index ab5710feaaf14babc3fed65f10598250e53ffd9b..42b078eb31c4bdfb0bdeea3d2a0b4fe1c4404ca7 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -39,9 +39,10 @@ from caoscrawler.converters import (Converter, ConverterValidationError, DictIntegerElementConverter, DirectoryConverter, FloatElementConverter, IntegerElementConverter, JSONFileConverter, + ListElementConverter, MarkdownFileConverter, YAMLFileConverter, _AbstractScalarValueElementConverter, - handle_value) + handle_value, replace_variables) from caoscrawler.crawl import Crawler from caoscrawler.scanner import (_load_definition_from_yaml_dict, create_converter_registry, load_definition) @@ -50,6 +51,7 @@ from caoscrawler.structure_elements import (BooleanElement, DictElement, Directory, File, FloatElement, IntegerElement, ListElement, TextElement) +from caoscrawler.transformer_functions import split UNITTESTDIR = Path(__file__).parent @@ -166,7 +168,7 @@ def test_markdown_converter(converter_registry): test_readme2 = File( "README.md", - UNITTESTDIR/"test_directories" / "examples_article" / + UNITTESTDIR / "test_directories" / "examples_article" / "ExperimentalData" / "2020_SpeedOfLight" / "2020-01-01_TimeOfFlight" / "README.md" ) @@ -244,7 +246,7 @@ def test_json_converter(converter_registry): invalid_json = File( "invalidjson.json", - UNITTESTDIR/"test_directories" / "examples_json" / "invalidjson.json" + UNITTESTDIR / "test_directories" / "examples_json" / "invalidjson.json" ) # Doesn't validate because of missing required 'name' property with pytest.raises(ConverterValidationError) as err: @@ -253,7 +255,7 @@ def test_json_converter(converter_registry): broken_json = File( "brokenjson.json", - UNITTESTDIR/"test_directories" / "examples_json" / "brokenjson.json" + UNITTESTDIR / "test_directories" / "examples_json" / "brokenjson.json" ) with pytest.raises(json.decoder.JSONDecodeError) as err: jsonconverter.create_children(None, broken_json) @@ -318,7 +320,7 @@ def test_yaml_converter(converter_registry): invalid_yaml = File( "invalidyaml.yml", - UNITTESTDIR/"test_directories" / "test_yamls" / "invalidyaml.yml" + UNITTESTDIR / "test_directories" / "test_yamls" / "invalidyaml.yml" ) # Doesn't validate because of missing required 'name' property @@ -328,7 +330,7 @@ def test_yaml_converter(converter_registry): broken_yaml = File( "brokenyaml.yml", - UNITTESTDIR/"test_directories" / "test_yamls" / "brokenyaml.yml" + UNITTESTDIR / "test_directories" / "test_yamls" / "brokenyaml.yml" ) with pytest.raises(yaml.parser.ParserError) as err: yamlconverter.create_children(None, broken_yaml) @@ -339,6 +341,12 @@ def test_variable_replacement(): values["a"] = 4 values["b"] = "68" + # basic values stay unchanged + assert replace_variables(5, values) is 5 + assert replace_variables(True, values) is True + assert replace_variables("$a", values) is 4 + assert replace_variables("${b}", values) == "68" + assert handle_value("b", values) == ("b", "single") assert handle_value("+b", values) == ("b", "list") assert handle_value("*b", values) == ("b", "multiproperty") @@ -360,7 +368,36 @@ def test_variable_replacement(): "collection_mode": "multiproperty"}, values) == ("68", "multiproperty") assert handle_value(["a", "b"], values) == (["a", "b"], "single") - assert handle_value(["$a", "$b"], values) == (["4", "68"], "single") + assert handle_value(["$a", "$b"], values) == ([4, "68"], "single") + + +def test_apply_transformers(converter_registry): + cfood_def = {"type": 'ListElement', "debug_match": True, "match_name": ".*", + 'transform': {'test': {'in': '$a', 'out': '$b', 'functions': [{ + 'split': {'marker': '|'}}]}}} + values = GeneralStore() + values["a"] = "a|b|c" + + # transformer_functions = create_transformer_registry(crawler_definition) + transformer_functions = {"split": split} + + conv = ListElementConverter(definition=cfood_def, name='test', + converter_registry=converter_registry) + + assert values['a'] is "a|b|c" + conv.apply_transformers(values, transformer_functions) + assert values['a'] is "a|b|c" + assert values['b'] == ["a", "b", "c"] + + # Check replacing of existing variable + cfood_def = {"type": 'ListElement', "debug_match": True, "match_name": ".*", + 'transform': {'test': {'in': '$a', 'out': '$a', 'functions': [{ + 'split': {'marker': '|'}}]}}} + conv = ListElementConverter(definition=cfood_def, name='test', + converter_registry=converter_registry) + + conv.apply_transformers(values, transformer_functions) + assert values['a'] == ["a", "b", "c"] def test_filter_children_of_directory(converter_registry, capsys): diff --git a/unittests/test_directories/test_transformers/Day_Mon/README.md b/unittests/test_directories/test_transformers/Day_Mon/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/unittests/test_directories/test_transformers/Day_Tue/README.md b/unittests/test_directories/test_transformers/Day_Tue/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/unittests/test_directories/test_transformers/Day_Unk/README.md b/unittests/test_directories/test_transformers/Day_Unk/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/unittests/test_directories/test_transformers/cfood.yml b/unittests/test_directories/test_transformers/cfood.yml new file mode 100644 index 0000000000000000000000000000000000000000..ec79eaf00203b5df1a436dfe50fbf17fa7b764db --- /dev/null +++ b/unittests/test_directories/test_transformers/cfood.yml @@ -0,0 +1,49 @@ + +# See: https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/107 +# +Transformers: + ceil: + function: quote + package: shlex + +RootDir: + type: Directory + match: ^.*$ + subtree: + DateDir: + type: Directory + match: ^Day_(?P<day_short>.*)$ # Example: Day_Mon + transform: + MakeDayLong: + in: $day_short + out: $day_long + functions: + - submatch: # name of the function + match: Mon # match is one specific argument + then: Monday # then another one + - submatch: # next function + match: Tue + then: Tuesday + TestSplit: + in: $day_short + out: $day_split + functions: + - split: + marker: o + records: + DayFolder: + Day: $day_long + DayShort: $day_short # just for checking, whether this variable remains + DaySplit: $day_split # just for checking, whether this variable remains + Testfi: + type: File + match: ^(?P<no>(\d+ )*)$ + transform: + up: + in: $no + out: $no + functions: + - ceil: {} + records: + Number: + num: $no diff --git a/unittests/test_scalars_cfood.py b/unittests/test_scalars_cfood.py index 57a8083c2aace830115c410e0425a8af4da17a7b..ba604fe4f5b695506bf8df9dab79fc23232c546a 100644 --- a/unittests/test_scalars_cfood.py +++ b/unittests/test_scalars_cfood.py @@ -37,11 +37,11 @@ def test_handle_value(): def test_record_structure_generation(): dbt = DebugTree() - scan_directory(UNITTESTDIR/"test_directories" / "examples_article", - UNITTESTDIR/"cfoods_scalar.yml", + scan_directory(UNITTESTDIR / "test_directories" / "examples_article", + UNITTESTDIR / "cfoods_scalar.yml", debug_tree=dbt) subd = dbt.debug_tree[dircheckstr( - UNITTESTDIR/"test_directories" / "examples_article", "DataAnalysis")] + UNITTESTDIR / "test_directories" / "examples_article", "DataAnalysis")] assert len(subd) == 2 # variables store on Data Analysis node of debug tree if "Data" in subd[0]: diff --git a/unittests/test_scanner.py b/unittests/test_scanner.py index 21b7da10ef860378a4779f34f8f1b26c6a54f359..c0ce736fc4bed18f371f1626b6bc451ee103db49 100644 --- a/unittests/test_scanner.py +++ b/unittests/test_scanner.py @@ -1,8 +1,10 @@ +#!/usr/bin/env python3 # encoding: utf-8 # -# This file is a part of the CaosDB Project. +# This file is a part of the LinkAhead Project. # -# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# Copyright (C) 2023,2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2023,2024 Henrik tom Wörden <h.tomwoerden@indiscale.com> # 2021-2023 Research Group Biomedical Physics, # Max-Planck-Institute for Dynamics and Self-Organization Göttingen # Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> @@ -20,37 +22,24 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. # +""" +Unit test functions for the scanner. +""" -import json -import logging -import os -import warnings -from copy import deepcopy from functools import partial -from os.path import basename, dirname, join from pathlib import Path from tempfile import NamedTemporaryFile from unittest.mock import MagicMock, Mock, patch import caosdb as db -import caosdb.common.models as dbmodels import pytest import yaml -from caoscrawler.crawl import (Crawler, SecurityMode, _treat_deprecated_prefix, - crawler_main, split_restricted_path) +from caoscrawler.crawl import Crawler from caoscrawler.debug_tree import DebugTree -from caoscrawler.identifiable import Identifiable -from caoscrawler.identifiable_adapters import (CaosDBIdentifiableAdapter, - IdentifiableAdapter, - LocalStorageIdentifiableAdapter) from caoscrawler.scanner import (create_converter_registry, load_definition, scan_directory, scan_structure_elements) -from caoscrawler.stores import GeneralStore, RecordStore from caoscrawler.structure_elements import (DictElement, DictListElement, DictTextElement, File) -from caosdb.apiutils import compare_entities -from caosdb.cached import cache_clear -from caosdb.exceptions import EmptyUniqueQueryError from pytest import raises from utils import dircheckstr as dircheck_base @@ -210,7 +199,6 @@ def test_record_generation(): # Try each record to check for i, check_prop in enumerate(check_props): matches = True - # breakpoint() # Verify that all props are in the record and have the right value for pr in check_prop: if rec.get_property(pr) is None: diff --git a/unittests/test_transformers.py b/unittests/test_transformers.py new file mode 100644 index 0000000000000000000000000000000000000000..ac530c9147fd7a4c86fa2ef668b6366a722935b0 --- /dev/null +++ b/unittests/test_transformers.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2023 Research Group Biomedical Physics, +# Max-Planck-Institute for Dynamics and Self-Organization Göttingen +# Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" +Unit test functions for the transformer feature of the scanner. + +Currently, this is under development. +See: https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/107 +""" + +from functools import partial +from pathlib import Path +from tempfile import NamedTemporaryFile +from unittest.mock import MagicMock, Mock, patch + +import caosdb as db +import pytest +import yaml +from caoscrawler.scanner import (create_converter_registry, load_definition, + scan_directory, scan_structure_elements) +from caoscrawler.structure_elements import (DictElement, DictListElement, + DictTextElement, File) +from pytest import raises + +UNITTESTDIR = Path(__file__).parent + + +def test_simple_transformer(): + """ + Test the correct list of returned records by the scanner using the + scifolder example from the article. + """ + + records = scan_directory(UNITTESTDIR / "test_directories" / "test_transformers", + UNITTESTDIR / "test_directories" / "test_transformers" / + "cfood.yml") + + for r in records: + if r.parents[0].name == "DayFolder": + assert r.get_property("Day") is not None + assert r.get_property("DayShort") is not None + assert r.get_property("DayShort").value != "$day_short" + if r.get_property("DayShort").value == "Unk": + # This unkown folder should not lead to a replacement + assert r.get_property("Day").value == "Unk" + assert r.get_property("DaySplit").value == ["Unk"] + elif r.get_property("DayShort").value == "Mon": + assert r.get_property("Day").value == "Monday" + assert r.get_property("DaySplit").value == ["M", "n"] + elif r.get_property("DayShort").value == "Tue": + assert r.get_property("Day").value == "Tuesday" + assert r.get_property("DaySplit").value == ["Tue"] + else: + # unexpected occurence of a short form, something wrong with test directories + assert False + elif r.parents[0].name == "Number": + assert r.get_property("num") is not None + assert r.get_property("num").value == "'12345 5 '" + else: + # unkown error, something wrong with test directories + assert False diff --git a/unittests/test_variable_substitutions.py b/unittests/test_variable_substitutions.py index 9d1981c773e481e78eb4f82e785e27ee8f8d00d6..09f78df661d82970e7264996102eff8881ee19ec 100644 --- a/unittests/test_variable_substitutions.py +++ b/unittests/test_variable_substitutions.py @@ -41,7 +41,7 @@ from pytest import raises from utils import dircheckstr as dircheckstr_base UNITTESTDIR = Path(__file__).parent -dircheckstr = partial(dircheckstr_base, UNITTESTDIR/"test_directories" / +dircheckstr = partial(dircheckstr_base, UNITTESTDIR / "test_directories" / "example_substitutions")