diff --git a/CHANGELOG.md b/CHANGELOG.md index 29013891d7f53dd4c4d8164e79eecfa169fcb289..583b6aedaaabd51a153760a846cdc046d33865fa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,9 +23,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Removed ### ### Fixed ### -- Empty Records can now be created (https://gitlab.com/caosdb/caosdb-crawler/-/issues/27) - +* Empty Records can now be created (https://gitlab.com/caosdb/caosdb-crawler/-/issues/27) * [#58](https://gitlab.com/caosdb/caosdb-crawler/-/issues/58) Documentation builds API docs in pipeline now. +* [#117](https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/117) `replace_variable` does no longer unnecessarily change the type. Values stored in variables in a CFood can have now other types. ### Security ### diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index ca6b8e5d8a4d9b5244e7a8fbd547ce9bd12b5826..aecbda9c18dd506d62abfd6397ef9bad9e0823ae 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -33,7 +33,7 @@ import re import warnings from abc import ABCMeta, abstractmethod from string import Template -from typing import List, Optional, Tuple, Union +from typing import Any, List, Optional, Tuple, Union import caosdb as db import pandas as pd @@ -52,7 +52,7 @@ from .utils import has_parent # by the converters: SPECIAL_PROPERTIES = ("description", "name", "id", "path", "file", "checksum", "size") - +SINGLE_VAR_RE = re.compile(r"^\$(\{)?(?P<varname>[0-9a-zA-Z_]+)(\})?$") logger = logging.getLogger(__name__) @@ -128,37 +128,34 @@ def create_path_value(func): return inner -def replace_variables(propvalue, values: GeneralStore): +def replace_variables(propvalue: Any, values: GeneralStore): """ This function replaces variables in property values (and possibly other locations, where the crawler can replace cfood-internal variables). - This function checks whether the value that is to be replaced is of type db.Entity. - In this case the entity is returned (note that this is of course only possible, if the - occurrence of the variable is directly at the beginning of the value and e.g. no string - concatenation is attempted. - - In any other case the variable substitution is carried out and a new string with the - replaced variables is returned. + If `propvalue` is a single variable name preceeded with a '$' (e.g. '$var' or '${var}'), then + the corresponding value stored in `values` is returned. + In any other case the variable substitution is carried out as defined by string templates + and a new string with the replaced variables is returned. """ + # We only replace string variable names. If it is not a string the value stays unchanged + if not isinstance(propvalue, str): + return propvalue + # Check if the replacement is a single variable containing a record: - match = re.match(r"^\$(\{)?(?P<varname>[0-9a-zA-Z_]+)(\})?$", propvalue) + match = SINGLE_VAR_RE.match(propvalue) if match is not None: varname = match.group("varname") if varname in values: - if values[varname] is None: - return None - if isinstance(values[varname], db.Entity): - return values[varname] + return values[varname] - if propvalue[1:] in values: - return values[propvalue[1:]] propvalue_template = CrawlerTemplate(propvalue) return propvalue_template.safe_substitute(**values.get_storage()) def handle_value(value: Union[dict, str, list], values: GeneralStore): - """Determine whether the given value needs to set a property, be added to an existing value (create a list) or + """Determine whether the given value needs to set a property, + be added to an existing value (create a list) or add as an additional property (multiproperty). Variable names (starting with a "$") are replaced by the corresponding value stored in the @@ -408,16 +405,11 @@ class Converter(object, metaclass=ABCMeta): raise RuntimeError("No functions given for transformer!") if not isinstance(transformer["functions"], list): raise RuntimeError("The value corresponding to the 'functions' key in the " - "transform section must be a dict") + "transform section must be a list") - # Currently overwriting the input variable would be possible. - # Shall we prevent that? - # Forbid overwriting existing variables at all? - in_expr = transformer["in"] - out_var = transformer["out"] - - in_expr_template = CrawlerTemplate(in_expr) - in_value = in_expr_template.safe_substitute(**values.get_storage()) + if not isinstance(transformer["in"], str): + raise RuntimeError("You should provide the variable name as string") + in_value = replace_variables(transformer["in"], values) for tr_func_el in transformer["functions"]: if not isinstance(tr_func_el, dict): @@ -450,9 +442,13 @@ class Converter(object, metaclass=ABCMeta): out_value = tr_func(in_value, tr_func_params) # The next in_value is the current out_value: in_value = out_value - # If everything succeeded, store the final value in the general store: - values[out_var] = out_value + match = SINGLE_VAR_RE.match(transformer["out"]) + if match is None: + raise RuntimeError("'out' of the transformer definition must specify a single" + " variable name") + print(f"set {match.group('varname')} to {out_value}") + values[match.group('varname')] = out_value @abstractmethod def create_children(self, values: GeneralStore, diff --git a/unittests/test_converters.py b/unittests/test_converters.py index f4a41c564b18c21b9ece0ca5551db6d614a03e3f..3ee20dd8b0ded7ed66c24be6facfa12a6a7c7ef1 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -39,9 +39,10 @@ from caoscrawler.converters import (Converter, ConverterValidationError, DictIntegerElementConverter, DirectoryConverter, FloatElementConverter, IntegerElementConverter, JSONFileConverter, + ListElementConverter, MarkdownFileConverter, YAMLFileConverter, _AbstractScalarValueElementConverter, - handle_value) + handle_value, replace_variables) from caoscrawler.crawl import Crawler from caoscrawler.scanner import (_load_definition_from_yaml_dict, create_converter_registry, load_definition) @@ -50,6 +51,7 @@ from caoscrawler.structure_elements import (BooleanElement, DictElement, Directory, File, FloatElement, IntegerElement, ListElement, TextElement) +from caoscrawler.transformer_functions import split UNITTESTDIR = Path(__file__).parent @@ -339,6 +341,12 @@ def test_variable_replacement(): values["a"] = 4 values["b"] = "68" + # basic values stay unchanged + assert replace_variables(5, values) is 5 + assert replace_variables(True, values) is True + assert replace_variables("$a", values) is 4 + assert replace_variables("${b}", values) == "68" + assert handle_value("b", values) == ("b", "single") assert handle_value("+b", values) == ("b", "list") assert handle_value("*b", values) == ("b", "multiproperty") @@ -363,6 +371,38 @@ def test_variable_replacement(): assert handle_value(["$a", "$b"], values) == ([4, "68"], "single") +def test_apply_transformers(converter_registry): + cfood_def = {"type": 'ListElement', "debug_match": True, "match_name": ".*", + 'transform': {'test': {'in': '$a', 'out': '$b', 'functions': [{ + 'split': {'marker': '|'}}]}}} + values = GeneralStore() + values["a"] = "a|b|c" + + # transformer_functions = create_transformer_registry(crawler_definition) + transformer_functions = { + "split": { + "function": split, + "package": "caoscrawler.transformer_functions"}} + + conv = ListElementConverter(definition=cfood_def, name='test', + converter_registry=converter_registry) + + assert values['a'] is "a|b|c" + conv.apply_transformers(values, transformer_functions) + assert values['a'] is "a|b|c" + assert values['b'] == ["a", "b", "c"] + + # Check replacing of existing variable + cfood_def = {"type": 'ListElement', "debug_match": True, "match_name": ".*", + 'transform': {'test': {'in': '$a', 'out': '$a', 'functions': [{ + 'split': {'marker': '|'}}]}}} + conv = ListElementConverter(definition=cfood_def, name='test', + converter_registry=converter_registry) + + conv.apply_transformers(values, transformer_functions) + assert values['a'] == ["a", "b", "c"] + + def test_filter_children_of_directory(converter_registry, capsys): """Verify that children (i.e., files) in a directory are filtered or sorted correctly. """ test_dir = Directory("examples_filter_children", UNITTESTDIR /