diff --git a/CHANGELOG.md b/CHANGELOG.md index 2cca95ed7805c45d3dd15ebdd6b6321ebbee522f..8eeed54fc829649a58a14cf60fbf85a48b0ae48a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### * 'transform' sections can be added to a CFood to apply functions to values stored in variables. +* default transform functions: submatch, split and replace. ### Changed ### - If the `parents` key is used in a cfood at a lower level for a Record that diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 4702d943d89f106e86b57638d5a55429749d7f30..1ea5f84b68981b873f072470c820d6e38e1d12c5 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -54,6 +54,7 @@ from caosdb.apiutils import (EntityMergeConflictError, compare_entities, merge_entities) from caosdb.cached import cache_clear, cached_get_entity_by from caosdb.exceptions import EmptyUniqueQueryError +from linkahead.common.datatype import get_list_datatype, is_reference from .config import get_config_setting from .converters import Converter, ConverterValidationError @@ -340,7 +341,8 @@ class Crawler(object): def _has_reference_value_without_id(self, ident: Identifiable) -> bool: """ - Returns True if there is at least one value in the properties attribute of ``ident`` which: + Returns True if there is at least one value in the properties and backrefs attributes of + ``ident`` which: a) is a reference property AND b) where the value is set to a @@ -890,6 +892,51 @@ class Crawler(object): "mode. This might lead to a failure of inserts that follow.") logger.info(parent_updates) + @staticmethod + def replace_name_with_referenced_entity_id(prop: db.Property): + """changes the given property in place if it is a reference property that has a name as + value + + If the Property has a List datatype, each element is treated separately. + If the datatype is generic, i.e. FILE or REFERENCE, values stay unchanged. + If the value is not a string, the value stays unchanged. + If the query using the datatype and the string value does not uniquely identify an Entity, + the value stays unchanged. + If an Entity is identified, then the string value is replaced by the ID. + """ + if get_list_datatype(prop.datatype) is None: # not a list + if (isinstance(prop.value, str) and is_reference(prop.datatype) and + prop.datatype != db.FILE and prop.datatype != db.REFERENCE): # datatype is a non-generic reference and value is a string + try: + # the get_entity function will raise an error if not unique + prop.value = cached_get_entity_by( + query=f"FIND Entity {prop.datatype} with name='{prop.value}'").id + except (db.EmptyUniqueQueryError, db.QueryNotUniqueError): + logger.error("The Property {prop.name} with datatype={prop.datatype} has the " + "value {prop.value} and there is no appropriate Entity with such " + "a name.") + raise + else: + dt = get_list_datatype(prop.datatype) + if not (is_reference(dt) and dt != db.FILE and dt != db.REFERENCE): + return + propval = [] + for el in prop.value: + if isinstance(el, str): + try: + # the get_entity function will raise an error if not unique + propval.append(cached_get_entity_by( + query=f"FIND Entity {prop.datatype} with name='{prop.value}'").id) + except (db.EmptyUniqueQueryError, db.QueryNotUniqueError): + logger.error( + "The Property {prop.name} with datatype={prop.datatype} has the " + "value {prop.value} and there is no appropriate Entity with such " + "a name.") + raise + else: + propval.append(el) + prop.value = propval + @staticmethod def execute_inserts_in_list(to_be_inserted, securityMode, run_id: Optional[uuid.UUID] = None, @@ -898,6 +945,7 @@ class Crawler(object): for prop in record.properties: entity = cached_get_entity_by(name=prop.name) _resolve_datatype(prop, entity) + Crawler.replace_name_with_referenced_entity_id(prop) logger.debug("INSERT") logger.debug(to_be_inserted) if len(to_be_inserted) > 0: diff --git a/src/caoscrawler/default_transformers.yml b/src/caoscrawler/default_transformers.yml index 74a76ae7f0a4051682a949e5ba7ed79e84c71578..d0ad23912176bdfbf2446aa6e04bd7fa6b858777 100644 --- a/src/caoscrawler/default_transformers.yml +++ b/src/caoscrawler/default_transformers.yml @@ -6,3 +6,6 @@ submatch: split: package: caoscrawler.transformer_functions function: split +replace: + package: caoscrawler.transformer_functions + function: replace diff --git a/src/caoscrawler/transformer_functions.py b/src/caoscrawler/transformer_functions.py index 8901c2f53bcfbfb28333887a98576e67cff81386..eda9f3c2bc98c8d2561f152f9f6ddd422daee00a 100644 --- a/src/caoscrawler/transformer_functions.py +++ b/src/caoscrawler/transformer_functions.py @@ -50,3 +50,14 @@ def split(in_value: Any, in_parameters: dict): if not isinstance(in_value, str): raise RuntimeError("must be string") return in_value.split(in_parameters['marker']) + + +def replace(in_value: Any, in_parameters: dict): + """calls the string 'replace' function on the first argument and uses the value of the keys + 'remove' and 'insert' stored in the second argument + """ + if "remove" not in in_parameters or "insert" not in in_parameters: + raise RuntimeError("Mandatory parameter missing.") + if not isinstance(in_value, str): + raise RuntimeError("must be string") + return in_value.replace(in_parameters['remove'], in_parameters['insert']) diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 42b078eb31c4bdfb0bdeea3d2a0b4fe1c4404ca7..52ece13dc2269a3e3b16e6378166e91b084f4a7c 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -45,13 +45,14 @@ from caoscrawler.converters import (Converter, ConverterValidationError, handle_value, replace_variables) from caoscrawler.crawl import Crawler from caoscrawler.scanner import (_load_definition_from_yaml_dict, - create_converter_registry, load_definition) + create_converter_registry, + create_transformer_registry, load_definition) from caoscrawler.stores import GeneralStore from caoscrawler.structure_elements import (BooleanElement, DictElement, Directory, File, FloatElement, IntegerElement, ListElement, TextElement) -from caoscrawler.transformer_functions import split +from caoscrawler.transformer_functions import replace, split UNITTESTDIR = Path(__file__).parent diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index 91e0e86a6d6cf2967ab3567a2ef93b7ccde56e64..fbf98346e59b0cbec88f17398eff41f26c423dee 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -33,6 +33,7 @@ from os.path import basename, dirname, join from pathlib import Path from unittest.mock import MagicMock, Mock, patch +import caoscrawler import caosdb as db import caosdb.common.models as dbmodels import pytest @@ -887,3 +888,79 @@ def test_detect_circular_dependency(crawler_mocked_identifiable_retrieve, caplog assert "Found circular dependency" in caplog.text assert "-------\na\n['C" in caplog.text caplog.clear() + + +def mock_get_entity_by_query(query=None): + if query is not None: + return db.Record(id=1111, name='rec_name').add_parent('RT') + + +@patch("caoscrawler.crawl.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by_query)) +def test_replace_name_with_referenced_entity(): + test_text = 'lkajsdf' + test_int = 134343 + test_id = 1111 + test_name = 'rec_name' + + # do not touch Properties with non-ref datatype + prop = db.Property(name='a', datatype=db.TEXT, value=test_text) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value is test_text + + # do not touch Properties with generic-ref datatype + prop = db.Property(name='a', datatype=db.REFERENCE, value=test_text) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value is test_text + + # do not touch Properties with file-ref datatype + prop = db.Property(name='a', datatype=db.FILE, value=test_text) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value is test_text + + # do not touch Properties with non-str values + prop = db.Property(name='a', datatype="RT", value=test_int) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value is test_int + + # no LinkAhead acccess until here + assert caoscrawler.crawl.cached_get_entity_by.call_count == 0 + + # change Properties with custom dt and str value + prop = db.Property(name='a', datatype="RT", value=test_name) + Crawler.replace_name_with_referenced_entity_id(prop) + assert isinstance(prop.value, int) + assert prop.value == test_id + assert caoscrawler.crawl.cached_get_entity_by.call_count == 1 + + # do not touch Properties with non-ref datatype (LIST) + prop = db.Property(name='a', datatype=db.LIST(db.TEXT), value=[test_text]) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value[0] is test_text + + # do not touch Properties with generic-ref datatype (LIST) + prop = db.Property(name='a', datatype=db.LIST(db.REFERENCE), value=[test_text]) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value[0] is test_text + + # do not touch Properties with file-ref datatype (LIST) + prop = db.Property(name='a', datatype=db.LIST(db.FILE), value=[test_text]) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value[0] is test_text + + # do not touch Properties with non-str values (LIST) + prop = db.Property(name='a', datatype=db.LIST("RT"), value=[test_int]) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value[0] is test_int + + # change Properties with custom dt and str value + prop = db.Property(name='a', datatype=db.LIST("RT"), value=[test_name, db.Record(name='hi'), + test_name]) + Crawler.replace_name_with_referenced_entity_id(prop) + assert isinstance(prop.value[0], int) + assert prop.value[0] == test_id + assert isinstance(prop.value[1], db.Entity) + assert prop.value[1].name == "hi" + assert isinstance(prop.value[2], int) + assert prop.value[2] == test_id + assert caoscrawler.crawl.cached_get_entity_by.call_count == 3 diff --git a/unittests/test_transformers.py b/unittests/test_transformers.py index ac530c9147fd7a4c86fa2ef668b6366a722935b0..02d932d13cc3fad52048b08e2b9fe56f11db2ae7 100644 --- a/unittests/test_transformers.py +++ b/unittests/test_transformers.py @@ -28,6 +28,7 @@ Currently, this is under development. See: https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/107 """ +import importlib from functools import partial from pathlib import Path from tempfile import NamedTemporaryFile @@ -36,10 +37,10 @@ from unittest.mock import MagicMock, Mock, patch import caosdb as db import pytest import yaml -from caoscrawler.scanner import (create_converter_registry, load_definition, - scan_directory, scan_structure_elements) -from caoscrawler.structure_elements import (DictElement, DictListElement, - DictTextElement, File) +from caoscrawler.converters import Converter, ListElementConverter +from caoscrawler.scanner import create_transformer_registry, scan_directory +from caoscrawler.stores import GeneralStore +from caoscrawler.transformer_functions import replace, split from pytest import raises UNITTESTDIR = Path(__file__).parent @@ -79,3 +80,69 @@ def test_simple_transformer(): else: # unkown error, something wrong with test directories assert False + + +@pytest.fixture +def converter_registry(): + converter_registry: dict[str, dict[str, str]] = { + "Directory": { + "converter": "DirectoryConverter", + "package": "caoscrawler.converters"}, + "MarkdownFile": { + "converter": "MarkdownFileConverter", + "package": "caoscrawler.converters"}, + "Date": { + "converter": "DateElementConverter", + "package": "caoscrawler.converters"}, + "DictElement": { + "converter": "DictElementConverter", + "package": "caoscrawler.converters"}, + "TextElement": { + "converter": "TextElementConverter", + "package": "caoscrawler.converters"}, + "ListElement": { + "converter": "ListElementConverter", + "package": "caoscrawler.converters"}, + "JSONFile": { + "converter": "JSONFileConverter", + "package": "caoscrawler.converters"}, + } + + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry + + +def test_apply_replace(converter_registry): + cfood_def = {"type": 'ListElement', "match_name": ".*", + 'transform': {'test': {'in': '$a', 'out': '$b', 'functions': [{ + 'replace': {'insert': ':', "remove": "_"}}]}}} + values = GeneralStore() + values["a"] = "16_45" + + # transformer_functions = create_transformer_registry(crawler_definition) + transformer_functions = {"replace": replace} + + conv = ListElementConverter(definition=cfood_def, name='test', + converter_registry=converter_registry) + + conv.apply_transformers(values, transformer_functions) + assert values['b'] == "16:45" + + +def test_apply_replace_from_def(converter_registry): + cfood_def = {"type": 'ListElement', "match_name": ".*", + 'transform': {'test': {'in': '$a', 'out': '$b', 'functions': [{ + 'replace': {'insert': ':', "remove": "_"}}]}}} + values = GeneralStore() + values["a"] = "16_45" + + transformer_functions = create_transformer_registry({}) + # transformer_functions = {"replace": replace} + + conv = ListElementConverter(definition=cfood_def, name='test', + converter_registry=converter_registry) + + conv.apply_transformers(values, transformer_functions) + assert values['b'] == "16:45"