diff --git a/CHANGELOG.md b/CHANGELOG.md index 2cca95ed7805c45d3dd15ebdd6b6321ebbee522f..8eeed54fc829649a58a14cf60fbf85a48b0ae48a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### * 'transform' sections can be added to a CFood to apply functions to values stored in variables. +* default transform functions: submatch, split and replace. ### Changed ### - If the `parents` key is used in a cfood at a lower level for a Record that diff --git a/src/caoscrawler/default_transformers.yml b/src/caoscrawler/default_transformers.yml index 74a76ae7f0a4051682a949e5ba7ed79e84c71578..d0ad23912176bdfbf2446aa6e04bd7fa6b858777 100644 --- a/src/caoscrawler/default_transformers.yml +++ b/src/caoscrawler/default_transformers.yml @@ -6,3 +6,6 @@ submatch: split: package: caoscrawler.transformer_functions function: split +replace: + package: caoscrawler.transformer_functions + function: replace diff --git a/src/caoscrawler/transformer_functions.py b/src/caoscrawler/transformer_functions.py index 8901c2f53bcfbfb28333887a98576e67cff81386..eda9f3c2bc98c8d2561f152f9f6ddd422daee00a 100644 --- a/src/caoscrawler/transformer_functions.py +++ b/src/caoscrawler/transformer_functions.py @@ -50,3 +50,14 @@ def split(in_value: Any, in_parameters: dict): if not isinstance(in_value, str): raise RuntimeError("must be string") return in_value.split(in_parameters['marker']) + + +def replace(in_value: Any, in_parameters: dict): + """calls the string 'replace' function on the first argument and uses the value of the keys + 'remove' and 'insert' stored in the second argument + """ + if "remove" not in in_parameters or "insert" not in in_parameters: + raise RuntimeError("Mandatory parameter missing.") + if not isinstance(in_value, str): + raise RuntimeError("must be string") + return in_value.replace(in_parameters['remove'], in_parameters['insert']) diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 42b078eb31c4bdfb0bdeea3d2a0b4fe1c4404ca7..52ece13dc2269a3e3b16e6378166e91b084f4a7c 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -45,13 +45,14 @@ from caoscrawler.converters import (Converter, ConverterValidationError, handle_value, replace_variables) from caoscrawler.crawl import Crawler from caoscrawler.scanner import (_load_definition_from_yaml_dict, - create_converter_registry, load_definition) + create_converter_registry, + create_transformer_registry, load_definition) from caoscrawler.stores import GeneralStore from caoscrawler.structure_elements import (BooleanElement, DictElement, Directory, File, FloatElement, IntegerElement, ListElement, TextElement) -from caoscrawler.transformer_functions import split +from caoscrawler.transformer_functions import replace, split UNITTESTDIR = Path(__file__).parent diff --git a/unittests/test_transformers.py b/unittests/test_transformers.py index ac530c9147fd7a4c86fa2ef668b6366a722935b0..02d932d13cc3fad52048b08e2b9fe56f11db2ae7 100644 --- a/unittests/test_transformers.py +++ b/unittests/test_transformers.py @@ -28,6 +28,7 @@ Currently, this is under development. See: https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/107 """ +import importlib from functools import partial from pathlib import Path from tempfile import NamedTemporaryFile @@ -36,10 +37,10 @@ from unittest.mock import MagicMock, Mock, patch import caosdb as db import pytest import yaml -from caoscrawler.scanner import (create_converter_registry, load_definition, - scan_directory, scan_structure_elements) -from caoscrawler.structure_elements import (DictElement, DictListElement, - DictTextElement, File) +from caoscrawler.converters import Converter, ListElementConverter +from caoscrawler.scanner import create_transformer_registry, scan_directory +from caoscrawler.stores import GeneralStore +from caoscrawler.transformer_functions import replace, split from pytest import raises UNITTESTDIR = Path(__file__).parent @@ -79,3 +80,69 @@ def test_simple_transformer(): else: # unkown error, something wrong with test directories assert False + + +@pytest.fixture +def converter_registry(): + converter_registry: dict[str, dict[str, str]] = { + "Directory": { + "converter": "DirectoryConverter", + "package": "caoscrawler.converters"}, + "MarkdownFile": { + "converter": "MarkdownFileConverter", + "package": "caoscrawler.converters"}, + "Date": { + "converter": "DateElementConverter", + "package": "caoscrawler.converters"}, + "DictElement": { + "converter": "DictElementConverter", + "package": "caoscrawler.converters"}, + "TextElement": { + "converter": "TextElementConverter", + "package": "caoscrawler.converters"}, + "ListElement": { + "converter": "ListElementConverter", + "package": "caoscrawler.converters"}, + "JSONFile": { + "converter": "JSONFileConverter", + "package": "caoscrawler.converters"}, + } + + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry + + +def test_apply_replace(converter_registry): + cfood_def = {"type": 'ListElement', "match_name": ".*", + 'transform': {'test': {'in': '$a', 'out': '$b', 'functions': [{ + 'replace': {'insert': ':', "remove": "_"}}]}}} + values = GeneralStore() + values["a"] = "16_45" + + # transformer_functions = create_transformer_registry(crawler_definition) + transformer_functions = {"replace": replace} + + conv = ListElementConverter(definition=cfood_def, name='test', + converter_registry=converter_registry) + + conv.apply_transformers(values, transformer_functions) + assert values['b'] == "16:45" + + +def test_apply_replace_from_def(converter_registry): + cfood_def = {"type": 'ListElement', "match_name": ".*", + 'transform': {'test': {'in': '$a', 'out': '$b', 'functions': [{ + 'replace': {'insert': ':', "remove": "_"}}]}}} + values = GeneralStore() + values["a"] = "16_45" + + transformer_functions = create_transformer_registry({}) + # transformer_functions = {"replace": replace} + + conv = ListElementConverter(definition=cfood_def, name='test', + converter_registry=converter_registry) + + conv.apply_transformers(values, transformer_functions) + assert values['b'] == "16:45"