diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index 5dba0726bba5351cb75ab363985daa639a6e56ac..ca6b8e5d8a4d9b5244e7a8fbd547ce9bd12b5826 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -151,6 +151,8 @@ def replace_variables(propvalue, values: GeneralStore): if isinstance(values[varname], db.Entity): return values[varname] + if propvalue[1:] in values: + return values[propvalue[1:]] propvalue_template = CrawlerTemplate(propvalue) return propvalue_template.safe_substitute(**values.get_storage()) @@ -204,7 +206,6 @@ out: tuple # being able to directly set list values. Semantics is, however, a bit # different from the two cases above. collection_mode = "single" - propvalue = value # variables replacement: propvalue = list() @@ -405,7 +406,7 @@ class Converter(object, metaclass=ABCMeta): raise RuntimeError("Out-variable not defined!") if "functions" not in transformer: raise RuntimeError("No functions given for transformer!") - if not isinstance(self.definition["functions"], list): + if not isinstance(transformer["functions"], list): raise RuntimeError("The value corresponding to the 'functions' key in the " "transform section must be a dict") @@ -427,7 +428,7 @@ class Converter(object, metaclass=ABCMeta): " one element with they key being the name" " of the function!") tr_func_key = list(tr_func_el.keys())[0] - tr_func_value = tr_func_el[tr_func_key] + tr_func_params = tr_func_el[tr_func_key] # These functions are a list of functions that need to be registered # in the dictionary of registered transformer_functions. # Each function is a dictionary: @@ -446,7 +447,7 @@ class Converter(object, metaclass=ABCMeta): # Retrieve the function from the dictionary: tr_func = transformer_functions[tr_func_key]["function"] # Call the function: - out_value = tr_func(in_value, tr_func_value) + out_value = tr_func(in_value, tr_func_params) # The next in_value is the current out_value: in_value = out_value diff --git a/src/caoscrawler/default_transformers.yml b/src/caoscrawler/default_transformers.yml index 5c039815df4f022a31a90b8a6c9b4dd961e3b04d..a4967440c1c8a020019dbed1ac6b18cb637269f3 100644 --- a/src/caoscrawler/default_transformers.yml +++ b/src/caoscrawler/default_transformers.yml @@ -3,3 +3,6 @@ ifelse: package: caoscrawler.transformer_functions function: ifelse +split: + package: caoscrawler.transformer_functions + function: split diff --git a/src/caoscrawler/transformer_functions.py b/src/caoscrawler/transformer_functions.py index b68c9f64d9f2da8b1d51d35598c579bfee7511ab..5746cb0a91957c7363c935d260dcc7477aa1481b 100644 --- a/src/caoscrawler/transformer_functions.py +++ b/src/caoscrawler/transformer_functions.py @@ -1,15 +1,43 @@ #!/usr/bin/env python3 -# Defnition of default transformer functions. -# A. Schlemmer, 08/2023 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# Copyright (C) 2023 Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +""" +Defnition of default transformer functions. +""" +import re from typing import Any -import re def ifelse(in_value: Any, in_parameters: dict): if "match" not in in_parameters or "then" not in in_parameters: raise RuntimeError("Mandatory parameters missing.") - if re.match(in_parameters["match"], in_value) is not None: return in_parameters["then"] return in_value + + +def split(in_value: Any, in_parameters: dict): + if "marker" not in in_parameters: + raise RuntimeError("Mandatory parameter missing.") + if not isinstance(in_value, str): + raise RuntimeError("must be string") + return in_value.split(in_parameters['marker']) diff --git a/unittests/test_directories/test_transformers/cfood.yml b/unittests/test_directories/test_transformers/cfood.yml index 2bea82596bdfe5fa634cedadd73eae6df242a8a0..84066a4c953bacd757398207ff9bf601fa0a6564 100644 --- a/unittests/test_directories/test_transformers/cfood.yml +++ b/unittests/test_directories/test_transformers/cfood.yml @@ -19,7 +19,14 @@ RootDir: - ifelse: # next function match: Tue then: Tuesday + TestSplit: + in: $day_short + out: day_split # no dollar sign here, because this is a variable name and no expression + functions: + - split: + marker: o records: DayFolder: Day: $day_long DayShort: $day_short # just for checking, whether this variable remains + DaySplit: $day_split # just for checking, whether this variable remains diff --git a/unittests/test_transformers.py b/unittests/test_transformers.py index c7cfe030f9b424db7de946cd3c0f4f4c74d4c822..47b31d0b0173786d58b2e7169342dea9fb11a03e 100644 --- a/unittests/test_transformers.py +++ b/unittests/test_transformers.py @@ -42,13 +42,8 @@ from caoscrawler.structure_elements import (DictElement, DictListElement, DictTextElement, File) from pytest import raises -from utils import dircheckstr as dircheck_base - UNITTESTDIR = Path(__file__).parent -dircheckstr = partial(dircheck_base, UNITTESTDIR / "test_directories" / "examples_article") - - def test_simple_transformer(): """ @@ -59,7 +54,7 @@ def test_simple_transformer(): records = scan_directory(UNITTESTDIR / "test_directories" / "test_transformers", UNITTESTDIR / "test_directories" / "test_transformers" / "cfood.yml") - + for r in records: assert r.get_property("Day") is not None assert r.get_property("DayShort") is not None @@ -67,10 +62,12 @@ def test_simple_transformer(): if r.get_property("DayShort").value == "Unk": # This unkown folder should not lead to a replacement assert r.get_property("Day").value == "Unk" + assert r.get_property("DaySplit").value == ["Unk"] elif r.get_property("DayShort").value == "Mon": assert r.get_property("Day").value == "Monday" + assert r.get_property("DaySplit").value == ["M", "n"] elif r.get_property("DayShort").value == "Tue": assert r.get_property("Day").value == "Tuesday" + assert r.get_property("DaySplit").value == ["Tue"] else: - assert r.get_property("Day").value != "$day_long" - + raise RuntimeError("There is no other short version!")