From e15a44bbacacc5c68d5f441f1dc1f7b55f729fc3 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <alexander@mail-schlemmer.de> Date: Fri, 11 Aug 2023 15:22:19 +0200 Subject: [PATCH] ENH: draft of transformer function application --- src/caoscrawler/converters.py | 74 ++++++++++++++++++- src/caoscrawler/scanner.py | 6 +- .../test_transformers/cfood.yml | 2 +- unittests/test_scanner.py | 1 - unittests/test_transformers.py | 11 ++- 5 files changed, 85 insertions(+), 9 deletions(-) diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index c8f4d229..e66581ce 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -356,8 +356,16 @@ class Converter(object, metaclass=ABCMeta): Extract information from the structure element and store them as values in the general store. - values: The GeneralStore to store values in. - element: The StructureElement to extract values from. + This function calls apply transformers after all values are created. + + Parameters: + ------------ + + values: GeneralStore + The GeneralStore to store values in. + + element: StructureElement + The StructureElement to extract values from. """ m = self.match(element) if m is None: @@ -365,6 +373,68 @@ class Converter(object, metaclass=ABCMeta): raise RuntimeError("Condition does not match.") values.update(m) + self.apply_transformers(values) + + def apply_transformers(self, values: GeneralStore, + transformer_funcions: dict): + """ + Check if transformers are defined using the "transform" keyword. + The apply the transformers to the variables defined in GeneralStore "values". + + Parameters: + ------------ + + values: GeneralStore + The GeneralStore to store values in. + + transformer_functions: dict + A dictionary of registered functions that can be used within this transformer block. + """ + + if "transform" in self.definition: + for transformer in self.definition["transform"]: + if "in" not in transformer: + raise RuntimeError("In-variable not defined!") + if "out" not in transformer: + raise RuntimeError("Out-variable not defined!") + if "functions" not in transformer: + raise RuntimeError("No functions given for transformer!") + + # Currently overwriting the input variable would be possible. + # Shall we prevent that? + # Forbid overwriting existing variables at all? + in_expr = transformer["in"] + out_var = transformer["out"] + + in_expr_template = CrawlerTemplate(in_expr) + in_value = in_expr_template.safe_substitute(**values.get_storage()) + + for tr_func_key, tr_func_value in transformer["functions"].items(): + # These functions are a list of functions that need to be registered + # in the dictionary of registered transformer_functions. + # Each function is a dictionary: + # - The key is the name of the function to be looked up in the dictionary + # of registered transformer functions. + # - The value is a dictionary with key, value-assignments which will be + # passed to the transformer function. + # The transformer function needs to be of the form: + # + # def func(in_value: Any, in_parameters: dict) -> Any: + # pass + # + if tr_func_key not in transformer_funcions: + raise RuntimeError("Unknown transformer function: {}".format(tr_func_key)) + + # Retrieve the function from the dictionary: + tr_func = transformer_funcions[tr_func_key] + # Call the function: + out_value = tr_func(in_value, tr_func_value) + # The next in_value is the current out_value: + in_value = out_value + + # If everything succeeded, store the final value in the general store: + values[out_var] = out_value + @abstractmethod def create_children(self, values: GeneralStore, element: StructureElement): diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py index c5e078c5..2f95fafb 100644 --- a/src/caoscrawler/scanner.py +++ b/src/caoscrawler/scanner.py @@ -110,6 +110,9 @@ def _load_definition_from_yaml_dict(crawler_definitions: list[dict]): for key in metadata["Converters"]: schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append( key) + # New section for custom transformers: + if "Transformers" in metadata: + raise NotImplementedError() # Validate the cfood schema: validate(instance=crawler_definition, schema=schema["cfood"]) @@ -280,7 +283,8 @@ def scanner(items: list[StructureElement], os.path.join(*(structure_elements_path + [element.get_name()]))) # extracts values from structure element and stores them in the - # variable store + # variable store. + # Additionally also apply transformers if there are any. converter.create_values(general_store_copy, element) keys_modified = converter.create_records( diff --git a/unittests/test_directories/test_transformers/cfood.yml b/unittests/test_directories/test_transformers/cfood.yml index 643af4b3..2bea8259 100644 --- a/unittests/test_directories/test_transformers/cfood.yml +++ b/unittests/test_directories/test_transformers/cfood.yml @@ -11,7 +11,7 @@ RootDir: transform: MakeDayLong: in: $day_short - out: $day_long + out: day_long # no dollar sign here, because this is a variable name and no expression functions: - ifelse: # name of the function match: Mon # match is one specific argument diff --git a/unittests/test_scanner.py b/unittests/test_scanner.py index 1233370d..9c271efd 100644 --- a/unittests/test_scanner.py +++ b/unittests/test_scanner.py @@ -200,7 +200,6 @@ def test_record_generation(): # Try each record to check for i, check_prop in enumerate(check_props): matches = True - # breakpoint() # Verify that all props are in the record and have the right value for pr in check_prop: if rec.get_property(pr) is None: diff --git a/unittests/test_transformers.py b/unittests/test_transformers.py index 81df2253..744b321b 100644 --- a/unittests/test_transformers.py +++ b/unittests/test_transformers.py @@ -59,12 +59,15 @@ def test_simple_transformer(): records = scan_directory(UNITTESTDIR / "test_directories" / "test_transformers", UNITTESTDIR / "test_directories" / "test_transformers" / "cfood.yml") - + breakpoint() for r in records: assert r.get_property("Day") is not None assert r.get_property("DayShort") is not None - assert r.get_property("Day").value != "$day_long" assert r.get_property("DayShort").value != "$day_short" + if r.get_property("DayShort").value == "Unk": + # This unkown folder should not lead to a replacement + assert r.get_property("Day").value == "$day_long" + else: + assert r.get_property("Day").value != "$day_long" - # breakpoint() - assert False + -- GitLab