From e15a44bbacacc5c68d5f441f1dc1f7b55f729fc3 Mon Sep 17 00:00:00 2001
From: Alexander Schlemmer <alexander@mail-schlemmer.de>
Date: Fri, 11 Aug 2023 15:22:19 +0200
Subject: [PATCH] ENH: draft of transformer function application

---
 src/caoscrawler/converters.py                 | 74 ++++++++++++++++++-
 src/caoscrawler/scanner.py                    |  6 +-
 .../test_transformers/cfood.yml               |  2 +-
 unittests/test_scanner.py                     |  1 -
 unittests/test_transformers.py                | 11 ++-
 5 files changed, 85 insertions(+), 9 deletions(-)

diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py
index c8f4d229..e66581ce 100644
--- a/src/caoscrawler/converters.py
+++ b/src/caoscrawler/converters.py
@@ -356,8 +356,16 @@ class Converter(object, metaclass=ABCMeta):
         Extract information from the structure element and store them as values in the
         general store.
 
-        values: The GeneralStore to store values in.
-        element: The StructureElement to extract values from.
+        This function calls apply transformers after all values are created.
+
+        Parameters:
+        ------------
+        
+        values: GeneralStore
+            The GeneralStore to store values in.
+        
+        element: StructureElement
+            The StructureElement to extract values from.
         """
         m = self.match(element)
         if m is None:
@@ -365,6 +373,68 @@ class Converter(object, metaclass=ABCMeta):
             raise RuntimeError("Condition does not match.")
         values.update(m)
 
+        self.apply_transformers(values)
+
+    def apply_transformers(self, values: GeneralStore,
+                           transformer_funcions: dict):
+        """
+        Check if transformers are defined using the "transform" keyword.
+        The apply the transformers to the variables defined in GeneralStore "values".
+
+        Parameters:
+        ------------
+        
+        values: GeneralStore
+            The GeneralStore to store values in.
+
+        transformer_functions: dict
+            A dictionary of registered functions that can be used within this transformer block.
+        """
+
+        if "transform" in self.definition:
+            for transformer in self.definition["transform"]:
+                if "in" not in transformer:
+                    raise RuntimeError("In-variable not defined!")
+                if "out" not in transformer:
+                    raise RuntimeError("Out-variable not defined!")
+                if "functions" not in transformer:
+                    raise RuntimeError("No functions given for transformer!")
+                
+                # Currently overwriting the input variable would be possible.
+                # Shall we prevent that?
+                # Forbid overwriting existing variables at all?
+                in_expr = transformer["in"]
+                out_var = transformer["out"]
+
+                in_expr_template = CrawlerTemplate(in_expr)
+                in_value = in_expr_template.safe_substitute(**values.get_storage())
+
+                for tr_func_key, tr_func_value in transformer["functions"].items():
+                    # These functions are a list of functions that need to be registered
+                    # in the dictionary of registered transformer_functions.
+                    # Each function is a dictionary:
+                    # - The key is the name of the function to be looked up in the dictionary
+                    #   of registered transformer functions.
+                    # - The value is a dictionary with key, value-assignments which will be
+                    #   passed to the transformer function.
+                    # The transformer function needs to be of the form:
+                    #
+                    # def func(in_value: Any, in_parameters: dict) -> Any:
+                    #     pass
+                    #
+                    if tr_func_key not in transformer_funcions:
+                        raise RuntimeError("Unknown transformer function: {}".format(tr_func_key))
+
+                    # Retrieve the function from the dictionary:
+                    tr_func = transformer_funcions[tr_func_key]
+                    # Call the function:
+                    out_value = tr_func(in_value, tr_func_value)
+                    # The next in_value is the current out_value:
+                    in_value = out_value
+
+                # If everything succeeded, store the final value in the general store:
+                values[out_var] = out_value
+
     @abstractmethod
     def create_children(self, values: GeneralStore,
                         element: StructureElement):
diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py
index c5e078c5..2f95fafb 100644
--- a/src/caoscrawler/scanner.py
+++ b/src/caoscrawler/scanner.py
@@ -110,6 +110,9 @@ def _load_definition_from_yaml_dict(crawler_definitions: list[dict]):
             for key in metadata["Converters"]:
                 schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append(
                     key)
+        # New section for custom transformers:
+        if "Transformers" in metadata:
+            raise NotImplementedError()
 
     # Validate the cfood schema:
     validate(instance=crawler_definition, schema=schema["cfood"])
@@ -280,7 +283,8 @@ def scanner(items: list[StructureElement],
                     os.path.join(*(structure_elements_path + [element.get_name()])))
 
                 # extracts values from structure element and stores them in the
-                # variable store
+                # variable store.
+                # Additionally also apply transformers if there are any.
                 converter.create_values(general_store_copy, element)
 
                 keys_modified = converter.create_records(
diff --git a/unittests/test_directories/test_transformers/cfood.yml b/unittests/test_directories/test_transformers/cfood.yml
index 643af4b3..2bea8259 100644
--- a/unittests/test_directories/test_transformers/cfood.yml
+++ b/unittests/test_directories/test_transformers/cfood.yml
@@ -11,7 +11,7 @@ RootDir:
       transform:
         MakeDayLong:
           in: $day_short
-          out: $day_long
+          out: day_long  # no dollar sign here, because this is a variable name and no expression
           functions:
           - ifelse:  # name of the function
               match: Mon  # match is one specific argument
diff --git a/unittests/test_scanner.py b/unittests/test_scanner.py
index 1233370d..9c271efd 100644
--- a/unittests/test_scanner.py
+++ b/unittests/test_scanner.py
@@ -200,7 +200,6 @@ def test_record_generation():
             # Try each record to check
             for i, check_prop in enumerate(check_props):
                 matches = True
-                # breakpoint()
                 # Verify that all props are in the record and have the right value
                 for pr in check_prop:
                     if rec.get_property(pr) is None:
diff --git a/unittests/test_transformers.py b/unittests/test_transformers.py
index 81df2253..744b321b 100644
--- a/unittests/test_transformers.py
+++ b/unittests/test_transformers.py
@@ -59,12 +59,15 @@ def test_simple_transformer():
     records = scan_directory(UNITTESTDIR / "test_directories" / "test_transformers",
                              UNITTESTDIR / "test_directories" / "test_transformers" /
                              "cfood.yml")
-
+    breakpoint()
     for r in records:
         assert r.get_property("Day") is not None
         assert r.get_property("DayShort") is not None
-        assert r.get_property("Day").value != "$day_long"
         assert r.get_property("DayShort").value != "$day_short"
+        if r.get_property("DayShort").value == "Unk":
+            # This unkown folder should not lead to a replacement
+            assert r.get_property("Day").value == "$day_long"
+        else:
+            assert r.get_property("Day").value != "$day_long"
 
-    # breakpoint()
-    assert False
+    
-- 
GitLab