From 6ba50c9b6dd48d33c50a127f860285d3f5b3449c Mon Sep 17 00:00:00 2001
From: Alexander Schlemmer <a.schlemmer@indiscale.com>
Date: Thu, 14 Nov 2024 09:17:22 +0100
Subject: [PATCH] REFACT(converters): match_properties is a generic method of
 converters now

---
 src/caoscrawler/converters/converters.py    | 59 +++++++++++++++++++++
 src/caoscrawler/converters/rocrate.py       | 22 +-------
 src/caoscrawler/converters/xml_converter.py | 28 +---------
 src/caoscrawler/crawl.py                    |  8 +--
 unittests/test_scanner.py                   |  6 +--
 5 files changed, 66 insertions(+), 57 deletions(-)

diff --git a/src/caoscrawler/converters/converters.py b/src/caoscrawler/converters/converters.py
index 64a557ce..40ddde92 100644
--- a/src/caoscrawler/converters/converters.py
+++ b/src/caoscrawler/converters/converters.py
@@ -456,6 +456,65 @@ class Converter(object, metaclass=ABCMeta):
             raise RuntimeError("Condition does not match.")
         values.update(m)
 
+    def match_properties(self, properties: dict, vardict: dict, label: str = "match_properties"):
+        """
+        This method can be used to generically match 'match_properties' from the cfood definition
+        with the behavior described as follows:
+
+        'match_properties' is a dictionary of key-regexps and value-regexp pairs. Each key matches
+        a property name and the corresponding value matches its property value.
+
+        What a property means in the context of the respective converter can be different, examples:
+        XMLTag: attributes of the node
+        ROCrate: properties of the ROCrateEntity
+        DictElement: properties of the dict
+
+        label can be used to customize the name of the property in the definition.
+
+        This method is not called by default, but can be called from child classes.
+
+        Arguments:
+        ----------
+
+        properties: dict
+          The dictionary containing the properties to be matched.
+
+        vardict: dict
+          This dictionary will be used to store the variables created during the matching.
+
+        label: str
+          Default "match_properties". Can be used to change the name of the property in the definition. E.g. the
+          xml converter uses "match_attrib" which makes more sense in the context of xml trees.
+        """
+        if label in self.definition:
+            # This matcher works analogously to the attributes matcher in the XMLConverter
+            for prop_def_key, prop_def_value in self.definition[label].items():
+                match_counter = 0
+                matched_m_prop = None
+                matched_m_prop_value = None
+                for prop_key, prop_value in properties.items():
+                    m_prop = re.match(prop_def_key, prop_key)
+                    if m_prop is not None:
+                        match_counter += 1
+                        matched_m_prop = m_prop
+                        m_prop_value = re.match(prop_def_value, prop_value)
+                        if m_prop_value is None:
+                            return None
+                        matched_m_prop_value = m_prop_value
+                # TODO: How to deal with multiple matches?
+                #       There are multiple options:
+                #       - Allow multiple attribute-key matches: Leads to possible overwrites of variables
+                #       - Require unique attribute-key and attribute-value matches: Very complex
+                #       - Only allow one single attribute-key to match and run attribute-value match separately.
+                #       Currently the latter option is implemented.
+                # TODO: The ROCrateEntityConverter implements a very similar behavior.
+                if match_counter == 0:
+                    return None
+                elif match_counter > 1:
+                    raise RuntimeError("Multiple properties match the same {} entry.".format(label))
+                vardict.update(matched_m_prop.groupdict())
+                vardict.update(matched_m_prop_value.groupdict())
+
     def apply_transformers(self, values: GeneralStore, transformer_functions: dict):
         """
         Check if transformers are defined using the "transform" keyword.
diff --git a/src/caoscrawler/converters/rocrate.py b/src/caoscrawler/converters/rocrate.py
index 286061ef..e940ba83 100644
--- a/src/caoscrawler/converters/rocrate.py
+++ b/src/caoscrawler/converters/rocrate.py
@@ -176,27 +176,7 @@ class ROCrateEntityConverter(Converter):
                 return None
             vardict.update(m_type.groupdict())
 
-        if "match_properties" in self.definition:
-            # This matcher works analogously to the attributes matcher in the XMLConverter
-            for prop_def_key, prop_def_value in self.definition["match_properties"].items():
-                match_counter = 0
-                matched_m_prop = None
-                matched_m_prop_value = None
-                for prop_key, prop_value in element.entity.properties().items():
-                    m_prop = re.match(prop_def_key, prop_key)
-                    if m_prop is not None:
-                        match_counter += 1
-                        matched_m_prop = m_prop
-                        m_prop_value = re.match(prop_def_value, prop_value)
-                        if m_prop_value is None:
-                            return None
-                        matched_m_prop_value = m_prop_value
-                if match_counter == 0:
-                    return None
-                elif match_counter > 1:
-                    raise RuntimeError("Multiple properties match the same match_prop entry.")
-                vardict.update(matched_m_prop.groupdict())
-                vardict.update(matched_m_prop_value.groupdict())
+        self.match_properties(element.entity.properties(), vardict)
 
         return vardict
 
diff --git a/src/caoscrawler/converters/xml_converter.py b/src/caoscrawler/converters/xml_converter.py
index bd3f6cf0..76d5afff 100644
--- a/src/caoscrawler/converters/xml_converter.py
+++ b/src/caoscrawler/converters/xml_converter.py
@@ -163,33 +163,7 @@ class XMLTagConverter(Converter):
                 return None
             vardict.update(m_text.groupdict())
 
-        if "match_attrib" in self.definition:
-            for attrib_def_key, attrib_def_value in self.definition["match_attrib"].items():
-                match_counter = 0
-                matched_m_attrib = None
-                matched_m_attrib_value = None
-                for attr_key, attr_value in element.tag.attrib.items():
-                    m_attrib = re.match(attrib_def_key, attr_key)
-                    if m_attrib is not None:
-                        match_counter += 1
-                        matched_m_attrib = m_attrib
-                        m_attrib_value = re.match(attrib_def_value, attr_value)
-                        if m_attrib_value is None:
-                            return None
-                        matched_m_attrib_value = m_attrib_value
-                # TODO: How to deal with multiple matches?
-                #       There are multiple options:
-                #       - Allow multiple attribute-key matches: Leads to possible overwrites of variables
-                #       - Require unique attribute-key and attribute-value matches: Very complex
-                #       - Only allow one single attribute-key to match and run attribute-value match separately.
-                #       Currently the latter option is implemented.
-                # TODO: The ROCrateEntityConverter implements a very similar behavior.
-                if match_counter == 0:
-                    return None
-                elif match_counter > 1:
-                    raise RuntimeError("Multiple attributes match the same match_attrib entry.")
-                vardict.update(matched_m_attrib.groupdict())
-                vardict.update(matched_m_attrib_value.groupdict())
+        self.match_properties(element.tag.attrib, vardict, "match_attrib")
 
         return vardict
 
diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py
index 9e4e2a80..a79e4434 100644
--- a/src/caoscrawler/crawl.py
+++ b/src/caoscrawler/crawl.py
@@ -39,7 +39,6 @@ import sys
 import traceback
 import uuid
 import warnings
-
 from argparse import RawTextHelpFormatter
 from copy import deepcopy
 from datetime import datetime
@@ -52,13 +51,10 @@ from caosadvancedtools.cache import UpdateCache
 from caosadvancedtools.crawler import Crawler as OldCrawler
 from caosadvancedtools.serverside.helper import send_mail
 from caosadvancedtools.utils import create_entity_link
-from linkahead.apiutils import (compare_entities,
-                                merge_entities)
+from linkahead.apiutils import compare_entities, merge_entities
 from linkahead.cached import cache_clear, cached_get_entity_by
 from linkahead.common.datatype import get_list_datatype, is_reference
-from linkahead.exceptions import (
-    TransactionError,
-)
+from linkahead.exceptions import TransactionError
 from linkahead.utils.escape import escape_squoted_text
 
 from .config import get_config_setting
diff --git a/unittests/test_scanner.py b/unittests/test_scanner.py
index 680e4abe..4fa752b0 100644
--- a/unittests/test_scanner.py
+++ b/unittests/test_scanner.py
@@ -36,9 +36,9 @@ import pytest
 import yaml
 from caoscrawler.crawl import Crawler
 from caoscrawler.debug_tree import DebugTree
-from caoscrawler.scanner import (create_converter_registry, load_definition,
-                                 scan_directory, scan_structure_elements,
-                                 _load_definition_from_yaml_dict)
+from caoscrawler.scanner import (_load_definition_from_yaml_dict,
+                                 create_converter_registry, load_definition,
+                                 scan_directory, scan_structure_elements)
 from caoscrawler.structure_elements import (DictElement, DictListElement,
                                             DictTextElement, File)
 from pytest import raises
-- 
GitLab