From ec0d5c3612fda3a2a43a3e15441572ad6ef7d245 Mon Sep 17 00:00:00 2001
From: Alexander Schlemmer <a.schlemmer@indiscale.com>
Date: Wed, 26 Jun 2024 15:42:45 +0200
Subject: [PATCH] ENH: implemented basic matching of structure elements of type
 XMLTagElement

---
 src/caoscrawler/xml_converter.py | 55 +++++++++++++++++++++++++++-----
 unittests/test_xml_converter.py  |  6 ++--
 2 files changed, 51 insertions(+), 10 deletions(-)

diff --git a/src/caoscrawler/xml_converter.py b/src/caoscrawler/xml_converter.py
index 99df9cd7..fe10c738 100644
--- a/src/caoscrawler/xml_converter.py
+++ b/src/caoscrawler/xml_converter.py
@@ -28,15 +28,11 @@ import logging
 import os
 import re
 import warnings
-from abc import ABCMeta, abstractmethod
 from inspect import signature
 from string import Template
 from typing import Any, Callable, Optional, Union
 
 import linkahead as db
-import pandas as pd
-import yaml
-import yaml_header_tools
 from jsonschema import ValidationError, validate
 
 from .stores import GeneralStore, RecordStore
@@ -89,9 +85,52 @@ class XMLTagConverter(Converter):
         # See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145
         # for a suggestion for the design of the matching algorithm.
         if not isinstance(element, XMLTagElement):
-            raise ArgumentError("Element must be an instance of XMLTagElement.")
-
-        return None
+            raise TypeError("Element must be an instance of XMLTagElement.")
+
+        # Store the result of all individual regexp variable results:
+        vardict = {}
+
+        if "match_tag" in self.definition:
+            m_tag = re.match(self.definition["match_tag"], element.tag.tag)
+            if m_tag is None:
+                return None
+            vardict.update(m_tag.groupdict())
+
+        if "match_text" in self.definition:
+            m_text = re.match(self.definition["match_text"], element.tag.text)
+            if m_text is None:
+                return None
+            vardict.update(m_text.groupdict())
+
+        if "match_attrib" in self.definition:
+            for attrib_def_key, attrib_def_value in self.definition["match_attrib"].items():
+                match_counter = 0
+                matched_m_attrib = None
+                matched_m_attrib_value = None
+                for attr_key, attr_value in element.tag.attrib.items():
+                    m_attrib = re.match(attrib_def_key, attr_key)
+                    if m_attrib is not None:
+                        match_counter += 1
+                        matched_m_attrib = m_attrib
+                        m_attrib_value = re.match(attrib_def_value, attr_value)
+                        if m_attrib_value is None:
+                            breakpoint()
+                            return None
+                        matched_m_attrib_value = m_attrib_value
+                # TODO: How to deal with multiple matches?
+                #       There are multiple options:
+                #       - Allow multiple attribute-key matches: Leads to possible overwrites of variables
+                #       - Require unique attribute-key and attribute-value matches: Very complex
+                #       - Only allow one single attribute-key to match and run attribute-value match separately.
+                #       Currently the latter option is implemented.
+                if match_counter == 0:
+                    return None
+                elif match_counter > 1:
+                    raise RuntimeError("Multiple attributes match the same match_attrib entry.")
+                vardict.update(matched_m_attrib.groupdict())
+                vardict.update(matched_m_attrib_value.groupdict())
+
+        return vardict
 
 
 class XMLTextNodeConverter(Converter):
@@ -109,6 +148,6 @@ class XMLTextNodeConverter(Converter):
         # See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145
         # for a suggestion for the design of the matching algorithm.
         if not isinstance(element, XMLTextNode):
-            raise ArgumentError("Element must be an instance of XMLTextNode.")
+            raise TypeError("Element must be an instance of XMLTextNode.")
 
         return None
diff --git a/unittests/test_xml_converter.py b/unittests/test_xml_converter.py
index 690ca129..b0dcce94 100644
--- a/unittests/test_xml_converter.py
+++ b/unittests/test_xml_converter.py
@@ -96,8 +96,8 @@ def test_simple_xml(converter_registry):
 type: XMLTag
 match_tag: a
 match_attrib:  # default is the empty dictionary
-"(?P<ref>(href|url))": "text(?P<number>[0-9])"  # either the "href" or the "url" attribute must be set
-alt: (.+)  # this attribute must be present and contain at least one character
+    "(?P<ref>(href|url))": "test(?P<number>[0-9])"  # either the "href" or the "url" attribute must be set
+    alt: (.+)  # this attribute must be present and contain at least one character
 match_text: .*  # allow any text, also empty (this is the default)
 
 # _*_ marks the default:
@@ -119,3 +119,5 @@ subtree:
     m = converter.match(tag)
 
     assert m is not None
+    assert m["ref"] == "href"
+    assert m["number"] == "1"
-- 
GitLab