ENH: implemented basic matching of structure elements of type XMLTagElement

ec0d5c36 · Alexander Schlemmer · 366f4b0a · ec0d5c36 · ec0d5c36
Commit ec0d5c36 authored 1 year ago by Alexander Schlemmer
--- a/src/caoscrawler/xml_converter.py
+++ b/src/caoscrawler/xml_converter.py
@@ -28,15 +28,11 @@ import logging
 import os
 import re
 import warnings
-from abc import ABCMeta, abstractmethod
 from inspect import signature
 from string import Template
 from typing import Any, Callable, Optional, Union
 import linkahead as db
-import pandas as pd
-import yaml
-import yaml_header_tools
 from jsonschema import ValidationError, validate
 from .stores import GeneralStore, RecordStore
@@ -89,9 +85,52 @@ class XMLTagConverter(Converter):
        # See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145
        # for a suggestion for the design of the matching algorithm.
        if not isinstance(element, XMLTagElement):
-            raise ArgumentError("Element must be an instance of XMLTagElement.")
+            raise TypeError("Element must be an instance of XMLTagElement.")
+        # Store the result of all individual regexp variable results:
+        vardict = {}
+        if "match_tag" in self.definition:
+            m_tag = re.match(self.definition["match_tag"], element.tag.tag)
+            if m_tag is None:
+                return None
+            vardict.update(m_tag.groupdict())
+        if "match_text" in self.definition:
+            m_text = re.match(self.definition["match_text"], element.tag.text)
+            if m_text is None:
                return None
+            vardict.update(m_text.groupdict())
+        if "match_attrib" in self.definition:
+            for attrib_def_key, attrib_def_value in self.definition["match_attrib"].items():
+                match_counter = 0
+                matched_m_attrib = None
+                matched_m_attrib_value = None
+                for attr_key, attr_value in element.tag.attrib.items():
+                    m_attrib = re.match(attrib_def_key, attr_key)
+                    if m_attrib is not None:
+                        match_counter += 1
+                        matched_m_attrib = m_attrib
+                        m_attrib_value = re.match(attrib_def_value, attr_value)
+                        if m_attrib_value is None:
+                            breakpoint()
+                            return None
+                        matched_m_attrib_value = m_attrib_value
+                # TODO: How to deal with multiple matches?
+                #       There are multiple options:
+                #       - Allow multiple attribute-key matches: Leads to possible overwrites of variables
+                #       - Require unique attribute-key and attribute-value matches: Very complex
+                #       - Only allow one single attribute-key to match and run attribute-value match separately.
+                #       Currently the latter option is implemented.
+                if match_counter == 0:
+                    return None
+                elif match_counter > 1:
+                    raise RuntimeError("Multiple attributes match the same match_attrib entry.")
+                vardict.update(matched_m_attrib.groupdict())
+                vardict.update(matched_m_attrib_value.groupdict())
+        return vardict
 class XMLTextNodeConverter(Converter):
@@ -109,6 +148,6 @@ class XMLTextNodeConverter(Converter):
        # See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145
        # for a suggestion for the design of the matching algorithm.
        if not isinstance(element, XMLTextNode):
-            raise ArgumentError("Element must be an instance of XMLTextNode.")
+            raise TypeError("Element must be an instance of XMLTextNode.")
        return None
--- a/unittests/test_xml_converter.py
+++ b/unittests/test_xml_converter.py
@@ -96,7 +96,7 @@ def test_simple_xml(converter_registry):
 type: XMLTag
 match_tag: a
 match_attrib:  # default is the empty dictionary
-"(?P<ref>(href|url))": "text(?P<number>[0-9])"  # either the "href" or the "url" attribute must be set
+    "(?P<ref>(href|url))": "test(?P<number>[0-9])"  # either the "href" or the "url" attribute must be set
    alt: (.+)  # this attribute must be present and contain at least one character
 match_text: .*  # allow any text, also empty (this is the default)
@@ -119,3 +119,5 @@ subtree:
    m = converter.match(tag)
    assert m is not None
+    assert m["ref"] == "href"
+    assert m["number"] == "1"