diff --git a/src/caoscrawler/xml_converter.py b/src/caoscrawler/xml_converter.py index 99df9cd78e1993c13384cd70cabdb17c38443c23..fe10c738932fcb15be5bacff728555fc65f72303 100644 --- a/src/caoscrawler/xml_converter.py +++ b/src/caoscrawler/xml_converter.py @@ -28,15 +28,11 @@ import logging import os import re import warnings -from abc import ABCMeta, abstractmethod from inspect import signature from string import Template from typing import Any, Callable, Optional, Union import linkahead as db -import pandas as pd -import yaml -import yaml_header_tools from jsonschema import ValidationError, validate from .stores import GeneralStore, RecordStore @@ -89,9 +85,52 @@ class XMLTagConverter(Converter): # See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145 # for a suggestion for the design of the matching algorithm. if not isinstance(element, XMLTagElement): - raise ArgumentError("Element must be an instance of XMLTagElement.") - - return None + raise TypeError("Element must be an instance of XMLTagElement.") + + # Store the result of all individual regexp variable results: + vardict = {} + + if "match_tag" in self.definition: + m_tag = re.match(self.definition["match_tag"], element.tag.tag) + if m_tag is None: + return None + vardict.update(m_tag.groupdict()) + + if "match_text" in self.definition: + m_text = re.match(self.definition["match_text"], element.tag.text) + if m_text is None: + return None + vardict.update(m_text.groupdict()) + + if "match_attrib" in self.definition: + for attrib_def_key, attrib_def_value in self.definition["match_attrib"].items(): + match_counter = 0 + matched_m_attrib = None + matched_m_attrib_value = None + for attr_key, attr_value in element.tag.attrib.items(): + m_attrib = re.match(attrib_def_key, attr_key) + if m_attrib is not None: + match_counter += 1 + matched_m_attrib = m_attrib + m_attrib_value = re.match(attrib_def_value, attr_value) + if m_attrib_value is None: + breakpoint() + return None + matched_m_attrib_value = m_attrib_value + # TODO: How to deal with multiple matches? + # There are multiple options: + # - Allow multiple attribute-key matches: Leads to possible overwrites of variables + # - Require unique attribute-key and attribute-value matches: Very complex + # - Only allow one single attribute-key to match and run attribute-value match separately. + # Currently the latter option is implemented. + if match_counter == 0: + return None + elif match_counter > 1: + raise RuntimeError("Multiple attributes match the same match_attrib entry.") + vardict.update(matched_m_attrib.groupdict()) + vardict.update(matched_m_attrib_value.groupdict()) + + return vardict class XMLTextNodeConverter(Converter): @@ -109,6 +148,6 @@ class XMLTextNodeConverter(Converter): # See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145 # for a suggestion for the design of the matching algorithm. if not isinstance(element, XMLTextNode): - raise ArgumentError("Element must be an instance of XMLTextNode.") + raise TypeError("Element must be an instance of XMLTextNode.") return None diff --git a/unittests/test_xml_converter.py b/unittests/test_xml_converter.py index 690ca129bd4878ab270ca95c63b93479b300254a..b0dcce940cc1c04ee351fe0b5709114cc74e1050 100644 --- a/unittests/test_xml_converter.py +++ b/unittests/test_xml_converter.py @@ -96,8 +96,8 @@ def test_simple_xml(converter_registry): type: XMLTag match_tag: a match_attrib: # default is the empty dictionary -"(?P<ref>(href|url))": "text(?P<number>[0-9])" # either the "href" or the "url" attribute must be set -alt: (.+) # this attribute must be present and contain at least one character + "(?P<ref>(href|url))": "test(?P<number>[0-9])" # either the "href" or the "url" attribute must be set + alt: (.+) # this attribute must be present and contain at least one character match_text: .* # allow any text, also empty (this is the default) # _*_ marks the default: @@ -119,3 +119,5 @@ subtree: m = converter.match(tag) assert m is not None + assert m["ref"] == "href" + assert m["number"] == "1"