Skip to content
Snippets Groups Projects
Commit ec0d5c36 authored by Alexander Schlemmer's avatar Alexander Schlemmer
Browse files

ENH: implemented basic matching of structure elements of type XMLTagElement

parent 366f4b0a
No related branches found
No related tags found
2 merge requests!181Release 0.9.0,!174XML Converter
Pipeline #52486 passed with warnings
This commit is part of merge request !174. Comments created here will be created in the context of that merge request.
...@@ -28,15 +28,11 @@ import logging ...@@ -28,15 +28,11 @@ import logging
import os import os
import re import re
import warnings import warnings
from abc import ABCMeta, abstractmethod
from inspect import signature from inspect import signature
from string import Template from string import Template
from typing import Any, Callable, Optional, Union from typing import Any, Callable, Optional, Union
import linkahead as db import linkahead as db
import pandas as pd
import yaml
import yaml_header_tools
from jsonschema import ValidationError, validate from jsonschema import ValidationError, validate
from .stores import GeneralStore, RecordStore from .stores import GeneralStore, RecordStore
...@@ -89,9 +85,52 @@ class XMLTagConverter(Converter): ...@@ -89,9 +85,52 @@ class XMLTagConverter(Converter):
# See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145 # See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145
# for a suggestion for the design of the matching algorithm. # for a suggestion for the design of the matching algorithm.
if not isinstance(element, XMLTagElement): if not isinstance(element, XMLTagElement):
raise ArgumentError("Element must be an instance of XMLTagElement.") raise TypeError("Element must be an instance of XMLTagElement.")
# Store the result of all individual regexp variable results:
vardict = {}
if "match_tag" in self.definition:
m_tag = re.match(self.definition["match_tag"], element.tag.tag)
if m_tag is None:
return None
vardict.update(m_tag.groupdict())
if "match_text" in self.definition:
m_text = re.match(self.definition["match_text"], element.tag.text)
if m_text is None:
return None return None
vardict.update(m_text.groupdict())
if "match_attrib" in self.definition:
for attrib_def_key, attrib_def_value in self.definition["match_attrib"].items():
match_counter = 0
matched_m_attrib = None
matched_m_attrib_value = None
for attr_key, attr_value in element.tag.attrib.items():
m_attrib = re.match(attrib_def_key, attr_key)
if m_attrib is not None:
match_counter += 1
matched_m_attrib = m_attrib
m_attrib_value = re.match(attrib_def_value, attr_value)
if m_attrib_value is None:
breakpoint()
return None
matched_m_attrib_value = m_attrib_value
# TODO: How to deal with multiple matches?
# There are multiple options:
# - Allow multiple attribute-key matches: Leads to possible overwrites of variables
# - Require unique attribute-key and attribute-value matches: Very complex
# - Only allow one single attribute-key to match and run attribute-value match separately.
# Currently the latter option is implemented.
if match_counter == 0:
return None
elif match_counter > 1:
raise RuntimeError("Multiple attributes match the same match_attrib entry.")
vardict.update(matched_m_attrib.groupdict())
vardict.update(matched_m_attrib_value.groupdict())
return vardict
class XMLTextNodeConverter(Converter): class XMLTextNodeConverter(Converter):
...@@ -109,6 +148,6 @@ class XMLTextNodeConverter(Converter): ...@@ -109,6 +148,6 @@ class XMLTextNodeConverter(Converter):
# See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145 # See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145
# for a suggestion for the design of the matching algorithm. # for a suggestion for the design of the matching algorithm.
if not isinstance(element, XMLTextNode): if not isinstance(element, XMLTextNode):
raise ArgumentError("Element must be an instance of XMLTextNode.") raise TypeError("Element must be an instance of XMLTextNode.")
return None return None
...@@ -96,7 +96,7 @@ def test_simple_xml(converter_registry): ...@@ -96,7 +96,7 @@ def test_simple_xml(converter_registry):
type: XMLTag type: XMLTag
match_tag: a match_tag: a
match_attrib: # default is the empty dictionary match_attrib: # default is the empty dictionary
"(?P<ref>(href|url))": "text(?P<number>[0-9])" # either the "href" or the "url" attribute must be set "(?P<ref>(href|url))": "test(?P<number>[0-9])" # either the "href" or the "url" attribute must be set
alt: (.+) # this attribute must be present and contain at least one character alt: (.+) # this attribute must be present and contain at least one character
match_text: .* # allow any text, also empty (this is the default) match_text: .* # allow any text, also empty (this is the default)
...@@ -119,3 +119,5 @@ subtree: ...@@ -119,3 +119,5 @@ subtree:
m = converter.match(tag) m = converter.match(tag)
assert m is not None assert m is not None
assert m["ref"] == "href"
assert m["number"] == "1"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment