Skip to content
Snippets Groups Projects

XML Converter

Merged Alexander Schlemmer requested to merge f-xml-converter into dev
2 files
+ 51
10
Compare changes
  • Side-by-side
  • Inline
Files
2
@@ -28,15 +28,11 @@ import logging
import os
import re
import warnings
from abc import ABCMeta, abstractmethod
from inspect import signature
from string import Template
from typing import Any, Callable, Optional, Union
import linkahead as db
import pandas as pd
import yaml
import yaml_header_tools
from jsonschema import ValidationError, validate
from .stores import GeneralStore, RecordStore
@@ -89,9 +85,52 @@ class XMLTagConverter(Converter):
# See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145
# for a suggestion for the design of the matching algorithm.
if not isinstance(element, XMLTagElement):
raise ArgumentError("Element must be an instance of XMLTagElement.")
return None
raise TypeError("Element must be an instance of XMLTagElement.")
# Store the result of all individual regexp variable results:
vardict = {}
if "match_tag" in self.definition:
m_tag = re.match(self.definition["match_tag"], element.tag.tag)
if m_tag is None:
return None
vardict.update(m_tag.groupdict())
if "match_text" in self.definition:
m_text = re.match(self.definition["match_text"], element.tag.text)
if m_text is None:
return None
vardict.update(m_text.groupdict())
if "match_attrib" in self.definition:
for attrib_def_key, attrib_def_value in self.definition["match_attrib"].items():
match_counter = 0
matched_m_attrib = None
matched_m_attrib_value = None
for attr_key, attr_value in element.tag.attrib.items():
m_attrib = re.match(attrib_def_key, attr_key)
if m_attrib is not None:
match_counter += 1
matched_m_attrib = m_attrib
m_attrib_value = re.match(attrib_def_value, attr_value)
if m_attrib_value is None:
breakpoint()
return None
matched_m_attrib_value = m_attrib_value
# TODO: How to deal with multiple matches?
# There are multiple options:
# - Allow multiple attribute-key matches: Leads to possible overwrites of variables
# - Require unique attribute-key and attribute-value matches: Very complex
# - Only allow one single attribute-key to match and run attribute-value match separately.
# Currently the latter option is implemented.
if match_counter == 0:
return None
elif match_counter > 1:
raise RuntimeError("Multiple attributes match the same match_attrib entry.")
vardict.update(matched_m_attrib.groupdict())
vardict.update(matched_m_attrib_value.groupdict())
return vardict
class XMLTextNodeConverter(Converter):
@@ -109,6 +148,6 @@ class XMLTextNodeConverter(Converter):
# See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145
# for a suggestion for the design of the matching algorithm.
if not isinstance(element, XMLTextNode):
raise ArgumentError("Element must be an instance of XMLTextNode.")
raise TypeError("Element must be an instance of XMLTextNode.")
return None
Loading