Skip to content
Snippets Groups Projects

XML Converter

Merged Alexander Schlemmer requested to merge f-xml-converter into dev
2 files
+ 31
2
Compare changes
  • Side-by-side
  • Inline
Files
2
+ 226
0
# encoding: utf-8
#
# This file is a part of the LinkAhead Project.
#
# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com>
# Copyright (C) 2024 Alexander Schlemmer
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""Converters take structure elements and create Records and new structure elements from them."""
from __future__ import annotations
import datetime
import json
import logging
import os
import re
import warnings
from inspect import signature
from string import Template
from typing import Any, Callable, Optional, Union
import linkahead as db
from jsonschema import ValidationError, validate
from .stores import GeneralStore, RecordStore
from .structure_elements import (BooleanElement, DictElement, Directory, File,
FloatElement, IntegerElement, JSONFile,
ListElement, NoneElement, StructureElement,
TextElement, XMLTagElement, XMLTextNode, XMLAttributeNode)
from .utils import has_parent
import lxml.etree
from .converters import SimpleFileConverter, ConverterValidationError, Converter
class XMLFileConverter(SimpleFileConverter):
"""Convert XML files. See
https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145
for the current suggestion for the specification.
"""
def create_children(self, generalStore: GeneralStore, element: StructureElement):
# TODO: See comment on types and inheritance
if not isinstance(element, File):
raise ValueError("create_children was called with wrong type of StructureElement")
with open(element.path, 'r') as xml_file:
xml = lxml.etree.parse(xml_file)
if "validate" in self.definition and self.definition["validate"]:
try:
raise NotImplementedError("XML validation not implemented yet.")
except ConverterValidationError as err:
raise ConverterValidationError(
"Error during the validation of the XML file:\n"
f"{element.path}\n" + err.message)
return [XMLTagElement(xml.getroot())]
class XMLTagConverter(Converter):
def create_children(self, generalStore: GeneralStore, element: StructureElement):
"""Children that are generated by this function are the
result of the xpath query given in the yaml property
``xpath``. Its default (when not given) is ``child::*``, so the
direct children of the current xml node. The xpath expression
must be designed in a way that it returns xml tags (and no
attributes or texts). That means, that the axis ``attribute::``
and the function ``text()`` must not be used.
The following yaml properties can be used to generate other
types of nodes (text nodes and attribute nodes) as subtree
structure elements:
::
# _*_ marks the default:
attribs_as_children: true # true / _false_
text_as_children: true # true / _false_
tags_as_children: true # _true_ / false
The default is to generate the tags matched by the xpath expression only.
- When text_as_children is set to true, text nodes will be generated that contain the text
contained in the matched tags.
- When attribs_as_children is set to true, attribute nodes will be generated from the attributes
of the matched tags.
Notes
-----
The default is to take the namespace map from the current node and use it in xpath queries.
Because default namespaces cannot be handled by xpath, it is possible to remap the default namespace
using the key ``default_namespace``.
The key ``nsmap`` can be used to define additional nsmap entries.
"""
if not isinstance(element, XMLTagElement):
raise TypeError("Element must be an instance of XMLTagElement.")
# Get the namespace map from the element:
nsmap = element.tag.nsmap
# The default name of the default namespace is "default".
# You can overwrite it using the attribute "default_namespace" in the converter definition:
default_namespace = self.definition.get("default_namespace", "default")
if None in nsmap:
nsmap[default_namespace] = nsmap[None]
del nsmap[None]
# Set additional nsmap entries from the converter definition:
if "nsmap" in self.definition:
for key, value in self.definition["nsmap"].items():
nsmap[key] = value
xpath = self.definition.get("xpath", "child::*")
children = element.tag.xpath(xpath, namespaces=nsmap)
el_lst = []
for el in children:
if isinstance(el, str):
raise RuntimeError(
"Only standard xml nodes are supported as results of xpath queries.")
elif isinstance(el, lxml.etree._Element):
if self.definition.get("tags_as_children", True):
el_lst.append(XMLTagElement(el))
if self.definition.get("attribs_as_children", False):
for attrib in el.attrib:
el_lst.append(XMLAttributeNode(el, attrib))
if self.definition.get("text_as_children", False):
el_lst.append(XMLTextNode(el))
else:
raise RuntimeError("Unsupported child type.")
return el_lst
def typecheck(self, element: StructureElement):
"""
Check whether the current structure element can be converted using
this converter.
"""
return isinstance(element, XMLTagElement)
def match(self, element: StructureElement) -> Optional[dict]:
# See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145
# for a suggestion for the design of the matching algorithm.
if not isinstance(element, XMLTagElement):
raise TypeError("Element must be an instance of XMLTagElement.")
# Store the result of all individual regexp variable results:
vardict = {}
if "match_tag" in self.definition:
m_tag = re.match(self.definition["match_tag"], element.tag.tag)
if m_tag is None:
return None
vardict.update(m_tag.groupdict())
if "match_text" in self.definition:
tagtext = element.tag.text
if element.tag.text is None:
tagtext = ""
m_text = re.match(self.definition["match_text"], tagtext, re.DOTALL)
if m_text is None:
return None
vardict.update(m_text.groupdict())
if "match_attrib" in self.definition:
for attrib_def_key, attrib_def_value in self.definition["match_attrib"].items():
match_counter = 0
matched_m_attrib = None
matched_m_attrib_value = None
for attr_key, attr_value in element.tag.attrib.items():
m_attrib = re.match(attrib_def_key, attr_key)
if m_attrib is not None:
match_counter += 1
matched_m_attrib = m_attrib
m_attrib_value = re.match(attrib_def_value, attr_value)
if m_attrib_value is None:
return None
matched_m_attrib_value = m_attrib_value
# TODO: How to deal with multiple matches?
# There are multiple options:
# - Allow multiple attribute-key matches: Leads to possible overwrites of variables
# - Require unique attribute-key and attribute-value matches: Very complex
# - Only allow one single attribute-key to match and run attribute-value match separately.
# Currently the latter option is implemented.
if match_counter == 0:
return None
elif match_counter > 1:
raise RuntimeError("Multiple attributes match the same match_attrib entry.")
vardict.update(matched_m_attrib.groupdict())
vardict.update(matched_m_attrib_value.groupdict())
return vardict
class XMLTextNodeConverter(Converter):
def create_children(self, generalStore: GeneralStore, element: StructureElement):
raise NotImplementedError()
def typecheck(self, element: StructureElement):
"""
Check whether the current structure element can be converted using
this converter.
"""
return isinstance(element, XMLTextNode)
def match(self, element: StructureElement) -> Optional[dict]:
# See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145
# for a suggestion for the design of the matching algorithm.
if not isinstance(element, XMLTextNode):
raise TypeError("Element must be an instance of XMLTextNode.")
raise NotImplementedError()
return None
Loading