Alexander Schlemmer · Alexander Schlemmer
--- a/src/caoscrawler/xml_converter.py 0 → 100644

+ 226

− 0
+++ b/src/caoscrawler/xml_converter.py 0 → 100644

+ 226

− 0
+# encoding: utf-8
+#
+# This file is a part of the LinkAhead Project.
+#
+# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com>
+# Copyright (C) 2024 Alexander Schlemmer
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+"""Converters take structure elements and create Records and new structure elements from them."""
+
+from __future__ import annotations
+
+import datetime
+import json
+import logging
+import os
+import re
+import warnings
+from inspect import signature
+from string import Template
+from typing import Any, Callable, Optional, Union
+
+import linkahead as db
+from jsonschema import ValidationError, validate
+
+from .stores import GeneralStore, RecordStore
+from .structure_elements import (BooleanElement, DictElement, Directory, File,
+                                 FloatElement, IntegerElement, JSONFile,
+                                 ListElement, NoneElement, StructureElement,
+                                 TextElement, XMLTagElement, XMLTextNode, XMLAttributeNode)
+from .utils import has_parent
+
+import lxml.etree
+from .converters import SimpleFileConverter, ConverterValidationError, Converter
+
+
+class XMLFileConverter(SimpleFileConverter):
+
+    """Convert XML files. See
+    https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145
+    for the current suggestion for the specification.
+
+    """
+
+    def create_children(self, generalStore: GeneralStore, element: StructureElement):
+        # TODO: See comment on types and inheritance
+        if not isinstance(element, File):
+            raise ValueError("create_children was called with wrong type of StructureElement")
+        with open(element.path, 'r') as xml_file:
+            xml = lxml.etree.parse(xml_file)
+        if "validate" in self.definition and self.definition["validate"]:
+            try:
+                raise NotImplementedError("XML validation not implemented yet.")
+            except ConverterValidationError as err:
+                raise ConverterValidationError(
+                    "Error during the validation of the XML file:\n"
+                    f"{element.path}\n" + err.message)
+
+        return [XMLTagElement(xml.getroot())]
+
+
+class XMLTagConverter(Converter):
+    def create_children(self, generalStore: GeneralStore, element: StructureElement):
+        """Children that are generated by this function are the
+        result of the xpath query given in the yaml property
+        ``xpath``. Its default (when not given) is ``child::*``, so the
+        direct children of the current xml node. The xpath expression
+        must be designed in a way that it returns xml tags (and no
+        attributes or texts). That means, that the axis ``attribute::``
+        and the function ``text()`` must not be used.
+
+        The following yaml properties can be used to generate other
+        types of nodes (text nodes and attribute nodes) as subtree
+        structure elements:
+
+        ::
+
+            # _*_ marks the default:
+            attribs_as_children: true  # true / _false_
+            text_as_children: true  # true / _false_
+            tags_as_children: true  # _true_ / false
+
+        The default is to generate the tags matched by the xpath expression only.
+
+        - When text_as_children is set to true, text nodes will be generated that contain the text
+          contained in the matched tags.
+        - When attribs_as_children is set to true, attribute nodes will be generated from the attributes
+          of the matched tags.
+
+        Notes
+        -----
+        The default is to take the namespace map from the current node and use it in xpath queries.
+        Because default namespaces cannot be handled by xpath, it is possible to remap the default namespace
+        using the key ``default_namespace``.
+        The key ``nsmap`` can be used to define additional nsmap entries.
+
+        """
+        if not isinstance(element, XMLTagElement):
+            raise TypeError("Element must be an instance of XMLTagElement.")
+
+        # Get the namespace map from the element:
+        nsmap = element.tag.nsmap
+        # The default name of the default namespace is "default".
+        # You can overwrite it using the attribute "default_namespace" in the converter definition:
+        default_namespace = self.definition.get("default_namespace", "default")
+        if None in nsmap:
+            nsmap[default_namespace] = nsmap[None]
+            del nsmap[None]
+
+        # Set additional nsmap entries from the converter definition:
+        if "nsmap" in self.definition:
+            for key, value in self.definition["nsmap"].items():
+                nsmap[key] = value
+
+        xpath = self.definition.get("xpath", "child::*")
+        children = element.tag.xpath(xpath, namespaces=nsmap)
+        el_lst = []
+        for el in children:
+            if isinstance(el, str):
+                raise RuntimeError(
+                    "Only standard xml nodes are supported as results of xpath queries.")
+            elif isinstance(el, lxml.etree._Element):
+                if self.definition.get("tags_as_children", True):
+                    el_lst.append(XMLTagElement(el))
+                if self.definition.get("attribs_as_children", False):
+                    for attrib in el.attrib:
+                        el_lst.append(XMLAttributeNode(el, attrib))
+                if self.definition.get("text_as_children", False):
+                    el_lst.append(XMLTextNode(el))
+            else:
+                raise RuntimeError("Unsupported child type.")
+        return el_lst
+
+    def typecheck(self, element: StructureElement):
+        """
+        Check whether the current structure element can be converted using
+        this converter.
+        """
+        return isinstance(element, XMLTagElement)
+
+    def match(self, element: StructureElement) -> Optional[dict]:
+        # See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145
+        # for a suggestion for the design of the matching algorithm.
+        if not isinstance(element, XMLTagElement):
+            raise TypeError("Element must be an instance of XMLTagElement.")
+
+        # Store the result of all individual regexp variable results:
+        vardict = {}
+
+        if "match_tag" in self.definition:
+            m_tag = re.match(self.definition["match_tag"], element.tag.tag)
+            if m_tag is None:
+                return None
+            vardict.update(m_tag.groupdict())
+
+        if "match_text" in self.definition:
+            tagtext = element.tag.text
+            if element.tag.text is None:
+                tagtext = ""
+            m_text = re.match(self.definition["match_text"], tagtext, re.DOTALL)
+            if m_text is None:
+                return None
+            vardict.update(m_text.groupdict())
+
+        if "match_attrib" in self.definition:
+            for attrib_def_key, attrib_def_value in self.definition["match_attrib"].items():
+                match_counter = 0
+                matched_m_attrib = None
+                matched_m_attrib_value = None
+                for attr_key, attr_value in element.tag.attrib.items():
+                    m_attrib = re.match(attrib_def_key, attr_key)
+                    if m_attrib is not None:
+                        match_counter += 1
+                        matched_m_attrib = m_attrib
+                        m_attrib_value = re.match(attrib_def_value, attr_value)
+                        if m_attrib_value is None:
+                            return None
+                        matched_m_attrib_value = m_attrib_value
+                # TODO: How to deal with multiple matches?
+                #       There are multiple options:
+                #       - Allow multiple attribute-key matches: Leads to possible overwrites of variables
+                #       - Require unique attribute-key and attribute-value matches: Very complex
+                #       - Only allow one single attribute-key to match and run attribute-value match separately.
+                #       Currently the latter option is implemented.
+                if match_counter == 0:
+                    return None
+                elif match_counter > 1:
+                    raise RuntimeError("Multiple attributes match the same match_attrib entry.")
+                vardict.update(matched_m_attrib.groupdict())
+                vardict.update(matched_m_attrib_value.groupdict())
+
+        return vardict
+
+
+class XMLTextNodeConverter(Converter):
+    def create_children(self, generalStore: GeneralStore, element: StructureElement):
+        raise NotImplementedError()
+
+    def typecheck(self, element: StructureElement):
+        """
+        Check whether the current structure element can be converted using
+        this converter.
+        """
+        return isinstance(element, XMLTextNode)
+
+    def match(self, element: StructureElement) -> Optional[dict]:
+        # See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145
+        # for a suggestion for the design of the matching algorithm.
+        if not isinstance(element, XMLTextNode):
+            raise TypeError("Element must be an instance of XMLTextNode.")
+
+        raise NotImplementedError()
+
+        return None