diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index c690da172ef36f7e1fdbe1fef22b25d18fabc677..2b4da3d94bb359814aae03a4de0ae20473aea8e5 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -316,7 +316,10 @@ def create_records(values: GeneralStore, records: RecordStore, def_records: dict class Converter(object, metaclass=ABCMeta): - """Converters treat StructureElements contained in the hierarchical sturcture.""" + """Converters treat StructureElements contained in the hierarchical sturcture. + + This is the abstract super class for all Converters. + """ def __init__(self, definition: dict, name: str, converter_registry: dict): """ @@ -581,6 +584,12 @@ class Converter(object, metaclass=ABCMeta): class DirectoryConverter(Converter): + """ + Converter that matches and handles structure elements of type directory. + + This is one typical starting point of a crawling procedure. + """ + def create_children(self, generalStore: GeneralStore, element: StructureElement): # TODO: See comment on types and inheritance if not isinstance(element, Directory): diff --git a/src/caoscrawler/structure_elements.py b/src/caoscrawler/structure_elements.py index 0efba91c185446e0bfbecbbb53f68aaa8a8e15d1..73c6f79aeba1611074d628d228252a66f2fd1a3f 100644 --- a/src/caoscrawler/structure_elements.py +++ b/src/caoscrawler/structure_elements.py @@ -24,6 +24,7 @@ # import warnings +import lxml.etree class StructureElement(object): @@ -167,3 +168,33 @@ class DictDictElement(DictElement): def __init__(self, *args, **kwargs): warnings.warn(DeprecationWarning("This class is depricated. Please use DictElement.")) super().__init__(*args, **kwargs) + + +class XMLTagElement(StructureElement): + """ + Stores elements of an XML tree. + """ + + def __init__(self, name: str, element: lxml.etree.Element): + super().__init__(name) + self.tag = element + + +class XMLTextNode(StructureElement): + """ + Stores text nodes of XML trees. + """ + + def __init__(self, name: str, value: str): + """ + Initializes this XML text node. + + Please note that, although syntactically similar, it is semantically + different from TextElement: + - TextElements have a meaningful name, e.g. a key in a key-value pair. This name can + be matched using the match_name entry. + - XMLTextNodes just have a text and the name is just for identifying the structure element. + They can only be matched using the match entry in the XMLTextNodeConverter. + """ + super().__init__(name) + self.value = value diff --git a/src/caoscrawler/xml_converter.py b/src/caoscrawler/xml_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..99df9cd78e1993c13384cd70cabdb17c38443c23 --- /dev/null +++ b/src/caoscrawler/xml_converter.py @@ -0,0 +1,114 @@ +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Converters take structure elements and create Records and new structure elements from them.""" + +from __future__ import annotations + +import datetime +import json +import logging +import os +import re +import warnings +from abc import ABCMeta, abstractmethod +from inspect import signature +from string import Template +from typing import Any, Callable, Optional, Union + +import linkahead as db +import pandas as pd +import yaml +import yaml_header_tools +from jsonschema import ValidationError, validate + +from .stores import GeneralStore, RecordStore +from .structure_elements import (BooleanElement, DictElement, Directory, File, + FloatElement, IntegerElement, JSONFile, + ListElement, NoneElement, StructureElement, + TextElement, XMLTagElement, XMLTextNode) +from .utils import has_parent + +import lxml.etree +from .converters import SimpleFileConverter, ConverterValidationError, Converter + + +class XMLFileConverter(SimpleFileConverter): + + """ + Convert XML files. See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145 + for the current suggestion for the specification. + """ + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + # TODO: See comment on types and inheritance + if not isinstance(element, File): + raise ValueError("create_children was called with wrong type of StructureElement") + with open(element.path, 'r') as xml_file: + xml = lxml.etree.parse(xml_file) + if "validate" in self.definition and self.definition["validate"]: + try: + raise NotImplementedError("XML validation not implemented yet.") + except ConverterValidationError as err: + raise ConverterValidationError( + "Error during the validation of the XML file:\n" + f"{element.path}\n" + err.message) + raise NotImplementedError("Not implemented yet") + # return [structure_element] + + +class XMLTagConverter(Converter): + def create_children(self, generalStore: GeneralStore, element: StructureElement): + raise NotImplementedError() + + def typecheck(self, element: StructureElement): + """ + Check whether the current structure element can be converted using + this converter. + """ + return isinstance(element, XMLTagElement) + + def match(self, element: StructureElement) -> Optional[dict]: + # See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145 + # for a suggestion for the design of the matching algorithm. + if not isinstance(element, XMLTagElement): + raise ArgumentError("Element must be an instance of XMLTagElement.") + + return None + + +class XMLTextNodeConverter(Converter): + def create_children(self, generalStore: GeneralStore, element: StructureElement): + raise NotImplementedError() + + def typecheck(self, element: StructureElement): + """ + Check whether the current structure element can be converted using + this converter. + """ + return isinstance(element, XMLTextNode) + + def match(self, element: StructureElement) -> Optional[dict]: + # See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145 + # for a suggestion for the design of the matching algorithm. + if not isinstance(element, XMLTextNode): + raise ArgumentError("Element must be an instance of XMLTextNode.") + + return None diff --git a/unittests/test_xml_converter.py b/unittests/test_xml_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..690ca129bd4878ab270ca95c63b93479b300254a --- /dev/null +++ b/unittests/test_xml_converter.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer <a.schlemmer@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" +test the converters module +""" +import datetime +import importlib +import json +import logging +import os +import sys +from itertools import product +from pathlib import Path + +import pytest +import yaml +from caoscrawler.converters import (Converter, ConverterValidationError, + DateElementConverter, DictElementConverter, + DictIntegerElementConverter, + DirectoryConverter, FloatElementConverter, + IntegerElementConverter, JSONFileConverter, + ListElementConverter, + MarkdownFileConverter, YAMLFileConverter, + _AbstractScalarValueElementConverter, + handle_value, replace_variables) +from caoscrawler.crawl import Crawler +from caoscrawler.scanner import (_load_definition_from_yaml_dict, + create_converter_registry, + create_transformer_registry, load_definition) +from caoscrawler.stores import GeneralStore +from caoscrawler.structure_elements import (BooleanElement, DictElement, + Directory, File, FloatElement, + IntegerElement, ListElement, + TextElement, XMLTagElement) +from caoscrawler.xml_converter import XMLTagConverter + +import lxml + +UNITTESTDIR = Path(__file__).parent + + +@pytest.fixture +def converter_registry(): + converter_registry: dict[str, dict[str, str]] = { + "Directory": { + "converter": "DirectoryConverter", + "package": "caoscrawler.converters"}, + "TextElement": { + "converter": "TextElementConverter", + "package": "caoscrawler.converters"}, + "XMLTag": { + "converter": "XMLTagConverter", + "package": "caoscrawler.xml_converter"}, + + "XMLTextNode": { + "converter": "XMLTextNodeConverter", + "package": "caoscrawler.xml_converter"}, + } + + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry + + +def test_simple_xml(converter_registry): + xml_text = """ + <a href="test1" alt="no link"> + test <img src="test2"/> + </a> + """ + + tag = XMLTagElement("tag", lxml.etree.fromstring(xml_text)) + + xml_cfood = yaml.safe_load(""" +type: XMLTag +match_tag: a +match_attrib: # default is the empty dictionary +"(?P<ref>(href|url))": "text(?P<number>[0-9])" # either the "href" or the "url" attribute must be set +alt: (.+) # this attribute must be present and contain at least one character +match_text: .* # allow any text, also empty (this is the default) + +# _*_ marks the default: +attribs_as_children: true # true / _false_ +text_as_children: true # true / _false_ +tags_as_children: true # _true_ / false + +subtree: + text: # this would be created by the text_as_children-flag + type: XMLTextNode + match: test + alt: # this would be created by the attribs_as_children-flag + type: TextElement + match_name: alt + match_value: ^(?P<text>.*)$ +""") + + converter = XMLTagConverter(xml_cfood, "TestXMLTagConverter", converter_registry) + m = converter.match(tag) + + assert m is not None