Skip to content
Snippets Groups Projects
Select Git revision
  • 3a9ca4f38f17e74ebbb02fb35b75b3377981fe88
  • main default protected
  • dev
  • f-spss-value-label-name
  • f-unmod
  • f-checkidentical
  • f-simple-breakpoint
  • f-new-debug-tree
  • f-existing-file-id
  • f-no-ident
  • f-collect-problems
  • f-refactor-debug-tree
  • v0.13.0
  • v0.12.0
  • v0.11.0
  • v0.10.1
  • v0.10.0
  • v0.9.1
  • v0.9.0
  • v0.8.0
  • v0.7.1
  • v0.7.0
  • v0.6.0
  • v0.5.0
  • v0.4.0
  • v0.3.0
  • v0.2.0
  • v0.1.0
28 results

test_xml_converter.py

Blame
  • Code owners
    Assign users and groups as approvers for specific file changes. Learn more.
    test_xml_converter.py 8.68 KiB
    #!/usr/bin/env python3
    # encoding: utf-8
    #
    # This file is a part of the LinkAhead Project.
    #
    # Copyright (C) 2024 Indiscale GmbH <info@indiscale.com>
    # Copyright (C) 2024 Alexander Schlemmer <a.schlemmer@indiscale.com>
    #
    # This program is free software: you can redistribute it and/or modify
    # it under the terms of the GNU Affero General Public License as
    # published by the Free Software Foundation, either version 3 of the
    # License, or (at your option) any later version.
    #
    # This program is distributed in the hope that it will be useful,
    # but WITHOUT ANY WARRANTY; without even the implied warranty of
    # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    # GNU Affero General Public License for more details.
    #
    # You should have received a copy of the GNU Affero General Public License
    # along with this program. If not, see <https://www.gnu.org/licenses/>.
    #
    
    """
    test the converters module
    """
    import datetime
    import importlib
    import json
    import logging
    import os
    import sys
    from itertools import product
    from pathlib import Path
    
    import pytest
    import yaml
    from caoscrawler.converters import (Converter, ConverterValidationError,
                                        DateElementConverter, DictElementConverter,
                                        DictIntegerElementConverter,
                                        DirectoryConverter, FloatElementConverter,
                                        IntegerElementConverter, JSONFileConverter,
                                        ListElementConverter,
                                        MarkdownFileConverter, YAMLFileConverter,
                                        _AbstractScalarValueElementConverter,
                                        handle_value, replace_variables)
    from caoscrawler.crawl import Crawler
    from caoscrawler.scanner import (_load_definition_from_yaml_dict,
                                     create_converter_registry,
                                     create_transformer_registry, load_definition)
    from caoscrawler.stores import GeneralStore
    from caoscrawler.structure_elements import (BooleanElement, DictElement,
                                                Directory, File, FloatElement,
                                                IntegerElement, ListElement,
                                                TextElement, XMLTagElement)
    from caoscrawler.xml_converter import XMLTagConverter
    
    from lxml.etree import fromstring
    
    UNITTESTDIR = Path(__file__).parent
    
    
    @pytest.fixture
    def converter_registry():
        converter_registry: dict[str, dict[str, str]] = {
            "Directory": {
                "converter": "DirectoryConverter",
                "package": "caoscrawler.converters"},
            "TextElement": {
                "converter": "TextElementConverter",
                "package": "caoscrawler.converters"},
            "XMLTag": {
                "converter": "XMLTagConverter",
                "package": "caoscrawler.xml_converter"},
    
            "XMLTextNode": {
                "converter": "XMLTextNodeConverter",
                "package": "caoscrawler.xml_converter"},
            }
    
        for key, value in converter_registry.items():
            module = importlib.import_module(value["package"])
            value["class"] = getattr(module, value["converter"])
        return converter_registry
    
    
    @pytest.fixture
    def basic_xmltag_converter(converter_registry):
        return XMLTagConverter(yaml.safe_load("""
    type: XMLTag
    match_tag: a
    match_attrib:  # default is the empty dictionary
        "(?P<ref>(href|url))": "test(?P<number>[0-9])"  # either the "href" or the "url" attribute must be set
        alt: (.+)  # this attribute must be present and contain at least one character
    match_text: \\s*(?P<node_text>.+)\\s*
    
    subtree:
        img:
            type: XMLTag
            match_name: img
            match_attrib:
                src: test2
    """), "TestXMLTagConverter", converter_registry)
    
    @pytest.fixture
    def basic_xpath_xmltag_converter(converter_registry):
        return XMLTagConverter(yaml.safe_load("""
    type: XMLTag
    match_tag: a
    match_attrib:  # default is the empty dictionary
        "(?P<ref>(href|url))": "test(?P<number>[0-9])"  # either the "href" or the "url" attribute must be set
        alt: (.+)  # this attribute must be present and contain at least one character
    match_text: \\s*(?P<node_text>.+)\\s*
    xpath: child::*/*
    
    subtree:
        img:
            type: XMLTag
            match_name: img
            match_attrib:
                src: test2
        testnode:
            type: XMLTag
            match_name: testnode
    """), "TestXMLTagConverter", converter_registry)
    
    
    
    def test_simple_xml(basic_xmltag_converter):
        """
        Test for basic xml conversion functionality.
        """
        xml_text = """
        <a href="test1" alt="no link">
        test <img src="test2"/>
        </a>
        """
    
        xml = fromstring(xml_text)
        tag = XMLTagElement(xml)
        assert tag.name == "."
    
        m = basic_xmltag_converter.match(tag)
    
        assert m is not None
        assert m["ref"] == "href"
        assert m["number"] == "1"
        assert m["node_text"] == "test "
    
    
    def test_not_matching(basic_xmltag_converter):
        m = basic_xmltag_converter.match(XMLTagElement(fromstring("""
            <a href="test1">
            test <img src="test2"/>
            </a>
            """)))
    
        assert m is None  # alt-attribute was missing
    
        m = basic_xmltag_converter.match(XMLTagElement(fromstring("""
            <a href="test" alt="no link">
            test <img src="test2"/>
            </a>
            """)))
    
        assert m is None  # href attribute did not match
    
        m = basic_xmltag_converter.match(XMLTagElement(fromstring("""
            <a href="test1" url="http" alt="no link">
            test <img src="test2"/>
            </a>
            """)))
    
        assert m is None  # href and url must not be present simultaneously
    
        m = basic_xmltag_converter.match(XMLTagElement(fromstring("""
            <a href="test1" alt="no link"><img src="test2"/></a>
            """)))
    
        assert m is None  # text node is empty
    
        m = basic_xmltag_converter.match(XMLTagElement(fromstring("""
            <a href="test1" alt="no link"/>
            """)))
    
        assert m is None  # text node is empty
    
        # TODO: adapt converter -> empty (==None) text node is equivalent to empty string text node
        # TODO: adapt tests
        # TODO: how to match "  ajskdlfjaldsf ajsdklfjadkl " without the whitespaces in regexp correctly?
    
    
    def test_nested_simple_xml(basic_xmltag_converter, basic_xpath_xmltag_converter):
        """
        Test for xml conversion including children.
        """
        xml_text = """
        <a href="test1" alt="no link">
        test <img src="test2"/>
        </a>
        """
    
        tag = XMLTagElement(fromstring(xml_text))
        m = basic_xmltag_converter.match(tag)
        assert m is not None
    
        general_store = GeneralStore()
        children = basic_xmltag_converter.create_children(general_store, tag)
    
        assert len(children) == 1
        assert isinstance(children[0], XMLTagElement)
        assert children[0].name == "img"
    
    
    
        xml_text = """
        <a href="test1" alt="no link">
        test <img src="test2">
            <testnode/> </img>
        </a>
        """
    
        tag = XMLTagElement(fromstring(xml_text))
        m = basic_xpath_xmltag_converter.match(tag)
        assert m is not None
    
        general_store = GeneralStore()
        children = basic_xpath_xmltag_converter.create_children(general_store, tag)
    
        assert len(children) == 1
        assert isinstance(children[0], XMLTagElement)
        assert children[0].name == "img/testnode"
    
    
    def test_namespace_xml(converter_registry):
        """
        Test for xml conversion including children.
        Nodes have namespaces.
        """
    
        xml_text = """
        <root xmlns="default-namespace" xmlns:test="alternative-namespace">
            <node1>
                Bla
            </node1>
            <node1>
            text
                <node2 xmlns="sub-namespace">
                    <node3>
                        ok
                    </node3>
                </node2>
                <test:node2>
                    sep
                </test:node2>
            </node1>
        </root>
    """
    
        converter = XMLTagConverter(yaml.safe_load("""
    type: XMLTag
    match_tag: "{default-namespace}root"
    xpath: "default:node1/text()"
    default_namespace: default
    """), "TestXMLTagConverter", converter_registry)
    
        tag = XMLTagElement(fromstring(xml_text))
        m = converter.match(tag)
        assert m is not None
    
        with pytest.raises(RuntimeError, match="Only standard xml nodes.*"):
            converter.create_children(GeneralStore(), tag)
    
        converter = XMLTagConverter(yaml.safe_load("""
    type: XMLTag
    match_tag: "{default-namespace}root"
    xpath: "default:node1"
    default_namespace: default
    attribs_as_children: false
    text_as_children: true
    tags_as_children: false
    """), "TestXMLTagConverter", converter_registry)
        children = converter.create_children(GeneralStore(), tag)
        assert len(children) == 2
        assert children[0].name == "{default-namespace}node1[1]/text()"
        assert children[0].value.strip() == "Bla"
        assert children[1].name == "{default-namespace}node1[2]/text()"
        assert children[1].value.strip() == "text"