Skip to content
Snippets Groups Projects

XML Converter

Merged Alexander Schlemmer requested to merge f-xml-converter into dev
1 file
+ 37
2
Compare changes
  • Side-by-side
  • Inline
+ 320
0
#!/usr/bin/env python3
# encoding: utf-8
#
# This file is a part of the LinkAhead Project.
#
# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com>
# Copyright (C) 2024 Alexander Schlemmer <a.schlemmer@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
"""
test the converters module
"""
import datetime
import importlib
import json
import logging
import os
import sys
from itertools import product
from pathlib import Path
import pytest
import yaml
from caoscrawler.converters import (Converter, ConverterValidationError,
DateElementConverter, DictElementConverter,
DictIntegerElementConverter,
DirectoryConverter, FloatElementConverter,
IntegerElementConverter, JSONFileConverter,
ListElementConverter,
MarkdownFileConverter, YAMLFileConverter,
_AbstractScalarValueElementConverter,
handle_value, replace_variables)
from caoscrawler.crawl import Crawler
from caoscrawler.scanner import (_load_definition_from_yaml_dict,
create_converter_registry,
create_transformer_registry, load_definition)
from caoscrawler.stores import GeneralStore
from caoscrawler.structure_elements import (BooleanElement, DictElement,
Directory, File, FloatElement,
IntegerElement, ListElement,
TextElement, XMLTagElement)
from caoscrawler.xml_converter import XMLTagConverter
from lxml.etree import fromstring
UNITTESTDIR = Path(__file__).parent
@pytest.fixture
def converter_registry():
converter_registry: dict[str, dict[str, str]] = {
"Directory": {
"converter": "DirectoryConverter",
"package": "caoscrawler.converters"},
"TextElement": {
"converter": "TextElementConverter",
"package": "caoscrawler.converters"},
"XMLTag": {
"converter": "XMLTagConverter",
"package": "caoscrawler.xml_converter"},
"XMLTextNode": {
"converter": "XMLTextNodeConverter",
"package": "caoscrawler.xml_converter"},
}
for key, value in converter_registry.items():
module = importlib.import_module(value["package"])
value["class"] = getattr(module, value["converter"])
return converter_registry
@pytest.fixture
def basic_xmltag_converter(converter_registry):
return XMLTagConverter(yaml.safe_load("""
type: XMLTag
match_tag: a
match_attrib: # default is the empty dictionary
"(?P<ref>(href|url))": "test(?P<number>[0-9])" # either the "href" or the "url" attribute must be set
alt: (.+) # this attribute must be present and contain at least one character
match_text: \\s*(?P<node_text>.+)\\s*
subtree:
img:
type: XMLTag
match_name: img
match_attrib:
src: test2
"""), "TestXMLTagConverter", converter_registry)
@pytest.fixture
def basic_xpath_xmltag_converter(converter_registry):
return XMLTagConverter(yaml.safe_load("""
type: XMLTag
match_tag: a
match_attrib: # default is the empty dictionary
"(?P<ref>(href|url))": "test(?P<number>[0-9])" # either the "href" or the "url" attribute must be set
alt: (.+) # this attribute must be present and contain at least one character
match_text: \\s*(?P<node_text>.+)\\s*
xpath: child::*/*
subtree:
img:
type: XMLTag
match_name: img
match_attrib:
src: test2
testnode:
type: XMLTag
match_name: testnode
"""), "TestXMLTagConverter", converter_registry)
def test_simple_xml(basic_xmltag_converter):
"""
Test for basic xml conversion functionality.
"""
xml_text = """
<a href="test1" alt="no link">
test <img src="test2"/>
</a>
"""
xml = fromstring(xml_text)
tag = XMLTagElement(xml)
assert tag.name == "."
m = basic_xmltag_converter.match(tag)
assert m is not None
assert m["ref"] == "href"
assert m["number"] == "1"
assert m["node_text"] == "test "
def test_not_matching(basic_xmltag_converter):
m = basic_xmltag_converter.match(XMLTagElement(fromstring("""
<a href="test1">
test <img src="test2"/>
</a>
""")))
assert m is None # alt-attribute was missing
m = basic_xmltag_converter.match(XMLTagElement(fromstring("""
<a href="test" alt="no link">
test <img src="test2"/>
</a>
""")))
assert m is None # href attribute did not match
m = basic_xmltag_converter.match(XMLTagElement(fromstring("""
<a href="test1" url="http" alt="no link">
test <img src="test2"/>
</a>
""")))
assert m is None # href and url must not be present simultaneously
m = basic_xmltag_converter.match(XMLTagElement(fromstring("""
<a href="test1" alt="no link"><img src="test2"/></a>
""")))
assert m is None # text node is empty
m = basic_xmltag_converter.match(XMLTagElement(fromstring("""
<a href="test1" alt="no link"/>
""")))
assert m is None # text node is empty
# TODO: adapt converter -> empty (==None) text node is equivalent to empty string text node
# TODO: adapt tests
# TODO: how to match " ajskdlfjaldsf ajsdklfjadkl " without the whitespaces in regexp correctly?
def test_nested_simple_xml(basic_xmltag_converter, basic_xpath_xmltag_converter):
"""
Test for xml conversion including children.
"""
xml_text = """
<a href="test1" alt="no link">
test <img src="test2"/>
</a>
"""
tag = XMLTagElement(fromstring(xml_text))
m = basic_xmltag_converter.match(tag)
assert m is not None
general_store = GeneralStore()
children = basic_xmltag_converter.create_children(general_store, tag)
assert len(children) == 1
assert isinstance(children[0], XMLTagElement)
assert children[0].name == "img"
xml_text = """
<a href="test1" alt="no link">
test <img src="test2">
<testnode/> </img>
</a>
"""
tag = XMLTagElement(fromstring(xml_text))
m = basic_xpath_xmltag_converter.match(tag)
assert m is not None
general_store = GeneralStore()
children = basic_xpath_xmltag_converter.create_children(general_store, tag)
assert len(children) == 1
assert isinstance(children[0], XMLTagElement)
assert children[0].name == "img/testnode"
def test_namespace_xml(converter_registry):
"""
Test for xml conversion including children.
Nodes have namespaces.
"""
xml_text = """
<root xmlns="default-namespace" xmlns:test="alternative-namespace">
<node1 active="true">
Bla
</node1>
<node1 active="true" size="45">
text
<node2 xmlns="sub-namespace">
<node3>
ok
</node3>
</node2>
<test:node2>
sep
</test:node2>
</node1>
</root>
"""
# Test unsupported xpath (containing text()):
converter = XMLTagConverter(yaml.safe_load("""
type: XMLTag
match_tag: "{default-namespace}root"
xpath: "default:node1/text()"
default_namespace: default
"""), "TestXMLTagConverter", converter_registry)
tag = XMLTagElement(fromstring(xml_text))
m = converter.match(tag)
assert m is not None
with pytest.raises(RuntimeError, match="Only standard xml nodes.*"):
converter.create_children(GeneralStore(), tag)
# Test complex xml using namespaces and text nodes:
converter = XMLTagConverter(yaml.safe_load("""
type: XMLTag
match_tag: "{default-namespace}root"
xpath: "default:node1"
default_namespace: default
attribs_as_children: false
text_as_children: true
tags_as_children: false
"""), "TestXMLTagConverter", converter_registry)
children = converter.create_children(GeneralStore(), tag)
assert len(children) == 2
assert children[0].name == "{default-namespace}node1[1]/text()"
assert children[0].value.strip() == "Bla"
assert children[1].name == "{default-namespace}node1[2]/text()"
assert children[1].value.strip() == "text"
# Check child generation of attributes:
converter = XMLTagConverter(yaml.safe_load("""
type: XMLTag
match_tag: "{default-namespace}root"
xpath: "default:node1"
default_namespace: default
attribs_as_children: true
text_as_children: false
tags_as_children: false
"""), "TestXMLTagConverter", converter_registry)
children = converter.create_children(GeneralStore(), tag)
assert len(children) == 3
assert children[0].name == "{default-namespace}node1[1]@active"
assert children[0].value.strip() == "true"
assert children[1].name == "{default-namespace}node1[2]@active"
assert children[1].value.strip() == "true"
assert children[2].name == "{default-namespace}node1[2]@size"
assert children[2].value.strip() == "45"
# Test setting nsmap entries:
converter = XMLTagConverter(yaml.safe_load("""
type: XMLTag
match_tag: "{default-namespace}root"
xpath: "//s:node2"
default_namespace: default
nsmap:
s: sub-namespace
"""), "TestXMLTagConverter", converter_registry)
children = converter.create_children(GeneralStore(), tag)
assert len(children) == 1
assert children[0].name == "{default-namespace}node1[2]/{sub-namespace}node2"
Loading