Skip to content
Snippets Groups Projects

XML Converter

Merged Alexander Schlemmer requested to merge f-xml-converter into dev
All threads resolved!
1 file
+ 69
4
Compare changes
  • Side-by-side
  • Inline
@@ -54,7 +54,7 @@ from caoscrawler.structure_elements import (BooleanElement, DictElement,
TextElement, XMLTagElement)
from caoscrawler.xml_converter import XMLTagConverter
import lxml
from lxml.etree import fromstring
UNITTESTDIR = Path(__file__).parent
@@ -91,7 +91,7 @@ match_tag: a
match_attrib: # default is the empty dictionary
"(?P<ref>(href|url))": "test(?P<number>[0-9])" # either the "href" or the "url" attribute must be set
alt: (.+) # this attribute must be present and contain at least one character
match_text: .* # allow any text, also empty (this is the default)
match_text: \\s*(?P<node_text>.+)\\s*
# _*_ marks the default:
attribs_as_children: true # true / _false_
@@ -106,6 +106,12 @@ subtree:
type: TextElement
match_name: alt
match_value: ^(?P<text>.*)$
img:
type: XMLTag
match_name: img
match_attrib:
src: test2
""")
return xml_cfood
@@ -127,13 +133,72 @@ def test_simple_xml(basic_xmltag_converter):
</a>
"""
tag = XMLTagElement("tag", lxml.etree.fromstring(xml_text))
tag = XMLTagElement("tag", fromstring(xml_text))
m = basic_xmltag_converter.match(tag)
assert m is not None
assert m["ref"] == "href"
assert m["number"] == "1"
assert m["node_text"] == "test "
def test_not_matching(basic_xmltag_converter):
pass
m = basic_xmltag_converter.match(XMLTagElement("tag", fromstring("""
<a href="test1">
test <img src="test2"/>
</a>
""")))
assert m is None # alt-attribute was missing
m = basic_xmltag_converter.match(XMLTagElement("tag", fromstring("""
<a href="test" alt="no link">
test <img src="test2"/>
</a>
""")))
assert m is None # href attribute did not match
m = basic_xmltag_converter.match(XMLTagElement("tag", fromstring("""
<a href="test1" url="http" alt="no link">
test <img src="test2"/>
</a>
""")))
assert m is None # href and url must not be present simultaneously
m = basic_xmltag_converter.match(XMLTagElement("tag", fromstring("""
<a href="test1" alt="no link"><img src="test2"/></a>
""")))
assert m is None # text node is empty
m = basic_xmltag_converter.match(XMLTagElement("tag", fromstring("""
<a href="test1" alt="no link"/>
""")))
assert m is None # text node is empty
# TODO: adapt converter -> empty (==None) text node is equivalent to empty string text node
# TODO: adapt tests
# TODO: how to match " ajskdlfjaldsf ajsdklfjadkl " without the whitespaces in regexp correctly?
def test_nested_simple_xml(basic_xmltag_converter):
"""
Test for xml conversion including children.
"""
xml_text = """
<a href="test1" alt="no link">
test <img src="test2"/>
</a>
"""
tag = XMLTagElement("tag", fromstring(xml_text))
m = basic_xmltag_converter.match(tag)
assert m is not None
general_store = GeneralStore()
children = basic_xmltag_converter.create_children(general_store, tag)
Loading