From a6b41c7a9b48726b244a034f6d8f3d86a6c1cf04 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Tue, 20 Aug 2024 17:55:30 +0200 Subject: [PATCH] TST: added test for xml converters that do not match and a stub for nested xml --- unittests/test_xml_converter.py | 73 +++++++++++++++++++++++++++++++-- 1 file changed, 69 insertions(+), 4 deletions(-) diff --git a/unittests/test_xml_converter.py b/unittests/test_xml_converter.py index a5e9898a..c302e7ef 100644 --- a/unittests/test_xml_converter.py +++ b/unittests/test_xml_converter.py @@ -54,7 +54,7 @@ from caoscrawler.structure_elements import (BooleanElement, DictElement, TextElement, XMLTagElement) from caoscrawler.xml_converter import XMLTagConverter -import lxml +from lxml.etree import fromstring UNITTESTDIR = Path(__file__).parent @@ -91,7 +91,7 @@ match_tag: a match_attrib: # default is the empty dictionary "(?P<ref>(href|url))": "test(?P<number>[0-9])" # either the "href" or the "url" attribute must be set alt: (.+) # this attribute must be present and contain at least one character -match_text: .* # allow any text, also empty (this is the default) +match_text: \\s*(?P<node_text>.+)\\s* # _*_ marks the default: attribs_as_children: true # true / _false_ @@ -106,6 +106,12 @@ subtree: type: TextElement match_name: alt match_value: ^(?P<text>.*)$ + + img: + type: XMLTag + match_name: img + match_attrib: + src: test2 """) return xml_cfood @@ -127,13 +133,72 @@ def test_simple_xml(basic_xmltag_converter): </a> """ - tag = XMLTagElement("tag", lxml.etree.fromstring(xml_text)) + tag = XMLTagElement("tag", fromstring(xml_text)) m = basic_xmltag_converter.match(tag) assert m is not None assert m["ref"] == "href" assert m["number"] == "1" + assert m["node_text"] == "test " def test_not_matching(basic_xmltag_converter): - pass + m = basic_xmltag_converter.match(XMLTagElement("tag", fromstring(""" + <a href="test1"> + test <img src="test2"/> + </a> + """))) + + assert m is None # alt-attribute was missing + + m = basic_xmltag_converter.match(XMLTagElement("tag", fromstring(""" + <a href="test" alt="no link"> + test <img src="test2"/> + </a> + """))) + + assert m is None # href attribute did not match + + m = basic_xmltag_converter.match(XMLTagElement("tag", fromstring(""" + <a href="test1" url="http" alt="no link"> + test <img src="test2"/> + </a> + """))) + + assert m is None # href and url must not be present simultaneously + + m = basic_xmltag_converter.match(XMLTagElement("tag", fromstring(""" + <a href="test1" alt="no link"><img src="test2"/></a> + """))) + + assert m is None # text node is empty + + m = basic_xmltag_converter.match(XMLTagElement("tag", fromstring(""" + <a href="test1" alt="no link"/> + """))) + + assert m is None # text node is empty + + # TODO: adapt converter -> empty (==None) text node is equivalent to empty string text node + # TODO: adapt tests + # TODO: how to match " ajskdlfjaldsf ajsdklfjadkl " without the whitespaces in regexp correctly? + + + +def test_nested_simple_xml(basic_xmltag_converter): + """ + Test for xml conversion including children. + """ + xml_text = """ + <a href="test1" alt="no link"> + test <img src="test2"/> + </a> + """ + + tag = XMLTagElement("tag", fromstring(xml_text)) + m = basic_xmltag_converter.match(tag) + assert m is not None + + general_store = GeneralStore() + + children = basic_xmltag_converter.create_children(general_store, tag) -- GitLab