From a6b41c7a9b48726b244a034f6d8f3d86a6c1cf04 Mon Sep 17 00:00:00 2001
From: Alexander Schlemmer <a.schlemmer@indiscale.com>
Date: Tue, 20 Aug 2024 17:55:30 +0200
Subject: [PATCH] TST: added test for xml converters that do not match and a
 stub for nested xml

---
 unittests/test_xml_converter.py | 73 +++++++++++++++++++++++++++++++--
 1 file changed, 69 insertions(+), 4 deletions(-)

diff --git a/unittests/test_xml_converter.py b/unittests/test_xml_converter.py
index a5e9898a..c302e7ef 100644
--- a/unittests/test_xml_converter.py
+++ b/unittests/test_xml_converter.py
@@ -54,7 +54,7 @@ from caoscrawler.structure_elements import (BooleanElement, DictElement,
                                             TextElement, XMLTagElement)
 from caoscrawler.xml_converter import XMLTagConverter
 
-import lxml
+from lxml.etree import fromstring
 
 UNITTESTDIR = Path(__file__).parent
 
@@ -91,7 +91,7 @@ match_tag: a
 match_attrib:  # default is the empty dictionary
     "(?P<ref>(href|url))": "test(?P<number>[0-9])"  # either the "href" or the "url" attribute must be set
     alt: (.+)  # this attribute must be present and contain at least one character
-match_text: .*  # allow any text, also empty (this is the default)
+match_text: \\s*(?P<node_text>.+)\\s*
 
 # _*_ marks the default:
 attribs_as_children: true  # true / _false_
@@ -106,6 +106,12 @@ subtree:
         type: TextElement
         match_name: alt
         match_value: ^(?P<text>.*)$
+
+    img:
+        type: XMLTag
+        match_name: img
+        match_attrib:
+            src: test2
 """)
     return xml_cfood
 
@@ -127,13 +133,72 @@ def test_simple_xml(basic_xmltag_converter):
     </a>
     """
 
-    tag = XMLTagElement("tag", lxml.etree.fromstring(xml_text))
+    tag = XMLTagElement("tag", fromstring(xml_text))
     m = basic_xmltag_converter.match(tag)
 
     assert m is not None
     assert m["ref"] == "href"
     assert m["number"] == "1"
+    assert m["node_text"] == "test "
 
 
 def test_not_matching(basic_xmltag_converter):
-    pass
+    m = basic_xmltag_converter.match(XMLTagElement("tag", fromstring("""
+        <a href="test1">
+        test <img src="test2"/>
+        </a>
+        """)))
+
+    assert m is None  # alt-attribute was missing
+
+    m = basic_xmltag_converter.match(XMLTagElement("tag", fromstring("""
+        <a href="test" alt="no link">
+        test <img src="test2"/>
+        </a>
+        """)))
+
+    assert m is None  # href attribute did not match
+
+    m = basic_xmltag_converter.match(XMLTagElement("tag", fromstring("""
+        <a href="test1" url="http" alt="no link">
+        test <img src="test2"/>
+        </a>
+        """)))
+
+    assert m is None  # href and url must not be present simultaneously
+
+    m = basic_xmltag_converter.match(XMLTagElement("tag", fromstring("""
+        <a href="test1" alt="no link"><img src="test2"/></a>
+        """)))
+
+    assert m is None  # text node is empty
+
+    m = basic_xmltag_converter.match(XMLTagElement("tag", fromstring("""
+        <a href="test1" alt="no link"/>
+        """)))
+
+    assert m is None  # text node is empty
+
+    # TODO: adapt converter -> empty (==None) text node is equivalent to empty string text node
+    # TODO: adapt tests
+    # TODO: how to match "  ajskdlfjaldsf ajsdklfjadkl " without the whitespaces in regexp correctly?
+
+
+
+def test_nested_simple_xml(basic_xmltag_converter):
+    """
+    Test for xml conversion including children.
+    """
+    xml_text = """
+    <a href="test1" alt="no link">
+    test <img src="test2"/>
+    </a>
+    """
+
+    tag = XMLTagElement("tag", fromstring(xml_text))
+    m = basic_xmltag_converter.match(tag)
+    assert m is not None
+
+    general_store = GeneralStore()
+
+    children = basic_xmltag_converter.create_children(general_store, tag)
-- 
GitLab