diff --git a/src/caoscrawler/xml_converter.py b/src/caoscrawler/xml_converter.py index 64e52ba97403b7491d10a07dd424083c8a13c3d1..06cf34a2825112feac5880f79a9c7b2b2229e4b7 100644 --- a/src/caoscrawler/xml_converter.py +++ b/src/caoscrawler/xml_converter.py @@ -39,7 +39,7 @@ from .stores import GeneralStore, RecordStore from .structure_elements import (BooleanElement, DictElement, Directory, File, FloatElement, IntegerElement, JSONFile, ListElement, NoneElement, StructureElement, - TextElement, XMLTagElement, XMLTextNode) + TextElement, XMLTagElement, XMLTextNode, XMLAttributeNode) from .utils import has_parent import lxml.etree @@ -93,6 +93,13 @@ class XMLTagConverter(Converter): - When attribs_as_children is set to true, attribute nodes will be generated from the attributes of the matched tags. + Namespaces + ---------- + The default is to take the namespace map from the current node and use it in xpath queries. + Because default namespaces cannot be handled by xpath, it is possible to remap the default namespace + using the key "default_namespace". + The key "nsmap" can be used to define additional nsmap entries. + """ if not isinstance(element, XMLTagElement): raise TypeError("Element must be an instance of XMLTagElement.") @@ -106,6 +113,11 @@ class XMLTagConverter(Converter): nsmap[default_namespace] = nsmap[None] del nsmap[None] + # Set additional nsmap entries from the converter definition: + if "nsmap" in self.definition: + for key, value in self.definition["nsmap"].items(): + nsmap[key] = value + xpath = self.definition.get("xpath", "child::*") children = element.tag.xpath(xpath, namespaces=nsmap) el_lst = [] @@ -113,7 +125,13 @@ class XMLTagConverter(Converter): if isinstance(el, str): raise RuntimeError("Only standard xml nodes are supported as results of xpath queries.") elif isinstance(el, lxml.etree._Element): - el_lst.append(XMLTagElement(el)) + if self.definition.get("tags_as_children", True): + el_lst.append(XMLTagElement(el)) + if self.definition.get("attribs_as_children", False): + for attrib in el.attrib: + el_lst.append(XMLAttributeNode(el, attrib)) + if self.definition.get("text_as_children", False): + el_lst.append(XMLTextNode(el)) else: raise RuntimeError("Unsupported child type.") return el_lst diff --git a/unittests/test_xml_converter.py b/unittests/test_xml_converter.py index fbcc39d729cd52c1275403848c22a9de56a643a5..16912aa7be177a72ce76f2afa2daf7ae27e5cd5d 100644 --- a/unittests/test_xml_converter.py +++ b/unittests/test_xml_converter.py @@ -249,35 +249,45 @@ def test_namespace_xml(converter_registry): ok </node3> </node2> - 2 <test:node2> sep </test:node2> - 3 </node1> </root> """ - xpaths = ( - "default:node1/text()", - ) - - for xpath in xpaths: - converter = XMLTagConverter(yaml.safe_load(""" -type: XMLTag -match_tag: \\{{default-namespace\\}}root -xpath: "{}" + conv_tail = """ default_namespace: default subtree: Text: type: XMLTextNode match: (?P<result>.*) -""".format(xpath)), "TestXMLTagConverter", converter_registry) +""" - tag = XMLTagElement(fromstring(xml_text)) - m = converter.match(tag) - assert m is not None + converter = XMLTagConverter(yaml.safe_load(""" +type: XMLTag +match_tag: "{default-namespace}root" +xpath: "default:node1/text()" +""" + conv_tail), "TestXMLTagConverter", converter_registry) - general_store = GeneralStore() - children = converter.create_children(general_store, tag) - assert len(children) == 4 + tag = XMLTagElement(fromstring(xml_text)) + m = converter.match(tag) + assert m is not None + + with pytest.raises(RuntimeError, match="Only standard xml nodes.*"): + converter.create_children(GeneralStore(), tag) + + converter = XMLTagConverter(yaml.safe_load(""" +type: XMLTag +match_tag: "{default-namespace}root" +xpath: "default:node1" +attribs_as_children: false +text_as_children: true +tags_as_children: false +""" + conv_tail), "TestXMLTagConverter", converter_registry) + children = converter.create_children(GeneralStore(), tag) + assert len(children) == 2 + assert children[0].name == "{default-namespace}node1[1]/text()" + assert children[0].value.strip() == "Bla" + assert children[1].name == "{default-namespace}node1[2]/text()" + assert children[1].value.strip() == "text"