diff --git a/src/caoscrawler/structure_elements.py b/src/caoscrawler/structure_elements.py index 27dfe21f29edc0f9ac9d4345281a76a5c4b9ea85..67cd1056b382c92485deada2058526a03b6d8535 100644 --- a/src/caoscrawler/structure_elements.py +++ b/src/caoscrawler/structure_elements.py @@ -196,7 +196,7 @@ class XMLTextNode(StructureElement): - XMLTextNodes just have a text and the name is just for identifying the structure element. They can only be matched using the match entry in the XMLTextNodeConverter. """ - super().__init__(element.getroottree().getelementpath(element)) + super().__init__(element.getroottree().getelementpath(element) + "/text()") self.tag = element self.value = element.text @@ -207,17 +207,14 @@ class XMLAttributeNode(StructureElement): """ def __init__(self, element: lxml.etree.Element, - key: str, - tree: lxml.etree.ElementTree): + key: str): """ Initializes this XML attribute node. element: The xml tree element containing the attribute. key: The key which identifies the attribute in the list of attributes. - tree: The tree containing the element which is used to set the node path. """ - super().__init__(element.getroottree().getelementpath(element)) + super().__init__(element.getroottree().getelementpath(element) + "@" + key) self.value = element.attrib[key] self.key = key - # TODO: use getpath instead of getlementpath? self.tag = element diff --git a/src/caoscrawler/xml_converter.py b/src/caoscrawler/xml_converter.py index b01d92e7dca19ed7a82e024be17cbd572f7c1c13..64e52ba97403b7491d10a07dd424083c8a13c3d1 100644 --- a/src/caoscrawler/xml_converter.py +++ b/src/caoscrawler/xml_converter.py @@ -72,6 +72,28 @@ class XMLFileConverter(SimpleFileConverter): class XMLTagConverter(Converter): def create_children(self, generalStore: GeneralStore, element: StructureElement): + """ + Children that are generated by this function are the result of the xpath query given in + the yaml property "xpath". Its default (when not given) is "child::*", so the direct children + of the current xml node. + The xpath expression must be designed in a way that it returns xml tags (and no attributes or texts). + That means, that the axis "attribute::" and the function "text()" must not be used. + + The following yaml properties can be used to generate other types of nodes (text nodes and attribute nodes) + as subtree structure elements: + + # _*_ marks the default: + attribs_as_children: true # true / _false_ + text_as_children: true # true / _false_ + tags_as_children: true # _true_ / false + + The default is to generate the tags matched by the xpath expression only. + - When text_as_children is set to true, text nodes will be generated that contain the text + contained in the matched tags. + - When attribs_as_children is set to true, attribute nodes will be generated from the attributes + of the matched tags. + + """ if not isinstance(element, XMLTagElement): raise TypeError("Element must be an instance of XMLTagElement.") @@ -89,7 +111,7 @@ class XMLTagConverter(Converter): el_lst = [] for el in children: if isinstance(el, str): - el_lst.append(XMLTextNode(el)) + raise RuntimeError("Only standard xml nodes are supported as results of xpath queries.") elif isinstance(el, lxml.etree._Element): el_lst.append(XMLTagElement(el)) else: @@ -174,4 +196,6 @@ class XMLTextNodeConverter(Converter): if not isinstance(element, XMLTextNode): raise TypeError("Element must be an instance of XMLTextNode.") + raise NotImplementedError() + return None diff --git a/unittests/test_xml_converter.py b/unittests/test_xml_converter.py index 6bfb1cccad1d7835f83f37b5263d579b84641c20..fbcc39d729cd52c1275403848c22a9de56a643a5 100644 --- a/unittests/test_xml_converter.py +++ b/unittests/test_xml_converter.py @@ -230,3 +230,54 @@ def test_nested_simple_xml(basic_xmltag_converter, basic_xpath_xmltag_converter) assert isinstance(children[0], XMLTagElement) assert children[0].name == "img/testnode" + +def test_namespace_xml(converter_registry): + """ + Test for xml conversion including children. + Nodes have namespaces. + """ + + xml_text = """ + <root xmlns="default-namespace" xmlns:test="alternative-namespace"> + <node1> + Bla + </node1> + <node1> + text + <node2 xmlns="sub-namespace"> + <node3> + ok + </node3> + </node2> + 2 + <test:node2> + sep + </test:node2> + 3 + </node1> + </root> +""" + + xpaths = ( + "default:node1/text()", + ) + + for xpath in xpaths: + converter = XMLTagConverter(yaml.safe_load(""" +type: XMLTag +match_tag: \\{{default-namespace\\}}root +xpath: "{}" +default_namespace: default +subtree: + Text: + type: XMLTextNode + match: (?P<result>.*) +""".format(xpath)), "TestXMLTagConverter", converter_registry) + + tag = XMLTagElement(fromstring(xml_text)) + m = converter.match(tag) + assert m is not None + + general_store = GeneralStore() + children = converter.create_children(general_store, tag) + assert len(children) == 4