diff --git a/src/caoscrawler/xml_converter.py b/src/caoscrawler/xml_converter.py index 908cd9ae072ba20279312474401b58d09a0e8478..e4b8cf4c61ff7258ed0b1f161453fba8d96db4a1 100644 --- a/src/caoscrawler/xml_converter.py +++ b/src/caoscrawler/xml_converter.py @@ -72,7 +72,30 @@ class XMLFileConverter(SimpleFileConverter): class XMLTagConverter(Converter): def create_children(self, generalStore: GeneralStore, element: StructureElement): - raise NotImplementedError() + if not isinstance(element, XMLTagElement): + raise TypeError("Element must be an instance of XMLTagElement.") + + # The options text_as_children, attribs_as_children and + # tags_as_children are currently not implemented. + + el_lst = [] + if "xpath" in self.definition: + # in this case it does not make sense to use the other "children"-options: + # for opt in ("text_", "attribs_", "tags_"): + # if opt + "_as_children" in self.definition: + # raise RuntimeError("The option {} cannot be used when using" + # " xpath".format(opt)) + # if xpath is present, use the xpath to generate children + children = element.xpath(self.definition["xpath"]) + else: + # use the direct children of the node instead + children = list(element) + # if "text_as_children" in self.definition and self.definition["text_as_children"] == True: + # el_lst.append(XMLTextNode(element, element.tree)) + # if "attribs_as_children" in self.definition and self.definition["attribs_as_children"] == True: + # # el_lst.append(XMLTextNode(element, element.tree)) + for el in children: + el_lst.append(XMLTagElement(el, element.tree)) def typecheck(self, element: StructureElement): """ diff --git a/unittests/test_xml_converter.py b/unittests/test_xml_converter.py index c302e7efb191adcbb6789117db48e127eca94891..b4288cacbb6ca30171214501c04ddd44971116ee 100644 --- a/unittests/test_xml_converter.py +++ b/unittests/test_xml_converter.py @@ -133,7 +133,9 @@ def test_simple_xml(basic_xmltag_converter): </a> """ - tag = XMLTagElement("tag", fromstring(xml_text)) + xml = fromstring(xml_text) + + tag = XMLTagElement(xml.getroot(), xml) m = basic_xmltag_converter.match(tag) assert m is not None @@ -202,3 +204,7 @@ def test_nested_simple_xml(basic_xmltag_converter): general_store = GeneralStore() children = basic_xmltag_converter.create_children(general_store, tag) + + assert len(children) == 1 + assert isinstance(children[0], XMLTagElement) + assert children.name == "/a/img"