From 4fb3afd3a37f8b3bc1b4268ce22a4f7d8d51422c Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Fri, 4 Oct 2024 15:17:13 +0200 Subject: [PATCH] ENH: implement xml attribute node converter and xml text node converter --- src/caoscrawler/converters/xml_converter.py | 50 +++++++++++- src/caoscrawler/crawl.py | 3 + src/caoscrawler/default_converters.yml | 4 + unittests/test_xml_converter.py | 89 ++++++++++++++++++++- 4 files changed, 142 insertions(+), 4 deletions(-) diff --git a/src/caoscrawler/converters/xml_converter.py b/src/caoscrawler/converters/xml_converter.py index d1d8b887..0f25c0c0 100644 --- a/src/caoscrawler/converters/xml_converter.py +++ b/src/caoscrawler/converters/xml_converter.py @@ -195,7 +195,10 @@ class XMLTagConverter(Converter): class XMLTextNodeConverter(Converter): def create_children(self, generalStore: GeneralStore, element: StructureElement): - raise NotImplementedError() + """ + This converter does not create children. + """ + return [] def typecheck(self, element: StructureElement): """ @@ -210,6 +213,47 @@ class XMLTextNodeConverter(Converter): if not isinstance(element, XMLTextNode): raise TypeError("Element must be an instance of XMLTextNode.") - raise NotImplementedError() + vardict = {} + + m_text = re.match(self.definition["match_text"], element.value, + re.DOTALL) + if m_text is None: + return None + vardict.update(m_text.groupdict()) + + return vardict + + +class XMLAttributeNodeConverter(Converter): + def create_children(self, generalStore: GeneralStore, element: StructureElement): + """ + This converter does not create children. + """ + return [] + + def typecheck(self, element: StructureElement): + """ + Check whether the current structure element can be converted using + this converter. + """ + return isinstance(element, XMLAttributeNode) + + def match(self, element: StructureElement) -> Optional[dict]: + # See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145 + # for a suggestion for the design of the matching algorithm. + if not isinstance(element, XMLAttributeNode): + raise TypeError("Element must be an instance of XMLAttributeNode.") + + vardict = {} - return None + m_name = re.match(self.definition["match_name"], element.key) + if m_name is None: + return None + vardict.update(m_name.groupdict()) + + m_value = re.match(self.definition["match_value"], element.value) + if m_value is None: + return None + vardict.update(m_value.groupdict()) + + return vardict diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index a449a779..89b5ba00 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -598,6 +598,9 @@ one with the entities that need to be updated and the other with entities to be unique_names=True): Crawler.set_ids_and_datatype_of_parents_and_properties(to_be_updated) logger.debug("UPDATE") + # Here, it's probably much more reasonable to show a diff of the update: + # from linkahead.apiutils import compare_entities + # [compare_entities(c, db.Record(id=c.id).retrieve()) for c in to_be_updated] logger.debug(to_be_updated) if len(to_be_updated) > 0: if securityMode.value > SecurityMode.INSERT.value: diff --git a/src/caoscrawler/default_converters.yml b/src/caoscrawler/default_converters.yml index a78c1579..656b0ba0 100644 --- a/src/caoscrawler/default_converters.yml +++ b/src/caoscrawler/default_converters.yml @@ -111,3 +111,7 @@ XMLTag: XMLTextNode: converter: XMLTextNodeConverter package: caoscrawler.converters + +XMLAttributeNode: + converter: XMLAttributeNodeConverter + package: caoscrawler.converters diff --git a/unittests/test_xml_converter.py b/unittests/test_xml_converter.py index fb4c7746..9fc9749c 100644 --- a/unittests/test_xml_converter.py +++ b/unittests/test_xml_converter.py @@ -32,7 +32,9 @@ import yaml from lxml.etree import fromstring from pathlib import Path -from caoscrawler.converters import XMLTagConverter +from caoscrawler.converters import (XMLTagConverter, + XMLAttributeNodeConverter, + XMLTextNodeConverter) from caoscrawler.scanner import load_definition from caoscrawler.stores import GeneralStore from caoscrawler.structure_elements import XMLTagElement @@ -51,6 +53,9 @@ def converter_registry(): "XMLTextNode": { "converter": "XMLTextNodeConverter", "package": "caoscrawler.converters"}, + "XMLAttributeNode": { + "converter": "XMLAttributeNodeConverter", + "package": "caoscrawler.converters"}, } for key, value in converter_registry.items(): @@ -294,3 +299,85 @@ nsmap: children = converter.create_children(GeneralStore(), tag) assert len(children) == 1 assert children[0].name == "{default-namespace}node1[2]/{sub-namespace}node2" + + +def test_attrib_nodes(converter_registry): + """ + Test attribute node converters. + """ + + xml_text = """ + <node1 active="true" size="45"> + Bla + </node1> +""" + + converter = XMLTagConverter(yaml.safe_load(""" +type: XMLTag +match_tag: "node1" +xpath: . +tags_as_children: false +attribs_as_children: true +"""), "TestXMLTagConverter", converter_registry) + + tag = XMLTagElement(fromstring(xml_text)) + m = converter.match(tag) + assert m is not None + children = converter.create_children(GeneralStore(), tag) + assert len(children) == 2 + + attrib_converter = XMLAttributeNodeConverter(yaml.safe_load(""" +type: XMLAttributeNode +match_name: active +match_value: (?P<val>.*) +"""), "TestXMLAttributeNodeConverter", converter_registry) + m = attrib_converter.match(children[1]) + assert m is None + m = attrib_converter.match(children[0]) + assert m is not None + assert m["val"] == "true" + + attrib_converter = XMLAttributeNodeConverter(yaml.safe_load(""" +type: XMLAttributeNode +match_name: size +match_value: (?P<val>.*) +"""), "TestXMLAttributeNodeConverter", converter_registry) + m = attrib_converter.match(children[0]) + assert m is None + m = attrib_converter.match(children[1]) + assert m is not None + assert m["val"] == "45" + + +def test_text_nodes(converter_registry): + """ + Test text node converters. + """ + + xml_text = """ + <node1 active="true" size="45"> + Bla + </node1> +""" + + converter = XMLTagConverter(yaml.safe_load(""" +type: XMLTag +match_tag: "node1" +xpath: . +tags_as_children: false +text_as_children: true +"""), "TestXMLTagConverter", converter_registry) + + tag = XMLTagElement(fromstring(xml_text)) + m = converter.match(tag) + assert m is not None + children = converter.create_children(GeneralStore(), tag) + assert len(children) == 1 + + attrib_converter = XMLTextNodeConverter(yaml.safe_load(""" +type: XMLTextNode +match_text: \s*(?P<val>\w*)\s* +"""), "TestXMLTextNodeConverter", converter_registry) + m = attrib_converter.match(children[0]) + assert m is not None + assert m["val"] == "Bla" -- GitLab