diff --git a/src/caoscrawler/converters/xml_converter.py b/src/caoscrawler/converters/xml_converter.py index d1d8b8871f9dad9762f35ee79e1a9106c259f4a9..0f25c0c0947421f0561c42318ac0abddabb447fc 100644 --- a/src/caoscrawler/converters/xml_converter.py +++ b/src/caoscrawler/converters/xml_converter.py @@ -195,7 +195,10 @@ class XMLTagConverter(Converter): class XMLTextNodeConverter(Converter): def create_children(self, generalStore: GeneralStore, element: StructureElement): - raise NotImplementedError() + """ + This converter does not create children. + """ + return [] def typecheck(self, element: StructureElement): """ @@ -210,6 +213,47 @@ class XMLTextNodeConverter(Converter): if not isinstance(element, XMLTextNode): raise TypeError("Element must be an instance of XMLTextNode.") - raise NotImplementedError() + vardict = {} + + m_text = re.match(self.definition["match_text"], element.value, + re.DOTALL) + if m_text is None: + return None + vardict.update(m_text.groupdict()) + + return vardict + + +class XMLAttributeNodeConverter(Converter): + def create_children(self, generalStore: GeneralStore, element: StructureElement): + """ + This converter does not create children. + """ + return [] + + def typecheck(self, element: StructureElement): + """ + Check whether the current structure element can be converted using + this converter. + """ + return isinstance(element, XMLAttributeNode) + + def match(self, element: StructureElement) -> Optional[dict]: + # See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145 + # for a suggestion for the design of the matching algorithm. + if not isinstance(element, XMLAttributeNode): + raise TypeError("Element must be an instance of XMLAttributeNode.") + + vardict = {} - return None + m_name = re.match(self.definition["match_name"], element.key) + if m_name is None: + return None + vardict.update(m_name.groupdict()) + + m_value = re.match(self.definition["match_value"], element.value) + if m_value is None: + return None + vardict.update(m_value.groupdict()) + + return vardict diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index a449a779ed7979719bbe0ac780adecf0e4fec8f6..89b5ba001ed446c7f5a0f261898deecd3e7a5e00 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -598,6 +598,9 @@ one with the entities that need to be updated and the other with entities to be unique_names=True): Crawler.set_ids_and_datatype_of_parents_and_properties(to_be_updated) logger.debug("UPDATE") + # Here, it's probably much more reasonable to show a diff of the update: + # from linkahead.apiutils import compare_entities + # [compare_entities(c, db.Record(id=c.id).retrieve()) for c in to_be_updated] logger.debug(to_be_updated) if len(to_be_updated) > 0: if securityMode.value > SecurityMode.INSERT.value: diff --git a/src/caoscrawler/default_converters.yml b/src/caoscrawler/default_converters.yml index a78c1579fc05c2ede424c076e7590d25550ea2f3..656b0ba0f1f76007266cc8b2e75f5bd7046f1206 100644 --- a/src/caoscrawler/default_converters.yml +++ b/src/caoscrawler/default_converters.yml @@ -111,3 +111,7 @@ XMLTag: XMLTextNode: converter: XMLTextNodeConverter package: caoscrawler.converters + +XMLAttributeNode: + converter: XMLAttributeNodeConverter + package: caoscrawler.converters diff --git a/unittests/test_xml_converter.py b/unittests/test_xml_converter.py index fb4c7746fa2d0b6c3d4ec95fc1de3139493a703f..9fc9749ccececd41d460fe297edfea72cc30a5ef 100644 --- a/unittests/test_xml_converter.py +++ b/unittests/test_xml_converter.py @@ -32,7 +32,9 @@ import yaml from lxml.etree import fromstring from pathlib import Path -from caoscrawler.converters import XMLTagConverter +from caoscrawler.converters import (XMLTagConverter, + XMLAttributeNodeConverter, + XMLTextNodeConverter) from caoscrawler.scanner import load_definition from caoscrawler.stores import GeneralStore from caoscrawler.structure_elements import XMLTagElement @@ -51,6 +53,9 @@ def converter_registry(): "XMLTextNode": { "converter": "XMLTextNodeConverter", "package": "caoscrawler.converters"}, + "XMLAttributeNode": { + "converter": "XMLAttributeNodeConverter", + "package": "caoscrawler.converters"}, } for key, value in converter_registry.items(): @@ -294,3 +299,85 @@ nsmap: children = converter.create_children(GeneralStore(), tag) assert len(children) == 1 assert children[0].name == "{default-namespace}node1[2]/{sub-namespace}node2" + + +def test_attrib_nodes(converter_registry): + """ + Test attribute node converters. + """ + + xml_text = """ + <node1 active="true" size="45"> + Bla + </node1> +""" + + converter = XMLTagConverter(yaml.safe_load(""" +type: XMLTag +match_tag: "node1" +xpath: . +tags_as_children: false +attribs_as_children: true +"""), "TestXMLTagConverter", converter_registry) + + tag = XMLTagElement(fromstring(xml_text)) + m = converter.match(tag) + assert m is not None + children = converter.create_children(GeneralStore(), tag) + assert len(children) == 2 + + attrib_converter = XMLAttributeNodeConverter(yaml.safe_load(""" +type: XMLAttributeNode +match_name: active +match_value: (?P<val>.*) +"""), "TestXMLAttributeNodeConverter", converter_registry) + m = attrib_converter.match(children[1]) + assert m is None + m = attrib_converter.match(children[0]) + assert m is not None + assert m["val"] == "true" + + attrib_converter = XMLAttributeNodeConverter(yaml.safe_load(""" +type: XMLAttributeNode +match_name: size +match_value: (?P<val>.*) +"""), "TestXMLAttributeNodeConverter", converter_registry) + m = attrib_converter.match(children[0]) + assert m is None + m = attrib_converter.match(children[1]) + assert m is not None + assert m["val"] == "45" + + +def test_text_nodes(converter_registry): + """ + Test text node converters. + """ + + xml_text = """ + <node1 active="true" size="45"> + Bla + </node1> +""" + + converter = XMLTagConverter(yaml.safe_load(""" +type: XMLTag +match_tag: "node1" +xpath: . +tags_as_children: false +text_as_children: true +"""), "TestXMLTagConverter", converter_registry) + + tag = XMLTagElement(fromstring(xml_text)) + m = converter.match(tag) + assert m is not None + children = converter.create_children(GeneralStore(), tag) + assert len(children) == 1 + + attrib_converter = XMLTextNodeConverter(yaml.safe_load(""" +type: XMLTextNode +match_text: \s*(?P<val>\w*)\s* +"""), "TestXMLTextNodeConverter", converter_registry) + m = attrib_converter.match(children[0]) + assert m is not None + assert m["val"] == "Bla"