diff --git a/CHANGELOG.md b/CHANGELOG.md index 1c17b1795dc997a749e04f81dab41ea0dc5aed2a..3488990fda0a75b7169ba82e08c59e1418d55b19 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### +- XMLTextNodeConverter for converting text nodes created by XMLTagConverter +- XMLAttributeNodeConverter for converting attribute nodes created by XMLTagConverter + ### Changed ### ### Deprecated ### diff --git a/src/caoscrawler/cfood-schema.yml b/src/caoscrawler/cfood-schema.yml index acc3911f21d320146d0c35abc9d781541ee151ac..451cb1c0a4fcb44cd31c5db412a411417936333a 100644 --- a/src/caoscrawler/cfood-schema.yml +++ b/src/caoscrawler/cfood-schema.yml @@ -76,6 +76,7 @@ cfood: - XMLFile - XMLTag - XMLTextNode + - XMLAttributeNode - PropertiesFromDictElement description: Type of this converter node. match: diff --git a/src/caoscrawler/converters/xml_converter.py b/src/caoscrawler/converters/xml_converter.py index d1d8b8871f9dad9762f35ee79e1a9106c259f4a9..0f25c0c0947421f0561c42318ac0abddabb447fc 100644 --- a/src/caoscrawler/converters/xml_converter.py +++ b/src/caoscrawler/converters/xml_converter.py @@ -195,7 +195,10 @@ class XMLTagConverter(Converter): class XMLTextNodeConverter(Converter): def create_children(self, generalStore: GeneralStore, element: StructureElement): - raise NotImplementedError() + """ + This converter does not create children. + """ + return [] def typecheck(self, element: StructureElement): """ @@ -210,6 +213,47 @@ class XMLTextNodeConverter(Converter): if not isinstance(element, XMLTextNode): raise TypeError("Element must be an instance of XMLTextNode.") - raise NotImplementedError() + vardict = {} + + m_text = re.match(self.definition["match_text"], element.value, + re.DOTALL) + if m_text is None: + return None + vardict.update(m_text.groupdict()) + + return vardict + + +class XMLAttributeNodeConverter(Converter): + def create_children(self, generalStore: GeneralStore, element: StructureElement): + """ + This converter does not create children. + """ + return [] + + def typecheck(self, element: StructureElement): + """ + Check whether the current structure element can be converted using + this converter. + """ + return isinstance(element, XMLAttributeNode) + + def match(self, element: StructureElement) -> Optional[dict]: + # See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145 + # for a suggestion for the design of the matching algorithm. + if not isinstance(element, XMLAttributeNode): + raise TypeError("Element must be an instance of XMLAttributeNode.") + + vardict = {} - return None + m_name = re.match(self.definition["match_name"], element.key) + if m_name is None: + return None + vardict.update(m_name.groupdict()) + + m_value = re.match(self.definition["match_value"], element.value) + if m_value is None: + return None + vardict.update(m_value.groupdict()) + + return vardict diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index a449a779ed7979719bbe0ac780adecf0e4fec8f6..89b5ba001ed446c7f5a0f261898deecd3e7a5e00 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -598,6 +598,9 @@ one with the entities that need to be updated and the other with entities to be unique_names=True): Crawler.set_ids_and_datatype_of_parents_and_properties(to_be_updated) logger.debug("UPDATE") + # Here, it's probably much more reasonable to show a diff of the update: + # from linkahead.apiutils import compare_entities + # [compare_entities(c, db.Record(id=c.id).retrieve()) for c in to_be_updated] logger.debug(to_be_updated) if len(to_be_updated) > 0: if securityMode.value > SecurityMode.INSERT.value: diff --git a/src/caoscrawler/default_converters.yml b/src/caoscrawler/default_converters.yml index a78c1579fc05c2ede424c076e7590d25550ea2f3..656b0ba0f1f76007266cc8b2e75f5bd7046f1206 100644 --- a/src/caoscrawler/default_converters.yml +++ b/src/caoscrawler/default_converters.yml @@ -111,3 +111,7 @@ XMLTag: XMLTextNode: converter: XMLTextNodeConverter package: caoscrawler.converters + +XMLAttributeNode: + converter: XMLAttributeNodeConverter + package: caoscrawler.converters diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 0522c4e6fd31239b9b3ae1f803ef5799ad2c5423..f1b74d89ffd367849b85433e0e456ae8a78134a8 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -643,7 +643,7 @@ def test_load_converters(): # converter classes can be loaded from their respective packages. # Please adapt, if defaults change! - assert len(converter_registry) == 28 + assert len(converter_registry) == 29 # All of them are contained in caoscrawler.converters # except for the xml converters: diff --git a/unittests/test_xml_converter.py b/unittests/test_xml_converter.py index fb4c7746fa2d0b6c3d4ec95fc1de3139493a703f..9fc9749ccececd41d460fe297edfea72cc30a5ef 100644 --- a/unittests/test_xml_converter.py +++ b/unittests/test_xml_converter.py @@ -32,7 +32,9 @@ import yaml from lxml.etree import fromstring from pathlib import Path -from caoscrawler.converters import XMLTagConverter +from caoscrawler.converters import (XMLTagConverter, + XMLAttributeNodeConverter, + XMLTextNodeConverter) from caoscrawler.scanner import load_definition from caoscrawler.stores import GeneralStore from caoscrawler.structure_elements import XMLTagElement @@ -51,6 +53,9 @@ def converter_registry(): "XMLTextNode": { "converter": "XMLTextNodeConverter", "package": "caoscrawler.converters"}, + "XMLAttributeNode": { + "converter": "XMLAttributeNodeConverter", + "package": "caoscrawler.converters"}, } for key, value in converter_registry.items(): @@ -294,3 +299,85 @@ nsmap: children = converter.create_children(GeneralStore(), tag) assert len(children) == 1 assert children[0].name == "{default-namespace}node1[2]/{sub-namespace}node2" + + +def test_attrib_nodes(converter_registry): + """ + Test attribute node converters. + """ + + xml_text = """ + <node1 active="true" size="45"> + Bla + </node1> +""" + + converter = XMLTagConverter(yaml.safe_load(""" +type: XMLTag +match_tag: "node1" +xpath: . +tags_as_children: false +attribs_as_children: true +"""), "TestXMLTagConverter", converter_registry) + + tag = XMLTagElement(fromstring(xml_text)) + m = converter.match(tag) + assert m is not None + children = converter.create_children(GeneralStore(), tag) + assert len(children) == 2 + + attrib_converter = XMLAttributeNodeConverter(yaml.safe_load(""" +type: XMLAttributeNode +match_name: active +match_value: (?P<val>.*) +"""), "TestXMLAttributeNodeConverter", converter_registry) + m = attrib_converter.match(children[1]) + assert m is None + m = attrib_converter.match(children[0]) + assert m is not None + assert m["val"] == "true" + + attrib_converter = XMLAttributeNodeConverter(yaml.safe_load(""" +type: XMLAttributeNode +match_name: size +match_value: (?P<val>.*) +"""), "TestXMLAttributeNodeConverter", converter_registry) + m = attrib_converter.match(children[0]) + assert m is None + m = attrib_converter.match(children[1]) + assert m is not None + assert m["val"] == "45" + + +def test_text_nodes(converter_registry): + """ + Test text node converters. + """ + + xml_text = """ + <node1 active="true" size="45"> + Bla + </node1> +""" + + converter = XMLTagConverter(yaml.safe_load(""" +type: XMLTag +match_tag: "node1" +xpath: . +tags_as_children: false +text_as_children: true +"""), "TestXMLTagConverter", converter_registry) + + tag = XMLTagElement(fromstring(xml_text)) + m = converter.match(tag) + assert m is not None + children = converter.create_children(GeneralStore(), tag) + assert len(children) == 1 + + attrib_converter = XMLTextNodeConverter(yaml.safe_load(""" +type: XMLTextNode +match_text: \s*(?P<val>\w*)\s* +"""), "TestXMLTextNodeConverter", converter_registry) + m = attrib_converter.match(children[0]) + assert m is not None + assert m["val"] == "Bla"