diff --git a/CHANGELOG.md b/CHANGELOG.md index 57a8ef488cde9120c8dbd993c7e438f5fdd69c7e..25a434be72057f6081aee8ab85453b8d12d1ba84 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### -- ZipFileConverter that opens zip files and exposes their contents as File and Directory structure elements. +- DictElementConverters can now make use of `match_properties` which + works analogous to `match_properties` in ROCrateEntityConverter and + `match_attrib` in XMLConverter. +- `match_properties` is a method of class Converter and can for + example be used by CustomConverters. +- ZipFileConverter that opens zip files and exposes their contents as + File and Directory structure elements. ### Changed ### diff --git a/src/caoscrawler/converters/converters.py b/src/caoscrawler/converters/converters.py index 64a557ce4e26fd8bfd345000d3abf18bf0360117..3a3c7e292a2967cab68228e820fea6880302be89 100644 --- a/src/caoscrawler/converters/converters.py +++ b/src/caoscrawler/converters/converters.py @@ -456,6 +456,90 @@ class Converter(object, metaclass=ABCMeta): raise RuntimeError("Condition does not match.") values.update(m) + def match_properties(self, properties: dict, vardict: dict, label: str = "match_properties"): + """This method can be used to generically match 'match_properties' from the cfood definition + with the behavior described as follows: + + 'match_properties' is a dictionary of key-regexps and value-regexp pairs. Each key matches + a property name and the corresponding value matches its property value. + + What a property means in the context of the respective converter can be different, examples: + + * XMLTag: attributes of the node + * ROCrate: properties of the ROCrateEntity + * DictElement: properties of the dict + + label can be used to customize the name of the dictionary in the definition. + + This method is not called by default, but can be called from child classes. + + Typically it would be used like this from methods overwriting `match`:: + + if not self.match_properties(<properties>, vardict): + return None + + vardict will be updated in place when there are + matches. <properties> is a dictionary taken from the structure + element that contains the properties in the context of this + converter. + + + Parameters + ---------- + + properties: dict + The dictionary containing the properties to be matched. + + vardict: dict + This dictionary will be used to store the variables created during the matching. + + label: str + Default "match_properties". Can be used to change the name + of the property in the definition. E.g. the xml converter + uses "match_attrib" which makes more sense in the context + of xml trees. + + Returns + ------- + + : bool + Returns True when properties match and False + otherwise. The vardict dictionary is updated in place. + + """ + if label in self.definition: + # This matcher works analogously to the attributes matcher in the XMLConverter + for prop_def_key, prop_def_value in self.definition[label].items(): + match_counter = 0 + matched_m_prop = None + matched_m_prop_value = None + for prop_key, prop_value in properties.items(): + print("{} = {}".format(prop_key, prop_value)) + # TODO: automatic conversion to str ok? + m_prop = re.match(prop_def_key, str(prop_key)) + if m_prop is not None: + match_counter += 1 + matched_m_prop = m_prop + # TODO: automatic conversion to str ok? + m_prop_value = re.match(prop_def_value, str(prop_value)) + if m_prop_value is None: + return False + matched_m_prop_value = m_prop_value + # TODO: How to deal with multiple matches? + # There are multiple options: + # - Allow multiple attribute-key matches: Leads to possible overwrites of variables + # - Require unique attribute-key and attribute-value matches: Very complex + # - Only allow one single attribute-key to match and run attribute-value match separately. + # Currently the latter option is implemented. + # TODO: The ROCrateEntityConverter implements a very similar behavior. + if match_counter == 0: + return False + elif match_counter > 1: + raise RuntimeError("Multiple properties match the same {} entry.".format(label)) + vardict.update(matched_m_prop.groupdict()) + vardict.update(matched_m_prop_value.groupdict()) + return True + def apply_transformers(self, values: GeneralStore, transformer_functions: dict): """ Check if transformers are defined using the "transform" keyword. @@ -876,7 +960,12 @@ class DictElementConverter(Converter): # TODO: See comment on types and inheritance if not isinstance(element, DictElement): raise RuntimeError("Element must be a DictElement.") - return match_name_and_value(self.definition, element.name, element.value) + vardict = match_name_and_value(self.definition, element.name, element.value) + + if not self.match_properties(element.value, vardict): + return None + + return vardict class PropertiesFromDictConverter(DictElementConverter): diff --git a/src/caoscrawler/converters/rocrate.py b/src/caoscrawler/converters/rocrate.py index b84462acba2fdd7e60094e38edc38605c80deb11..8a45af753312a2bf29c1ddb9e6bcb15458c3ebde 100644 --- a/src/caoscrawler/converters/rocrate.py +++ b/src/caoscrawler/converters/rocrate.py @@ -32,15 +32,13 @@ import tempfile from typing import Optional from zipfile import ZipFile -import linkahead as db import rocrate from rocrate.rocrate import ROCrate -from ..stores import GeneralStore, RecordStore +from ..stores import GeneralStore from ..structure_elements import (Directory, File, ROCrateEntity, StructureElement) -from .converters import (Converter, ConverterValidationError, - SimpleFileConverter, convert_basic_element) +from .converters import Converter, SimpleFileConverter, convert_basic_element class ROCrateConverter(SimpleFileConverter): @@ -169,33 +167,24 @@ class ROCrateEntityConverter(Converter): # Store the result of all individual regexp variable results: vardict = {} + # TODO: I accidentally used "match_type" instead + # of "match_entity_type". This was completely + # unnoticed. So add it to schema and adapt tests. + if "match_entity_type" in self.definition: - m_type = re.match(self.definition["match_entity_type"], element.type) + entity_type = element.entity.type + if isinstance(entity_type, list): + # TODO: this seems to be a bug in kadi4mat RO-Crates + # ./ has type ['Dataset'] + # instead of type 'Dataset' + entity_type = entity_type[0] + m_type = re.match(self.definition["match_entity_type"], entity_type) if m_type is None: return None vardict.update(m_type.groupdict()) - if "match_properties" in self.definition: - # This matcher works analogously to the attributes matcher in the XMLConverter - for prop_def_key, prop_def_value in self.definition["match_properties"].items(): - match_counter = 0 - matched_m_prop = None - matched_m_prop_value = None - for prop_key, prop_value in element.entity.properties().items(): - m_prop = re.match(prop_def_key, prop_key) - if m_prop is not None: - match_counter += 1 - matched_m_prop = m_prop - m_prop_value = re.match(prop_def_value, prop_value) - if m_prop_value is None: - return None - matched_m_prop_value = m_prop_value - if match_counter == 0: - return None - elif match_counter > 1: - raise RuntimeError("Multiple properties match the same match_prop entry.") - vardict.update(matched_m_prop.groupdict()) - vardict.update(matched_m_prop_value.groupdict()) + if not self.match_properties(element.entity.properties(), vardict): + return None return vardict diff --git a/src/caoscrawler/converters/xml_converter.py b/src/caoscrawler/converters/xml_converter.py index b9f7487ee633d0ba25a3b81b78b9a3561274edc9..60d7b49431fb011a06b7105a16471b0b3c7b2268 100644 --- a/src/caoscrawler/converters/xml_converter.py +++ b/src/caoscrawler/converters/xml_converter.py @@ -25,10 +25,9 @@ from __future__ import annotations import re from typing import Optional -import linkahead as db import lxml.etree -from ..stores import GeneralStore, RecordStore +from ..stores import GeneralStore from ..structure_elements import (File, StructureElement, XMLAttributeNode, XMLTagElement, XMLTextNode) from .converters import (Converter, ConverterValidationError, @@ -163,33 +162,8 @@ class XMLTagConverter(Converter): return None vardict.update(m_text.groupdict()) - if "match_attrib" in self.definition: - for attrib_def_key, attrib_def_value in self.definition["match_attrib"].items(): - match_counter = 0 - matched_m_attrib = None - matched_m_attrib_value = None - for attr_key, attr_value in element.tag.attrib.items(): - m_attrib = re.match(attrib_def_key, attr_key) - if m_attrib is not None: - match_counter += 1 - matched_m_attrib = m_attrib - m_attrib_value = re.match(attrib_def_value, attr_value) - if m_attrib_value is None: - return None - matched_m_attrib_value = m_attrib_value - # TODO: How to deal with multiple matches? - # There are multiple options: - # - Allow multiple attribute-key matches: Leads to possible overwrites of variables - # - Require unique attribute-key and attribute-value matches: Very complex - # - Only allow one single attribute-key to match and run attribute-value match separately. - # Currently the latter option is implemented. - # TODO: The ROCrateEntityConverter implements a very similar behavior. - if match_counter == 0: - return None - elif match_counter > 1: - raise RuntimeError("Multiple attributes match the same match_attrib entry.") - vardict.update(matched_m_attrib.groupdict()) - vardict.update(matched_m_attrib_value.groupdict()) + if not self.match_properties(element.tag.attrib, vardict, "match_attrib"): + return None return vardict diff --git a/src/doc/cfood.rst b/src/doc/cfood.rst index a42d593035bd37d0712986c958fb8ad7ad287968..0c7726d2017b955ecd7472d57dc259ff9a7bab53 100644 --- a/src/doc/cfood.rst +++ b/src/doc/cfood.rst @@ -207,9 +207,9 @@ following. ValueWithUnitElt: type: TextElement match_name: ^my_prop$ - match_value: "^(?P<number>\\d+\\.?\\d*)\s+(?P<unit>.+)" # Extract value and unit from a string which - # has a number followed by at least one whitespace - # character followed by a unit. + match_value: "^(?P<number>\\d+\\.?\\d*)\\s+(?P<unit>.+)" # Extract value and unit from a string which + # has a number followed by at least one whitespace + # character followed by a unit. records: MyRecord: MyProp: diff --git a/src/doc/converters/standard_converters.rst b/src/doc/converters/standard_converters.rst index b988172dc5a4e0efd77631013085d776cc2527f6..0520f56b619dcb9329e56e857453a6c1ca2f1d97 100644 --- a/src/doc/converters/standard_converters.rst +++ b/src/doc/converters/standard_converters.rst @@ -41,6 +41,41 @@ The following StructureElement types are typically created by the DictElement co Note that you may use ``TextElement`` for anything that exists in a text format that can be interpreted by the server, such as date and datetime strings in ISO-8601 format. +match_properties +---------------- + +`match_properties` is a dictionary of key-regexps and value-regexp pairs and can be used to +match direct properties of a `DictElement`. Each key matches +a property name and the corresponding value matches its property value. + +Example: +........ + +.. code-block:: json + + { + "@type": "PropertyValue", + "additionalType": "str", + "propertyID": "testextra", + "value": "hi" + } + +When applied to a dict loaded from the above json, a `DictElementConverter` with the following definition: + +.. code-block:: yaml + + Example: + type: DictElement + match_properties: + additionalType: (?P<addt>.*)$ + property(.*): (?P<propid>.*)$ + +will match and create two variables: + +- `addt = "str"` +- `propid = "testextra"` + + Scalar Value Converters ======================= `BooleanElementConverter`, `FloatElementConverter`, `TextElementConverter`, and diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 6c7db6ed346fc5e6d0d286024e96ef8828c5c872..12285e463cdcab12f853931abc5f314ed6b20782 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -35,7 +35,6 @@ from pathlib import Path import linkahead as db import pytest import yaml - from caoscrawler.converters import (Converter, ConverterValidationError, DateElementConverter, DictElementConverter, DictIntegerElementConverter, @@ -1021,3 +1020,53 @@ def test_properties_from_dict_nested(converter_registry): # The "old" DictConverter should have added the additional property: assert myrec.get_property("additional_from_other") is not None assert myrec.get_property("additional_from_other").value == "other" + + +def test_dict_match_properties(converter_registry): + + root_dict_element = DictElement("RootDict", { + "prop_a": "value", + "prop_b": "25", + "prop_c": 24 + }) + + def_dict = { + "RootElt": { + # Root dictionary + "type": "DictElement", + "match_properties": { + "prop_a": "(?P<a>.*)$", + "prop_[^ac]": "(?P<b>.*)$", + "prop_c": "(?P<c>.*)$", + }, + "records": { + # Define top-level, use below in subtrees + "MyRec": { + "prop_a": "$a", + "prop_b": "$b", + "$a": "$c" + } + }}} + records = scan_structure_elements(root_dict_element, def_dict, converter_registry) + assert len(records) == 1 + record = records[0] + assert record.get_property("prop_a").value == "value" + assert record.get_property("prop_b").value == "25" + assert record.get_property("value").value == "24" # Note the type change here + + root_dict_element = DictElement("RootDict", { + "prop_a": "value", + "prop_b": "25", + # Property missing + }) + + records = scan_structure_elements(root_dict_element, def_dict, converter_registry) + assert len(records) == 0 + + with pytest.raises(RuntimeError, match="Multiple properties match the same match_properties entry."): + root_dict_element = DictElement("RootDict", { + "prop_a": "value", + "prop_b": "25", + "prop_d": 24 # duplicate matches + }) + records = scan_structure_elements(root_dict_element, def_dict, converter_registry)