From 6ba50c9b6dd48d33c50a127f860285d3f5b3449c Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Thu, 14 Nov 2024 09:17:22 +0100 Subject: [PATCH] REFACT(converters): match_properties is a generic method of converters now --- src/caoscrawler/converters/converters.py | 59 +++++++++++++++++++++ src/caoscrawler/converters/rocrate.py | 22 +------- src/caoscrawler/converters/xml_converter.py | 28 +--------- src/caoscrawler/crawl.py | 8 +-- unittests/test_scanner.py | 6 +-- 5 files changed, 66 insertions(+), 57 deletions(-) diff --git a/src/caoscrawler/converters/converters.py b/src/caoscrawler/converters/converters.py index 64a557ce..40ddde92 100644 --- a/src/caoscrawler/converters/converters.py +++ b/src/caoscrawler/converters/converters.py @@ -456,6 +456,65 @@ class Converter(object, metaclass=ABCMeta): raise RuntimeError("Condition does not match.") values.update(m) + def match_properties(self, properties: dict, vardict: dict, label: str = "match_properties"): + """ + This method can be used to generically match 'match_properties' from the cfood definition + with the behavior described as follows: + + 'match_properties' is a dictionary of key-regexps and value-regexp pairs. Each key matches + a property name and the corresponding value matches its property value. + + What a property means in the context of the respective converter can be different, examples: + XMLTag: attributes of the node + ROCrate: properties of the ROCrateEntity + DictElement: properties of the dict + + label can be used to customize the name of the property in the definition. + + This method is not called by default, but can be called from child classes. + + Arguments: + ---------- + + properties: dict + The dictionary containing the properties to be matched. + + vardict: dict + This dictionary will be used to store the variables created during the matching. + + label: str + Default "match_properties". Can be used to change the name of the property in the definition. E.g. the + xml converter uses "match_attrib" which makes more sense in the context of xml trees. + """ + if label in self.definition: + # This matcher works analogously to the attributes matcher in the XMLConverter + for prop_def_key, prop_def_value in self.definition[label].items(): + match_counter = 0 + matched_m_prop = None + matched_m_prop_value = None + for prop_key, prop_value in properties.items(): + m_prop = re.match(prop_def_key, prop_key) + if m_prop is not None: + match_counter += 1 + matched_m_prop = m_prop + m_prop_value = re.match(prop_def_value, prop_value) + if m_prop_value is None: + return None + matched_m_prop_value = m_prop_value + # TODO: How to deal with multiple matches? + # There are multiple options: + # - Allow multiple attribute-key matches: Leads to possible overwrites of variables + # - Require unique attribute-key and attribute-value matches: Very complex + # - Only allow one single attribute-key to match and run attribute-value match separately. + # Currently the latter option is implemented. + # TODO: The ROCrateEntityConverter implements a very similar behavior. + if match_counter == 0: + return None + elif match_counter > 1: + raise RuntimeError("Multiple properties match the same {} entry.".format(label)) + vardict.update(matched_m_prop.groupdict()) + vardict.update(matched_m_prop_value.groupdict()) + def apply_transformers(self, values: GeneralStore, transformer_functions: dict): """ Check if transformers are defined using the "transform" keyword. diff --git a/src/caoscrawler/converters/rocrate.py b/src/caoscrawler/converters/rocrate.py index 286061ef..e940ba83 100644 --- a/src/caoscrawler/converters/rocrate.py +++ b/src/caoscrawler/converters/rocrate.py @@ -176,27 +176,7 @@ class ROCrateEntityConverter(Converter): return None vardict.update(m_type.groupdict()) - if "match_properties" in self.definition: - # This matcher works analogously to the attributes matcher in the XMLConverter - for prop_def_key, prop_def_value in self.definition["match_properties"].items(): - match_counter = 0 - matched_m_prop = None - matched_m_prop_value = None - for prop_key, prop_value in element.entity.properties().items(): - m_prop = re.match(prop_def_key, prop_key) - if m_prop is not None: - match_counter += 1 - matched_m_prop = m_prop - m_prop_value = re.match(prop_def_value, prop_value) - if m_prop_value is None: - return None - matched_m_prop_value = m_prop_value - if match_counter == 0: - return None - elif match_counter > 1: - raise RuntimeError("Multiple properties match the same match_prop entry.") - vardict.update(matched_m_prop.groupdict()) - vardict.update(matched_m_prop_value.groupdict()) + self.match_properties(element.entity.properties(), vardict) return vardict diff --git a/src/caoscrawler/converters/xml_converter.py b/src/caoscrawler/converters/xml_converter.py index bd3f6cf0..76d5afff 100644 --- a/src/caoscrawler/converters/xml_converter.py +++ b/src/caoscrawler/converters/xml_converter.py @@ -163,33 +163,7 @@ class XMLTagConverter(Converter): return None vardict.update(m_text.groupdict()) - if "match_attrib" in self.definition: - for attrib_def_key, attrib_def_value in self.definition["match_attrib"].items(): - match_counter = 0 - matched_m_attrib = None - matched_m_attrib_value = None - for attr_key, attr_value in element.tag.attrib.items(): - m_attrib = re.match(attrib_def_key, attr_key) - if m_attrib is not None: - match_counter += 1 - matched_m_attrib = m_attrib - m_attrib_value = re.match(attrib_def_value, attr_value) - if m_attrib_value is None: - return None - matched_m_attrib_value = m_attrib_value - # TODO: How to deal with multiple matches? - # There are multiple options: - # - Allow multiple attribute-key matches: Leads to possible overwrites of variables - # - Require unique attribute-key and attribute-value matches: Very complex - # - Only allow one single attribute-key to match and run attribute-value match separately. - # Currently the latter option is implemented. - # TODO: The ROCrateEntityConverter implements a very similar behavior. - if match_counter == 0: - return None - elif match_counter > 1: - raise RuntimeError("Multiple attributes match the same match_attrib entry.") - vardict.update(matched_m_attrib.groupdict()) - vardict.update(matched_m_attrib_value.groupdict()) + self.match_properties(element.tag.attrib, vardict, "match_attrib") return vardict diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 9e4e2a80..a79e4434 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -39,7 +39,6 @@ import sys import traceback import uuid import warnings - from argparse import RawTextHelpFormatter from copy import deepcopy from datetime import datetime @@ -52,13 +51,10 @@ from caosadvancedtools.cache import UpdateCache from caosadvancedtools.crawler import Crawler as OldCrawler from caosadvancedtools.serverside.helper import send_mail from caosadvancedtools.utils import create_entity_link -from linkahead.apiutils import (compare_entities, - merge_entities) +from linkahead.apiutils import compare_entities, merge_entities from linkahead.cached import cache_clear, cached_get_entity_by from linkahead.common.datatype import get_list_datatype, is_reference -from linkahead.exceptions import ( - TransactionError, -) +from linkahead.exceptions import TransactionError from linkahead.utils.escape import escape_squoted_text from .config import get_config_setting diff --git a/unittests/test_scanner.py b/unittests/test_scanner.py index 680e4abe..4fa752b0 100644 --- a/unittests/test_scanner.py +++ b/unittests/test_scanner.py @@ -36,9 +36,9 @@ import pytest import yaml from caoscrawler.crawl import Crawler from caoscrawler.debug_tree import DebugTree -from caoscrawler.scanner import (create_converter_registry, load_definition, - scan_directory, scan_structure_elements, - _load_definition_from_yaml_dict) +from caoscrawler.scanner import (_load_definition_from_yaml_dict, + create_converter_registry, load_definition, + scan_directory, scan_structure_elements) from caoscrawler.structure_elements import (DictElement, DictListElement, DictTextElement, File) from pytest import raises -- GitLab