diff --git a/CHANGELOG.md b/CHANGELOG.md index aafd19ba54a01f88c3e5dcf0f54aa27aa7e9c461..16c12bb88bfd202eeb5225d96ddf589420e9fcdf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,8 +9,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### -* Validation module for checking a list of generated records against a list of json schemas +- Validation module for checking a list of generated records against a list of json schemas that can be generated from a yaml data model file. +- DictElementConverters can now make use of `match_properties` which + works analogous to `match_properties` in ROCrateEntityConverter and + `match_attrib` in XMLConverter. +- `match_properties` is a method of class Converter and can for + example be used by CustomConverters. +- ZipFileConverter that opens zip files and exposes their contents as + File and Directory structure elements. +- `linkahead-crawler` script as alias for `caosdb-crawler`. ### Changed ### @@ -20,6 +28,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed ### +- `spss_to_datamodel` script works again. + ### Security ### ### Documentation ### @@ -49,9 +59,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Units for properties. They can be specified by giving the property as a dict in the form ```yaml MyRecord: - my_prop: - value: 5 - unit: m + my_prop: + value: 5 + unit: m ``` - Support for Python 3.13 - ROCrateConverter, ELNFileConverter and ROCrateEntityConverter for crawling ROCrate and .eln files diff --git a/setup.cfg b/setup.cfg index d00202b9a6c466674da6510d7e9a3d9a014b927c..d05f2acb1e8d5afafa5a1003c6da2dff0980c126 100644 --- a/setup.cfg +++ b/setup.cfg @@ -39,8 +39,9 @@ per-file-ignores = __init__.py:F401 [options.entry_points] console_scripts = + linkahead-crawler = caoscrawler.crawl:main caosdb-crawler = caoscrawler.crawl:main - spss_to_datamodel = caoscrawler.conv_impl.spss:spss_to_datamodel_main + spss_to_datamodel = caoscrawler.converters.spss:spss_to_datamodel_main csv_to_datamodel = caoscrawler.scripts.generators:csv_to_datamodel_main [options.extras_require] diff --git a/src/caoscrawler/converters/__init__.py b/src/caoscrawler/converters/__init__.py index 670d4e966c72c6bcf45d0d46c1db715fb79d8ab5..edb7b3633cea2657dc3b9638379a3e57c37c87e4 100644 --- a/src/caoscrawler/converters/__init__.py +++ b/src/caoscrawler/converters/__init__.py @@ -18,11 +18,12 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. -"""Submdule containing all default and optional converters.""" +"""Submodule containing all default and optional converters.""" from .. import utils from .converters import * from .xml_converter import * +from .zipfile_converter import ZipFileConverter try: from .spss import SPSSConverter diff --git a/src/caoscrawler/converters/converters.py b/src/caoscrawler/converters/converters.py index 64a557ce4e26fd8bfd345000d3abf18bf0360117..3a3c7e292a2967cab68228e820fea6880302be89 100644 --- a/src/caoscrawler/converters/converters.py +++ b/src/caoscrawler/converters/converters.py @@ -456,6 +456,90 @@ class Converter(object, metaclass=ABCMeta): raise RuntimeError("Condition does not match.") values.update(m) + def match_properties(self, properties: dict, vardict: dict, label: str = "match_properties"): + """This method can be used to generically match 'match_properties' from the cfood definition + with the behavior described as follows: + + 'match_properties' is a dictionary of key-regexps and value-regexp pairs. Each key matches + a property name and the corresponding value matches its property value. + + What a property means in the context of the respective converter can be different, examples: + + * XMLTag: attributes of the node + * ROCrate: properties of the ROCrateEntity + * DictElement: properties of the dict + + label can be used to customize the name of the dictionary in the definition. + + This method is not called by default, but can be called from child classes. + + Typically it would be used like this from methods overwriting `match`:: + + if not self.match_properties(<properties>, vardict): + return None + + vardict will be updated in place when there are + matches. <properties> is a dictionary taken from the structure + element that contains the properties in the context of this + converter. + + + Parameters + ---------- + + properties: dict + The dictionary containing the properties to be matched. + + vardict: dict + This dictionary will be used to store the variables created during the matching. + + label: str + Default "match_properties". Can be used to change the name + of the property in the definition. E.g. the xml converter + uses "match_attrib" which makes more sense in the context + of xml trees. + + Returns + ------- + + : bool + Returns True when properties match and False + otherwise. The vardict dictionary is updated in place. + + """ + if label in self.definition: + # This matcher works analogously to the attributes matcher in the XMLConverter + for prop_def_key, prop_def_value in self.definition[label].items(): + match_counter = 0 + matched_m_prop = None + matched_m_prop_value = None + for prop_key, prop_value in properties.items(): + print("{} = {}".format(prop_key, prop_value)) + # TODO: automatic conversion to str ok? + m_prop = re.match(prop_def_key, str(prop_key)) + if m_prop is not None: + match_counter += 1 + matched_m_prop = m_prop + # TODO: automatic conversion to str ok? + m_prop_value = re.match(prop_def_value, str(prop_value)) + if m_prop_value is None: + return False + matched_m_prop_value = m_prop_value + # TODO: How to deal with multiple matches? + # There are multiple options: + # - Allow multiple attribute-key matches: Leads to possible overwrites of variables + # - Require unique attribute-key and attribute-value matches: Very complex + # - Only allow one single attribute-key to match and run attribute-value match separately. + # Currently the latter option is implemented. + # TODO: The ROCrateEntityConverter implements a very similar behavior. + if match_counter == 0: + return False + elif match_counter > 1: + raise RuntimeError("Multiple properties match the same {} entry.".format(label)) + vardict.update(matched_m_prop.groupdict()) + vardict.update(matched_m_prop_value.groupdict()) + return True + def apply_transformers(self, values: GeneralStore, transformer_functions: dict): """ Check if transformers are defined using the "transform" keyword. @@ -876,7 +960,12 @@ class DictElementConverter(Converter): # TODO: See comment on types and inheritance if not isinstance(element, DictElement): raise RuntimeError("Element must be a DictElement.") - return match_name_and_value(self.definition, element.name, element.value) + vardict = match_name_and_value(self.definition, element.name, element.value) + + if not self.match_properties(element.value, vardict): + return None + + return vardict class PropertiesFromDictConverter(DictElementConverter): diff --git a/src/caoscrawler/converters/rocrate.py b/src/caoscrawler/converters/rocrate.py index b84462acba2fdd7e60094e38edc38605c80deb11..8a45af753312a2bf29c1ddb9e6bcb15458c3ebde 100644 --- a/src/caoscrawler/converters/rocrate.py +++ b/src/caoscrawler/converters/rocrate.py @@ -32,15 +32,13 @@ import tempfile from typing import Optional from zipfile import ZipFile -import linkahead as db import rocrate from rocrate.rocrate import ROCrate -from ..stores import GeneralStore, RecordStore +from ..stores import GeneralStore from ..structure_elements import (Directory, File, ROCrateEntity, StructureElement) -from .converters import (Converter, ConverterValidationError, - SimpleFileConverter, convert_basic_element) +from .converters import Converter, SimpleFileConverter, convert_basic_element class ROCrateConverter(SimpleFileConverter): @@ -169,33 +167,24 @@ class ROCrateEntityConverter(Converter): # Store the result of all individual regexp variable results: vardict = {} + # TODO: I accidentally used "match_type" instead + # of "match_entity_type". This was completely + # unnoticed. So add it to schema and adapt tests. + if "match_entity_type" in self.definition: - m_type = re.match(self.definition["match_entity_type"], element.type) + entity_type = element.entity.type + if isinstance(entity_type, list): + # TODO: this seems to be a bug in kadi4mat RO-Crates + # ./ has type ['Dataset'] + # instead of type 'Dataset' + entity_type = entity_type[0] + m_type = re.match(self.definition["match_entity_type"], entity_type) if m_type is None: return None vardict.update(m_type.groupdict()) - if "match_properties" in self.definition: - # This matcher works analogously to the attributes matcher in the XMLConverter - for prop_def_key, prop_def_value in self.definition["match_properties"].items(): - match_counter = 0 - matched_m_prop = None - matched_m_prop_value = None - for prop_key, prop_value in element.entity.properties().items(): - m_prop = re.match(prop_def_key, prop_key) - if m_prop is not None: - match_counter += 1 - matched_m_prop = m_prop - m_prop_value = re.match(prop_def_value, prop_value) - if m_prop_value is None: - return None - matched_m_prop_value = m_prop_value - if match_counter == 0: - return None - elif match_counter > 1: - raise RuntimeError("Multiple properties match the same match_prop entry.") - vardict.update(matched_m_prop.groupdict()) - vardict.update(matched_m_prop_value.groupdict()) + if not self.match_properties(element.entity.properties(), vardict): + return None return vardict diff --git a/src/caoscrawler/converters/xml_converter.py b/src/caoscrawler/converters/xml_converter.py index b9f7487ee633d0ba25a3b81b78b9a3561274edc9..60d7b49431fb011a06b7105a16471b0b3c7b2268 100644 --- a/src/caoscrawler/converters/xml_converter.py +++ b/src/caoscrawler/converters/xml_converter.py @@ -25,10 +25,9 @@ from __future__ import annotations import re from typing import Optional -import linkahead as db import lxml.etree -from ..stores import GeneralStore, RecordStore +from ..stores import GeneralStore from ..structure_elements import (File, StructureElement, XMLAttributeNode, XMLTagElement, XMLTextNode) from .converters import (Converter, ConverterValidationError, @@ -163,33 +162,8 @@ class XMLTagConverter(Converter): return None vardict.update(m_text.groupdict()) - if "match_attrib" in self.definition: - for attrib_def_key, attrib_def_value in self.definition["match_attrib"].items(): - match_counter = 0 - matched_m_attrib = None - matched_m_attrib_value = None - for attr_key, attr_value in element.tag.attrib.items(): - m_attrib = re.match(attrib_def_key, attr_key) - if m_attrib is not None: - match_counter += 1 - matched_m_attrib = m_attrib - m_attrib_value = re.match(attrib_def_value, attr_value) - if m_attrib_value is None: - return None - matched_m_attrib_value = m_attrib_value - # TODO: How to deal with multiple matches? - # There are multiple options: - # - Allow multiple attribute-key matches: Leads to possible overwrites of variables - # - Require unique attribute-key and attribute-value matches: Very complex - # - Only allow one single attribute-key to match and run attribute-value match separately. - # Currently the latter option is implemented. - # TODO: The ROCrateEntityConverter implements a very similar behavior. - if match_counter == 0: - return None - elif match_counter > 1: - raise RuntimeError("Multiple attributes match the same match_attrib entry.") - vardict.update(matched_m_attrib.groupdict()) - vardict.update(matched_m_attrib_value.groupdict()) + if not self.match_properties(element.tag.attrib, vardict, "match_attrib"): + return None return vardict diff --git a/src/caoscrawler/converters/zipfile_converter.py b/src/caoscrawler/converters/zipfile_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..7073e66a266168e17eb9b6143e7dc6292b5149dc --- /dev/null +++ b/src/caoscrawler/converters/zipfile_converter.py @@ -0,0 +1,82 @@ +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Converters take structure elements and create Records and new structure elements from them. + +This converter opens zip files, unzips them into a temporary directory and +exposes its contents as File structure elements. + +""" + +from __future__ import annotations + +import os +import tempfile +from os.path import isdir, join +from zipfile import ZipFile + +from ..stores import GeneralStore +from ..structure_elements import Directory, File, StructureElement +from .converters import SimpleFileConverter + + +class ZipFileConverter(SimpleFileConverter): + + """Convert zipfiles. + """ + + def setup(self): + self._tempdir = None + + def cleanup(self): + self._tempdir.cleanup() + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + """ + Loads an ROCrate from an rocrate file or directory. + + Arguments: + ---------- + element must be a File or Directory (structure element). + + Returns: + -------- + A list with an ROCrateElement representing the contents of the .eln-file or None + in case of errors. + """ + + if isinstance(element, File): + self._tempdir = tempfile.TemporaryDirectory() + unzd_path = self._tempdir.name + with ZipFile(element.path) as zipf: + zipf.extractall(unzd_path) + + entity_ls = [] + for el in os.listdir(unzd_path): + path = join(unzd_path, el) + if isdir(path): + entity_ls.append(Directory(el, path)) + else: + entity_ls.append(File(el, path)) + + return entity_ls + else: + raise ValueError("create_children was called with wrong type of StructureElement") + return None diff --git a/src/doc/cfood.rst b/src/doc/cfood.rst index a42d593035bd37d0712986c958fb8ad7ad287968..0c7726d2017b955ecd7472d57dc259ff9a7bab53 100644 --- a/src/doc/cfood.rst +++ b/src/doc/cfood.rst @@ -207,9 +207,9 @@ following. ValueWithUnitElt: type: TextElement match_name: ^my_prop$ - match_value: "^(?P<number>\\d+\\.?\\d*)\s+(?P<unit>.+)" # Extract value and unit from a string which - # has a number followed by at least one whitespace - # character followed by a unit. + match_value: "^(?P<number>\\d+\\.?\\d*)\\s+(?P<unit>.+)" # Extract value and unit from a string which + # has a number followed by at least one whitespace + # character followed by a unit. records: MyRecord: MyProp: diff --git a/src/doc/converters/standard_converters.rst b/src/doc/converters/standard_converters.rst index 586b84b48be78f1307298a11ad61a2448c3c3cd7..f7f18794496e5e658a8abdb5676b562d5e047675 100644 --- a/src/doc/converters/standard_converters.rst +++ b/src/doc/converters/standard_converters.rst @@ -41,6 +41,41 @@ The following StructureElement types are typically created by the DictElement co Note that you may use ``TextElement`` for anything that exists in a text format that can be interpreted by the server, such as date and datetime strings in ISO-8601 format. +match_properties +---------------- + +`match_properties` is a dictionary of key-regexps and value-regexp pairs and can be used to +match direct properties of a `DictElement`. Each key matches +a property name and the corresponding value matches its property value. + +Example: +........ + +.. code-block:: json + + { + "@type": "PropertyValue", + "additionalType": "str", + "propertyID": "testextra", + "value": "hi" + } + +When applied to a dict loaded from the above json, a `DictElementConverter` with the following definition: + +.. code-block:: yaml + + Example: + type: DictElement + match_properties: + additionalType: (?P<addt>.*)$ + property(.*): (?P<propid>.*)$ + +will match and create two variables: + +- `addt = "str"` +- `propid = "testextra"` + + Scalar Value Converters ======================= `BooleanElementConverter`, `FloatElementConverter`, `TextElementConverter`, and @@ -331,3 +366,31 @@ XMLTextNodeConverter In the future, this converter can be used to match XMLTextNodes that are generated by the XMLTagConverter. + + +ZipFileConverter +================ + +This converter opens zip files, unzips them into a temporary directory and +exposes its contents as File structure elements. + +Usage Example: +-------------- + +.. code-block:: yaml + + ExampleZipFile: + type: ZipFile + match: example\.zip$ + subtree: + DirInsideZip: + type: Directory + match: experiments$ + FileInsideZip: + type: File + match: description.odt$ + +This converter will match and open files called ``example.zip``. If +the file contains a directory called ``experiments`` it will be +processed further by the respective converter in the subtree. The same +is true for a file called ``description.odt``. diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 6c7db6ed346fc5e6d0d286024e96ef8828c5c872..12285e463cdcab12f853931abc5f314ed6b20782 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -35,7 +35,6 @@ from pathlib import Path import linkahead as db import pytest import yaml - from caoscrawler.converters import (Converter, ConverterValidationError, DateElementConverter, DictElementConverter, DictIntegerElementConverter, @@ -1021,3 +1020,53 @@ def test_properties_from_dict_nested(converter_registry): # The "old" DictConverter should have added the additional property: assert myrec.get_property("additional_from_other") is not None assert myrec.get_property("additional_from_other").value == "other" + + +def test_dict_match_properties(converter_registry): + + root_dict_element = DictElement("RootDict", { + "prop_a": "value", + "prop_b": "25", + "prop_c": 24 + }) + + def_dict = { + "RootElt": { + # Root dictionary + "type": "DictElement", + "match_properties": { + "prop_a": "(?P<a>.*)$", + "prop_[^ac]": "(?P<b>.*)$", + "prop_c": "(?P<c>.*)$", + }, + "records": { + # Define top-level, use below in subtrees + "MyRec": { + "prop_a": "$a", + "prop_b": "$b", + "$a": "$c" + } + }}} + records = scan_structure_elements(root_dict_element, def_dict, converter_registry) + assert len(records) == 1 + record = records[0] + assert record.get_property("prop_a").value == "value" + assert record.get_property("prop_b").value == "25" + assert record.get_property("value").value == "24" # Note the type change here + + root_dict_element = DictElement("RootDict", { + "prop_a": "value", + "prop_b": "25", + # Property missing + }) + + records = scan_structure_elements(root_dict_element, def_dict, converter_registry) + assert len(records) == 0 + + with pytest.raises(RuntimeError, match="Multiple properties match the same match_properties entry."): + root_dict_element = DictElement("RootDict", { + "prop_a": "value", + "prop_b": "25", + "prop_d": 24 # duplicate matches + }) + records = scan_structure_elements(root_dict_element, def_dict, converter_registry) diff --git a/unittests/test_rocrate_converter.py b/unittests/test_rocrate_converter.py index ef59a37c7a9ca91f85d3a62b4f5b6f5c12559575..dc7cef9f6d396c73a2a285d3f60fd587863237ac 100644 --- a/unittests/test_rocrate_converter.py +++ b/unittests/test_rocrate_converter.py @@ -27,22 +27,16 @@ import importlib import os from pathlib import Path -import jsonschema import linkahead as db import pytest import rocrate import yaml -from linkahead.high_level_api import convert_to_python_object -from lxml.etree import fromstring -from rocrate.model.entity import Entity -from rocrate.rocrate import ROCrate - from caoscrawler import scanner from caoscrawler.converters import ELNFileConverter, ROCrateEntityConverter -from caoscrawler.scanner import load_definition from caoscrawler.stores import GeneralStore from caoscrawler.structure_elements import (DictElement, File, ROCrateEntity, TextElement) +from rocrate.model.entity import Entity UNITTESTDIR = Path(__file__).parent diff --git a/unittests/test_scripts.py b/unittests/test_scripts.py new file mode 100644 index 0000000000000000000000000000000000000000..da03c1f24fbd3d7ca13cfa55d6f69c0cb5a6a6f1 --- /dev/null +++ b/unittests/test_scripts.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 + +# This file is a part of the LinkAhead project. +# +# Copyright (C) 2024 IndiScale GmbH <www.indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Test if the scripts work as expected. +""" + +from subprocess import run + +SCRIPTS = [ + "linkahead-crawler", + "caosdb-crawler", + "spss_to_datamodel", + "csv_to_datamodel", +] + + +def test_script_loading(): + """Run the scripts with "-h".""" + for script in SCRIPTS: + run([script, "-h"], check=True) diff --git a/unittests/test_zipfile_converter.py b/unittests/test_zipfile_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..9bc8b8804e299387157869f0dc8b11a9c2a8c6f8 --- /dev/null +++ b/unittests/test_zipfile_converter.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer <a.schlemmer@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" +test the zip-file converter +""" +import importlib +import os +from pathlib import Path + +import pytest +import yaml +from caoscrawler.converters import DirectoryConverter, ZipFileConverter +from caoscrawler.stores import GeneralStore +from caoscrawler.structure_elements import Directory, File + +UNITTESTDIR = Path(__file__).parent + + +@pytest.fixture +def converter_registry(): + converter_registry: dict[str, dict[str, str]] = { + "ZipFile": { + "converter": "ZipFileConverter", + "package": "caoscrawler.converters"}, + } + + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry + + +def test_zipfile_converter(converter_registry): + zipfile = File("PASTA.eln", os.path.join(UNITTESTDIR, "eln_files", "PASTA.eln")) + zip_conv = ZipFileConverter(yaml.safe_load(""" +type: ZipFile +match: .*$ +"""), "TestZipFileConverter", converter_registry) + + match = zip_conv.match(zipfile) + assert match is not None + + children = zip_conv.create_children(GeneralStore(), zipfile) + assert len(children) == 1 + assert children[0].name == "PASTA" + + dir_conv = DirectoryConverter(yaml.safe_load(""" +type: Directory +match: ^PASTA$ +"""), "TestDirectory", converter_registry) + match = dir_conv.match(children[0]) + assert match is not None + children = dir_conv.create_children(GeneralStore(), children[0]) + assert len(children) == 5 + print(children) + for i in range(2): + assert isinstance(children[i], Directory) + for i in range(2, 5): + assert isinstance(children[i], File)