From 2fa1fc0bb5c28a19829728c1f2b01f86ace1c6ab Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Thu, 24 Oct 2024 12:33:51 +0200 Subject: [PATCH 001/131] TST: two tests for recursive definitions of records --- unittests/test_scanner.py | 84 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/unittests/test_scanner.py b/unittests/test_scanner.py index 680e4abe..b93d5f98 100644 --- a/unittests/test_scanner.py +++ b/unittests/test_scanner.py @@ -405,3 +405,87 @@ def test_units(): assert rec.get_property("may_be_overwritten") is not None assert rec.get_property("may_be_overwritten").value == "400" assert rec.get_property("may_be_overwritten").unit == "°C" + + +def test_recursive_definition(): + """ + + """ + + recursive_yaml = """ +Converter: + type: DictElement + records: + Block: + Experiment: $Experiment + Experiment: + Block: $Block + """ + + crawler_definition = _load_definition_from_yaml_dict( + [yaml.load(recursive_yaml, Loader=yaml.SafeLoader)]) + converter_registry = create_converter_registry(crawler_definition) + + data = { + "value_with_unit": "1.1 m", + "array_with_units": [ + "1.1 cm", + "2.2 cm" + ] + } + records = scan_structure_elements(DictElement(name="", value=data), crawler_definition, + converter_registry) + + assert len(records) == 2 + assert len(records[0].parents) == 1 + assert records[0].parents[0].name == "Block" + assert len(records[1].parents) == 1 + assert records[1].parents[0].name == "Experiment" + + assert records[0].get_property("Experiment").value == records[1] + assert records[1].get_property("Block").value == records[0] + + +def test_recursive_definition_2(): + """ + This is basically a test for: + https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/16 + """ + + recursive_yaml = """ +FirstConverter: + type: DictElement + records: + Experiment: + subtree: + Converter: + type: DictElement + records: + Block: + Experiment: $Experiment + Experiment: + Block: $Block + """ + + crawler_definition = _load_definition_from_yaml_dict( + [yaml.load(recursive_yaml, Loader=yaml.SafeLoader)]) + converter_registry = create_converter_registry(crawler_definition) + + data = {"data": { + "value_with_unit": "1.1 m", + "array_with_units": [ + "1.1 cm", + "2.2 cm" + ] + }} + records = scan_structure_elements(DictElement(name="", value=data), crawler_definition, + converter_registry) + + assert len(records) == 2 + assert len(records[0].parents) == 1 + assert records[0].parents[0].name == "Block" + assert len(records[1].parents) == 1 + assert records[1].parents[0].name == "Experiment" + + assert records[0].get_property("Experiment").value == records[1] + assert records[1].get_property("Block").value == records[0] -- GitLab From 5cca4956592622f60613cb2c22811d3c64bfe7e9 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Thu, 24 Oct 2024 13:09:12 +0200 Subject: [PATCH 002/131] TST: added tests for recursively defined records --- integrationtests/test_issues.py | 57 ++++++++++++++++++++++++++++++++- unittests/test_scanner.py | 9 ++++-- 2 files changed, 63 insertions(+), 3 deletions(-) diff --git a/integrationtests/test_issues.py b/integrationtests/test_issues.py index 76392f3a..38d00a5e 100644 --- a/integrationtests/test_issues.py +++ b/integrationtests/test_issues.py @@ -22,12 +22,15 @@ from caoscrawler.crawl import Crawler from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter from caoscrawler.scanner import (create_converter_registry, - scan_structure_elements) + scan_structure_elements, + _load_definition_from_yaml_dict) from caoscrawler.structure_elements import DictElement from linkahead.cached import cache_clear from linkahead.utils.register_tests import clear_database, set_test_key from pytest import fixture, mark, raises +import yaml + set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") @@ -328,3 +331,55 @@ def test_indiscale_87(clear_database): print(db.apiutils.compare_entities(rec, retrieved)) assert db.apiutils.empty_diff(rec, retrieved) print("---") + + +def test_issue_16(clear_database): + """ + This is another a test for: + https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/16 + + In addition to the two unit tests for recursive definition in `test_scanner.py` this system test + tests whether recursively defined records can be synchronized correctly using the crawler. + """ + recursive_yaml = """ +FirstConverter: + type: DictElement + records: + Experiment: + subtree: + Converter: + type: DictElement + records: + Block: + Experiment: $Experiment + Experiment: + Block: $Block + """ + + crawler_definition = _load_definition_from_yaml_dict( + [yaml.load(recursive_yaml, Loader=yaml.SafeLoader)]) + converter_registry = create_converter_registry(crawler_definition) + + data = {"data": { + "value_with_unit": "1.1 m", + "array_with_units": [ + "1.1 cm", + "2.2 cm" + ] + }} + records = scan_structure_elements(DictElement(name="", value=data), crawler_definition, + converter_registry) + + rt_exp = db.RecordType(name="Experiment").insert() + rt_block = db.RecordType(name="Block").insert() + + ident = CaosDBIdentifiableAdapter() + ident.load_from_yaml_object(yaml.safe_load(""" +Experiment: +- Block +Block: +- Experiment +""")) + + crawler = Crawler(identifiableAdapter=ident) + crawler.synchronize(crawled_data=records) diff --git a/unittests/test_scanner.py b/unittests/test_scanner.py index b93d5f98..d2003be1 100644 --- a/unittests/test_scanner.py +++ b/unittests/test_scanner.py @@ -409,7 +409,8 @@ def test_units(): def test_recursive_definition(): """ - + This is basically a test for: + https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/16 """ recursive_yaml = """ @@ -448,8 +449,12 @@ Converter: def test_recursive_definition_2(): """ - This is basically a test for: + This is another a test for: https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/16 + + It defines Experiment on a different level, therefore allowing the recursive definition. + This is, however, no workaround for test_recursive_definition as a bidirectional link on the + same level is still not achieved. """ recursive_yaml = """ -- GitLab From 6bd1fc1b154130681cf647e22c04fc7ffebacaba Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Wed, 13 Nov 2024 11:31:14 +0100 Subject: [PATCH 003/131] Revert "PIPELINE: Fix missing rocrate dependency" This reverts commit f5947ee963fac47840f01e945c517d2bb0c07d66. --- .docker/Dockerfile | 3 --- .gitlab-ci.yml | 4 ---- tox.ini | 2 -- 3 files changed, 9 deletions(-) diff --git a/.docker/Dockerfile b/.docker/Dockerfile index 066923e6..1468a17f 100644 --- a/.docker/Dockerfile +++ b/.docker/Dockerfile @@ -34,9 +34,6 @@ RUN rm -r /git/.git # Install pycaosdb.ini for the tests RUN mv /git/.docker/tester_pycaosdb.ini /git/integrationtests/pycaosdb.ini -# TODO Remove once https://github.com/ResearchObject/ro-crate-py/issues/203 has been resolved. -RUN pip3 install --break-system-packages git+https://github.com/salexan2001/ro-crate-py.git@f-automatic-dummy-ids - RUN cd /git/ && pip3 install --break-system-packages .[h5-crawler,spss,rocrate] WORKDIR /git/integrationtests diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 58476b39..e4322356 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -131,10 +131,6 @@ unittest_py3.9: # TODO: Use f-branch logic here - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@dev - # TODO: Remove once - # https://github.com/ResearchObject/ro-crate-py/issues/203 has - # been resolved. - - pip install git+https://github.com/salexan2001/ro-crate-py.git@f-automatic-dummy-ids - pip install .[h5-crawler,spss,rocrate] # actual test - caosdb-crawler --help diff --git a/tox.ini b/tox.ini index 1b695d26..e003e26e 100644 --- a/tox.ini +++ b/tox.ini @@ -9,8 +9,6 @@ deps = .[h5-crawler,spss,rocrate] # TODO: Make this f-branch sensitive git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@dev - # TODO Remove once https://github.com/ResearchObject/ro-crate-py/issues/203 has been resolved. - git+https://github.com/salexan2001/ro-crate-py.git@f-automatic-dummy-ids commands = caosdb-crawler --help py.test --cov=caoscrawler -vv {posargs} -- GitLab From 082ca19e1016c6dca5711f307aabd12f747951bb Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Wed, 13 Nov 2024 11:32:28 +0100 Subject: [PATCH 004/131] DEP: Re-enable optional rocrate dependency with @salexan's fork --- setup.cfg | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 5eadc00b..d00202b9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = caoscrawler -version = 0.10.1 +version = 0.10.2 author = Alexander Schlemmer author_email = alexander.schlemmer@ds.mpg.de description = A new crawler for LinkAhead @@ -49,3 +49,5 @@ h5-crawler = numpy spss = pandas[spss] +rocrate = + rocrate @ git+https://github.com/salexan2001/ro-crate-py.git@f-automatic-dummy-ids -- GitLab From bd74a0954fe582a88c49dda6a63d1f49b75214ec Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Wed, 13 Nov 2024 11:33:21 +0100 Subject: [PATCH 005/131] REL: Begin next release cycle --- CHANGELOG.md | 16 ++++++++++++++++ src/doc/conf.py | 4 ++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a0903761..f106e8f0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,22 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased] ## + +### Added ### + +### Changed ### + +### Deprecated ### + +### Removed ### + +### Fixed ### + +### Security ### + +### Documentation ### + ## [0.10.1] - 2024-11-13 ## ### Fixed ### diff --git a/src/doc/conf.py b/src/doc/conf.py index f9cab5b7..01ca66bf 100644 --- a/src/doc/conf.py +++ b/src/doc/conf.py @@ -33,10 +33,10 @@ copyright = '2024, IndiScale' author = 'Alexander Schlemmer' # The short X.Y version -version = '0.10.1' +version = '0.10.2' # The full version, including alpha/beta/rc tags # release = '0.5.2-rc2' -release = '0.10.1' +release = '0.10.2-dev' # -- General configuration --------------------------------------------------- -- GitLab From 6ba50c9b6dd48d33c50a127f860285d3f5b3449c Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Thu, 14 Nov 2024 09:17:22 +0100 Subject: [PATCH 006/131] REFACT(converters): match_properties is a generic method of converters now --- src/caoscrawler/converters/converters.py | 59 +++++++++++++++++++++ src/caoscrawler/converters/rocrate.py | 22 +------- src/caoscrawler/converters/xml_converter.py | 28 +--------- src/caoscrawler/crawl.py | 8 +-- unittests/test_scanner.py | 6 +-- 5 files changed, 66 insertions(+), 57 deletions(-) diff --git a/src/caoscrawler/converters/converters.py b/src/caoscrawler/converters/converters.py index 64a557ce..40ddde92 100644 --- a/src/caoscrawler/converters/converters.py +++ b/src/caoscrawler/converters/converters.py @@ -456,6 +456,65 @@ class Converter(object, metaclass=ABCMeta): raise RuntimeError("Condition does not match.") values.update(m) + def match_properties(self, properties: dict, vardict: dict, label: str = "match_properties"): + """ + This method can be used to generically match 'match_properties' from the cfood definition + with the behavior described as follows: + + 'match_properties' is a dictionary of key-regexps and value-regexp pairs. Each key matches + a property name and the corresponding value matches its property value. + + What a property means in the context of the respective converter can be different, examples: + XMLTag: attributes of the node + ROCrate: properties of the ROCrateEntity + DictElement: properties of the dict + + label can be used to customize the name of the property in the definition. + + This method is not called by default, but can be called from child classes. + + Arguments: + ---------- + + properties: dict + The dictionary containing the properties to be matched. + + vardict: dict + This dictionary will be used to store the variables created during the matching. + + label: str + Default "match_properties". Can be used to change the name of the property in the definition. E.g. the + xml converter uses "match_attrib" which makes more sense in the context of xml trees. + """ + if label in self.definition: + # This matcher works analogously to the attributes matcher in the XMLConverter + for prop_def_key, prop_def_value in self.definition[label].items(): + match_counter = 0 + matched_m_prop = None + matched_m_prop_value = None + for prop_key, prop_value in properties.items(): + m_prop = re.match(prop_def_key, prop_key) + if m_prop is not None: + match_counter += 1 + matched_m_prop = m_prop + m_prop_value = re.match(prop_def_value, prop_value) + if m_prop_value is None: + return None + matched_m_prop_value = m_prop_value + # TODO: How to deal with multiple matches? + # There are multiple options: + # - Allow multiple attribute-key matches: Leads to possible overwrites of variables + # - Require unique attribute-key and attribute-value matches: Very complex + # - Only allow one single attribute-key to match and run attribute-value match separately. + # Currently the latter option is implemented. + # TODO: The ROCrateEntityConverter implements a very similar behavior. + if match_counter == 0: + return None + elif match_counter > 1: + raise RuntimeError("Multiple properties match the same {} entry.".format(label)) + vardict.update(matched_m_prop.groupdict()) + vardict.update(matched_m_prop_value.groupdict()) + def apply_transformers(self, values: GeneralStore, transformer_functions: dict): """ Check if transformers are defined using the "transform" keyword. diff --git a/src/caoscrawler/converters/rocrate.py b/src/caoscrawler/converters/rocrate.py index 286061ef..e940ba83 100644 --- a/src/caoscrawler/converters/rocrate.py +++ b/src/caoscrawler/converters/rocrate.py @@ -176,27 +176,7 @@ class ROCrateEntityConverter(Converter): return None vardict.update(m_type.groupdict()) - if "match_properties" in self.definition: - # This matcher works analogously to the attributes matcher in the XMLConverter - for prop_def_key, prop_def_value in self.definition["match_properties"].items(): - match_counter = 0 - matched_m_prop = None - matched_m_prop_value = None - for prop_key, prop_value in element.entity.properties().items(): - m_prop = re.match(prop_def_key, prop_key) - if m_prop is not None: - match_counter += 1 - matched_m_prop = m_prop - m_prop_value = re.match(prop_def_value, prop_value) - if m_prop_value is None: - return None - matched_m_prop_value = m_prop_value - if match_counter == 0: - return None - elif match_counter > 1: - raise RuntimeError("Multiple properties match the same match_prop entry.") - vardict.update(matched_m_prop.groupdict()) - vardict.update(matched_m_prop_value.groupdict()) + self.match_properties(element.entity.properties(), vardict) return vardict diff --git a/src/caoscrawler/converters/xml_converter.py b/src/caoscrawler/converters/xml_converter.py index bd3f6cf0..76d5afff 100644 --- a/src/caoscrawler/converters/xml_converter.py +++ b/src/caoscrawler/converters/xml_converter.py @@ -163,33 +163,7 @@ class XMLTagConverter(Converter): return None vardict.update(m_text.groupdict()) - if "match_attrib" in self.definition: - for attrib_def_key, attrib_def_value in self.definition["match_attrib"].items(): - match_counter = 0 - matched_m_attrib = None - matched_m_attrib_value = None - for attr_key, attr_value in element.tag.attrib.items(): - m_attrib = re.match(attrib_def_key, attr_key) - if m_attrib is not None: - match_counter += 1 - matched_m_attrib = m_attrib - m_attrib_value = re.match(attrib_def_value, attr_value) - if m_attrib_value is None: - return None - matched_m_attrib_value = m_attrib_value - # TODO: How to deal with multiple matches? - # There are multiple options: - # - Allow multiple attribute-key matches: Leads to possible overwrites of variables - # - Require unique attribute-key and attribute-value matches: Very complex - # - Only allow one single attribute-key to match and run attribute-value match separately. - # Currently the latter option is implemented. - # TODO: The ROCrateEntityConverter implements a very similar behavior. - if match_counter == 0: - return None - elif match_counter > 1: - raise RuntimeError("Multiple attributes match the same match_attrib entry.") - vardict.update(matched_m_attrib.groupdict()) - vardict.update(matched_m_attrib_value.groupdict()) + self.match_properties(element.tag.attrib, vardict, "match_attrib") return vardict diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 9e4e2a80..a79e4434 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -39,7 +39,6 @@ import sys import traceback import uuid import warnings - from argparse import RawTextHelpFormatter from copy import deepcopy from datetime import datetime @@ -52,13 +51,10 @@ from caosadvancedtools.cache import UpdateCache from caosadvancedtools.crawler import Crawler as OldCrawler from caosadvancedtools.serverside.helper import send_mail from caosadvancedtools.utils import create_entity_link -from linkahead.apiutils import (compare_entities, - merge_entities) +from linkahead.apiutils import compare_entities, merge_entities from linkahead.cached import cache_clear, cached_get_entity_by from linkahead.common.datatype import get_list_datatype, is_reference -from linkahead.exceptions import ( - TransactionError, -) +from linkahead.exceptions import TransactionError from linkahead.utils.escape import escape_squoted_text from .config import get_config_setting diff --git a/unittests/test_scanner.py b/unittests/test_scanner.py index 680e4abe..4fa752b0 100644 --- a/unittests/test_scanner.py +++ b/unittests/test_scanner.py @@ -36,9 +36,9 @@ import pytest import yaml from caoscrawler.crawl import Crawler from caoscrawler.debug_tree import DebugTree -from caoscrawler.scanner import (create_converter_registry, load_definition, - scan_directory, scan_structure_elements, - _load_definition_from_yaml_dict) +from caoscrawler.scanner import (_load_definition_from_yaml_dict, + create_converter_registry, load_definition, + scan_directory, scan_structure_elements) from caoscrawler.structure_elements import (DictElement, DictListElement, DictTextElement, File) from pytest import raises -- GitLab From 70bc59162e16bf5d48f5416905da4f418da82b82 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Thu, 14 Nov 2024 09:18:12 +0100 Subject: [PATCH 007/131] MAINT(converters): cleaned import statements --- src/caoscrawler/converters/rocrate.py | 20 ++++++++++---------- src/caoscrawler/converters/xml_converter.py | 10 +++++----- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/caoscrawler/converters/rocrate.py b/src/caoscrawler/converters/rocrate.py index e940ba83..3bd0f0df 100644 --- a/src/caoscrawler/converters/rocrate.py +++ b/src/caoscrawler/converters/rocrate.py @@ -26,22 +26,22 @@ This converter converts ro-crate files which may also be .eln-files. from __future__ import annotations +import os +import re +import tempfile from typing import Optional +from zipfile import ZipFile + +import linkahead as db import rocrate from rocrate.rocrate import ROCrate -import linkahead as db - -from .converters import SimpleFileConverter, ConverterValidationError, Converter, convert_basic_element from ..stores import GeneralStore, RecordStore -from ..structure_elements import (File, Directory, StructureElement, ROCrateEntity) - -from zipfile import ZipFile - -import tempfile -import os -import re +from ..structure_elements import (Directory, File, ROCrateEntity, + StructureElement) +from .converters import (Converter, ConverterValidationError, + SimpleFileConverter, convert_basic_element) class ROCrateConverter(SimpleFileConverter): diff --git a/src/caoscrawler/converters/xml_converter.py b/src/caoscrawler/converters/xml_converter.py index 76d5afff..a0ccb2fb 100644 --- a/src/caoscrawler/converters/xml_converter.py +++ b/src/caoscrawler/converters/xml_converter.py @@ -22,17 +22,17 @@ from __future__ import annotations -import lxml.etree import re - from typing import Optional import linkahead as db +import lxml.etree -from .converters import SimpleFileConverter, ConverterValidationError, Converter from ..stores import GeneralStore, RecordStore -from ..structure_elements import (File, StructureElement, - XMLTagElement, XMLTextNode, XMLAttributeNode) +from ..structure_elements import (File, StructureElement, XMLAttributeNode, + XMLTagElement, XMLTextNode) +from .converters import (Converter, ConverterValidationError, + SimpleFileConverter) class XMLFileConverter(SimpleFileConverter): -- GitLab From a6e775de6c9960070e99931e09d7eef2032d4242 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Thu, 14 Nov 2024 09:19:17 +0100 Subject: [PATCH 008/131] FIX(converters): workaround for bug in some .eln files --- src/caoscrawler/converters/rocrate.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/caoscrawler/converters/rocrate.py b/src/caoscrawler/converters/rocrate.py index 3bd0f0df..36b331f2 100644 --- a/src/caoscrawler/converters/rocrate.py +++ b/src/caoscrawler/converters/rocrate.py @@ -170,8 +170,18 @@ class ROCrateEntityConverter(Converter): # Store the result of all individual regexp variable results: vardict = {} + # TODO: I accidentally used "match_type" instead + # of "match_entity_type". This was completely + # unnoticed. So add it to schema and adapt tests. + if "match_entity_type" in self.definition: - m_type = re.match(self.definition["match_entity_type"], element.type) + entity_type = element.entity.type + if isinstance(entity_type, list): + # TODO: this seems to be a bug in kadi4mat RO-Crates + # ./ has type ['Dataset'] + # instead of type 'Dataset' + entity_type = entity_type[0] + m_type = re.match(self.definition["match_entity_type"], entity_type) if m_type is None: return None vardict.update(m_type.groupdict()) -- GitLab From 51d26269c48f420f6fc33f4796bcfb21a3239149 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Thu, 14 Nov 2024 09:47:47 +0100 Subject: [PATCH 009/131] MAINT(converters): cleaned import statements --- src/caoscrawler/converters/rocrate.py | 7 ++----- src/caoscrawler/converters/xml_converter.py | 3 +-- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/src/caoscrawler/converters/rocrate.py b/src/caoscrawler/converters/rocrate.py index 36b331f2..7fd8378e 100644 --- a/src/caoscrawler/converters/rocrate.py +++ b/src/caoscrawler/converters/rocrate.py @@ -32,16 +32,13 @@ import tempfile from typing import Optional from zipfile import ZipFile -import linkahead as db - import rocrate from rocrate.rocrate import ROCrate -from ..stores import GeneralStore, RecordStore +from ..stores import GeneralStore from ..structure_elements import (Directory, File, ROCrateEntity, StructureElement) -from .converters import (Converter, ConverterValidationError, - SimpleFileConverter, convert_basic_element) +from .converters import Converter, SimpleFileConverter, convert_basic_element class ROCrateConverter(SimpleFileConverter): diff --git a/src/caoscrawler/converters/xml_converter.py b/src/caoscrawler/converters/xml_converter.py index a0ccb2fb..472e4051 100644 --- a/src/caoscrawler/converters/xml_converter.py +++ b/src/caoscrawler/converters/xml_converter.py @@ -25,10 +25,9 @@ from __future__ import annotations import re from typing import Optional -import linkahead as db import lxml.etree -from ..stores import GeneralStore, RecordStore +from ..stores import GeneralStore from ..structure_elements import (File, StructureElement, XMLAttributeNode, XMLTagElement, XMLTextNode) from .converters import (Converter, ConverterValidationError, -- GitLab From ef95bfbdb6c651e334fe9043d79b8088c061d993 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Thu, 14 Nov 2024 10:01:18 +0100 Subject: [PATCH 010/131] FIX(converters): added return values to match_properties method --- src/caoscrawler/converters/converters.py | 10 ++++++++-- src/caoscrawler/converters/rocrate.py | 3 ++- src/caoscrawler/converters/xml_converter.py | 3 ++- unittests/test_rocrate_converter.py | 9 ++++----- 4 files changed, 16 insertions(+), 9 deletions(-) diff --git a/src/caoscrawler/converters/converters.py b/src/caoscrawler/converters/converters.py index 40ddde92..1bc70a15 100644 --- a/src/caoscrawler/converters/converters.py +++ b/src/caoscrawler/converters/converters.py @@ -485,6 +485,11 @@ class Converter(object, metaclass=ABCMeta): label: str Default "match_properties". Can be used to change the name of the property in the definition. E.g. the xml converter uses "match_attrib" which makes more sense in the context of xml trees. + + Returns: + -------- + + Returns True when properties match and False otherwise. The vardict dictionary is updated in place. """ if label in self.definition: # This matcher works analogously to the attributes matcher in the XMLConverter @@ -499,7 +504,7 @@ class Converter(object, metaclass=ABCMeta): matched_m_prop = m_prop m_prop_value = re.match(prop_def_value, prop_value) if m_prop_value is None: - return None + return False matched_m_prop_value = m_prop_value # TODO: How to deal with multiple matches? # There are multiple options: @@ -509,11 +514,12 @@ class Converter(object, metaclass=ABCMeta): # Currently the latter option is implemented. # TODO: The ROCrateEntityConverter implements a very similar behavior. if match_counter == 0: - return None + return False elif match_counter > 1: raise RuntimeError("Multiple properties match the same {} entry.".format(label)) vardict.update(matched_m_prop.groupdict()) vardict.update(matched_m_prop_value.groupdict()) + return True def apply_transformers(self, values: GeneralStore, transformer_functions: dict): """ diff --git a/src/caoscrawler/converters/rocrate.py b/src/caoscrawler/converters/rocrate.py index 7fd8378e..8a45af75 100644 --- a/src/caoscrawler/converters/rocrate.py +++ b/src/caoscrawler/converters/rocrate.py @@ -183,7 +183,8 @@ class ROCrateEntityConverter(Converter): return None vardict.update(m_type.groupdict()) - self.match_properties(element.entity.properties(), vardict) + if not self.match_properties(element.entity.properties(), vardict): + return None return vardict diff --git a/src/caoscrawler/converters/xml_converter.py b/src/caoscrawler/converters/xml_converter.py index 472e4051..60d7b494 100644 --- a/src/caoscrawler/converters/xml_converter.py +++ b/src/caoscrawler/converters/xml_converter.py @@ -162,7 +162,8 @@ class XMLTagConverter(Converter): return None vardict.update(m_text.groupdict()) - self.match_properties(element.tag.attrib, vardict, "match_attrib") + if not self.match_properties(element.tag.attrib, vardict, "match_attrib"): + return None return vardict diff --git a/unittests/test_rocrate_converter.py b/unittests/test_rocrate_converter.py index ef59a37c..06ce187e 100644 --- a/unittests/test_rocrate_converter.py +++ b/unittests/test_rocrate_converter.py @@ -32,17 +32,16 @@ import linkahead as db import pytest import rocrate import yaml -from linkahead.high_level_api import convert_to_python_object -from lxml.etree import fromstring -from rocrate.model.entity import Entity -from rocrate.rocrate import ROCrate - from caoscrawler import scanner from caoscrawler.converters import ELNFileConverter, ROCrateEntityConverter from caoscrawler.scanner import load_definition from caoscrawler.stores import GeneralStore from caoscrawler.structure_elements import (DictElement, File, ROCrateEntity, TextElement) +from linkahead.high_level_api import convert_to_python_object +from lxml.etree import fromstring +from rocrate.model.entity import Entity +from rocrate.rocrate import ROCrate UNITTESTDIR = Path(__file__).parent -- GitLab From d82811f03610c6165e8a7c2d4ba1392ebda21b1b Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Thu, 14 Nov 2024 10:04:30 +0100 Subject: [PATCH 011/131] DOC(converters): updated docstring of match_properties method --- src/caoscrawler/converters/converters.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/caoscrawler/converters/converters.py b/src/caoscrawler/converters/converters.py index 1bc70a15..0879dba3 100644 --- a/src/caoscrawler/converters/converters.py +++ b/src/caoscrawler/converters/converters.py @@ -469,10 +469,19 @@ class Converter(object, metaclass=ABCMeta): ROCrate: properties of the ROCrateEntity DictElement: properties of the dict - label can be used to customize the name of the property in the definition. + label can be used to customize the name of the dictionary in the definition. This method is not called by default, but can be called from child classes. + Typically it would be used like this from methods overwriting `match`: + > if not self.match_properties(<properties>, vardict): + > return None + + vardict will be updated in place when there are matches. + <properties> is a dictionary taken from the structure element that contains the properties in the + context of this converter. + + Arguments: ---------- -- GitLab From 02340e8b8fa6648728395a1d6f491f715a9adf69 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Thu, 14 Nov 2024 11:12:21 +0100 Subject: [PATCH 012/131] ENH(converters): DictElementConverter can now match_properties --- src/caoscrawler/converters/converters.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/caoscrawler/converters/converters.py b/src/caoscrawler/converters/converters.py index 0879dba3..0320ae10 100644 --- a/src/caoscrawler/converters/converters.py +++ b/src/caoscrawler/converters/converters.py @@ -950,7 +950,12 @@ class DictElementConverter(Converter): # TODO: See comment on types and inheritance if not isinstance(element, DictElement): raise RuntimeError("Element must be a DictElement.") - return match_name_and_value(self.definition, element.name, element.value) + vardict = match_name_and_value(self.definition, element.name, element.value) + + if not self.match_properties(element.value.items(), vardict): + return None + + return vardict class PropertiesFromDictConverter(DictElementConverter): -- GitLab From 7a4935c0cd7fd4b249667f3137c24878e43df62d Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Thu, 14 Nov 2024 11:37:08 +0100 Subject: [PATCH 013/131] FIX: correct dict in match_properties of DictElementConverter --- src/caoscrawler/converters/converters.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/caoscrawler/converters/converters.py b/src/caoscrawler/converters/converters.py index 0320ae10..6e3c8ffe 100644 --- a/src/caoscrawler/converters/converters.py +++ b/src/caoscrawler/converters/converters.py @@ -507,11 +507,14 @@ class Converter(object, metaclass=ABCMeta): matched_m_prop = None matched_m_prop_value = None for prop_key, prop_value in properties.items(): - m_prop = re.match(prop_def_key, prop_key) + print("{} = {}".format(prop_key, prop_value)) + # TODO: automatic conversion to str ok? + m_prop = re.match(prop_def_key, str(prop_key)) if m_prop is not None: match_counter += 1 matched_m_prop = m_prop - m_prop_value = re.match(prop_def_value, prop_value) + # TODO: automatic conversion to str ok? + m_prop_value = re.match(prop_def_value, str(prop_value)) if m_prop_value is None: return False matched_m_prop_value = m_prop_value @@ -952,7 +955,7 @@ class DictElementConverter(Converter): raise RuntimeError("Element must be a DictElement.") vardict = match_name_and_value(self.definition, element.name, element.value) - if not self.match_properties(element.value.items(), vardict): + if not self.match_properties(element.value, vardict): return None return vardict -- GitLab From 08f8d412a103e98706cc873e7c66a2962c934934 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Thu, 14 Nov 2024 12:32:10 +0100 Subject: [PATCH 014/131] TST(converters): unit test for new match_properties feature in DictElementConverter --- unittests/test_converters.py | 51 +++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 6c7db6ed..12285e46 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -35,7 +35,6 @@ from pathlib import Path import linkahead as db import pytest import yaml - from caoscrawler.converters import (Converter, ConverterValidationError, DateElementConverter, DictElementConverter, DictIntegerElementConverter, @@ -1021,3 +1020,53 @@ def test_properties_from_dict_nested(converter_registry): # The "old" DictConverter should have added the additional property: assert myrec.get_property("additional_from_other") is not None assert myrec.get_property("additional_from_other").value == "other" + + +def test_dict_match_properties(converter_registry): + + root_dict_element = DictElement("RootDict", { + "prop_a": "value", + "prop_b": "25", + "prop_c": 24 + }) + + def_dict = { + "RootElt": { + # Root dictionary + "type": "DictElement", + "match_properties": { + "prop_a": "(?P<a>.*)$", + "prop_[^ac]": "(?P<b>.*)$", + "prop_c": "(?P<c>.*)$", + }, + "records": { + # Define top-level, use below in subtrees + "MyRec": { + "prop_a": "$a", + "prop_b": "$b", + "$a": "$c" + } + }}} + records = scan_structure_elements(root_dict_element, def_dict, converter_registry) + assert len(records) == 1 + record = records[0] + assert record.get_property("prop_a").value == "value" + assert record.get_property("prop_b").value == "25" + assert record.get_property("value").value == "24" # Note the type change here + + root_dict_element = DictElement("RootDict", { + "prop_a": "value", + "prop_b": "25", + # Property missing + }) + + records = scan_structure_elements(root_dict_element, def_dict, converter_registry) + assert len(records) == 0 + + with pytest.raises(RuntimeError, match="Multiple properties match the same match_properties entry."): + root_dict_element = DictElement("RootDict", { + "prop_a": "value", + "prop_b": "25", + "prop_d": 24 # duplicate matches + }) + records = scan_structure_elements(root_dict_element, def_dict, converter_registry) -- GitLab From fe4d858ace740a08353298a9e5f64313afd234d4 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Thu, 14 Nov 2024 12:46:55 +0100 Subject: [PATCH 015/131] DOC(converters): added changelog entry and documentation for the new match_properties feature of the DictElementConverter --- CHANGELOG.md | 3 ++ src/doc/converters/standard_converters.rst | 35 ++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f106e8f0..845f4e39 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### +- DictElementConverters can now make use of `match_properties` which works analogous to `match_properties` in ROCrateEntityConverter and `match_attrib` in XMLConverter. +- `match_properties` is a method of class Converter and can for example be used by CustomConverters. + ### Changed ### ### Deprecated ### diff --git a/src/doc/converters/standard_converters.rst b/src/doc/converters/standard_converters.rst index 586b84b4..8d4ab045 100644 --- a/src/doc/converters/standard_converters.rst +++ b/src/doc/converters/standard_converters.rst @@ -41,6 +41,41 @@ The following StructureElement types are typically created by the DictElement co Note that you may use ``TextElement`` for anything that exists in a text format that can be interpreted by the server, such as date and datetime strings in ISO-8601 format. +match_properties +---------------- + +`match_properties` is a dictionary of key-regexps and value-regexp pairs and can be used to +match direct properties of a `DictElement`. Each key matches +a property name and the corresponding value matches its property value. + +Example: +........ + +.. code-block:: json + + { + "@type": "PropertyValue", + "additionalType": "str", + "propertyID": "testextra", + "value": "hi" + } + +When applied to a dict loaded from the above json, a `DictElementConverter` with the following definition: + +.. code-block:: yaml + + Example: + type: DictElement + match_properties: + additionalType: (?P<addt>.*)$ + property(.*): (?P<propid>.*)$ + +will match and create two variables: + +- `addt = "str"` +- `propid = "testextra"` + + Scalar Value Converters ======================= `BooleanElementConverter`, `FloatElementConverter`, `TextElementConverter`, and -- GitLab From f411d63ba8165092918d41b735bdee6fbaad110e Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Fri, 15 Nov 2024 11:27:07 +0100 Subject: [PATCH 016/131] ENH(converters): implemented zipfile converter --- src/caoscrawler/converters/__init__.py | 1 + src/caoscrawler/converters/zipfile.py | 81 ++++++++++++++++++++++++++ 2 files changed, 82 insertions(+) create mode 100644 src/caoscrawler/converters/zipfile.py diff --git a/src/caoscrawler/converters/__init__.py b/src/caoscrawler/converters/__init__.py index 670d4e96..51d85605 100644 --- a/src/caoscrawler/converters/__init__.py +++ b/src/caoscrawler/converters/__init__.py @@ -23,6 +23,7 @@ from .. import utils from .converters import * from .xml_converter import * +from .zipfile import * try: from .spss import SPSSConverter diff --git a/src/caoscrawler/converters/zipfile.py b/src/caoscrawler/converters/zipfile.py new file mode 100644 index 00000000..49bfcc36 --- /dev/null +++ b/src/caoscrawler/converters/zipfile.py @@ -0,0 +1,81 @@ +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Converters take structure elements and create Records and new structure elements from them. + +This converter opens zip files, unzips them into a temporary directory and +exposes its contents as File structure elements. + +""" + +from __future__ import annotations + +import os +import tempfile +from os.path import isdir, join +from zipfile import ZipFile + +from ..stores import GeneralStore +from ..structure_elements import Directory, File, StructureElement +from .converters import SimpleFileConverter + + +class ZipFileConverter(SimpleFileConverter): + + """Convert zipfiles. + """ + + def setup(self): + self._tempdir = None + + def cleanup(self): + self._tempdir.cleanup() + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + """ + Loads an ROCrate from an rocrate file or directory. + + Arguments: + ---------- + element must be a File or Directory (structure element). + + Returns: + -------- + A list with an ROCrateElement representing the contents of the .eln-file or None + in case of errors. + """ + + if isinstance(element, File): + self._tempdir = tempfile.TemporaryDirectory() + unzd_path = self._tempdir.name + with ZipFile(element.path) as zipf: + zipf.extractall(unzd_path) + + entity_ls = [] + for el in os.listdir(unzd_path): + if isdir(join(unzd_path, el)): + entity_ls.append(Directory()) + else: + entity_ls.append(File()) + + return entity_ls + else: + raise ValueError("create_children was called with wrong type of StructureElement") + return None -- GitLab From b2ea2ee267dd01accdcec1115d180e81e6035023 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Fri, 15 Nov 2024 11:34:34 +0100 Subject: [PATCH 017/131] FIX(converters): name and path added to structure elements --- src/caoscrawler/converters/zipfile.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/caoscrawler/converters/zipfile.py b/src/caoscrawler/converters/zipfile.py index 49bfcc36..7073e66a 100644 --- a/src/caoscrawler/converters/zipfile.py +++ b/src/caoscrawler/converters/zipfile.py @@ -70,10 +70,11 @@ class ZipFileConverter(SimpleFileConverter): entity_ls = [] for el in os.listdir(unzd_path): - if isdir(join(unzd_path, el)): - entity_ls.append(Directory()) + path = join(unzd_path, el) + if isdir(path): + entity_ls.append(Directory(el, path)) else: - entity_ls.append(File()) + entity_ls.append(File(el, path)) return entity_ls else: -- GitLab From b93710647990d8e44bad730e680e141828c740ae Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Fri, 15 Nov 2024 11:45:51 +0100 Subject: [PATCH 018/131] TST(converters): test for zipfile converter --- unittests/test_zipfile_converter.py | 79 +++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 unittests/test_zipfile_converter.py diff --git a/unittests/test_zipfile_converter.py b/unittests/test_zipfile_converter.py new file mode 100644 index 00000000..68020e49 --- /dev/null +++ b/unittests/test_zipfile_converter.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer <a.schlemmer@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" +test the XML converters +""" +import importlib +import os +from pathlib import Path + +import pytest +import yaml +from caoscrawler.converters import DirectoryConverter, ZipFileConverter +from caoscrawler.stores import GeneralStore +from caoscrawler.structure_elements import Directory, File + +UNITTESTDIR = Path(__file__).parent + + +@pytest.fixture +def converter_registry(): + converter_registry: dict[str, dict[str, str]] = { + "ZipFile": { + "converter": "ZipFileConverter", + "package": "caoscrawler.converters"}, + } + + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry + + +def test_zipfile_converter(converter_registry): + zipfile = File("PASTA.eln", os.path.join(UNITTESTDIR, "eln_files", "PASTA.eln")) + zip_conv = ZipFileConverter(yaml.safe_load(""" +type: ZipFile +match: .*$ +"""), "TestZipFileConverter", converter_registry) + + match = zip_conv.match(zipfile) + assert match is not None + + children = zip_conv.create_children(GeneralStore(), zipfile) + assert len(children) == 1 + assert children[0].name == "PASTA" + + dir_conv = DirectoryConverter(yaml.safe_load(""" +type: Directory +match: ^PASTA$ +"""), "TestDirectory", converter_registry) + match = dir_conv.match(children[0]) + assert match is not None + children = dir_conv.create_children(GeneralStore(), children[0]) + assert len(children) == 5 + print(children) + for i in range(2): + assert isinstance(children[i], Directory) + for i in range(2, 5): + assert isinstance(children[i], File) -- GitLab From e72336a7476ee4a7cec858f04d675e1d57734aa4 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Fri, 15 Nov 2024 11:51:59 +0100 Subject: [PATCH 019/131] DOC(converters): doc for zipfile converter --- src/doc/converters/standard_converters.rst | 27 ++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/src/doc/converters/standard_converters.rst b/src/doc/converters/standard_converters.rst index 586b84b4..b988172d 100644 --- a/src/doc/converters/standard_converters.rst +++ b/src/doc/converters/standard_converters.rst @@ -331,3 +331,30 @@ XMLTextNodeConverter In the future, this converter can be used to match XMLTextNodes that are generated by the XMLTagConverter. + + +ZipFileConverter +---------------- + +This converter opens zip files, unzips them into a temporary directory and +exposes its contents as File structure elements. + +Usage Example: +============== + +.. code-block:: yaml + + ExampleZipFile: + type: ZipFile + match: example\.zip$ + subtree: + DirInsideZip: + type: Directory + match: experiments$ + FileInsideZip: + type: File + match: description.odt$ + +This converter will match and open files called `example.zip`. +If the file contains a directory called `experiments` it will be processed further by the respective +converter in the subtree. The same is true for a file called `description.odt`. -- GitLab From 9f34c4dc0c3687ea93145e9746e8c74b572a1ae6 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Fri, 15 Nov 2024 11:52:22 +0100 Subject: [PATCH 020/131] MAINT: cleanup of imports --- unittests/test_rocrate_converter.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/unittests/test_rocrate_converter.py b/unittests/test_rocrate_converter.py index ef59a37c..dc7cef9f 100644 --- a/unittests/test_rocrate_converter.py +++ b/unittests/test_rocrate_converter.py @@ -27,22 +27,16 @@ import importlib import os from pathlib import Path -import jsonschema import linkahead as db import pytest import rocrate import yaml -from linkahead.high_level_api import convert_to_python_object -from lxml.etree import fromstring -from rocrate.model.entity import Entity -from rocrate.rocrate import ROCrate - from caoscrawler import scanner from caoscrawler.converters import ELNFileConverter, ROCrateEntityConverter -from caoscrawler.scanner import load_definition from caoscrawler.stores import GeneralStore from caoscrawler.structure_elements import (DictElement, File, ROCrateEntity, TextElement) +from rocrate.model.entity import Entity UNITTESTDIR = Path(__file__).parent -- GitLab From 71e73242c54214982ef1723ddfb6322683094a2c Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Fri, 15 Nov 2024 11:53:11 +0100 Subject: [PATCH 021/131] DOC(converters): updated changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f106e8f0..57a8ef48 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### +- ZipFileConverter that opens zip files and exposes their contents as File and Directory structure elements. + ### Changed ### ### Deprecated ### -- GitLab From 21bfe7d3614a14cf8779c3d94e9429fc1f03b40e Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Mon, 18 Nov 2024 10:42:15 +0100 Subject: [PATCH 022/131] ENH(core): new validator module that is supposed to check created records using a json schema --- src/caoscrawler/validator.py | 68 ++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 src/caoscrawler/validator.py diff --git a/src/caoscrawler/validator.py b/src/caoscrawler/validator.py new file mode 100644 index 00000000..7e7a2eb3 --- /dev/null +++ b/src/caoscrawler/validator.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer +# +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +""" +This module contains functions to validate the output of a scanner run with a +json schema. +""" + + +import json + +import jsonschema +import linkahead as db +# from caosadvancedtools.models.parser import parse_model_from_string +from caosadvancedtools.json_schema_exporter import recordtype_to_json_schema +from caosadvancedtools.models.parser import parse_model_from_yaml +from linkahead.high_level_api import convert_to_python_object + +from caoscrawler import scanner + + +def load_json_schema_from_datamodel_yaml(filename: str) -> list: + """ + Load a data model yaml file (using caosadvancedtools) and convert + all record types into a json schema using the json_schema_exporter module. + + Arguments + --------- + filename: str + The filename of the yaml file to load. + + Returns + ------- + A list of json schema objects. + """ + + model = parse_model_from_yaml(filename) + + # TODO: fix needed (https://gitlab.indiscale.com/caosdb/customers/f-fit/management/-/issues/58) + + rt_schemas = [] + for el in model: + if isinstance(el, db.RecordType): + rt_schemas.append(recordtype_to_json_schema(el)) + + return rt_schemas -- GitLab From b5414af6d1ce30d80f1503b3fcc54091ec393df4 Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Mon, 18 Nov 2024 15:24:17 +0100 Subject: [PATCH 023/131] DOC: Fix docstring format --- CHANGELOG.md | 7 ++-- src/caoscrawler/converters/converters.py | 45 ++++++++++++++---------- src/doc/cfood.rst | 6 ++-- 3 files changed, 34 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 845f4e39..71fdba54 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,8 +9,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### -- DictElementConverters can now make use of `match_properties` which works analogous to `match_properties` in ROCrateEntityConverter and `match_attrib` in XMLConverter. -- `match_properties` is a method of class Converter and can for example be used by CustomConverters. +- DictElementConverters can now make use of `match_properties` which + works analogous to `match_properties` in ROCrateEntityConverter and + `match_attrib` in XMLConverter. +- `match_properties` is a method of class Converter and can for + example be used by CustomConverters. ### Changed ### diff --git a/src/caoscrawler/converters/converters.py b/src/caoscrawler/converters/converters.py index 6e3c8ffe..3a3c7e29 100644 --- a/src/caoscrawler/converters/converters.py +++ b/src/caoscrawler/converters/converters.py @@ -457,48 +457,55 @@ class Converter(object, metaclass=ABCMeta): values.update(m) def match_properties(self, properties: dict, vardict: dict, label: str = "match_properties"): - """ - This method can be used to generically match 'match_properties' from the cfood definition + """This method can be used to generically match 'match_properties' from the cfood definition with the behavior described as follows: 'match_properties' is a dictionary of key-regexps and value-regexp pairs. Each key matches a property name and the corresponding value matches its property value. What a property means in the context of the respective converter can be different, examples: - XMLTag: attributes of the node - ROCrate: properties of the ROCrateEntity - DictElement: properties of the dict + + * XMLTag: attributes of the node + * ROCrate: properties of the ROCrateEntity + * DictElement: properties of the dict label can be used to customize the name of the dictionary in the definition. This method is not called by default, but can be called from child classes. - Typically it would be used like this from methods overwriting `match`: - > if not self.match_properties(<properties>, vardict): - > return None + Typically it would be used like this from methods overwriting `match`:: + + if not self.match_properties(<properties>, vardict): + return None - vardict will be updated in place when there are matches. - <properties> is a dictionary taken from the structure element that contains the properties in the - context of this converter. + vardict will be updated in place when there are + matches. <properties> is a dictionary taken from the structure + element that contains the properties in the context of this + converter. - Arguments: + Parameters ---------- properties: dict - The dictionary containing the properties to be matched. + The dictionary containing the properties to be matched. vardict: dict - This dictionary will be used to store the variables created during the matching. + This dictionary will be used to store the variables created during the matching. label: str - Default "match_properties". Can be used to change the name of the property in the definition. E.g. the - xml converter uses "match_attrib" which makes more sense in the context of xml trees. + Default "match_properties". Can be used to change the name + of the property in the definition. E.g. the xml converter + uses "match_attrib" which makes more sense in the context + of xml trees. + + Returns + ------- - Returns: - -------- + : bool + Returns True when properties match and False + otherwise. The vardict dictionary is updated in place. - Returns True when properties match and False otherwise. The vardict dictionary is updated in place. """ if label in self.definition: # This matcher works analogously to the attributes matcher in the XMLConverter diff --git a/src/doc/cfood.rst b/src/doc/cfood.rst index a42d5930..0c7726d2 100644 --- a/src/doc/cfood.rst +++ b/src/doc/cfood.rst @@ -207,9 +207,9 @@ following. ValueWithUnitElt: type: TextElement match_name: ^my_prop$ - match_value: "^(?P<number>\\d+\\.?\\d*)\s+(?P<unit>.+)" # Extract value and unit from a string which - # has a number followed by at least one whitespace - # character followed by a unit. + match_value: "^(?P<number>\\d+\\.?\\d*)\\s+(?P<unit>.+)" # Extract value and unit from a string which + # has a number followed by at least one whitespace + # character followed by a unit. records: MyRecord: MyProp: -- GitLab From 1d48703dd03f773f28b0ee7b012f757cec1c5206 Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Tue, 19 Nov 2024 14:10:42 +0100 Subject: [PATCH 024/131] MAINT: Rename to avoid duplication --- src/caoscrawler/converters/__init__.py | 2 +- src/caoscrawler/converters/{zipfile.py => zipfile_converter.py} | 0 unittests/test_zipfile_converter.py | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename src/caoscrawler/converters/{zipfile.py => zipfile_converter.py} (100%) diff --git a/src/caoscrawler/converters/__init__.py b/src/caoscrawler/converters/__init__.py index 51d85605..3ec22233 100644 --- a/src/caoscrawler/converters/__init__.py +++ b/src/caoscrawler/converters/__init__.py @@ -23,7 +23,7 @@ from .. import utils from .converters import * from .xml_converter import * -from .zipfile import * +from .zipfile_converter import ZipFileConverter try: from .spss import SPSSConverter diff --git a/src/caoscrawler/converters/zipfile.py b/src/caoscrawler/converters/zipfile_converter.py similarity index 100% rename from src/caoscrawler/converters/zipfile.py rename to src/caoscrawler/converters/zipfile_converter.py diff --git a/unittests/test_zipfile_converter.py b/unittests/test_zipfile_converter.py index 68020e49..9bc8b880 100644 --- a/unittests/test_zipfile_converter.py +++ b/unittests/test_zipfile_converter.py @@ -21,7 +21,7 @@ # """ -test the XML converters +test the zip-file converter """ import importlib import os -- GitLab From 5f3fecfd0f9935bdee9a5baacee56d5c47159925 Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Tue, 19 Nov 2024 14:17:29 +0100 Subject: [PATCH 025/131] DOC: Fix header levels --- src/caoscrawler/converters/__init__.py | 2 +- src/doc/converters/standard_converters.rst | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/caoscrawler/converters/__init__.py b/src/caoscrawler/converters/__init__.py index 3ec22233..edb7b363 100644 --- a/src/caoscrawler/converters/__init__.py +++ b/src/caoscrawler/converters/__init__.py @@ -18,7 +18,7 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. -"""Submdule containing all default and optional converters.""" +"""Submodule containing all default and optional converters.""" from .. import utils from .converters import * diff --git a/src/doc/converters/standard_converters.rst b/src/doc/converters/standard_converters.rst index 0520f56b..f7f18794 100644 --- a/src/doc/converters/standard_converters.rst +++ b/src/doc/converters/standard_converters.rst @@ -369,13 +369,13 @@ are generated by the XMLTagConverter. ZipFileConverter ----------------- +================ This converter opens zip files, unzips them into a temporary directory and exposes its contents as File structure elements. Usage Example: -============== +-------------- .. code-block:: yaml @@ -390,6 +390,7 @@ Usage Example: type: File match: description.odt$ -This converter will match and open files called `example.zip`. -If the file contains a directory called `experiments` it will be processed further by the respective -converter in the subtree. The same is true for a file called `description.odt`. +This converter will match and open files called ``example.zip``. If +the file contains a directory called ``experiments`` it will be +processed further by the respective converter in the subtree. The same +is true for a file called ``description.odt``. -- GitLab From a093ca8b060c5590157c346818b0760f908d0601 Mon Sep 17 00:00:00 2001 From: Daniel <d.hornung@indiscale.com> Date: Tue, 19 Nov 2024 14:53:42 +0100 Subject: [PATCH 026/131] FIX: module path in spss_to_datamodel script. --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index d00202b9..acde906c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -40,7 +40,7 @@ per-file-ignores = __init__.py:F401 [options.entry_points] console_scripts = caosdb-crawler = caoscrawler.crawl:main - spss_to_datamodel = caoscrawler.conv_impl.spss:spss_to_datamodel_main + spss_to_datamodel = caoscrawler.converters.spss:spss_to_datamodel_main csv_to_datamodel = caoscrawler.scripts.generators:csv_to_datamodel_main [options.extras_require] -- GitLab From 99695ba63624fe3f36a2bbac2aac0d58a69d0f42 Mon Sep 17 00:00:00 2001 From: Daniel <d.hornung@indiscale.com> Date: Tue, 19 Nov 2024 14:54:00 +0100 Subject: [PATCH 027/131] WIP: Pipeline for script tests. --- unittests/test_scripts.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 unittests/test_scripts.py diff --git a/unittests/test_scripts.py b/unittests/test_scripts.py new file mode 100644 index 00000000..c620a8a1 --- /dev/null +++ b/unittests/test_scripts.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 + +# This file is a part of the LinkAhead project. +# +# Copyright (C) 2024 IndiScale GmbH <www.indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Test if the scripts work as expected. +""" + +from subprocess import run + +SCRIPTS = [ + "caosdb-crawler", + "spss_to_datamodel", + "csv_to_datamodel", +] + + +def test_script_loading(): + """Run the scripts with "-h".""" + for script in SCRIPTS: + run([script, "-h"], check=True) -- GitLab From fae51774da017da87cfc456b32ac5ea26623ac68 Mon Sep 17 00:00:00 2001 From: Daniel <d.hornung@indiscale.com> Date: Tue, 19 Nov 2024 15:18:16 +0100 Subject: [PATCH 028/131] DOC: CHANGELOG --- CHANGELOG.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f106e8f0..3c3e9abc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed ### +- `spss_to_datamodel` script works again. + ### Security ### ### Documentation ### @@ -46,9 +48,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Units for properties. They can be specified by giving the property as a dict in the form ```yaml MyRecord: - my_prop: - value: 5 - unit: m + my_prop: + value: 5 + unit: m ``` - Support for Python 3.13 - ROCrateConverter, ELNFileConverter and ROCrateEntityConverter for crawling ROCrate and .eln files -- GitLab From ffeb0667c297ac692f20560065084ab2e651c2ec Mon Sep 17 00:00:00 2001 From: Daniel <d.hornung@indiscale.com> Date: Tue, 19 Nov 2024 15:39:20 +0100 Subject: [PATCH 029/131] MAINT: Added linkahead-crawler script alias. --- CHANGELOG.md | 2 ++ setup.cfg | 1 + unittests/test_scripts.py | 1 + 3 files changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3c3e9abc..07c7c4a7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### +* `linkahead-crawler` script as alias for `caosdb-crawler`. + ### Changed ### ### Deprecated ### diff --git a/setup.cfg b/setup.cfg index acde906c..d05f2acb 100644 --- a/setup.cfg +++ b/setup.cfg @@ -39,6 +39,7 @@ per-file-ignores = __init__.py:F401 [options.entry_points] console_scripts = + linkahead-crawler = caoscrawler.crawl:main caosdb-crawler = caoscrawler.crawl:main spss_to_datamodel = caoscrawler.converters.spss:spss_to_datamodel_main csv_to_datamodel = caoscrawler.scripts.generators:csv_to_datamodel_main diff --git a/unittests/test_scripts.py b/unittests/test_scripts.py index c620a8a1..da03c1f2 100644 --- a/unittests/test_scripts.py +++ b/unittests/test_scripts.py @@ -24,6 +24,7 @@ from subprocess import run SCRIPTS = [ + "linkahead-crawler", "caosdb-crawler", "spss_to_datamodel", "csv_to_datamodel", -- GitLab From 60ae1ad4eef9d588016764b0593b61d5134082b9 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Mon, 25 Nov 2024 14:21:16 +0100 Subject: [PATCH 030/131] ENH(scanner): json schema validation functions and test --- src/caoscrawler/validator.py | 42 +++++++++++++++++-- unittests/datamodels/datamodel.yaml | 6 +++ unittests/test_validation.py | 64 +++++++++++++++++++++++++++++ 3 files changed, 109 insertions(+), 3 deletions(-) create mode 100644 unittests/datamodels/datamodel.yaml create mode 100644 unittests/test_validation.py diff --git a/src/caoscrawler/validator.py b/src/caoscrawler/validator.py index 7e7a2eb3..e1d36a39 100644 --- a/src/caoscrawler/validator.py +++ b/src/caoscrawler/validator.py @@ -40,6 +40,8 @@ from linkahead.high_level_api import convert_to_python_object from caoscrawler import scanner +# from collections import OrderedDict + def load_json_schema_from_datamodel_yaml(filename: str) -> list: """ @@ -58,11 +60,45 @@ def load_json_schema_from_datamodel_yaml(filename: str) -> list: model = parse_model_from_yaml(filename) - # TODO: fix needed (https://gitlab.indiscale.com/caosdb/customers/f-fit/management/-/issues/58) - rt_schemas = [] - for el in model: + for el_key, el in model.items(): if isinstance(el, db.RecordType): rt_schemas.append(recordtype_to_json_schema(el)) return rt_schemas + + +def representer_ordereddict(dumper, data): + # yaml.add_representer(OrderedDict, caoscrawler.validator.representer_ordereddict) + return dumper.represent_data(dict(data)) + + +def convert_record(record: db.Record): + """ + Convert a record into a form suitable for validation with jsonschema. + + Uses high_level_api.convert_to_python_object + + Changes applied: + - properties are moved vom subitem "proeprties" to top-level. + - The following keys are deleted: parents, role, name, description, metadata, properties + + Arguments: + ---------- + record: db.Record + The record that is supposed to be converted. + """ + pobj = convert_to_python_object(record).serialize() + + for prop in pobj["properties"]: + pobj[prop] = pobj["properties"][prop] + + for keyd in ("parents", "role", "name", + "description", "metadata", "properties"): + if keyd in pobj: + del pobj[keyd] + + return pobj + +# def validate(schema, records): +# pass diff --git a/unittests/datamodels/datamodel.yaml b/unittests/datamodels/datamodel.yaml new file mode 100644 index 00000000..2759ecba --- /dev/null +++ b/unittests/datamodels/datamodel.yaml @@ -0,0 +1,6 @@ +Dataset: + obligatory_properties: + keywords: + datatype: TEXT + dateModified: + datatype: DATETIME diff --git a/unittests/test_validation.py b/unittests/test_validation.py new file mode 100644 index 00000000..45462ac8 --- /dev/null +++ b/unittests/test_validation.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer <a.schlemmer@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" +test validation +""" +import importlib +import os +from os.path import join +from pathlib import Path + +import caoscrawler +import jsonschema +import linkahead as db +import pytest +import yaml +from caoscrawler.validator import (convert_record, + load_json_schema_from_datamodel_yaml) +from jsonschema import ValidationError + +UNITTESTDIR = Path(__file__).parent + + +def test_create_json_schema(): + json = load_json_schema_from_datamodel_yaml(join(UNITTESTDIR, "datamodels", "datamodel.yaml")) + r = db.Record() + r.add_parent(name="Dataset") + r.add_property(name="keywords", value="jakdlfjakdf") + r.add_property(name="dateModified", value="2024-11-16") + + pobj = convert_record(r) + # print(yaml.dump(pobj)) + # print(yaml.dump(json[0])) + jsonschema.validate(pobj, json[0]) + + # Failing test: + r = db.Record() + r.add_parent(name="Dataset") + r.add_property(name="keywordss", value="jakdlfjakdf") + r.add_property(name="dateModified", value="2024-11-16") + + pobj = convert_record(r) + + with pytest.raises(ValidationError, match=".*'keywords' is a required property.*"): + jsonschema.validate(pobj, json[0]) -- GitLab From f291fbf4dec94c5feb4e57e15c3f903390e1f12d Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Mon, 25 Nov 2024 14:31:59 +0100 Subject: [PATCH 031/131] DOC(scanner): added docstring for validation function --- src/caoscrawler/validator.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/src/caoscrawler/validator.py b/src/caoscrawler/validator.py index e1d36a39..91313daf 100644 --- a/src/caoscrawler/validator.py +++ b/src/caoscrawler/validator.py @@ -100,5 +100,27 @@ def convert_record(record: db.Record): return pobj -# def validate(schema, records): -# pass + +def validate(records: list[db.Record], schema: dict) -> tuple[list, list]: + """ + Validate a list of records against a JSON schema. + + Arguments: + ---------- + + records: list[db.Record] + List of records that will be validated. + + schema: dict + A JSON schema generated using `load_json_schema_from_datamodel_yaml`. + + Returns: + -------- + A tuple containing two elements: + + - Index 0: A list of boolean values, one for each record in `records` determining whether + the validation was successful. + - Index 1: A list of ValidationErrors (in case of insuccesful validation) or None if + the validation was successful. + """ + pass -- GitLab From 2cceed883c59ed95961b9e58129d940042ec7d7c Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Tue, 26 Nov 2024 11:32:52 +0100 Subject: [PATCH 032/131] ENH(scanner): validation function for json schemas --- src/caoscrawler/validator.py | 30 ++++++++++++++++++++---------- unittests/test_validation.py | 26 +++++++++++++++++++++++++- 2 files changed, 45 insertions(+), 11 deletions(-) diff --git a/src/caoscrawler/validator.py b/src/caoscrawler/validator.py index 91313daf..c91a5224 100644 --- a/src/caoscrawler/validator.py +++ b/src/caoscrawler/validator.py @@ -28,14 +28,12 @@ This module contains functions to validate the output of a scanner run with a json schema. """ - -import json - import jsonschema import linkahead as db # from caosadvancedtools.models.parser import parse_model_from_string from caosadvancedtools.json_schema_exporter import recordtype_to_json_schema from caosadvancedtools.models.parser import parse_model_from_yaml +from jsonschema import ValidationError from linkahead.high_level_api import convert_to_python_object from caoscrawler import scanner @@ -101,9 +99,12 @@ def convert_record(record: db.Record): return pobj -def validate(records: list[db.Record], schema: dict) -> tuple[list, list]: +def validate(records: list[db.Record], schemas: list[dict]) -> tuple[list, list]: """ - Validate a list of records against a JSON schema. + Validate a list of records against a list of possible JSON schemas. + + It is tried to validate each schema from the list of schemas. If none of them validates + without error, it is assumed that it does not match at all. Arguments: ---------- @@ -111,8 +112,8 @@ def validate(records: list[db.Record], schema: dict) -> tuple[list, list]: records: list[db.Record] List of records that will be validated. - schema: dict - A JSON schema generated using `load_json_schema_from_datamodel_yaml`. + schemas: list[dict] + A list of JSON schemas generated using `load_json_schema_from_datamodel_yaml`. Returns: -------- @@ -120,7 +121,16 @@ def validate(records: list[db.Record], schema: dict) -> tuple[list, list]: - Index 0: A list of boolean values, one for each record in `records` determining whether the validation was successful. - - Index 1: A list of ValidationErrors (in case of insuccesful validation) or None if - the validation was successful. + - Index 1: A list of schemas matching the record at this position of the list `records`. """ - pass + retval = [] + for r in records: + matching_schemas = [] + for schema in schemas: + try: + jsonschema.validate(convert_record(r), schema) + matching_schemas.append(schema) + except ValidationError: + pass + retval.append((len(matching_schemas) > 0, matching_schemas)) + return retval diff --git a/unittests/test_validation.py b/unittests/test_validation.py index 45462ac8..42bf33ba 100644 --- a/unittests/test_validation.py +++ b/unittests/test_validation.py @@ -34,7 +34,8 @@ import linkahead as db import pytest import yaml from caoscrawler.validator import (convert_record, - load_json_schema_from_datamodel_yaml) + load_json_schema_from_datamodel_yaml, + validate) from jsonschema import ValidationError UNITTESTDIR = Path(__file__).parent @@ -62,3 +63,26 @@ def test_create_json_schema(): with pytest.raises(ValidationError, match=".*'keywords' is a required property.*"): jsonschema.validate(pobj, json[0]) + + +def test_validation(): + """ + Test for the main validation API function `validate` + """ + json = load_json_schema_from_datamodel_yaml( + join(UNITTESTDIR, "datamodels", "datamodel.yaml")) + r1 = db.Record() + r1.add_parent(name="Dataset") + r1.add_property(name="keywords", value="jakdlfjakdf") + r1.add_property(name="dateModified", value="2024-11-16") + + r2 = db.Record() + r2.add_parent(name="Dataset") + r2.add_property(name="keywordss", value="jakdlfjakdf") + r2.add_property(name="dateModified", value="2024-11-16") + + valres = validate([r1, r2], json) + assert valres[0][0] + assert len(valres[0][1]) == 1 + assert valres[0][1][0] == json[0] + assert len(valres[1][1]) == 0 -- GitLab From 910b3fdcae195a8cea396fb4127710b052fc6bcc Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Tue, 26 Nov 2024 11:56:44 +0100 Subject: [PATCH 033/131] DOC(scanner): updated changelog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f106e8f0..aafd19ba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### +* Validation module for checking a list of generated records against a list of json schemas + that can be generated from a yaml data model file. + ### Changed ### ### Deprecated ### -- GitLab From 5f84d8fdb67f3eb7edee23422005ba98c1db10cb Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Tue, 26 Nov 2024 14:16:23 +0100 Subject: [PATCH 034/131] DOC(validator): corrected docstring of validate function --- src/caoscrawler/validator.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/caoscrawler/validator.py b/src/caoscrawler/validator.py index c91a5224..1663aa46 100644 --- a/src/caoscrawler/validator.py +++ b/src/caoscrawler/validator.py @@ -99,7 +99,7 @@ def convert_record(record: db.Record): return pobj -def validate(records: list[db.Record], schemas: list[dict]) -> tuple[list, list]: +def validate(records: list[db.Record], schemas: list[dict]) -> list[tuple[bool, list]]: """ Validate a list of records against a list of possible JSON schemas. @@ -117,10 +117,9 @@ def validate(records: list[db.Record], schemas: list[dict]) -> tuple[list, list] Returns: -------- - A tuple containing two elements: + A list of tuples, one element for each record: - - Index 0: A list of boolean values, one for each record in `records` determining whether - the validation was successful. + - Index 0: A boolean that determines whether at least one schema matched for this record. - Index 1: A list of schemas matching the record at this position of the list `records`. """ retval = [] -- GitLab From 08e7aa5b902a22c23130d447fcf371c772a70ef6 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Tue, 26 Nov 2024 13:17:21 +0000 Subject: [PATCH 035/131] TST(validator): improve readability of unit test --- unittests/test_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unittests/test_validation.py b/unittests/test_validation.py index 42bf33ba..6db0674e 100644 --- a/unittests/test_validation.py +++ b/unittests/test_validation.py @@ -82,7 +82,7 @@ def test_validation(): r2.add_property(name="dateModified", value="2024-11-16") valres = validate([r1, r2], json) - assert valres[0][0] + assert valres[0][0] is True assert len(valres[0][1]) == 1 assert valres[0][1][0] == json[0] assert len(valres[1][1]) == 0 -- GitLab From 1907eee26191bcfe235516d86cd8ab5012c50487 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Tue, 26 Nov 2024 14:22:33 +0100 Subject: [PATCH 036/131] ENH(transformers): new transformer functions for casting types of variables --- src/caoscrawler/default_transformers.yml | 12 ++++++++++++ src/caoscrawler/transformer_functions.py | 16 ++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/src/caoscrawler/default_transformers.yml b/src/caoscrawler/default_transformers.yml index ffcb1b15..0de9a6e0 100644 --- a/src/caoscrawler/default_transformers.yml +++ b/src/caoscrawler/default_transformers.yml @@ -15,3 +15,15 @@ date_parse: datetime_parse: package: caoscrawler.transformer_functions function: datetime_parse +cast_to_int: + package: caoscrawler.transformer_functions + function: cast_to_int +cast_to_float: + package: caoscrawler.transformer_functions + function: cast_to_float +cast_to_bool: + package: caoscrawler.transformer_functions + function: cast_to_bool +cast_to_str: + package: caoscrawler.transformer_functions + function: cast_to_str diff --git a/src/caoscrawler/transformer_functions.py b/src/caoscrawler/transformer_functions.py index ce08bc6b..ddc1fe94 100644 --- a/src/caoscrawler/transformer_functions.py +++ b/src/caoscrawler/transformer_functions.py @@ -99,3 +99,19 @@ Parameters fmt = params.get("datetime_format", fmt_default) dt_str = datetime.datetime.strptime(in_value, fmt).strftime(fmt_default) return dt_str + + +def cast_to_int(in_value: Any, params: dict) -> int: + return int(in_value) + + +def cast_to_float(in_value: Any, params: dict) -> float: + return float(in_value) + + +def cast_to_bool(in_value: Any, params: dict) -> bool: + return bool(in_value) + + +def cast_to_str(in_value: Any, params: dict) -> str: + return str(in_value) -- GitLab From 2b7035c41cb392df3074d8010b7f93f092d254df Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Thu, 28 Nov 2024 13:16:05 +0100 Subject: [PATCH 037/131] ENH(scanner): variables are replaced in parameters of transformer functions --- src/caoscrawler/converters/converters.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/caoscrawler/converters/converters.py b/src/caoscrawler/converters/converters.py index 3a3c7e29..bb069e5d 100644 --- a/src/caoscrawler/converters/converters.py +++ b/src/caoscrawler/converters/converters.py @@ -575,10 +575,17 @@ class Converter(object, metaclass=ABCMeta): " one element with they key being the name" " of the function!") tr_func_key = list(tr_func_el.keys())[0] - tr_func_params = tr_func_el[tr_func_key] + + # Create a copy of the function parameters: + tr_func_params = dict(tr_func_el[tr_func_key]) + if tr_func_key not in transformer_functions: raise RuntimeError("Unknown transformer function: {}".format(tr_func_key)) + # Do variable replacment on function parameters: + for key in tr_func_params: + tr_func_params[key] = replace_variables(tr_func_params[key]) + # Retrieve the function from the dictionary: tr_func = transformer_functions[tr_func_key] # Call the function: -- GitLab From d6c95eb2f6c395fb061c87886ee03abb73a93d32 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Thu, 28 Nov 2024 13:20:40 +0100 Subject: [PATCH 038/131] FIX(converters): argument missing in replace_variables --- src/caoscrawler/converters/converters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/caoscrawler/converters/converters.py b/src/caoscrawler/converters/converters.py index bb069e5d..6f544ddb 100644 --- a/src/caoscrawler/converters/converters.py +++ b/src/caoscrawler/converters/converters.py @@ -584,7 +584,7 @@ class Converter(object, metaclass=ABCMeta): # Do variable replacment on function parameters: for key in tr_func_params: - tr_func_params[key] = replace_variables(tr_func_params[key]) + tr_func_params[key] = replace_variables(tr_func_params[key], values) # Retrieve the function from the dictionary: tr_func = transformer_functions[tr_func_key] -- GitLab From ae694da61cb92f8b1e29285ae8d2fd2fa900e4aa Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Thu, 28 Nov 2024 13:42:20 +0100 Subject: [PATCH 039/131] FIX(validator): patches are applied recursively to match substructure of records --- src/caoscrawler/validator.py | 38 ++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/src/caoscrawler/validator.py b/src/caoscrawler/validator.py index 1663aa46..56104703 100644 --- a/src/caoscrawler/validator.py +++ b/src/caoscrawler/validator.py @@ -71,25 +71,20 @@ def representer_ordereddict(dumper, data): return dumper.represent_data(dict(data)) -def convert_record(record: db.Record): +def apply_schema_patches(pobj: dict): """ - Convert a record into a form suitable for validation with jsonschema. - - Uses high_level_api.convert_to_python_object - Changes applied: - properties are moved vom subitem "proeprties" to top-level. - The following keys are deleted: parents, role, name, description, metadata, properties - - Arguments: - ---------- - record: db.Record - The record that is supposed to be converted. """ - pobj = convert_to_python_object(record).serialize() - + if "properties" not in pobj: + # this is probably a file + return pobj for prop in pobj["properties"]: - pobj[prop] = pobj["properties"][prop] + if isinstance(pobj["properties"][prop], dict): + pobj[prop] = apply_schema_patches(pobj["properties"][prop]) + else: + pobj[prop] = pobj["properties"][prop] for keyd in ("parents", "role", "name", "description", "metadata", "properties"): @@ -99,6 +94,23 @@ def convert_record(record: db.Record): return pobj +def convert_record(record: db.Record): + """ + Convert a record into a form suitable for validation with jsonschema. + + Uses high_level_api.convert_to_python_object + Afterwards apply_schema_patches is called recursively to refactor the dictionary + to match the current form of the jsonschema. + + Arguments: + ---------- + record: db.Record + The record that is supposed to be converted. + """ + pobj = convert_to_python_object(record).serialize() + return apply_schema_patches(pobj) + + def validate(records: list[db.Record], schemas: list[dict]) -> list[tuple[bool, list]]: """ Validate a list of records against a list of possible JSON schemas. -- GitLab From 3b2ff777a4869d928df2d73ccb57d47d3c8203d7 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Thu, 28 Nov 2024 13:45:01 +0100 Subject: [PATCH 040/131] FIX(scanner): variables are only replaced if parameters are present at all --- src/caoscrawler/converters/converters.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/caoscrawler/converters/converters.py b/src/caoscrawler/converters/converters.py index 6f544ddb..4205382b 100644 --- a/src/caoscrawler/converters/converters.py +++ b/src/caoscrawler/converters/converters.py @@ -583,8 +583,9 @@ class Converter(object, metaclass=ABCMeta): raise RuntimeError("Unknown transformer function: {}".format(tr_func_key)) # Do variable replacment on function parameters: - for key in tr_func_params: - tr_func_params[key] = replace_variables(tr_func_params[key], values) + if tr_func_params is not None: + for key in tr_func_params: + tr_func_params[key] = replace_variables(tr_func_params[key], values) # Retrieve the function from the dictionary: tr_func = transformer_functions[tr_func_key] -- GitLab From d637644ce4fac3185af498a486c1eab3523023e0 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Thu, 28 Nov 2024 13:48:27 +0100 Subject: [PATCH 041/131] FIX(scanner): parameters can only be copied if present --- src/caoscrawler/converters/converters.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/caoscrawler/converters/converters.py b/src/caoscrawler/converters/converters.py index 4205382b..d06415f7 100644 --- a/src/caoscrawler/converters/converters.py +++ b/src/caoscrawler/converters/converters.py @@ -576,16 +576,17 @@ class Converter(object, metaclass=ABCMeta): " of the function!") tr_func_key = list(tr_func_el.keys())[0] - # Create a copy of the function parameters: - tr_func_params = dict(tr_func_el[tr_func_key]) - if tr_func_key not in transformer_functions: raise RuntimeError("Unknown transformer function: {}".format(tr_func_key)) # Do variable replacment on function parameters: - if tr_func_params is not None: + if tr_func_el[tr_func_key] is not None: + # Create a copy of the function parameters: + tr_func_params = dict(tr_func_el[tr_func_key]) for key in tr_func_params: tr_func_params[key] = replace_variables(tr_func_params[key], values) + else: + tr_func_params = None # Retrieve the function from the dictionary: tr_func = transformer_functions[tr_func_key] -- GitLab From 62cc17ae0f300173074a4f4dcd57a35173237e32 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Thu, 28 Nov 2024 20:29:21 +0100 Subject: [PATCH 042/131] DOC(validator): comment on refactoring the validator --- src/caoscrawler/validator.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/caoscrawler/validator.py b/src/caoscrawler/validator.py index 56104703..bb21a611 100644 --- a/src/caoscrawler/validator.py +++ b/src/caoscrawler/validator.py @@ -134,6 +134,14 @@ def validate(records: list[db.Record], schemas: list[dict]) -> list[tuple[bool, - Index 0: A boolean that determines whether at least one schema matched for this record. - Index 1: A list of schemas matching the record at this position of the list `records`. """ + + # TODO: + # I think it makes sense to change the behavior as follows: + # - Only validate the schema that was generated for a specific record type that matches the parent + # record that is validated. + # - With this behavior for each record a single schema is matched, and if it does not match the + # validation error can be returned. + retval = [] for r in records: matching_schemas = [] -- GitLab From 197bf0ab0f96d4b06e1c656b2bff1270be99d76f Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Thu, 28 Nov 2024 20:33:14 +0100 Subject: [PATCH 043/131] DOC(transformers): added docstrings to cast_to_* functions --- src/caoscrawler/transformer_functions.py | 28 ++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/src/caoscrawler/transformer_functions.py b/src/caoscrawler/transformer_functions.py index ddc1fe94..03f017b1 100644 --- a/src/caoscrawler/transformer_functions.py +++ b/src/caoscrawler/transformer_functions.py @@ -102,16 +102,44 @@ Parameters def cast_to_int(in_value: Any, params: dict) -> int: + """ + Cast the `in_value` to int. + + Parameters + ========== + No parameters. + """ return int(in_value) def cast_to_float(in_value: Any, params: dict) -> float: + """ + Cast the `in_value` to float. + + Parameters + ========== + No parameters. + """ return float(in_value) def cast_to_bool(in_value: Any, params: dict) -> bool: + """ + Cast the `in_value` to bool. + + Parameters + ========== + No parameters. + """ return bool(in_value) def cast_to_str(in_value: Any, params: dict) -> str: + """ + Cast the `in_value` to str. + + Parameters + ========== + No parameters. + """ return str(in_value) -- GitLab From 685ce79dbb386163b2ee3b3e3500ec70ab1a9307 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Thu, 28 Nov 2024 20:49:33 +0100 Subject: [PATCH 044/131] ENH(transformers): added better logic for converting to bool --- src/caoscrawler/transformer_functions.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/caoscrawler/transformer_functions.py b/src/caoscrawler/transformer_functions.py index 03f017b1..117d0b02 100644 --- a/src/caoscrawler/transformer_functions.py +++ b/src/caoscrawler/transformer_functions.py @@ -127,11 +127,20 @@ def cast_to_bool(in_value: Any, params: dict) -> bool: """ Cast the `in_value` to bool. + This is done by comparing `in_value` to "True". + Only "true", "True", "False" and "false" are accepted as possible values. + All other input values raise an error. + Parameters ========== No parameters. """ - return bool(in_value) + val = str(in_value).lower() + if val == "true": + return True + if val == "false": + return False + raise ValueError("Invalid value for type cast to bool: {}".format(in_value)) def cast_to_str(in_value: Any, params: dict) -> str: -- GitLab From e5034c9d332fd97010c08ba33fe82aa85a10ab46 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Thu, 28 Nov 2024 20:50:06 +0100 Subject: [PATCH 045/131] TST(transformers): tests for cast transformer functions --- unittests/test_transformers.py | 37 +++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/unittests/test_transformers.py b/unittests/test_transformers.py index 0571dbd3..5a1a9cfa 100644 --- a/unittests/test_transformers.py +++ b/unittests/test_transformers.py @@ -35,12 +35,13 @@ from unittest.mock import MagicMock, Mock, patch import linkahead as db import pytest import yaml -from pytest import raises - from caoscrawler.converters import Converter, ListElementConverter from caoscrawler.scanner import create_transformer_registry, scan_directory from caoscrawler.stores import GeneralStore -from caoscrawler.transformer_functions import replace, split +from caoscrawler.transformer_functions import (cast_to_bool, cast_to_float, + cast_to_int, cast_to_str, + replace, split) +from pytest import raises UNITTESTDIR = Path(__file__).parent @@ -163,3 +164,33 @@ def test_empty_functions_list(converter_registry): conv.apply_transformers(values, transformer_functions) assert values['b'] == "16_45" + + +def test_cast_transformer_functions(): + for val in ("True", "true", "False", "false"): + assert type(cast_to_bool(val, {})) == bool + if val[1] == "r": + assert cast_to_bool(val, {}) + else: + assert not cast_to_bool(val, {}) + for val_err in ("jaksdlfj", "0", 1): + with pytest.raises(ValueError): + cast_to_bool(val_err, {}) + assert not cast_to_bool(False, {}) + assert cast_to_bool(True, {}) + + assert cast_to_int("24", {}) == 24 + assert cast_to_int(24.0, {}) == 24 + assert cast_to_int(24, {}) == 24 + with pytest.raises(ValueError): + cast_to_int("24dsf", {}) + cast_to_int("24.0", {}) == 24 + + assert cast_to_float("24", {}) == 24.0 + assert cast_to_float("24.0", {}) == 24.0 + assert cast_to_float(24.0, {}) == 24.0 + assert cast_to_float(24, {}) == 24.0 + with pytest.raises(ValueError): + cast_to_float("24dsf", {}) + + assert cast_to_str(24, {}) == "24" -- GitLab From 581f68bd62d03a5164989b94ea7d8f7e3120a5e7 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Fri, 29 Nov 2024 09:11:58 +0100 Subject: [PATCH 046/131] DOC(validator): improved docstr of helper function --- src/caoscrawler/validator.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/caoscrawler/validator.py b/src/caoscrawler/validator.py index bb21a611..3051f4c4 100644 --- a/src/caoscrawler/validator.py +++ b/src/caoscrawler/validator.py @@ -67,7 +67,17 @@ def load_json_schema_from_datamodel_yaml(filename: str) -> list: def representer_ordereddict(dumper, data): - # yaml.add_representer(OrderedDict, caoscrawler.validator.representer_ordereddict) + """ + Helper function to be able to represent the converted json schema objects correctly as yaml. + This representer essentially replaced OrderedDict objects with simple dict objects. + + Since Python 3.7 dicts are ordered by default, see e.g.: https://softwaremaniacs.org/blog/2020/02/05/dicts-ordered/en/ + + Example how to use the representer: + ```python + yaml.add_representer(OrderedDict, caoscrawler.validator.representer_ordereddict) + ``` + """ return dumper.represent_data(dict(data)) -- GitLab From 17017e1a59d97a003e773fa01cb7b2d117ad26b4 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Fri, 29 Nov 2024 09:15:27 +0100 Subject: [PATCH 047/131] MAINT(validator): make apply_schema_patches function private --- src/caoscrawler/validator.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/caoscrawler/validator.py b/src/caoscrawler/validator.py index 3051f4c4..fdfed23f 100644 --- a/src/caoscrawler/validator.py +++ b/src/caoscrawler/validator.py @@ -71,7 +71,8 @@ def representer_ordereddict(dumper, data): Helper function to be able to represent the converted json schema objects correctly as yaml. This representer essentially replaced OrderedDict objects with simple dict objects. - Since Python 3.7 dicts are ordered by default, see e.g.: https://softwaremaniacs.org/blog/2020/02/05/dicts-ordered/en/ + Since Python 3.7 dicts are ordered by default, see e.g.: + https://softwaremaniacs.org/blog/2020/02/05/dicts-ordered/en/ Example how to use the representer: ```python @@ -81,7 +82,7 @@ def representer_ordereddict(dumper, data): return dumper.represent_data(dict(data)) -def apply_schema_patches(pobj: dict): +def _apply_schema_patches(pobj: dict): """ Changes applied: - properties are moved vom subitem "proeprties" to top-level. @@ -92,7 +93,7 @@ def apply_schema_patches(pobj: dict): return pobj for prop in pobj["properties"]: if isinstance(pobj["properties"][prop], dict): - pobj[prop] = apply_schema_patches(pobj["properties"][prop]) + pobj[prop] = _apply_schema_patches(pobj["properties"][prop]) else: pobj[prop] = pobj["properties"][prop] @@ -108,8 +109,8 @@ def convert_record(record: db.Record): """ Convert a record into a form suitable for validation with jsonschema. - Uses high_level_api.convert_to_python_object - Afterwards apply_schema_patches is called recursively to refactor the dictionary + Uses `high_level_api.convert_to_python_object` + Afterwards `_apply_schema_patches` is called recursively to refactor the dictionary to match the current form of the jsonschema. Arguments: -- GitLab From 55e6f39b6c6fe471543da7fb7f254366cd50c422 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Fri, 29 Nov 2024 09:23:47 +0100 Subject: [PATCH 048/131] API(validator): new validator behavior --- src/caoscrawler/validator.py | 39 +++++++++++++++++------------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/src/caoscrawler/validator.py b/src/caoscrawler/validator.py index fdfed23f..67775163 100644 --- a/src/caoscrawler/validator.py +++ b/src/caoscrawler/validator.py @@ -41,7 +41,7 @@ from caoscrawler import scanner # from collections import OrderedDict -def load_json_schema_from_datamodel_yaml(filename: str) -> list: +def load_json_schema_from_datamodel_yaml(filename: str) -> dict[str, dict]: """ Load a data model yaml file (using caosadvancedtools) and convert all record types into a json schema using the json_schema_exporter module. @@ -53,15 +53,16 @@ def load_json_schema_from_datamodel_yaml(filename: str) -> list: Returns ------- - A list of json schema objects. + A dict of json schema objects. The keys are the record types for which the schemas + are generated. """ model = parse_model_from_yaml(filename) - rt_schemas = [] + rt_schemas = {} for el_key, el in model.items(): if isinstance(el, db.RecordType): - rt_schemas.append(recordtype_to_json_schema(el)) + rt_schemas[el_key] = recordtype_to_json_schema(el) return rt_schemas @@ -119,10 +120,10 @@ def convert_record(record: db.Record): The record that is supposed to be converted. """ pobj = convert_to_python_object(record).serialize() - return apply_schema_patches(pobj) + return _apply_schema_patches(pobj) -def validate(records: list[db.Record], schemas: list[dict]) -> list[tuple[bool, list]]: +def validate(records: list[db.Record], schemas: dict[str, dict]) -> list[tuple[bool, list]]: """ Validate a list of records against a list of possible JSON schemas. @@ -146,21 +147,17 @@ def validate(records: list[db.Record], schemas: list[dict]) -> list[tuple[bool, - Index 1: A list of schemas matching the record at this position of the list `records`. """ - # TODO: - # I think it makes sense to change the behavior as follows: - # - Only validate the schema that was generated for a specific record type that matches the parent - # record that is validated. - # - With this behavior for each record a single schema is matched, and if it does not match the - # validation error can be returned. - retval = [] for r in records: - matching_schemas = [] - for schema in schemas: - try: - jsonschema.validate(convert_record(r), schema) - matching_schemas.append(schema) - except ValidationError: - pass - retval.append((len(matching_schemas) > 0, matching_schemas)) + if len(r.parents) != 0: + raise RuntimeError( + "Schema validation is only supported if records have exactly one parent.") + if r.parents[0] not in schemas: + raise RuntimeError( + "No schema for record type {} in schema dictionary.".format(r.parents[0])) + try: + jsonschema.validate(convert_record(r), schemas[r.parents[0]]) + retval.append((True, None)) + except ValidationError as ex: + retval.append((False, ex)) return retval -- GitLab From 8edc1588b5360f375197e0159b3db1b0a0c7764e Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Fri, 29 Nov 2024 09:29:42 +0100 Subject: [PATCH 049/131] TST(validator): unit tests for new validator behavior --- src/caoscrawler/validator.py | 9 +++++---- unittests/test_validation.py | 12 +++++++----- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/caoscrawler/validator.py b/src/caoscrawler/validator.py index 67775163..3cd57cd5 100644 --- a/src/caoscrawler/validator.py +++ b/src/caoscrawler/validator.py @@ -149,14 +149,15 @@ def validate(records: list[db.Record], schemas: dict[str, dict]) -> list[tuple[b retval = [] for r in records: - if len(r.parents) != 0: + if len(r.parents) != 1: raise RuntimeError( "Schema validation is only supported if records have exactly one parent.") - if r.parents[0] not in schemas: + parname = r.parents[0].name + if parname not in schemas: raise RuntimeError( - "No schema for record type {} in schema dictionary.".format(r.parents[0])) + "No schema for record type {} in schema dictionary.".format(parname)) try: - jsonschema.validate(convert_record(r), schemas[r.parents[0]]) + jsonschema.validate(convert_record(r), schemas[parname]) retval.append((True, None)) except ValidationError as ex: retval.append((False, ex)) diff --git a/unittests/test_validation.py b/unittests/test_validation.py index 6db0674e..216b51fa 100644 --- a/unittests/test_validation.py +++ b/unittests/test_validation.py @@ -51,7 +51,8 @@ def test_create_json_schema(): pobj = convert_record(r) # print(yaml.dump(pobj)) # print(yaml.dump(json[0])) - jsonschema.validate(pobj, json[0]) + assert "Dataset" in json + jsonschema.validate(pobj, json["Dataset"]) # Failing test: r = db.Record() @@ -62,7 +63,7 @@ def test_create_json_schema(): pobj = convert_record(r) with pytest.raises(ValidationError, match=".*'keywords' is a required property.*"): - jsonschema.validate(pobj, json[0]) + jsonschema.validate(pobj, json["Dataset"]) def test_validation(): @@ -83,6 +84,7 @@ def test_validation(): valres = validate([r1, r2], json) assert valres[0][0] is True - assert len(valres[0][1]) == 1 - assert valres[0][1][0] == json[0] - assert len(valres[1][1]) == 0 + assert valres[0][1] is None + assert not valres[1][0] + assert isinstance(valres[1][1], ValidationError) + assert valres[1][1].message == "'keywords' is a required property" -- GitLab From a140962de1be1c7f876fa6ea71baafc7b5caf96d Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Fri, 29 Nov 2024 09:31:47 +0100 Subject: [PATCH 050/131] MAINT(validator): cleaned up imports --- unittests/test_validation.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/unittests/test_validation.py b/unittests/test_validation.py index 216b51fa..a3215963 100644 --- a/unittests/test_validation.py +++ b/unittests/test_validation.py @@ -23,16 +23,12 @@ """ test validation """ -import importlib -import os from os.path import join from pathlib import Path -import caoscrawler import jsonschema import linkahead as db import pytest -import yaml from caoscrawler.validator import (convert_record, load_json_schema_from_datamodel_yaml, validate) -- GitLab From bd388c866b4eaf7c32f00da9ac6c7a2c84c588a0 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Fri, 29 Nov 2024 10:42:27 +0100 Subject: [PATCH 051/131] DOC(validator): doc updated to describe new behavior --- src/caoscrawler/validator.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/src/caoscrawler/validator.py b/src/caoscrawler/validator.py index 3cd57cd5..8e0efd94 100644 --- a/src/caoscrawler/validator.py +++ b/src/caoscrawler/validator.py @@ -123,12 +123,14 @@ def convert_record(record: db.Record): return _apply_schema_patches(pobj) -def validate(records: list[db.Record], schemas: dict[str, dict]) -> list[tuple[bool, list]]: +def validate(records: list[db.Record], schemas: dict[str, dict]) -> list[tuple]: """ - Validate a list of records against a list of possible JSON schemas. - - It is tried to validate each schema from the list of schemas. If none of them validates - without error, it is assumed that it does not match at all. + Validate a list of records against a dictionary of schemas. + The keys of the dictionary are record types and the corresponding values are json schemata + associated with that record type. The current implementation assumes that each record that is + checked has exactly one parent and raises an error if that is not the case. + The schema belonging to a record is identified using the name of the first (and only) parent + of the record. Arguments: ---------- @@ -136,15 +138,16 @@ def validate(records: list[db.Record], schemas: dict[str, dict]) -> list[tuple[b records: list[db.Record] List of records that will be validated. - schemas: list[dict] - A list of JSON schemas generated using `load_json_schema_from_datamodel_yaml`. + schemas: dict[str, dict] + A dictionary of JSON schemas generated using `load_json_schema_from_datamodel_yaml`. Returns: -------- A list of tuples, one element for each record: - - Index 0: A boolean that determines whether at least one schema matched for this record. - - Index 1: A list of schemas matching the record at this position of the list `records`. + - Index 0: A boolean that determines whether the schema belonging to the record type of the + record matched. + - Index 1: A validation error if the schema did not match or None otherwise. """ retval = [] -- GitLab From 716ec67e908c0b9b8adb99d0139b9656e99f77e6 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Fri, 29 Nov 2024 10:47:13 +0100 Subject: [PATCH 052/131] DOC: updated changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 16c12bb8..88ea70fb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - ZipFileConverter that opens zip files and exposes their contents as File and Directory structure elements. - `linkahead-crawler` script as alias for `caosdb-crawler`. +- New transformers of the form `cast_to_*` which allow casting variables to `int`, `float`, + `str` and `bool`. ### Changed ### -- GitLab From 3ceac0b55fae0138bc87abfaec3d8c823ad23b64 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Fri, 29 Nov 2024 11:11:09 +0100 Subject: [PATCH 053/131] TST(transformers): unit test for transformer to check if variable replacement in parameters works --- unittests/test_transformers.py | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/unittests/test_transformers.py b/unittests/test_transformers.py index 0571dbd3..301aab50 100644 --- a/unittests/test_transformers.py +++ b/unittests/test_transformers.py @@ -30,17 +30,14 @@ See: https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/107 import importlib from pathlib import Path -from unittest.mock import MagicMock, Mock, patch +from unittest.mock import Mock -import linkahead as db import pytest -import yaml -from pytest import raises - from caoscrawler.converters import Converter, ListElementConverter from caoscrawler.scanner import create_transformer_registry, scan_directory from caoscrawler.stores import GeneralStore -from caoscrawler.transformer_functions import replace, split +from caoscrawler.transformer_functions import replace +from pytest import raises UNITTESTDIR = Path(__file__).parent @@ -163,3 +160,23 @@ def test_empty_functions_list(converter_registry): conv.apply_transformers(values, transformer_functions) assert values['b'] == "16_45" + + +def test_replace_variables(): + vals = GeneralStore() + vals["test"] = "with" + vals["a"] = "str_without_replacement" + conv = Mock() + conv.definition = {} + conv.definition["transform"] = { + "test": { + "in": "$a", + "out": "$a", + "functions": [ + {"replace": { + "remove": "without", + "insert": "$test" + }} + ]}} + Converter.apply_transformers(conv, vals, {"replace": replace}) + assert vals["a"] == "str_with_replacement" -- GitLab From 0eb8eb5e75404f39fb11a938addd839c78a16581 Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Fri, 29 Nov 2024 13:36:08 +0100 Subject: [PATCH 054/131] MAINT: Remove unused imports --- src/caoscrawler/validator.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/caoscrawler/validator.py b/src/caoscrawler/validator.py index 8e0efd94..4ed639df 100644 --- a/src/caoscrawler/validator.py +++ b/src/caoscrawler/validator.py @@ -36,10 +36,6 @@ from caosadvancedtools.models.parser import parse_model_from_yaml from jsonschema import ValidationError from linkahead.high_level_api import convert_to_python_object -from caoscrawler import scanner - -# from collections import OrderedDict - def load_json_schema_from_datamodel_yaml(filename: str) -> dict[str, dict]: """ -- GitLab From 10b4c47e669bcf7c9c890c5bd951caa73087f1fb Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Fri, 29 Nov 2024 14:29:54 +0100 Subject: [PATCH 055/131] DOC: Update changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c18b9eb5..7c50fb1b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - ZipFileConverter that opens zip files and exposes their contents as File and Directory structure elements. - `linkahead-crawler` script as alias for `caosdb-crawler`. +- Transformer function definietion in the cfood support variable + substitutions now. ### Changed ### -- GitLab From 05d9cdf01446f1e100eb874cb0bd2c0710a048b7 Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Fri, 29 Nov 2024 14:30:02 +0100 Subject: [PATCH 056/131] DOC: Extend documentation for variable replacement in transformers --- src/doc/converters/transform_functions.rst | 29 ++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/src/doc/converters/transform_functions.rst b/src/doc/converters/transform_functions.rst index 22df35c8..ecd47d2d 100644 --- a/src/doc/converters/transform_functions.rst +++ b/src/doc/converters/transform_functions.rst @@ -38,8 +38,33 @@ An example that splits the variable ``a`` and puts the generated list in ``b`` i Report: tags: $b -This splits the string in '$a' and stores the resulting list in '$b'. This is here used to add a -list valued property to the Report Record. +This splits the string in '$a' and stores the resulting list in +'$b'. This is here used to add a list valued property to the Report +Record. Note that from LinkAhead Crawler 0.11.0 onwards, the value of +``marker`` in the above example can also be read in from a variable in +the usual ``$`` notation: + +.. code-block:: yaml + + # ... variable ``separator`` is defined somewhere above this part, e.g., + # by reading a config file. + Experiment: + type: Dict + match: ".*" + transform: + param_split: + in: $a + out: $b + functions: + - split: + marker: $separator # Now the separator is read in from a + # variable, so we can, e.g., change from + # '|' to ';' without changing the cfood + # definition. + records: + Report: + tags: $b + There are a number of transform functions that are defined by default (see -- GitLab From 89efe5faf5e29a7967c524d686ce5618ad56d0dc Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Fri, 29 Nov 2024 15:48:32 +0100 Subject: [PATCH 057/131] TST(transformers): Extend typecast unittests a little. --- unittests/test_transformers.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/unittests/test_transformers.py b/unittests/test_transformers.py index 6b1c4655..a2d227ad 100644 --- a/unittests/test_transformers.py +++ b/unittests/test_transformers.py @@ -168,20 +168,22 @@ def test_cast_transformer_functions(): for val in ("True", "true", "False", "false"): assert type(cast_to_bool(val, {})) == bool if val[1] == "r": - assert cast_to_bool(val, {}) + assert cast_to_bool(val, {}) is True else: - assert not cast_to_bool(val, {}) + assert cast_to_bool(val, {}) is False for val_err in ("jaksdlfj", "0", 1): with pytest.raises(ValueError): cast_to_bool(val_err, {}) - assert not cast_to_bool(False, {}) - assert cast_to_bool(True, {}) + assert cast_to_bool(False, {}) is False + assert cast_to_bool(True, {}) is True assert cast_to_int("24", {}) == 24 assert cast_to_int(24.0, {}) == 24 assert cast_to_int(24, {}) == 24 + assert cast_to_int("-24", {}) == -24 with pytest.raises(ValueError): cast_to_int("24dsf", {}) + with pytest.raises(ValueError): cast_to_int("24.0", {}) == 24 assert cast_to_float("24", {}) == 24.0 -- GitLab From 4c70a0a62b2580bd38a4d848b8ff417566fcc717 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Mon, 2 Dec 2024 10:47:17 +0100 Subject: [PATCH 058/131] ENH(validator): use NotImplementedError instead of RuntimeError --- src/caoscrawler/validator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/caoscrawler/validator.py b/src/caoscrawler/validator.py index 4ed639df..33e29b02 100644 --- a/src/caoscrawler/validator.py +++ b/src/caoscrawler/validator.py @@ -149,7 +149,7 @@ def validate(records: list[db.Record], schemas: dict[str, dict]) -> list[tuple]: retval = [] for r in records: if len(r.parents) != 1: - raise RuntimeError( + raise NotImplementedError( "Schema validation is only supported if records have exactly one parent.") parname = r.parents[0].name if parname not in schemas: -- GitLab From a72b0fc06347c55f2b010faef262689dfb18d29c Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Mon, 2 Dec 2024 17:12:31 +0100 Subject: [PATCH 059/131] ENH: Add option to only match directories with contents newer than a reference file --- src/caoscrawler/converters/converters.py | 48 ++++++++++++++++++ unittests/test_converters.py | 62 +++++++++++++++++++++++- 2 files changed, 108 insertions(+), 2 deletions(-) diff --git a/src/caoscrawler/converters/converters.py b/src/caoscrawler/converters/converters.py index d06415f7..da752f68 100644 --- a/src/caoscrawler/converters/converters.py +++ b/src/caoscrawler/converters/converters.py @@ -769,6 +769,11 @@ class DirectoryConverter(Converter): m = re.match(self.definition["match"], element.name) if m is None: return None + if "match_newer_than_file" in self.definition: + last_modified = self._get_most_recent_change_in_dir(element) + reference = self._get_reference_file_timestamp() + if last_modified < reference: + return None return m.groupdict() @staticmethod @@ -791,6 +796,49 @@ class DirectoryConverter(Converter): return children + @staticmethod + def _get_most_recent_change_in_dir(element: Directory) -> datetime.datetime: + """Return the datetime of the most recent change of any file + or directory in the given Directory element. + + """ + most_recent = os.path.getmtime(element.path) + + for root, _, files in os.walk(element.path): + mtimes = [os.path.getmtime(root)] + \ + [os.path.getmtime(os.path.join(root, fname)) for fname in files] + if max(mtimes) > most_recent: + most_recent = max(mtimes) + + return datetime.datetime.fromtimestamp(most_recent) + + def _get_reference_file_timestamp(self) -> datetime.datetime: + """Return a time stamp read from a reference file if it + exists. Otherwise return datetime.datetime.min, i.e., the + earliest datetime known to datetime. + + """ + + if "match_newer_than_file" not in self.definition: + logger.debug("No reference file specified.") + return datetime.datetime.min + + elif not os.path.isfile(self.definition["match_newer_than_file"]): + logger.debug("Reference file doesn't exist.") + return datetime.datetime.min + + with open(self.definition["match_newer_than_file"]) as ref_file: + stamp_str = ref_file.readline().strip() + try: + return datetime.datetime.fromisoformat(stamp_str) + except ValueError: + logger.warn( + f"Reference file in {self.definition['match_newer_than_file']} " + "doesn't contain a ISO formatted datetime in its first line. " + "Match regardless of modification times." + ) + return datetime.datetime.min + class SimpleFileConverter(Converter): """Just a file, ignore the contents.""" diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 12285e46..7b22aa84 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -29,12 +29,15 @@ import importlib import json import logging import os +import pytest +import yaml + from itertools import product from pathlib import Path +from tempfile import NamedTemporaryFile import linkahead as db -import pytest -import yaml + from caoscrawler.converters import (Converter, ConverterValidationError, DateElementConverter, DictElementConverter, DictIntegerElementConverter, @@ -1070,3 +1073,58 @@ def test_dict_match_properties(converter_registry): "prop_d": 24 # duplicate matches }) records = scan_structure_elements(root_dict_element, def_dict, converter_registry) + + +def test_directory_converter_change_date(caplog, converter_registry): + """Test that only directories that were modified after a certain + date are crawled. + + """ + test_dir_element = Directory("test_directories", UNITTESTDIR / "test_directories") + date_of_dir_change = DirectoryConverter._get_most_recent_change_in_dir(test_dir_element) + past_date = date_of_dir_change - datetime.timedelta(days=1) + future_date = date_of_dir_change + datetime.timedelta(days=1) + + tmpfi = NamedTemporaryFile(delete=False) + + # Write down past + with open(tmpfi.name, "w") as fi: + fi.write(f"{past_date.isoformat()}\n") + + converter_def = { + "type": "Directory", + "match": "^test_directories$", + "match_newer_than_file": tmpfi.name + } + dc = DirectoryConverter(name="DC1", definition=converter_def, + converter_registry=converter_registry) + assert dc.match(test_dir_element) is not None + + # Write down future, so nothing should match + with open(tmpfi.name, "w") as fi: + fi.write(f"{future_date.isoformat()}\n") + assert dc.match(test_dir_element) is None + + # Also match in the corner case of equality: + with open(tmpfi.name, "w") as fi: + fi.write(f"{date_of_dir_change.isoformat()}\n") + assert dc.match(test_dir_element) is not None + + # Match but warn + with open(tmpfi.name, "w") as fi: + fi.write(f"This is garbage.\n") + assert dc.match(test_dir_element) is not None + assert len(caplog.record_tuples) == 1 + assert caplog.record_tuples[0][1] == logging.WARNING + assert tmpfi.name in caplog.record_tuples[0][2] + assert "doesn't contain a ISO formatted datetime in its first line" in caplog.record_tuples[0][2] + + # Match anything since file doesn't exist, inform in debug log. + os.remove(tmpfi.name) + # Clear log and enforce debug level. + caplog.clear() + caplog.set_level(logging.DEBUG) + assert dc.match(test_dir_element) is not None + assert len(caplog.record_tuples) == 1 + assert caplog.record_tuples[0][1] == logging.DEBUG + assert "Reference file doesn't exist." == caplog.record_tuples[0][2] -- GitLab From 1b9ec761bfc84a4c75c55ce53b05d8c297ec4338 Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Mon, 2 Dec 2024 17:22:01 +0100 Subject: [PATCH 060/131] DOC: Update changelog and explain `match_newer_than_file` --- CHANGELOG.md | 4 ++++ src/doc/converters/standard_converters.rst | 14 +++++++++++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4fe40138..dd57c157 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 variables to `int`, `float`, `str` and `bool`. - Transformer function definition in the cfood support variable substitutions now. +- `match_newer_than_file` option for `DirectoryConverter`: A reference + file containing (only) an ISO-formatted datetime string can be + specified here. Directories with this option won't match if all + their contents were last modified before that datetime. ### Changed ### diff --git a/src/doc/converters/standard_converters.rst b/src/doc/converters/standard_converters.rst index f7f18794..5f86abb5 100644 --- a/src/doc/converters/standard_converters.rst +++ b/src/doc/converters/standard_converters.rst @@ -6,9 +6,17 @@ These are the standard converters that exist in a default installation. For wri Directory Converter =================== -The Directory Converter creates StructureElements for each File and Directory -inside the current Directory. You can match a regular expression against the -directory name using the 'match' key. + +The Directory Converter creates StructureElements for each File and +Directory inside the current Directory. You can match a regular +expression against the directory name using the 'match' key. + +With the optional ``match_newer_than_file`` key, a path to file +containing only an ISO-formatted datetime string can be specified. If +this is done, a directory will only match if it contains at least one +file or directory that has been modified since that datetime. If the +file doesn't exist or contains an invalid string, the directory will +be matched regardless of the modification times. Simple File Converter ===================== -- GitLab From cf08c07ab89f1032b2aae8743f9251d2b4d9f0ce Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Mon, 2 Dec 2024 17:39:49 +0100 Subject: [PATCH 061/131] MAINT: Add match_newer_than_file to cfood schema --- src/caoscrawler/cfood-schema.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/caoscrawler/cfood-schema.yml b/src/caoscrawler/cfood-schema.yml index c5e0eaad..d2e4cea2 100644 --- a/src/caoscrawler/cfood-schema.yml +++ b/src/caoscrawler/cfood-schema.yml @@ -88,6 +88,12 @@ cfood: match_value: description: a regexp that is matched to the value of a key-value pair type: string + match_newer_than_file: + description: | + Only relevant for Directory. A path to a file containing + an ISO-formatted datetime. Only match if the contents of the + Directory have been modified after that datetime. + type: string record_from_dict: description: Only relevant for PropertiesFromDictElement. Specify the root record which is generated from the contained dictionary. type: object -- GitLab From 34dc48a5e783eed17bb9db2a386241d532d15de9 Mon Sep 17 00:00:00 2001 From: Joscha Schmiedt <joscha@schmiedt.dev> Date: Mon, 2 Dec 2024 21:40:23 +0100 Subject: [PATCH 062/131] TST: Make file paths Windows-compatible --- unittests/test_crawler.py | 6 +++--- unittests/test_scanner.py | 8 ++++---- unittests/test_utilities.py | 20 ++++++++++---------- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index e88ce454..ad69c6f5 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -824,9 +824,9 @@ def test_restricted_path(create_mock): def test_split_restricted_path(): - assert ["el"] == split_restricted_path("/el") - assert ["el"] == split_restricted_path("/el/") - assert ["el", "el"] == split_restricted_path("/el/el") + assert ["el"] == split_restricted_path(os.path.sep + "el") + assert ["el"] == split_restricted_path(os.path.sep + "el" + os.path.sep) + assert ["el", "el"] == split_restricted_path(os.path.sep + "el" + os.path.sep + "el") # Filter the warning because we want to have it here and this way it does not hinder running diff --git a/unittests/test_scanner.py b/unittests/test_scanner.py index 5cbbc634..60ab2591 100644 --- a/unittests/test_scanner.py +++ b/unittests/test_scanner.py @@ -30,7 +30,7 @@ from functools import partial from pathlib import Path from tempfile import NamedTemporaryFile from unittest.mock import MagicMock, Mock, patch - +import os import linkahead as db import pytest import yaml @@ -110,7 +110,7 @@ def test_record_structure_generation(): assert len(subc[1]) == 0 # The data analysis node creates one variable for the node itself: - assert subd[0]["DataAnalysis"] == "examples_article/DataAnalysis" + assert subd[0]["DataAnalysis"] == os.path.join("examples_article", "DataAnalysis") assert subc[0]["DataAnalysis"] is False subd = dbt.debug_tree[dircheckstr("DataAnalysis", "2020_climate-model-predict")] @@ -128,9 +128,9 @@ def test_record_structure_generation(): assert subd[0]["identifier"] == "climate-model-predict" assert subd[0]["Project"].__class__ == db.Record - assert subd[0]["DataAnalysis"] == "examples_article/DataAnalysis" + assert subd[0]["DataAnalysis"] == os.path.join("examples_article" , "DataAnalysis") assert subc[0]["DataAnalysis"] is True - assert subd[0]["project_dir"] == "examples_article/DataAnalysis/2020_climate-model-predict" + assert subd[0]["project_dir"] == os.path.join("examples_article", "DataAnalysis", "2020_climate-model-predict") assert subc[0]["project_dir"] is False # Check the copy flags for the first level in the hierarchy: diff --git a/unittests/test_utilities.py b/unittests/test_utilities.py index 463e304a..da1245b3 100644 --- a/unittests/test_utilities.py +++ b/unittests/test_utilities.py @@ -20,22 +20,22 @@ # import pytest - +from os.path import sep from caoscrawler.crawl import split_restricted_path from caoscrawler.utils import MissingImport, get_shared_resource_link def test_split_restricted_path(): assert split_restricted_path("") == [] - assert split_restricted_path("/") == [] - assert split_restricted_path("test/") == ["test"] - assert split_restricted_path("/test/") == ["test"] - assert split_restricted_path("test/bla") == ["test", "bla"] - assert split_restricted_path("/test/bla") == ["test", "bla"] - assert split_restricted_path("/test1/test2/bla") == ["test1", "test2", "bla"] - assert split_restricted_path("/test//bla") == ["test", "bla"] - assert split_restricted_path("//test/bla") == ["test", "bla"] - assert split_restricted_path("///test//bla////") == ["test", "bla"] + assert split_restricted_path(f"{sep}") == [] + assert split_restricted_path(f"test{sep}") == ["test"] + assert split_restricted_path(f"{sep}test{sep}") == ["test"] + assert split_restricted_path(f"test{sep}bla") == ["test", "bla"] + assert split_restricted_path(f"{sep}test{sep}bla") == ["test", "bla"] + assert split_restricted_path(f"{sep}test1{sep}test2{sep}bla") == ["test1", "test2", "bla"] + assert split_restricted_path(f"{sep}test{sep}{sep}bla") == ["test", "bla"] + assert split_restricted_path(f"{sep}{sep}test{sep}bla") == ["test", "bla"] + assert split_restricted_path(f"{sep}{sep}{sep}test{sep}{sep}bla{sep}{sep}{sep}{sep}") == ["test", "bla"] def test_dummy_class(): -- GitLab From 9bad8e2c73b7a75947e231b0f9c85fc31687ccbc Mon Sep 17 00:00:00 2001 From: Joscha Schmiedt <joscha@schmiedt.dev> Date: Mon, 2 Dec 2024 21:41:04 +0100 Subject: [PATCH 063/131] TST: Make NamedTemporaryFiles Windows-compatible Don't open files twice (add delete=False, which causes file to not be opened) --- unittests/test_cfood_metadata.py | 4 ++-- unittests/test_macros.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/unittests/test_cfood_metadata.py b/unittests/test_cfood_metadata.py index c606a0a1..b123f985 100644 --- a/unittests/test_cfood_metadata.py +++ b/unittests/test_cfood_metadata.py @@ -18,7 +18,7 @@ # along with this program. If not, see <https://www.gnu.org/licenses/>. # from tempfile import NamedTemporaryFile -from unittest.mock import MagicMock, Mock, patch +from unittest.mock import patch import pytest import yaml @@ -33,7 +33,7 @@ def _temp_file_load(txt: str): definition using load_definition from Crawler. """ definition = None - with NamedTemporaryFile() as f: + with NamedTemporaryFile(delete=False) as f: f.write(txt.encode()) f.flush() definition = load_definition(f.name) diff --git a/unittests/test_macros.py b/unittests/test_macros.py index a87b633e..03fe0e66 100644 --- a/unittests/test_macros.py +++ b/unittests/test_macros.py @@ -50,10 +50,10 @@ def _temp_file_load(txt: str): definition using load_definition from Crawler. """ definition = None - with NamedTemporaryFile() as f: + with NamedTemporaryFile(delete=False) as f: f.write(txt.encode()) f.flush() - definition = load_definition(f.name) + definition = load_definition(f.name) return definition -- GitLab From 444f98310092c168ddd34c81d059c9274487d8f5 Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Wed, 4 Dec 2024 14:11:43 +0100 Subject: [PATCH 064/131] MAINT: Reduce code duplication in error handling --- src/caoscrawler/crawl.py | 52 +++++++++++++++------------------------- 1 file changed, 19 insertions(+), 33 deletions(-) diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index a79e4434..8ca84502 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -1115,42 +1115,28 @@ def crawler_main(crawled_directory_path: str, crawler.run_id) _update_status_record(crawler.run_id, len(inserts), len(updates), status="OK") return 0 - except ForbiddenTransaction as err: - logger.debug(traceback.format_exc()) - logger.error(err) - _update_status_record(crawler.run_id, 0, 0, status="FAILED") - return 1 - except ConverterValidationError as err: - logger.debug(traceback.format_exc()) - logger.error(err) - _update_status_record(crawler.run_id, 0, 0, status="FAILED") - return 1 - except ImpossibleMergeError as err: - logger.debug(traceback.format_exc()) - logger.error( - "Encountered conflicting information when creating Records from the crawled " - f"data:\n\n{err}" - ) - _update_status_record(crawler.run_id, 0, 0, status="FAILED") - return 1 - except TransactionError as err: - logger.debug(traceback.format_exc()) - logger.error(err) - logger.error("Transaction error details:") - for suberr in err.errors: - logger.error("---") - logger.error(suberr.msg) - logger.error(suberr.entity) - return 1 except Exception as err: logger.debug(traceback.format_exc()) logger.error(err) - - if "SHARED_DIR" in os.environ: - # pylint: disable=E0601 - domain = get_config_setting("public_host_url") - logger.error("Unexpected Error: Please tell your administrator about this and provide " - f"the following path.\n{get_shared_resource_link(domain, debuglog_public)}") + # Special treatment for known error types + if isinstance(err, ImpossibleMergeError): + logger.error( + "Encountered conflicting information when creating Records from the crawled " + f"data:\n\n{err}" + ) + elif isinstance(err, TransactionError): + logger.error("Transaction error details:") + for suberr in err.errors: + logger.error("---") + logger.error(suberr.msg) + logger.error(suberr.entity) + # Unkown errors get a special message + elif not isinstance(err, (ConverterValidationError, ForbiddenTransaction)): + if "SHARED_DIR" in os.environ: + # pylint: disable=E0601 + domain = get_config_setting("public_host_url") + logger.error("Unexpected Error: Please tell your administrator about this and provide " + f"the following path.\n{get_shared_resource_link(domain, debuglog_public)}") _update_status_record(crawler.run_id, 0, 0, status="FAILED") return 1 -- GitLab From 0cd3c0c17221791adf22493204c83e8bd42142a7 Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Thu, 5 Dec 2024 17:26:04 +0100 Subject: [PATCH 065/131] ENH: Support list of directories for crawler_main and scan_directory --- src/caoscrawler/crawl.py | 20 ++++++++++---- src/caoscrawler/scanner.py | 55 +++++++++++++++++++++----------------- 2 files changed, 46 insertions(+), 29 deletions(-) diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 8ca84502..fd0beaa2 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -621,7 +621,7 @@ one with the entities that need to be updated and the other with entities to be crawled_data: Optional[list[db.Record]] = None, no_insert_RTs: Optional[list[str]] = None, no_update_RTs: Optional[list[str]] = None, - path_for_authorized_run: Optional[str] = "", + path_for_authorized_run: Optional[Union[str, list[str]]] = "", ): """ This function applies several stages: @@ -643,7 +643,7 @@ one with the entities that need to be updated and the other with entities to be no_update_RTs : list[str], optional list of RecordType names. Records that have one of those RecordTypes as parent will not be updated - path_for_authorized_run : str, optional + path_for_authorized_run : str or list[str], optional only used if there are changes that need authorization before being applied. The form for rerunning the crawler with the authorization of these changes will be generated with this path. See @@ -718,11 +718,21 @@ one with the entities that need to be updated and the other with entities to be update_cache = UpdateCache() pending_inserts = update_cache.get_inserts(self.run_id) if pending_inserts: + if isinstance(path_for_authorized_run, list): + raise NotImplementedError( + "Authorization of inserts is currently implemented only for single paths, " + "not for lists of paths." + ) Crawler.inform_about_pending_changes( pending_inserts, self.run_id, path_for_authorized_run) pending_updates = update_cache.get_updates(self.run_id) if pending_updates: + if isinstance(path_for_authorized_run, list): + raise NotImplementedError( + "Authorization of updates is currently implemented only for single paths, " + "not for lists of paths." + ) Crawler.inform_about_pending_changes( pending_updates, self.run_id, path_for_authorized_run) @@ -1004,7 +1014,7 @@ def _store_dry_run_data(ins, upd): "update": updates})) -def crawler_main(crawled_directory_path: str, +def crawler_main(crawled_directory_path: Union[str, list[str]], cfood_file_name: str, identifiables_definition_file: Optional[str] = None, debug: bool = False, @@ -1022,8 +1032,8 @@ def crawler_main(crawled_directory_path: str, Parameters ---------- - crawled_directory_path : str - path to be crawled + crawled_directory_path : str or list[str] + path(s) to be crawled cfood_file_name : str filename of the cfood to be used identifiables_definition_file : str diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py index 89bd1c04..6b4d7a12 100644 --- a/src/caoscrawler/scanner.py +++ b/src/caoscrawler/scanner.py @@ -421,7 +421,7 @@ def scanner(items: list[StructureElement], # -------------------------------------------------------------------------------- -def scan_directory(dirname: str, crawler_definition_path: str, +def scan_directory(dirname: Union[str, list[str]], crawler_definition_path: str, restricted_path: Optional[list[str]] = None, debug_tree: Optional[DebugTree] = None): """ Crawl a single directory. @@ -434,10 +434,12 @@ def scan_directory(dirname: str, crawler_definition_path: str, Parameters ---------- + dirname: str or list[str] + directory or list of directories to be scanned restricted_path: optional, list of strings - Traverse the data tree only along the given path. When the end of the given path - is reached, traverse the full tree as normal. See docstring of 'scanner' for - more details. + Traverse the data tree only along the given path. When the end + of the given path is reached, traverse the full tree as + normal. See docstring of 'scanner' for more details. Returns ------- @@ -455,26 +457,31 @@ def scan_directory(dirname: str, crawler_definition_path: str, if not dirname: raise ValueError( "You have to provide a non-empty path for crawling.") - dir_structure_name = os.path.basename(dirname) - - # TODO: needs to be covered somewhere else - crawled_directory = dirname - if not dir_structure_name and dirname.endswith('/'): - if dirname == '/': - # Crawling the entire file system - dir_structure_name = "root" - else: - # dirname had a trailing '/' - dir_structure_name = os.path.basename(dirname[:-1]) - - return scan_structure_elements(Directory(dir_structure_name, - dirname), - crawler_definition, - converter_registry, - restricted_path=restricted_path, - debug_tree=debug_tree, - registered_transformer_functions=registered_transformer_functions - ) + if not isinstance(dirname, list): + dirname = [dirname] + dir_element_list = [] + for dname in dirname: + dir_structure_name = os.path.basename(dname) + + # TODO: needs to be covered somewhere else + crawled_directory = dname + if not dir_structure_name and dname.endswith('/'): + if dname == '/': + # Crawling the entire file system + dir_structure_name = "root" + else: + # dirname had a trailing '/' + dir_structure_name = os.path.basename(dname[:-1]) + dir_element_list.append(Directory(dir_structure_name, dname)) + + return scan_structure_elements( + dir_element_list, + crawler_definition, + converter_registry, + restricted_path=restricted_path, + debug_tree=debug_tree, + registered_transformer_functions=registered_transformer_functions + ) def scan_structure_elements(items: Union[list[StructureElement], StructureElement], -- GitLab From f0a56809211896b0387ffd0b656d5668b85d6466 Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Thu, 5 Dec 2024 18:39:21 +0100 Subject: [PATCH 066/131] TST: Add integration test for crawler_main with list of dirs --- integrationtests/test_crawler_main.py | 91 +++++++++++++++++++ .../crawler_main_with_list_of_dirs/cfood.yml | 10 ++ .../dir1/.gitkeep | 0 .../dir2/.gitkeep | 0 .../identifiable.yml | 2 + integrationtests/test_issues.py | 2 +- 6 files changed, 104 insertions(+), 1 deletion(-) create mode 100644 integrationtests/test_crawler_main.py create mode 100644 integrationtests/test_data/crawler_main_with_list_of_dirs/cfood.yml create mode 100644 integrationtests/test_data/crawler_main_with_list_of_dirs/dir1/.gitkeep create mode 100644 integrationtests/test_data/crawler_main_with_list_of_dirs/dir2/.gitkeep create mode 100644 integrationtests/test_data/crawler_main_with_list_of_dirs/identifiable.yml diff --git a/integrationtests/test_crawler_main.py b/integrationtests/test_crawler_main.py new file mode 100644 index 00000000..3c0ec57e --- /dev/null +++ b/integrationtests/test_crawler_main.py @@ -0,0 +1,91 @@ +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# 2024 Florian Spreckelsen <f.spreckelsen@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +import logging + +from pathlib import Path + +import linkahead as db + +from caoscrawler import crawl +from caoscrawler.crawl import (crawler_main, SecurityMode) +from linkahead.utils.register_tests import clear_database, set_test_key + +set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") + +INTTESTDIR = Path(__file__).parent + + +def test_list_of_paths(clear_database, monkeypatch): + + # Mock the status record + dummy_status = { + "n_calls": 0 + } + + def _mock_update_status_record(run_id, n_inserts, n_updates, status): + print("Update mocked status") + dummy_status["run_id"] = run_id + dummy_status["n_inserts"] = n_inserts + dummy_status["n_updates"] = n_updates + dummy_status["status"] = status + dummy_status["n_calls"] += 1 + monkeypatch.setattr(crawl, "_update_status_record", _mock_update_status_record) + + # mock SSS environment + monkeypatch.setenv("SHARED_DIR", "/tmp") + + # We need only one dummy RT + rt = db.RecordType(name="TestType").insert() + basepath = INTTESTDIR / "test_data" / "crawler_main_with_list_of_dirs" + dirlist = [basepath / "dir1", basepath / "dir2"] + crawler_main( + dirlist, + cfood_file_name=basepath / "cfood.yml", + identifiables_definition_file=basepath / "identifiable.yml" + ) + recs = db.execute_query("FIND TestType") + assert len(recs) == 2 + assert "Test1" in [r.name for r in recs] + assert "Test2" in [r.name for r in recs] + + assert dummy_status["n_inserts"] == 2 + assert dummy_status["n_updates"] == 0 + assert dummy_status["status"] == "OK" + assert dummy_status["n_calls"] == 1 + + +def test_not_implemented_list_with_authorization(caplog, clear_database): + + rt = db.RecordType(name="TestType").insert() + basepath = INTTESTDIR / "test_data" / "crawler_main_with_list_of_dirs" + dirlist = [basepath / "dir1", basepath / "dir2"] + + # This is not implemented yet, so check log for correct error. + crawler_main( + dirlist, + cfood_file_name=basepath / "cfood.yml", + identifiables_definition_file=basepath / "identifiable.yml", + securityMode=SecurityMode.RETRIEVE + ) + err_tuples = [t for t in caplog.record_tuples if t[1] == logging.ERROR] + assert len(err_tuples) == 1 + assert "currently implemented only for single paths, not for lists of paths" in err_tuples[0][2] + # No inserts after the errors + assert len(db.execute_query("FIND TestType")) == 0 diff --git a/integrationtests/test_data/crawler_main_with_list_of_dirs/cfood.yml b/integrationtests/test_data/crawler_main_with_list_of_dirs/cfood.yml new file mode 100644 index 00000000..c7f22ce0 --- /dev/null +++ b/integrationtests/test_data/crawler_main_with_list_of_dirs/cfood.yml @@ -0,0 +1,10 @@ +--- +metadata: + crawler-version: 0.10.2 +--- +BaseDirElement: + type: Directory + match: ^dir(?P<dir_number>[0-9]+)$$ + records: + TestType: + name: Test$dir_number diff --git a/integrationtests/test_data/crawler_main_with_list_of_dirs/dir1/.gitkeep b/integrationtests/test_data/crawler_main_with_list_of_dirs/dir1/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/integrationtests/test_data/crawler_main_with_list_of_dirs/dir2/.gitkeep b/integrationtests/test_data/crawler_main_with_list_of_dirs/dir2/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/integrationtests/test_data/crawler_main_with_list_of_dirs/identifiable.yml b/integrationtests/test_data/crawler_main_with_list_of_dirs/identifiable.yml new file mode 100644 index 00000000..6d608cec --- /dev/null +++ b/integrationtests/test_data/crawler_main_with_list_of_dirs/identifiable.yml @@ -0,0 +1,2 @@ +TestType: + - name diff --git a/integrationtests/test_issues.py b/integrationtests/test_issues.py index cb1e2e09..c699e0ab 100644 --- a/integrationtests/test_issues.py +++ b/integrationtests/test_issues.py @@ -1,4 +1,4 @@ -# This file is a part of the CaosDB Project. +# This file is a part of the LinkAhead Project. # # Copyright (C) 2022 Indiscale GmbH <info@indiscale.com> # 2022 Florian Spreckelsen <f.spreckelsen@indiscale.com> -- GitLab From 3a1384d7bbaaea7752e557cf2b14814fa88f706f Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Thu, 5 Dec 2024 18:39:37 +0100 Subject: [PATCH 067/131] FIX: Raise NotImplementedError at correct position --- src/caoscrawler/crawl.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index fd0beaa2..e3dd0488 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -661,6 +661,12 @@ one with the entities that need to be updated and the other with entities to be "use for example the Scanner to create this data.")) crawled_data = self.crawled_data + if isinstance(path_for_authorized_run, list) and self.securityMode != SecurityMode.UPDATE: + raise NotImplementedError( + "Authorization of inserts and updates is currently implemented only " + "for single paths, not for lists of paths." + ) + to_be_inserted, to_be_updated = self._split_into_inserts_and_updates( SyncGraph(crawled_data, self.identifiableAdapter)) @@ -718,21 +724,11 @@ one with the entities that need to be updated and the other with entities to be update_cache = UpdateCache() pending_inserts = update_cache.get_inserts(self.run_id) if pending_inserts: - if isinstance(path_for_authorized_run, list): - raise NotImplementedError( - "Authorization of inserts is currently implemented only for single paths, " - "not for lists of paths." - ) Crawler.inform_about_pending_changes( pending_inserts, self.run_id, path_for_authorized_run) pending_updates = update_cache.get_updates(self.run_id) if pending_updates: - if isinstance(path_for_authorized_run, list): - raise NotImplementedError( - "Authorization of updates is currently implemented only for single paths, " - "not for lists of paths." - ) Crawler.inform_about_pending_changes( pending_updates, self.run_id, path_for_authorized_run) -- GitLab From ff0a883a67052f97d0681903c1c24a09048d173c Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Fri, 6 Dec 2024 11:21:35 +0100 Subject: [PATCH 068/131] DOC: Update changelog --- CHANGELOG.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4fe40138..fe302156 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 variables to `int`, `float`, `str` and `bool`. - Transformer function definition in the cfood support variable substitutions now. +- `crawler_main` and `scanner.scan_directory` now support list of + directories to be crawled, too. Note that giving a list of + directories is currently incompatible with + `securityMode=SecurityMode.RETRIEVE` or + `securityMode=SecurityMode.INSERT` since the functionality to + authoriye pending inserts or updates doesn't support path lists yet + and will raise a NotImplementedError for now. ### Changed ### -- GitLab From 4de47f4ab588e3367369918075d9d0b9292d11a5 Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Fri, 6 Dec 2024 11:28:39 +0100 Subject: [PATCH 069/131] TST: Test crawler_main return code --- integrationtests/test_crawler_main.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/integrationtests/test_crawler_main.py b/integrationtests/test_crawler_main.py index 3c0ec57e..793dd6ed 100644 --- a/integrationtests/test_crawler_main.py +++ b/integrationtests/test_crawler_main.py @@ -78,12 +78,15 @@ def test_not_implemented_list_with_authorization(caplog, clear_database): dirlist = [basepath / "dir1", basepath / "dir2"] # This is not implemented yet, so check log for correct error. - crawler_main( + ret = crawler_main( dirlist, cfood_file_name=basepath / "cfood.yml", identifiables_definition_file=basepath / "identifiable.yml", securityMode=SecurityMode.RETRIEVE ) + # crawler_main hides the error, but has a non-zero return code and + # errors in the log: + assert ret != 0 err_tuples = [t for t in caplog.record_tuples if t[1] == logging.ERROR] assert len(err_tuples) == 1 assert "currently implemented only for single paths, not for lists of paths" in err_tuples[0][2] -- GitLab From f6bac3e6adc276811e369aeaee47d20d1f34d5f6 Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Fri, 6 Dec 2024 17:02:25 +0100 Subject: [PATCH 070/131] STY: autopep8'd --- unittests/test_scanner.py | 5 +++-- unittests/test_utilities.py | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/unittests/test_scanner.py b/unittests/test_scanner.py index 60ab2591..120d804c 100644 --- a/unittests/test_scanner.py +++ b/unittests/test_scanner.py @@ -128,9 +128,10 @@ def test_record_structure_generation(): assert subd[0]["identifier"] == "climate-model-predict" assert subd[0]["Project"].__class__ == db.Record - assert subd[0]["DataAnalysis"] == os.path.join("examples_article" , "DataAnalysis") + assert subd[0]["DataAnalysis"] == os.path.join("examples_article", "DataAnalysis") assert subc[0]["DataAnalysis"] is True - assert subd[0]["project_dir"] == os.path.join("examples_article", "DataAnalysis", "2020_climate-model-predict") + assert subd[0]["project_dir"] == os.path.join( + "examples_article", "DataAnalysis", "2020_climate-model-predict") assert subc[0]["project_dir"] is False # Check the copy flags for the first level in the hierarchy: diff --git a/unittests/test_utilities.py b/unittests/test_utilities.py index da1245b3..a9b05252 100644 --- a/unittests/test_utilities.py +++ b/unittests/test_utilities.py @@ -35,7 +35,8 @@ def test_split_restricted_path(): assert split_restricted_path(f"{sep}test1{sep}test2{sep}bla") == ["test1", "test2", "bla"] assert split_restricted_path(f"{sep}test{sep}{sep}bla") == ["test", "bla"] assert split_restricted_path(f"{sep}{sep}test{sep}bla") == ["test", "bla"] - assert split_restricted_path(f"{sep}{sep}{sep}test{sep}{sep}bla{sep}{sep}{sep}{sep}") == ["test", "bla"] + assert split_restricted_path( + f"{sep}{sep}{sep}test{sep}{sep}bla{sep}{sep}{sep}{sep}") == ["test", "bla"] def test_dummy_class(): -- GitLab From 3ebadf374f81f41e5e1aac8c0ecedba6bc71c6e9 Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Fri, 6 Dec 2024 17:17:03 +0100 Subject: [PATCH 071/131] MAINT: Use platform-independent tmp and paths --- integrationtests/test_crawler_main.py | 3 ++- src/caoscrawler/scanner.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/integrationtests/test_crawler_main.py b/integrationtests/test_crawler_main.py index 793dd6ed..a2eebf4f 100644 --- a/integrationtests/test_crawler_main.py +++ b/integrationtests/test_crawler_main.py @@ -18,6 +18,7 @@ # import logging +import tempfile from pathlib import Path @@ -49,7 +50,7 @@ def test_list_of_paths(clear_database, monkeypatch): monkeypatch.setattr(crawl, "_update_status_record", _mock_update_status_record) # mock SSS environment - monkeypatch.setenv("SHARED_DIR", "/tmp") + monkeypatch.setenv("SHARED_DIR", tempfile.gettempdir()) # We need only one dummy RT rt = db.RecordType(name="TestType").insert() diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py index 6b4d7a12..af1f4173 100644 --- a/src/caoscrawler/scanner.py +++ b/src/caoscrawler/scanner.py @@ -465,8 +465,8 @@ def scan_directory(dirname: Union[str, list[str]], crawler_definition_path: str, # TODO: needs to be covered somewhere else crawled_directory = dname - if not dir_structure_name and dname.endswith('/'): - if dname == '/': + if not dir_structure_name and dname.endswith(os.path.sep): + if dname == os.path.sep: # Crawling the entire file system dir_structure_name = "root" else: -- GitLab From 25803a7231c70a5700382a5cc430ac2164d2eb07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Tue, 10 Dec 2024 13:02:23 +0100 Subject: [PATCH 072/131] ENH: raise an Error instead of just warning when timestamp file has bad content --- src/caoscrawler/converters/converters.py | 6 +++--- unittests/test_converters.py | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/caoscrawler/converters/converters.py b/src/caoscrawler/converters/converters.py index da752f68..df0d77b1 100644 --- a/src/caoscrawler/converters/converters.py +++ b/src/caoscrawler/converters/converters.py @@ -831,13 +831,13 @@ class DirectoryConverter(Converter): stamp_str = ref_file.readline().strip() try: return datetime.datetime.fromisoformat(stamp_str) - except ValueError: - logger.warn( + except ValueError as e: + logger.error( f"Reference file in {self.definition['match_newer_than_file']} " "doesn't contain a ISO formatted datetime in its first line. " "Match regardless of modification times." ) - return datetime.datetime.min + raise e class SimpleFileConverter(Converter): diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 7b22aa84..e4b442d9 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -1113,9 +1113,10 @@ def test_directory_converter_change_date(caplog, converter_registry): # Match but warn with open(tmpfi.name, "w") as fi: fi.write(f"This is garbage.\n") - assert dc.match(test_dir_element) is not None + with pytest.raises(ValueError): + dc.match(test_dir_element) assert len(caplog.record_tuples) == 1 - assert caplog.record_tuples[0][1] == logging.WARNING + assert caplog.record_tuples[0][1] == logging.ERROR assert tmpfi.name in caplog.record_tuples[0][2] assert "doesn't contain a ISO formatted datetime in its first line" in caplog.record_tuples[0][2] -- GitLab From caa680aee3b474bf8f153e4f476c8b087402333c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Tue, 10 Dec 2024 13:34:00 +0100 Subject: [PATCH 073/131] DOC: add comment (TODO) --- src/caoscrawler/crawl.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index e3dd0488..a939b2ff 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -1166,6 +1166,7 @@ def parse_args(): "This file will only be generated if this option is set.") parser.add_argument("--debug", required=False, action="store_true", help="Path name of the cfood yaml file to be used.") + # TODO allow to provide multiple directories to be crawled on the commandline parser.add_argument("crawled_directory_path", help="The subtree of files below the given path will " "be considered. Use '/' for everything.") -- GitLab From f77d7340c761f4d3725c0d350ace27ae0c00ba35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Sat, 14 Dec 2024 14:09:02 +0100 Subject: [PATCH 074/131] FIX: make string a format string --- src/caoscrawler/crawl.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index a939b2ff..e0d24397 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -531,8 +531,8 @@ one with the entities that need to be updated and the other with entities to be prop.value = Crawler._get_property_id_for_datatype( rtname=prop.datatype, name=prop.value) except (db.EmptyUniqueQueryError, db.QueryNotUniqueError): - logger.error("The Property {prop.name} with datatype={prop.datatype} has the " - "value {prop.value} and there is no appropriate Entity with such " + logger.error(f"The Property {prop.name} with datatype={prop.datatype} has the " + f"value {prop.value} and there is no appropriate Entity with such " "a name.") raise else: @@ -548,8 +548,8 @@ one with the entities that need to be updated and the other with entities to be name=el)) except (db.EmptyUniqueQueryError, db.QueryNotUniqueError): logger.error( - "The Property {prop.name} with datatype={prop.datatype} has the " - "value {prop.value} and there is no appropriate Entity with such " + f"The Property {prop.name} with datatype={prop.datatype} has the " + f"value {prop.value} and there is no appropriate Entity with such " "a name.") raise else: -- GitLab From 9eb92c4c9204c1fa5d790a5ae9dd81a48f50c361 Mon Sep 17 00:00:00 2001 From: "i.nueske" <i.nueske@indiscale.com> Date: Mon, 16 Dec 2024 12:32:21 +0100 Subject: [PATCH 075/131] DOC: Add basic documentation for ROCrateConverter, ELNFileConverter, and ROCrateEntityConverter --- src/doc/converters/further_converters.rst | 73 +++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/src/doc/converters/further_converters.rst b/src/doc/converters/further_converters.rst index a334c877..1acf4c02 100644 --- a/src/doc/converters/further_converters.rst +++ b/src/doc/converters/further_converters.rst @@ -98,3 +98,76 @@ given ``recordname``, this record can be used within the cfood. Most importantly, this record stores the internal path of this array within the HDF5 file in a text property, the name of which can be configured with the ``internal_path_property_name`` option which defaults to ``internal_hdf5_path``. + + + +ROCrateConverter +================ + +The ROCrateConverter unpacks ro-crate files, and creates one instance of the +``ROCrateEntity`` structure element for each contained object. Currently only +zipped ro-crate files are supported. The created ROCrateEntities wrap a +``rocrate.model.entity.Entity`` with a path to the folder the ROCrate data +is saved in, and can then be treated by the :ref:`ROCrateEntityConverter`. +To use the ROCrateConverter, you need to install the LinkAhead crawler with its +optional ``rocrate`` dependency. + +ELNFileConverter +---------------- + +As .eln files are zipped ro-crate files, the ELNFileConverter works analogously +to the ROCrateConverter and also creates ROCrateEntities for contained objects. + +ROCrateEntityConverter +---------------------- + +The ROCrateEntityConverter unpacks the properties, files and parts of the +``rocrate.model.entity.Entity`` wrapped within a ROCrateEntity. Properties +are converted to a basic element matching their value +(``BooleanElement``, ``IntegerElement``, etc.), each ``rocrate.model.file.File`` +is converted to a crawler File object, and each subpart of the ROCrateEntity is +again converted to a ROCrateEntity. These elements can then again be treated +using this or their respective inbuilt converters. + +Example cfood +------------- + +One short cfood to generate records for each .eln file in a directory and +their metadata files could be: + +.. code-block:: yaml + + --- + metadata: + crawler-version: 0.9.0 + --- + Converters: + ELNFile: + converter: ELNFileConverter + package: caoscrawler.converters.rocrate + ROCrateEntity: + converter: ROCrateEntityConverter + package: caoscrawler.converters.rocrate + + ParentDirectory: + type: Directory + match: (.*) + subtree: + ELNFile: + type: ELNFile + match: (?P<filename>.*)\.eln + records: + ELNExampleRecord: + filename: $filename + subtree: + ROCrateEntity: + type: ROCrateEntity + match_properties: + "@id": ro-crate-metadata.json + dateCreated: (?P<dateCreated>.*) + records: + MDExampleRecord: + parent: $ELNFile + filename: ro-crate-metadata.json + time: $dateCreated + -- GitLab From 6563280fbc68517f821d69b4c4405e446979fa55 Mon Sep 17 00:00:00 2001 From: "i.nueske" <i.nueske@indiscale.com> Date: Mon, 16 Dec 2024 12:39:25 +0100 Subject: [PATCH 076/131] DOC: Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 354024f9..939baa6e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -48,6 +48,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Security ### ### Documentation ### +- Added documentation for ROCrateConverter, ELNFileConverter, and ROCrateEntityConverter ## [0.10.1] - 2024-11-13 ## -- GitLab From dd773f1387f9021cab914d67f909172ceb9852de Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Thu, 19 Dec 2024 08:56:54 +0100 Subject: [PATCH 077/131] MAINT: removed unnecessary print statement --- src/caoscrawler/converters/converters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/caoscrawler/converters/converters.py b/src/caoscrawler/converters/converters.py index d06415f7..ef35f1b0 100644 --- a/src/caoscrawler/converters/converters.py +++ b/src/caoscrawler/converters/converters.py @@ -514,7 +514,7 @@ class Converter(object, metaclass=ABCMeta): matched_m_prop = None matched_m_prop_value = None for prop_key, prop_value in properties.items(): - print("{} = {}".format(prop_key, prop_value)) + # print("{} = {}".format(prop_key, prop_value)) # TODO: automatic conversion to str ok? m_prop = re.match(prop_def_key, str(prop_key)) if m_prop is not None: -- GitLab From a93537377d7cc39a1ecaf11ae9d18f10026202e3 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Thu, 19 Dec 2024 10:02:01 +0100 Subject: [PATCH 078/131] ENH(scanner): allow recursive references to records on the same level in the cfood --- src/caoscrawler/converters/converters.py | 51 ++++++++++++++++++++---- 1 file changed, 43 insertions(+), 8 deletions(-) diff --git a/src/caoscrawler/converters/converters.py b/src/caoscrawler/converters/converters.py index f95862a9..47950b36 100644 --- a/src/caoscrawler/converters/converters.py +++ b/src/caoscrawler/converters/converters.py @@ -249,11 +249,34 @@ out: tuple return (propvalue, propunit, collection_mode) -def create_records(values: GeneralStore, records: RecordStore, def_records: dict): - # list of keys to identify, which variables have been set by which paths: - # the items are tuples: - # 0: record name - # 1: property name +def create_records(values: GeneralStore, + records: RecordStore, + def_records: dict) -> list[tuple[str, str]]: + """ + Create records in GeneralStore `values` and RecordStore `records` as given + by the definition in `def_records`. + + This function will be called during scanning using the cfood definition. + It also should be used by CustomConverters to set records as automatic substitution + and other crawler features are applied automatically. + + Arguments: + ---------- + values: GeneralStore + This GeneralStore will be used to access variables that are needed during variable substitution + in setting the properties of records and files. + Furthermore, the records that are generated in this function will be stored in this GeneralStore + **additionally to** storing them in the RecordStore given as the second argument to this function. + + records: RecordStore + The RecordStore where the generated records will be stored. + + Returns: + -------- + A list of tuples containing the record names (1st element of tuple) and respective property names + as 2nd element of the tuples. This list will be used by the scanner for creating the debug tree. + + """ keys_modified = [] for name, record in def_records.items(): @@ -286,11 +309,22 @@ def create_records(values: GeneralStore, records: RecordStore, def_records: dict if (role == "Record" and "parents" not in record): c_record.add_parent(name) - c_record = records[name] - if isinstance(record, str): raise RuntimeError( "dict expected, but found str: {}".format(record)) + + # We do a second run over the def_records, here. Having finished the first run + # for creating the records (in the variable and records stores) makes sure that + # records, that are defined on this level can already be accessed during variable substitution + # in the properties that will be set in the next block. + for name, record in def_records.items(): + # See above: + if record is None: + record = {} + + c_record = records[name] + + # Set the properties: for key, value in record.items(): if key == "parents" or key == "role": continue @@ -320,7 +354,8 @@ def create_records(values: GeneralStore, records: RecordStore, def_records: dict c_record.add_property(name=key, value=propvalue, unit=propunit) else: if collection_mode == "list": - if propunit and c_record.get_property(key).unit and propunit != c_record.get_property(key).unit: + if (propunit and c_record.get_property(key).unit + and propunit != c_record.get_property(key).unit): raise RuntimeError( f"Property '{key}' has contradictory units: " f"{propunit} and {c_record.get_property(key).unit}" -- GitLab From 7df6e3e6a7f89fbd121b60ced5299dff3fe27d40 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Thu, 19 Dec 2024 10:07:56 +0100 Subject: [PATCH 079/131] DOC: changelog updated --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 354024f9..e0589ec4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -44,6 +44,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed ### - `spss_to_datamodel` script works again. +- The cfood now supports bi-directional references when defining records on the same level. + (See: https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/175) ### Security ### -- GitLab From cb87b6e87c2b4822ca4353d2010d2c40c6f86f1b Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Thu, 19 Dec 2024 10:13:07 +0100 Subject: [PATCH 080/131] TST(scanner): test for bi-directional definition only works if the identifying properties are non-recursive --- integrationtests/test_issues.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/integrationtests/test_issues.py b/integrationtests/test_issues.py index 01e2c303..f0a9d880 100644 --- a/integrationtests/test_issues.py +++ b/integrationtests/test_issues.py @@ -352,8 +352,10 @@ FirstConverter: type: DictElement records: Block: + name: block 1 Experiment: $Experiment Experiment: + name: experiment 1 Block: $Block """ @@ -377,9 +379,9 @@ FirstConverter: ident = CaosDBIdentifiableAdapter() ident.load_from_yaml_object(yaml.safe_load(""" Experiment: -- Block +- name Block: -- Experiment +- name """)) crawler = Crawler(identifiableAdapter=ident) -- GitLab From dcbfe3dc807fc82579704615e6fab0d71fb32811 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Fri, 20 Dec 2024 14:57:19 +0000 Subject: [PATCH 081/131] Apply 1 suggestion(s) to 1 file(s) Co-authored-by: Florian Spreckelsen <f.spreckelsen@indiscale.com> --- src/caoscrawler/converters/converters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/caoscrawler/converters/converters.py b/src/caoscrawler/converters/converters.py index 47950b36..21f255f0 100644 --- a/src/caoscrawler/converters/converters.py +++ b/src/caoscrawler/converters/converters.py @@ -260,7 +260,7 @@ def create_records(values: GeneralStore, It also should be used by CustomConverters to set records as automatic substitution and other crawler features are applied automatically. - Arguments: + Parameters ---------- values: GeneralStore This GeneralStore will be used to access variables that are needed during variable substitution -- GitLab From f452017aeb6c5e89124e49676d886ec47c5f1ff5 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Fri, 20 Dec 2024 14:57:26 +0000 Subject: [PATCH 082/131] Apply 1 suggestion(s) to 1 file(s) Co-authored-by: Florian Spreckelsen <f.spreckelsen@indiscale.com> --- src/caoscrawler/converters/converters.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/caoscrawler/converters/converters.py b/src/caoscrawler/converters/converters.py index 21f255f0..94b8242c 100644 --- a/src/caoscrawler/converters/converters.py +++ b/src/caoscrawler/converters/converters.py @@ -271,8 +271,8 @@ def create_records(values: GeneralStore, records: RecordStore The RecordStore where the generated records will be stored. - Returns: - -------- + Returns + ------- A list of tuples containing the record names (1st element of tuple) and respective property names as 2nd element of the tuples. This list will be used by the scanner for creating the debug tree. -- GitLab From 5f5f3c4a477531385ce11f77dca3a9394ba03848 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Fri, 20 Dec 2024 14:57:42 +0000 Subject: [PATCH 083/131] Apply 1 suggestion(s) to 1 file(s) Co-authored-by: Florian Spreckelsen <f.spreckelsen@indiscale.com> --- src/caoscrawler/converters/converters.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/caoscrawler/converters/converters.py b/src/caoscrawler/converters/converters.py index 94b8242c..09942918 100644 --- a/src/caoscrawler/converters/converters.py +++ b/src/caoscrawler/converters/converters.py @@ -273,8 +273,9 @@ def create_records(values: GeneralStore, Returns ------- - A list of tuples containing the record names (1st element of tuple) and respective property names - as 2nd element of the tuples. This list will be used by the scanner for creating the debug tree. + : list[tuple[str, str]] + A list of tuples containing the record names (1st element of tuple) and respective property names + as 2nd element of the tuples. This list will be used by the scanner for creating the debug tree. """ keys_modified = [] -- GitLab From 73f1a84b8ada26566294279c093d575c49e3fe5b Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Thu, 2 Jan 2025 13:22:11 +0100 Subject: [PATCH 084/131] TST: simplified data needed for integration test --- integrationtests/test_issues.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/integrationtests/test_issues.py b/integrationtests/test_issues.py index f0a9d880..8357388d 100644 --- a/integrationtests/test_issues.py +++ b/integrationtests/test_issues.py @@ -363,12 +363,8 @@ FirstConverter: [yaml.load(recursive_yaml, Loader=yaml.SafeLoader)]) converter_registry = create_converter_registry(crawler_definition) + # Nested DictElements that match the yaml structure in recursive_yaml: data = {"data": { - "value_with_unit": "1.1 m", - "array_with_units": [ - "1.1 cm", - "2.2 cm" - ] }} records = scan_structure_elements(DictElement(name="", value=data), crawler_definition, converter_registry) -- GitLab From 5f73baea2ee1c1a7283641cd7b74fce2d0978ace Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Thu, 2 Jan 2025 13:22:34 +0100 Subject: [PATCH 085/131] TST: added test that bidirectional reference was inserted correctly --- integrationtests/test_issues.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/integrationtests/test_issues.py b/integrationtests/test_issues.py index 8357388d..0506fa4d 100644 --- a/integrationtests/test_issues.py +++ b/integrationtests/test_issues.py @@ -383,6 +383,14 @@ Block: crawler = Crawler(identifiableAdapter=ident) crawler.synchronize(crawled_data=records) + exp_res = db.execute_query("FIND Experiment") + assert len(exp_res) == 1 + exp_block = db.execute_query("FIND Block") + assert len(exp_block) == 1 + + assert exp_res[0].get_property("Block").value == exp_block[0].id + assert exp_block[0].get_property("Experiment").value == exp_res[0].id + def test_issue_14(clear_database): """ -- GitLab From 7535edf724a7b4b1922e33679655a91b751100a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Fri, 10 Jan 2025 10:08:40 +0100 Subject: [PATCH 086/131] MAINT: rename filter function --- src/caoscrawler/sync_node.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/caoscrawler/sync_node.py b/src/caoscrawler/sync_node.py index d912d646..46187c0d 100644 --- a/src/caoscrawler/sync_node.py +++ b/src/caoscrawler/sync_node.py @@ -256,9 +256,9 @@ class SyncNode(db.Entity): def parent_in_list(parent: Parent, plist: ParentList) -> bool: """helper function that checks whether a parent with the same name or ID is in the plist""" - return plist.filter(parent) + return plist.filter_by_identity(parent) def property_in_list(prop: db.Property, plist: PropertyList) -> bool: """helper function that checks whether a property with the same name or ID is in the plist""" - return plist.filter(prop) + return plist.filter_by_identity(prop) -- GitLab From 8b73beaa271a41149f444c26c7cdc0ca7584abf9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Fri, 10 Jan 2025 11:25:52 +0100 Subject: [PATCH 087/131] TST: allow pipeline to pass --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index e4322356..9ad8f764 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -129,7 +129,7 @@ unittest_py3.9: # install dependencies - pip install pytest pytest-cov # TODO: Use f-branch logic here - - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev + - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@f-rename-filter - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@dev - pip install .[h5-crawler,spss,rocrate] # actual test -- GitLab From 1460d5bfc0070e977c3f47bdbb6044397b6b447e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Fri, 10 Jan 2025 11:27:59 +0100 Subject: [PATCH 088/131] FIX: remove unused code --- .docker/Dockerfile | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/.docker/Dockerfile b/.docker/Dockerfile index 1468a17f..e549f2f4 100644 --- a/.docker/Dockerfile +++ b/.docker/Dockerfile @@ -16,16 +16,6 @@ RUN pip3 install --break-system-packages \ sphinx-rtd-theme \ ; COPY .docker/wait-for-it.sh /wait-for-it.sh -ARG PYLIB -ADD https://gitlab.indiscale.com/api/v4/projects/97/repository/commits/${PYLIB} \ - pylib_version.json -RUN git clone https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git && \ - cd caosdb-pylib && git checkout ${PYLIB} && pip3 install --break-system-packages . -ARG ADVANCED -ADD https://gitlab.indiscale.com/api/v4/projects/104/repository/commits/${ADVANCED} \ - advanced_version.json -RUN git clone https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git && \ - cd caosdb-advanced-user-tools && git checkout ${ADVANCED} && pip3 install --break-system-packages .[h5-crawler] COPY . /git # Delete .git because it is huge. -- GitLab From 3a23058149b100e30eadab60c448188dd0ea2b8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Fri, 10 Jan 2025 11:33:58 +0100 Subject: [PATCH 089/131] Revert "TST: allow pipeline to pass" This reverts commit 8b73beaa271a41149f444c26c7cdc0ca7584abf9. --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9ad8f764..e4322356 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -129,7 +129,7 @@ unittest_py3.9: # install dependencies - pip install pytest pytest-cov # TODO: Use f-branch logic here - - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@f-rename-filter + - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@dev - pip install .[h5-crawler,spss,rocrate] # actual test -- GitLab From 9ea70aaff32ab117b5c3888a63340ede84d5e4a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Fri, 10 Jan 2025 11:53:14 +0100 Subject: [PATCH 090/131] MAINT: refactor pipeleine script --- .gitlab-ci.yml | 81 +++++++++++++++++++++++++++++--------------------- tox.ini | 5 ++-- 2 files changed, 49 insertions(+), 37 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index e4322356..1d454da0 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -104,6 +104,27 @@ stages: CAOSDB_TAG=${REFTAG}; fi - echo $CAOSDB_TAG + - if [ -z "$PYLIB" ]; then + if echo "$CI_COMMIT_REF_NAME" | grep -c "^f-" ; then + echo "Check if pylib has branch $CI_COMMIT_REF_NAME" ; + if wget https://gitlab.indiscale.com/api/v4/projects/97/repository/branches/${CI_COMMIT_REF_NAME} ; then + PYLIB=$CI_COMMIT_REF_NAME ; + fi; + fi; + fi; + - PYLIB=${PYLIB:-dev} + - echo $PYLIB + + - if [ -z "$ADVANCED" ]; then + if echo "$CI_COMMIT_REF_NAME" | grep -c "^f-" ; then + echo "Check if advanced user tools have branch $CI_COMMIT_REF_NAME" ; + if wget https://gitlab.indiscale.com/api/v4/projects/104/repository/branches/${CI_COMMIT_REF_NAME} ; then + ADVANCED=$CI_COMMIT_REF_NAME ; + fi; + fi; + fi; + - ADVANCED=${ADVANCED:-dev} + - echo $ADVANCED info: tags: [cached-dind] @@ -113,47 +134,58 @@ info: script: - *env -unittest_py3.11: - tags: [cached-dind] - stage: test - image: $CI_REGISTRY_IMAGE - script: - - python3 -c "import sys; assert sys.version.startswith('3.11')" - - tox - unittest_py3.9: tags: [cached-dind] stage: test + variables: + PYVER: 3.9 image: python:3.9 script: &python_test_script # install dependencies + - *env - pip install pytest pytest-cov - # TODO: Use f-branch logic here - - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev - - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@dev + - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@${PYLIB} + - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@{ADVANCED} - pip install .[h5-crawler,spss,rocrate] + - echo "import sys; assert sys.version.startswith('"$PYVER"')" + - python3 -c "import sys; assert sys.version.startswith('"$PYVER"')" # actual test - caosdb-crawler --help - pytest --cov=caosdb -vv ./unittests unittest_py3.10: - tags: [cached-dind] + variables: + PYVER: 3.10 stage: test + tags: [cached-dind] image: python:3.10 script: *python_test_script -unittest_py3.12: +unittest_py3.11: + variables: + PYVER: 3.9 tags: [cached-dind] stage: test + image: python:3.11 + script: *python_test_script + +unittest_py3.12: + variables: + PYVER: 3.12 + stage: test + tags: [cached-dind] image: python:3.12 script: *python_test_script unittest_py3.13: + variables: + PYVER: 3.13 tags: [cached-dind] stage: test image: python:3.13 script: *python_test_script + inttest: tags: [docker] services: @@ -170,6 +202,8 @@ inttest: - *env - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY - echo $CAOSDB_TAG + - echo $PYLIB + - echo $ADVANCED - cd .docker # Store mariadb version @@ -233,27 +267,6 @@ build-testenv: script: - df -h - command -v wget - - if [ -z "$PYLIB" ]; then - if echo "$CI_COMMIT_REF_NAME" | grep -c "^f-" ; then - echo "Check if pylib has branch $CI_COMMIT_REF_NAME" ; - if wget https://gitlab.indiscale.com/api/v4/projects/97/repository/branches/${CI_COMMIT_REF_NAME} ; then - PYLIB=$CI_COMMIT_REF_NAME ; - fi; - fi; - fi; - - PYLIB=${PYLIB:-dev} - - echo $PYLIB - - - if [ -z "$ADVANCED" ]; then - if echo "$CI_COMMIT_REF_NAME" | grep -c "^f-" ; then - echo "Check if advanced user tools have branch $CI_COMMIT_REF_NAME" ; - if wget https://gitlab.indiscale.com/api/v4/projects/104/repository/branches/${CI_COMMIT_REF_NAME} ; then - ADVANCED=$CI_COMMIT_REF_NAME ; - fi; - fi; - fi; - - ADVANCED=${ADVANCED:-dev} - - echo $ADVANCED - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY # use here general latest or specific branch latest... diff --git a/tox.ini b/tox.ini index e003e26e..65c7549d 100644 --- a/tox.ini +++ b/tox.ini @@ -6,9 +6,8 @@ skip_missing_interpreters = true deps = .[h5-crawler,spss,rocrate] pytest pytest-cov - # TODO: Make this f-branch sensitive - git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev - git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@dev + caosdb-pylib + caosdb-advanced-user-tools commands = caosdb-crawler --help py.test --cov=caoscrawler -vv {posargs} -- GitLab From 8d623f8933f6dc88602b1815c2208d841ba314df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Fri, 10 Jan 2025 12:28:37 +0100 Subject: [PATCH 091/131] FIX: add missing dollar --- .gitlab-ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 1d454da0..91fd8fdf 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -145,7 +145,7 @@ unittest_py3.9: - *env - pip install pytest pytest-cov - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@${PYLIB} - - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@{ADVANCED} + - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@${ADVANCED} - pip install .[h5-crawler,spss,rocrate] - echo "import sys; assert sys.version.startswith('"$PYVER"')" - python3 -c "import sys; assert sys.version.startswith('"$PYVER"')" @@ -271,7 +271,7 @@ build-testenv: - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY # use here general latest or specific branch latest... - docker build - --build-arg PYLIB=${PYLIB} + --build-arg PYLIB=${PYLIB:dev} --build-arg ADVANCED=${ADVANCED:dev} --file .docker/Dockerfile -t $CI_REGISTRY_IMAGE . -- GitLab From ad096e50d5dc74977713c2f92e2b1734893d95e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Fri, 10 Jan 2025 12:40:11 +0100 Subject: [PATCH 092/131] FIX: string concatenation --- .gitlab-ci.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 91fd8fdf..cd3e46b5 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -138,7 +138,7 @@ unittest_py3.9: tags: [cached-dind] stage: test variables: - PYVER: 3.9 + PYVER: "3.9" image: python:3.9 script: &python_test_script # install dependencies @@ -147,15 +147,15 @@ unittest_py3.9: - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@${PYLIB} - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@${ADVANCED} - pip install .[h5-crawler,spss,rocrate] - - echo "import sys; assert sys.version.startswith('"$PYVER"')" - - python3 -c "import sys; assert sys.version.startswith('"$PYVER"')" + - echo "import sys; assert sys.version.startswith('$PYVER')" + - python3 -c "import sys; assert sys.version.startswith('$PYVER')" # actual test - caosdb-crawler --help - pytest --cov=caosdb -vv ./unittests unittest_py3.10: variables: - PYVER: 3.10 + PYVER: "3.10" stage: test tags: [cached-dind] image: python:3.10 @@ -163,7 +163,7 @@ unittest_py3.10: unittest_py3.11: variables: - PYVER: 3.9 + PYVER: "3.11" tags: [cached-dind] stage: test image: python:3.11 @@ -171,7 +171,7 @@ unittest_py3.11: unittest_py3.12: variables: - PYVER: 3.12 + PYVER: "3.12" stage: test tags: [cached-dind] image: python:3.12 @@ -179,7 +179,7 @@ unittest_py3.12: unittest_py3.13: variables: - PYVER: 3.13 + PYVER: "3.13" tags: [cached-dind] stage: test image: python:3.13 -- GitLab From 08ad7cfea99bfc80eac64725159516df54ef7327 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Fri, 10 Jan 2025 13:25:44 +0100 Subject: [PATCH 093/131] MAINT: resturcture docker image --- .docker/Dockerfile | 43 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/.docker/Dockerfile b/.docker/Dockerfile index e549f2f4..42926cf7 100644 --- a/.docker/Dockerfile +++ b/.docker/Dockerfile @@ -1,3 +1,29 @@ +############################### +###### Temporary Image ######## +############################### +FROM debian:bookworm as git_base + +# Check for availability of DNS +RUN if getent hosts indiscale.com > /dev/null; \ + then echo "Connected to the internet and DNS available"; \ + else echo "No internet connection or DNS not available"; \ + fi + +RUN apt-get update && apt-get install -y \ + git + +COPY . /git + +# Delete .git because it is huge. +RUN rm -r /git/.git + +# Install pycaosdb.ini for the tests +RUN mv /git/.docker/tester_pycaosdb.ini /git/integrationtests/pycaosdb.ini + +############################### +###### Main Image Build ####### +############################### + FROM debian:bookworm RUN apt-get update && \ apt-get install \ @@ -16,13 +42,18 @@ RUN pip3 install --break-system-packages \ sphinx-rtd-theme \ ; COPY .docker/wait-for-it.sh /wait-for-it.sh -COPY . /git +ARG PYLIB +ADD https://gitlab.indiscale.com/api/v4/projects/97/repository/commits/${PYLIB} \ + pylib_version.json +RUN git clone https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git && \ + cd caosdb-pylib && git checkout ${PYLIB} && pip3 install --break-system-packages . +ARG ADVANCED +ADD https://gitlab.indiscale.com/api/v4/projects/104/repository/commits/${ADVANCED} \ + advanced_version.json +RUN git clone https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git && \ + cd caosdb-advanced-user-tools && git checkout ${ADVANCED} && pip3 install --break-system-packages .[h5-crawler] -# Delete .git because it is huge. -RUN rm -r /git/.git - -# Install pycaosdb.ini for the tests -RUN mv /git/.docker/tester_pycaosdb.ini /git/integrationtests/pycaosdb.ini +COPY --from=git_base /git /git RUN cd /git/ && pip3 install --break-system-packages .[h5-crawler,spss,rocrate] -- GitLab From 40a07e9c959d7278913b925de4d4d81ca9168384 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Fri, 10 Jan 2025 13:31:21 +0100 Subject: [PATCH 094/131] FIX: wget does not produce output --- .gitlab-ci.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index cd3e46b5..782768dd 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -107,7 +107,7 @@ stages: - if [ -z "$PYLIB" ]; then if echo "$CI_COMMIT_REF_NAME" | grep -c "^f-" ; then echo "Check if pylib has branch $CI_COMMIT_REF_NAME" ; - if wget https://gitlab.indiscale.com/api/v4/projects/97/repository/branches/${CI_COMMIT_REF_NAME} ; then + if wget -O /dev/null https://gitlab.indiscale.com/api/v4/projects/97/repository/branches/${CI_COMMIT_REF_NAME}>/dev/null ; then PYLIB=$CI_COMMIT_REF_NAME ; fi; fi; @@ -118,7 +118,7 @@ stages: - if [ -z "$ADVANCED" ]; then if echo "$CI_COMMIT_REF_NAME" | grep -c "^f-" ; then echo "Check if advanced user tools have branch $CI_COMMIT_REF_NAME" ; - if wget https://gitlab.indiscale.com/api/v4/projects/104/repository/branches/${CI_COMMIT_REF_NAME} ; then + if wget -O /dev/null https://gitlab.indiscale.com/api/v4/projects/104/repository/branches/${CI_COMMIT_REF_NAME} ; then ADVANCED=$CI_COMMIT_REF_NAME ; fi; fi; @@ -265,6 +265,7 @@ build-testenv: - pushes needs: [] script: + - *env - df -h - command -v wget -- GitLab From 9691b126d5ccbf07f61a10b707ad2bdb9107aab5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Fri, 10 Jan 2025 14:26:27 +0100 Subject: [PATCH 095/131] FIX: remove tox from makefile --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 95fc2bf6..1ba97ae3 100644 --- a/Makefile +++ b/Makefile @@ -44,5 +44,5 @@ lint: .PHONY: lint unittest: - tox -r + pytest .PHONY: unittest -- GitLab From 02b58258177933713d830aebb57d8e9e8d576f93 Mon Sep 17 00:00:00 2001 From: "i.nueske" <i.nueske@indiscale.com> Date: Sat, 11 Jan 2025 13:43:05 +0100 Subject: [PATCH 096/131] DOC: ROCrateEntityConverter explanation --- src/doc/converters/further_converters.rst | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/doc/converters/further_converters.rst b/src/doc/converters/further_converters.rst index 1acf4c02..a3c306a2 100644 --- a/src/doc/converters/further_converters.rst +++ b/src/doc/converters/further_converters.rst @@ -108,7 +108,9 @@ The ROCrateConverter unpacks ro-crate files, and creates one instance of the ``ROCrateEntity`` structure element for each contained object. Currently only zipped ro-crate files are supported. The created ROCrateEntities wrap a ``rocrate.model.entity.Entity`` with a path to the folder the ROCrate data -is saved in, and can then be treated by the :ref:`ROCrateEntityConverter`. +is saved in. They are appended as children and can then be accessed via the +subtree and treated using the :ref:`ROCrateEntityConverter`. + To use the ROCrateConverter, you need to install the LinkAhead crawler with its optional ``rocrate`` dependency. @@ -121,13 +123,17 @@ to the ROCrateConverter and also creates ROCrateEntities for contained objects. ROCrateEntityConverter ---------------------- -The ROCrateEntityConverter unpacks the properties, files and parts of the -``rocrate.model.entity.Entity`` wrapped within a ROCrateEntity. Properties -are converted to a basic element matching their value -(``BooleanElement``, ``IntegerElement``, etc.), each ``rocrate.model.file.File`` -is converted to a crawler File object, and each subpart of the ROCrateEntity is -again converted to a ROCrateEntity. These elements can then again be treated -using this or their respective inbuilt converters. +The ROCrateEntityConverter unpacks the ``rocrate.model.entity.Entity`` wrapped +within a ROCrateEntity, and appends all properties, contained files, and parts +as children. Properties are converted to a basic element matching their value +(``BooleanElement``, ``IntegerElement``, etc.) and can be matched using +match_properties. Each ``rocrate.model.file.File`` is converted to a crawler +File object, which can be matched with SimpleFile. And each subpart of the +ROCrateEntity is also converted to a ROCrateEntity, which can then again be +treated using this converter. + +To match a ROCrateEntity using its entity_type, the match_entity_types keyword +can be used. Example cfood ------------- -- GitLab From 3db75326e10021b182a40e0ed3d031ab6e30a431 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Mon, 13 Jan 2025 14:29:21 +0100 Subject: [PATCH 097/131] ENH: allow inheritance of identifiables --- src/caoscrawler/identifiable_adapters.py | 56 +++++++++++++++++++----- src/doc/concepts.rst | 17 ++++++- unittests/test_identifiable_adapters.py | 56 +++++++++++++++++++++++- 3 files changed, 116 insertions(+), 13 deletions(-) diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index 592f603b..2d7b0158 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -45,6 +45,12 @@ from .utils import has_parent logger = logging.getLogger(__name__) +def _retrieve_RecordType(id=None, name =None): + """ + Retrieve the RecordType from LinkAhead. For mocking purposes. + """ + return db.RecordType(name=name, id=id).retrieve() + def get_children_of_rt(rtname): """Supply the name of a recordtype. This name and the name of all children RTs are returned in a list""" @@ -586,8 +592,7 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): self.load_from_yaml_object(identifiable_data) def load_from_yaml_object(self, identifiable_data): - """Load identifiables defined in a yaml object. - """ + """Load identifiables defined in a yaml object. """ for rt_name, id_list in identifiable_data.items(): rt = db.RecordType().add_parent(rt_name) @@ -611,7 +616,7 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): self.register_identifiable(rt_name, rt) def register_identifiable(self, name: str, definition: db.RecordType): - self._registered_identifiables[name] = definition + self._registered_identifiables[name.lower()] = definition def get_file(self, identifiable: Identifiable): warnings.warn( @@ -630,20 +635,51 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): return None return candidates[0] - def get_registered_identifiable(self, record: db.Entity): + def get_registered_identifiable(self, entity: db.Entity): """ returns the registered identifiable for the given Record It is assumed, that there is exactly one identifiable for each RecordType. Only the first parent of the given Record is considered; others are ignored """ - if len(record.parents) == 0: + if len(entity.parents) == 0: + return None + registerd = [] + for parent in entity.parents: + prt = _retrieve_RecordType(id=parent.id, name=parent.name) + reg = self._get_registered_for_rt(prt) + if reg is not None: + registerd.append(reg) + if len(registerd) > 1: + raise RuntimeError("Multiple registered identifiables found.") + elif len(registerd) == 1: + return registerd[0] + else: return None - # TODO We need to treat the case where multiple parents exist properly. - rt_name = record.parents[0].name - for name, definition in self._registered_identifiables.items(): - if definition.parents[0].name.lower() == rt_name.lower(): - return definition + + + def _get_registered_for_rt(self, rt): + """ + returns the registered identifiable for the given RecordType or the + registered identifiable of the first parent + """ + if rt.name.lower() in self._registered_identifiables: + return self._registered_identifiables[rt.name.lower()] + if len(rt.parents) == 0: + return None + registerd = [] + for parent in rt.parents: + prt = _retrieve_RecordType(id=parent.id, name=parent.name) + registerd.append(self._get_registered_for_rt(prt)) + if len(registerd) > 1: + raise RuntimeError("Multiple registered identifiables found.") + elif len(registerd) == 1: + return registerd[0] + else: + return None + + + def retrieve_identified_record_for_identifiable(self, identifiable: Identifiable): query_string = self.create_query_for_identifiable(identifiable) diff --git a/src/doc/concepts.rst b/src/doc/concepts.rst index b3aa02a1..f6a7c547 100644 --- a/src/doc/concepts.rst +++ b/src/doc/concepts.rst @@ -79,7 +79,7 @@ A Registered Identifiable is the blue print for Identifiables. You can think of registered identifiables as identifiables without concrete values for properties. RegisteredIdentifiables are associated with RecordTypes and define of what information an identifiable for that RecordType -exists. There can be multiple Registered Identifiables for one RecordType. +exists. There cannot be multiple Registered Identifiables for one RecordType. If identifiables shall contain references to the object to be identified, the Registered Identifiable must list the RecordTypes of the Entities that have those references. @@ -94,6 +94,21 @@ reference the object to be identified. You can also use the wildcard "*" as RecordType name in the configuration which will only require, that ANY Record references the Record at hand. +If a Record has multiple parents, only one of them must have an registered identifiable. + +Reasoning: +If there are mutliple registered identifiables that could be used to identify a given record, then only a single +one of them is used, it might be that the existence check returns a different result than if another one would +be used. This would allow for unpredictable and inconsistent behavior(Example: one registered identifiable +contains the name another one property date. Using the name might imply that the record does not exist and using +the date might imply that it does. Thus, for any Record the registered identifiable must be unique). +Anlogous Example: If you tinnk in the context, of relational databases, there can always only be a foreign key +associated with one table. + +When no registered identifiable exist for the direct parents, registered identifiables may be used +from their parents. If multiple recordtypes exist in the inheritance chain with a registered identifiable, then +the one that is closest to the direct parent is used. In case of multiple inheritance, only one branch must have registered identifiables. + Identified Records ++++++++++++++++++ diff --git a/unittests/test_identifiable_adapters.py b/unittests/test_identifiable_adapters.py index bdc0ab85..57a1c856 100644 --- a/unittests/test_identifiable_adapters.py +++ b/unittests/test_identifiable_adapters.py @@ -44,6 +44,19 @@ from caoscrawler.sync_graph import SyncNode UNITTESTDIR = Path(__file__).parent + +def mock_retrieve_RecordType(id, name): + return { + "Person": db.RecordType(name="Person"), + "Keyword": db.RecordType(name="Keyword"), + "Project": db.RecordType(name="Project"), + "A": db.RecordType(name="A"), + "Experiment": db.RecordType(name="Experiment"), + "Lab": db.RecordType(name="Lab"), + "Analysis": db.RecordType(name="Analysis"), + "Measurement": db.RecordType(name="Measurement").add_parent("Experiment") + }[name] + def test_create_query_for_identifiable(): query = IdentifiableAdapter.create_query_for_identifiable( Identifiable(record_type="Person", properties={"first_name": "A", "last_name": "B"})) @@ -99,7 +112,8 @@ def test_create_query_for_identifiable(): Identifiable(record_type="record type", name="it's weird")) assert query == ("FIND RECORD 'record type' WITH name='it\\'s weird'") - +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=mock_retrieve_RecordType)) def test_load_from_yaml_file(): ident = CaosDBIdentifiableAdapter() ident.load_from_yaml_definition( @@ -174,7 +188,8 @@ def test_convert_value(): assert convert_value(A()) == " a " - +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=mock_retrieve_RecordType)) def test_get_identifiable(): ident = CaosDBIdentifiableAdapter() ident.load_from_yaml_definition(UNITTESTDIR / "example_identifiables.yml") @@ -283,3 +298,40 @@ def test_referencing_entity_has_appropriate_type(): assert rft(dummy.parents, registered_identifiable) registered_identifiable.properties[0].value = ["B", "*"] assert rft(dummy.parents, registered_identifiable) + +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=mock_retrieve_RecordType)) +def test_get_registered_identifiable(): + # Test the case that the record has a parent for which an identifiable is registered + ident = CaosDBIdentifiableAdapter() + ident.load_from_yaml_definition(UNITTESTDIR / "example_identifiables.yml") + rec = db.Record().add_parent(name="Experiment") + registered = ident.get_registered_identifiable(rec) + assert registered is not None + assert registered.parents[0].name == "Experiment" + + # Test the same but with an additional parent + rec = db.Record().add_parent(name="Lab").add_parent(name="Experiment") + registered = ident.get_registered_identifiable(rec) + assert registered is not None + assert registered.parents[0].name == "Experiment" + + + # Test the same but with an additional parent that also has a registered identifiable + rec = db.Record().add_parent(name="Analysis").add_parent(name="Experiment") + with pytest.raises(RuntimeError): + registered = ident.get_registered_identifiable(rec) + + # Test the same but with an additional parent that also has a registered identifiable + rec = db.Record().add_parent(name="Measurement").add_parent(name="Experiment") + with pytest.raises(RuntimeError): + registered = ident.get_registered_identifiable(rec) + + # Test the case that the record has a parent for which no identifiable is registered + # and there is a registered identifiable for a grand parent + ident = CaosDBIdentifiableAdapter() + ident.load_from_yaml_definition(UNITTESTDIR / "example_identifiables.yml") + rec = db.Record().add_parent(name="Measurement") + registered = ident.get_registered_identifiable(rec) + assert registered is not None + assert registered.parents[0].name == "Experiment" \ No newline at end of file -- GitLab From cf6fdcd4de7dc38bfd32c51b6bb685ee39b19d7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Mon, 13 Jan 2025 14:35:57 +0100 Subject: [PATCH 098/131] FIX: rename vars --- src/caoscrawler/identifiable_adapters.py | 26 ++++++++++++------------ 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index 2d7b0158..59adbf3b 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -635,25 +635,25 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): return None return candidates[0] - def get_registered_identifiable(self, entity: db.Entity): + def get_registered_identifiable(self, record: db.Entity): """ returns the registered identifiable for the given Record It is assumed, that there is exactly one identifiable for each RecordType. Only the first parent of the given Record is considered; others are ignored """ - if len(entity.parents) == 0: + if len(record.parents) == 0: return None - registerd = [] - for parent in entity.parents: + registered = [] + for parent in record.parents: prt = _retrieve_RecordType(id=parent.id, name=parent.name) reg = self._get_registered_for_rt(prt) if reg is not None: - registerd.append(reg) - if len(registerd) > 1: + registered.append(reg) + if len(registered) > 1: raise RuntimeError("Multiple registered identifiables found.") - elif len(registerd) == 1: - return registerd[0] + elif len(registered) == 1: + return registered[0] else: return None @@ -667,14 +667,14 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): return self._registered_identifiables[rt.name.lower()] if len(rt.parents) == 0: return None - registerd = [] + registered = [] for parent in rt.parents: prt = _retrieve_RecordType(id=parent.id, name=parent.name) - registerd.append(self._get_registered_for_rt(prt)) - if len(registerd) > 1: + registered.append(self._get_registered_for_rt(prt)) + if len(registered) > 1: raise RuntimeError("Multiple registered identifiables found.") - elif len(registerd) == 1: - return registerd[0] + elif len(registered) == 1: + return registered[0] else: return None -- GitLab From 36de070666543047ed2e82b040ee29eae2c2ffe2 Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Mon, 13 Jan 2025 14:57:15 +0100 Subject: [PATCH 099/131] PIPELINE: Reove unnessary git installation --- .docker/Dockerfile | 3 --- 1 file changed, 3 deletions(-) diff --git a/.docker/Dockerfile b/.docker/Dockerfile index 42926cf7..14c3c1ef 100644 --- a/.docker/Dockerfile +++ b/.docker/Dockerfile @@ -9,9 +9,6 @@ RUN if getent hosts indiscale.com > /dev/null; \ else echo "No internet connection or DNS not available"; \ fi -RUN apt-get update && apt-get install -y \ - git - COPY . /git # Delete .git because it is huge. -- GitLab From 42838e73baf63c3f4fa0c7541d8266f5ad76d2ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Mon, 13 Jan 2025 15:05:15 +0100 Subject: [PATCH 100/131] TST: refine test --- src/caoscrawler/identifiable_adapters.py | 10 +++++++--- src/doc/concepts.rst | 22 ++++++++++++++-------- unittests/test_identifiable_adapters.py | 5 +++-- 3 files changed, 24 insertions(+), 13 deletions(-) diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index 59adbf3b..98281c4d 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -650,15 +650,17 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): reg = self._get_registered_for_rt(prt) if reg is not None: registered.append(reg) + # TODO we might in future want to check whether the registered identifiables are the same if len(registered) > 1: - raise RuntimeError("Multiple registered identifiables found.") + raise RuntimeError("Multiple registered identifiables found for a Record " + f"with the following parents: {record.parents}") elif len(registered) == 1: return registered[0] else: return None - def _get_registered_for_rt(self, rt): + def _get_registered_for_rt(self, rt: db.RecordType): """ returns the registered identifiable for the given RecordType or the registered identifiable of the first parent @@ -671,8 +673,10 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): for parent in rt.parents: prt = _retrieve_RecordType(id=parent.id, name=parent.name) registered.append(self._get_registered_for_rt(prt)) + # TODO we might in future want to check whether the registered identifiables are the same if len(registered) > 1: - raise RuntimeError("Multiple registered identifiables found.") + raise RuntimeError("Multiple registered identifiables found for the RecordType " + f" {rt.name} with the following parents: {rt.parents}") elif len(registered) == 1: return registered[0] else: diff --git a/src/doc/concepts.rst b/src/doc/concepts.rst index f6a7c547..15b7812f 100644 --- a/src/doc/concepts.rst +++ b/src/doc/concepts.rst @@ -94,20 +94,26 @@ reference the object to be identified. You can also use the wildcard "*" as RecordType name in the configuration which will only require, that ANY Record references the Record at hand. -If a Record has multiple parents, only one of them must have an registered identifiable. + +Instead of defining registered identifiables for a RecordType directly, they can be +defined for their parents. I.e. if there is no registered identifiable for a RecordType, +then it will be checked whether there is a parent that has one. +If multiple recordtypes exist in the inheritance chain with a registered identifiable, then +the one that is closest to the direct parent is used. In case of multiple inheritance, only one branch must have registered identifiables. Reasoning: -If there are mutliple registered identifiables that could be used to identify a given record, then only a single -one of them is used, it might be that the existence check returns a different result than if another one would -be used. This would allow for unpredictable and inconsistent behavior(Example: one registered identifiable +If there would be mutliple registered identifiables that could be used to identify a given record and only a single +one of them would used, it might be that the existence check returns a different result than if the other one would +be used. This would allow for unpredictable and inconsistent behavior (Example: one registered identifiable contains the name another one property date. Using the name might imply that the record does not exist and using the date might imply that it does. Thus, for any Record the registered identifiable must be unique). -Anlogous Example: If you tinnk in the context, of relational databases, there can always only be a foreign key +Anlogous Example: If you think in the context, of relational databases, there can always only be a foreign key associated with one table. -When no registered identifiable exist for the direct parents, registered identifiables may be used -from their parents. If multiple recordtypes exist in the inheritance chain with a registered identifiable, then -the one that is closest to the direct parent is used. In case of multiple inheritance, only one branch must have registered identifiables. +Note: +In case of using the registered identifiable of a parent, the identifiable will be created by using the parent RecordType. Example: The +registered identifiable is defined for the parent "Experiment" and the RecordType at hand "LaseExperiment" is a child of "Experiment". +Then the identifiable will construct a query that searches for "Experiment" Records (and not "LaseExperiment" Records). Identified Records diff --git a/unittests/test_identifiable_adapters.py b/unittests/test_identifiable_adapters.py index 57a1c856..8fbbf07e 100644 --- a/unittests/test_identifiable_adapters.py +++ b/unittests/test_identifiable_adapters.py @@ -54,6 +54,7 @@ def mock_retrieve_RecordType(id, name): "Experiment": db.RecordType(name="Experiment"), "Lab": db.RecordType(name="Lab"), "Analysis": db.RecordType(name="Analysis"), + "MetaAnalysis": db.RecordType(name="MetaAnalysis").add_parent("Analysis"), "Measurement": db.RecordType(name="Measurement").add_parent("Experiment") }[name] @@ -322,8 +323,8 @@ def test_get_registered_identifiable(): with pytest.raises(RuntimeError): registered = ident.get_registered_identifiable(rec) - # Test the same but with an additional parent that also has a registered identifiable - rec = db.Record().add_parent(name="Measurement").add_parent(name="Experiment") + # Test the same but with an additional parent that has a parent with a registered identifiable + rec = db.Record().add_parent(name="MetaAnalysis").add_parent(name="Experiment") with pytest.raises(RuntimeError): registered = ident.get_registered_identifiable(rec) -- GitLab From 8367763330eb6fd6e333b9f0d4132bb98af090a2 Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Mon, 13 Jan 2025 15:05:42 +0100 Subject: [PATCH 101/131] FIX: Use correct package names in tox.ini --- tox.ini | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tox.ini b/tox.ini index 65c7549d..d44fbb6d 100644 --- a/tox.ini +++ b/tox.ini @@ -6,8 +6,8 @@ skip_missing_interpreters = true deps = .[h5-crawler,spss,rocrate] pytest pytest-cov - caosdb-pylib - caosdb-advanced-user-tools + linkahead + caosadvancedtools commands = caosdb-crawler --help py.test --cov=caoscrawler -vv {posargs} -- GitLab From d4fd36bb7586c8f2e00cc4343182d24f222e215c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Mon, 13 Jan 2025 15:12:57 +0100 Subject: [PATCH 102/131] DOC: cl --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e0589ec4..34ce1852 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -36,6 +36,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 their contents were last modified before that datetime. ### Changed ### +- Registered identifiables can also be used by children of the given RecordType + if no registered identifiable is defined for them. ### Deprecated ### -- GitLab From 8b1b474a37fd81d588c692598525d6870607bcc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Mon, 13 Jan 2025 15:20:14 +0100 Subject: [PATCH 103/131] TST: mock more unittests --- unittests/test_crawler.py | 9 ++++++--- unittests/test_sync_graph.py | 15 ++++++++++----- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index ad69c6f5..7b19e9ae 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -399,7 +399,8 @@ def test_split_into_inserts_and_updates_with_complex(crawler_mocked_identifiable # TODO write test where the unresoled entity is not part of the identifiable - +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id,name=name))) @patch("caoscrawler.crawl.cached_get_entity_by", new=Mock(side_effect=mock_get_entity_by)) @patch("caoscrawler.identifiable_adapters.cached_query", @@ -594,7 +595,8 @@ def test_replace_entities_with_ids(): assert a.get_property("B").value == 12345 assert a.get_property("C").value == [12345, 233324] - +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id,name=name))) @patch("caoscrawler.crawl.cached_get_entity_by", new=Mock(side_effect=mock_get_entity_by)) @patch("caoscrawler.identifiable_adapters.cached_get_entity_by", @@ -619,7 +621,8 @@ def test_synchronization_no_commit(upmock, insmock): assert len(ins) == 1 assert len(ups) == 1 - +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id,name=name))) @patch("caoscrawler.crawl.cached_get_entity_by", new=Mock(side_effect=mock_get_entity_by)) @patch("caoscrawler.identifiable_adapters.cached_get_entity_by", diff --git a/unittests/test_sync_graph.py b/unittests/test_sync_graph.py index 06f0dfb9..3c838d57 100644 --- a/unittests/test_sync_graph.py +++ b/unittests/test_sync_graph.py @@ -139,7 +139,8 @@ def test_create_reference_mapping(): assert forward_references_backref[id(b)] == set([a]) assert backward_references_backref[id(b)] == set() - +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id,name=name))) @patch("caoscrawler.sync_graph.cached_get_entity_by", new=Mock(side_effect=mock_get_entity_by)) def test_SyncGraph_init(): @@ -203,7 +204,8 @@ def test_SyncGraph_init(): if not st._identity_relies_on_unchecked_entity(el): assert el.identifiable is not None or el.id is not None - +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id,name=name))) @patch("caoscrawler.identifiable_adapters.get_children_of_rt", new=Mock(side_effect=lambda x: [x])) def test_merge_into_trivial(simple_adapter): @@ -281,7 +283,8 @@ def test_merge_into_trivial(simple_adapter): assert len(st.backward_references_backref[id(se_c)]) == 1 assert se_b in st.backward_references_backref[id(se_c)] - +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id,name=name))) @patch("caoscrawler.identifiable_adapters.get_children_of_rt", new=Mock(side_effect=lambda x: [x])) def test_merge_into_simple(simple_adapter): @@ -362,7 +365,8 @@ def test_merge_into_simple(simple_adapter): assert len(st.backward_references_backref[id(se_c)]) == 1 se_b in st.backward_references_backref[id(se_c)] - +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id,name=name))) @patch("caoscrawler.identifiable_adapters.get_children_of_rt", new=Mock(side_effect=lambda x: [x])) def test_backward_references_backref(): @@ -483,7 +487,8 @@ def test_set_id_of_node(simple_adapter): assert st.nodes[0].identifiable is not None assert len(st.nodes) == 2 - +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id,name=name))) @patch("caoscrawler.sync_graph.cached_get_entity_by", new=Mock(side_effect=mock_get_entity_by)) def test_merging(simple_adapter): -- GitLab From f9796f8f58247d721ad671236bfab8f0e4bd0623 Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Mon, 13 Jan 2025 15:20:17 +0100 Subject: [PATCH 104/131] MAINT: Use cov in make unittest --- .gitlab-ci.yml | 2 +- Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 782768dd..f295e0b9 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -151,7 +151,7 @@ unittest_py3.9: - python3 -c "import sys; assert sys.version.startswith('$PYVER')" # actual test - caosdb-crawler --help - - pytest --cov=caosdb -vv ./unittests + - make unittest unittest_py3.10: variables: diff --git a/Makefile b/Makefile index 1ba97ae3..d9b1354d 100644 --- a/Makefile +++ b/Makefile @@ -44,5 +44,5 @@ lint: .PHONY: lint unittest: - pytest + pytest --cov=caosdb -vv ./unittests .PHONY: unittest -- GitLab From ce66dcc26ea64de676da0e5bf739aed47336daba Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Mon, 13 Jan 2025 15:24:17 +0100 Subject: [PATCH 105/131] MAINT: Use correct module for cov --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index d9b1354d..7167ebfd 100644 --- a/Makefile +++ b/Makefile @@ -44,5 +44,5 @@ lint: .PHONY: lint unittest: - pytest --cov=caosdb -vv ./unittests + pytest --cov=caoscrawler -vv ./unittests .PHONY: unittest -- GitLab From 009e2c07625fb5002def297f126ed6df81b91b37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Mon, 13 Jan 2025 15:26:31 +0100 Subject: [PATCH 106/131] TST: mock more unittests --- unittests/test_crawler.py | 8 ++++++-- unittests/test_sync_graph.py | 12 ++++++++---- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index 7b19e9ae..def00e41 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -371,7 +371,8 @@ def test_split_into_inserts_and_updates_with_circ(crawler_mocked_identifiable_re with pytest.raises(RuntimeError): crawler._split_into_inserts_and_updates(st) - +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id,name=name))) def test_split_into_inserts_and_updates_with_complex(crawler_mocked_identifiable_retrieve): crawler = crawler_mocked_identifiable_retrieve # A @@ -583,7 +584,8 @@ def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_ assert len(update) == 2 assert len(insert) == 1 - +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id,name=name))) def test_replace_entities_with_ids(): crawler = Crawler() a = (db.Record().add_parent("B").add_property("A", 12345) @@ -595,6 +597,7 @@ def test_replace_entities_with_ids(): assert a.get_property("B").value == 12345 assert a.get_property("C").value == [12345, 233324] + @patch("caoscrawler.identifiable_adapters._retrieve_RecordType", new=Mock(side_effect=lambda id, name: db.RecordType(id=id,name=name))) @patch("caoscrawler.crawl.cached_get_entity_by", @@ -621,6 +624,7 @@ def test_synchronization_no_commit(upmock, insmock): assert len(ins) == 1 assert len(ups) == 1 + @patch("caoscrawler.identifiable_adapters._retrieve_RecordType", new=Mock(side_effect=lambda id, name: db.RecordType(id=id,name=name))) @patch("caoscrawler.crawl.cached_get_entity_by", diff --git a/unittests/test_sync_graph.py b/unittests/test_sync_graph.py index 3c838d57..85979a58 100644 --- a/unittests/test_sync_graph.py +++ b/unittests/test_sync_graph.py @@ -384,7 +384,8 @@ def test_backward_references_backref(): st = SyncGraph(ent_list, ident_adapter) assert st.nodes[1] in st.backward_references_backref[id(st.nodes[0])] - +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id,name=name))) @patch("caoscrawler.identifiable_adapters.get_children_of_rt", new=Mock(side_effect=lambda x: [x])) def test_set_id_of_node(simple_adapter): @@ -573,7 +574,8 @@ def test_merging(simple_adapter): assert len(st.nodes) == 4 assert len(st.unchecked) == 0 - +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id,name=name))) def test_update_of_reference_values(simple_adapter): # multiple nodes are merged including one that is referenced # assure that this still leads to the value of the property of the referencing node to be @@ -596,7 +598,8 @@ def test_update_of_reference_values(simple_adapter): b_prop = st.nodes[1].properties[0].value assert b_prop.id == 101 - +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id,name=name))) def test_ignoring_irrelevant_references(simple_adapter): # make sure that a circle of references is no problem if one references is not identifying b = db.Record(name='b').add_parent("RT5") @@ -657,7 +660,8 @@ def test_set_each_scalar_value(): _set_each_scalar_value(a, lambda x: x == 42, lambda x: None) assert a.properties[0].value is None - +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id,name=name))) @patch("caoscrawler.identifiable_adapters.cached_query", new=Mock(side_effect=mock_cached_only_rt_allow_empty)) def test_merge_referenced_by(): -- GitLab From d83c300899a93d7f9399de77950fa373b3f99c3b Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Mon, 13 Jan 2025 17:35:43 +0100 Subject: [PATCH 107/131] STY: autopep8'd --- src/caoscrawler/identifiable_adapters.py | 9 +++------ unittests/test_crawler.py | 23 +++++++++++++---------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index 98281c4d..6169a99e 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -45,12 +45,13 @@ from .utils import has_parent logger = logging.getLogger(__name__) -def _retrieve_RecordType(id=None, name =None): +def _retrieve_RecordType(id=None, name=None): """ Retrieve the RecordType from LinkAhead. For mocking purposes. """ return db.RecordType(name=name, id=id).retrieve() + def get_children_of_rt(rtname): """Supply the name of a recordtype. This name and the name of all children RTs are returned in a list""" @@ -658,8 +659,7 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): return registered[0] else: return None - - + def _get_registered_for_rt(self, rt: db.RecordType): """ returns the registered identifiable for the given RecordType or the @@ -681,9 +681,6 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): return registered[0] else: return None - - - def retrieve_identified_record_for_identifiable(self, identifiable: Identifiable): query_string = self.create_query_for_identifiable(identifiable) diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index def00e41..bdb22ba2 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -371,8 +371,9 @@ def test_split_into_inserts_and_updates_with_circ(crawler_mocked_identifiable_re with pytest.raises(RuntimeError): crawler._split_into_inserts_and_updates(st) -@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", - new=Mock(side_effect=lambda id, name: db.RecordType(id=id,name=name))) + +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id, name=name))) def test_split_into_inserts_and_updates_with_complex(crawler_mocked_identifiable_retrieve): crawler = crawler_mocked_identifiable_retrieve # A @@ -400,8 +401,9 @@ def test_split_into_inserts_and_updates_with_complex(crawler_mocked_identifiable # TODO write test where the unresoled entity is not part of the identifiable -@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", - new=Mock(side_effect=lambda id, name: db.RecordType(id=id,name=name))) + +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id, name=name))) @patch("caoscrawler.crawl.cached_get_entity_by", new=Mock(side_effect=mock_get_entity_by)) @patch("caoscrawler.identifiable_adapters.cached_query", @@ -584,8 +586,9 @@ def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_ assert len(update) == 2 assert len(insert) == 1 -@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", - new=Mock(side_effect=lambda id, name: db.RecordType(id=id,name=name))) + +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id, name=name))) def test_replace_entities_with_ids(): crawler = Crawler() a = (db.Record().add_parent("B").add_property("A", 12345) @@ -598,8 +601,8 @@ def test_replace_entities_with_ids(): assert a.get_property("C").value == [12345, 233324] -@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", - new=Mock(side_effect=lambda id, name: db.RecordType(id=id,name=name))) +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id, name=name))) @patch("caoscrawler.crawl.cached_get_entity_by", new=Mock(side_effect=mock_get_entity_by)) @patch("caoscrawler.identifiable_adapters.cached_get_entity_by", @@ -625,8 +628,8 @@ def test_synchronization_no_commit(upmock, insmock): assert len(ups) == 1 -@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", - new=Mock(side_effect=lambda id, name: db.RecordType(id=id,name=name))) +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id, name=name))) @patch("caoscrawler.crawl.cached_get_entity_by", new=Mock(side_effect=mock_get_entity_by)) @patch("caoscrawler.identifiable_adapters.cached_get_entity_by", -- GitLab From 6bb6e88151111ced0a405bc3483eeac2fa6d5161 Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Mon, 13 Jan 2025 17:39:43 +0100 Subject: [PATCH 108/131] STY: autopep8'd --- unittests/test_identifiable_adapters.py | 14 ++++---- unittests/test_sync_graph.py | 45 +++++++++++++++---------- 2 files changed, 35 insertions(+), 24 deletions(-) diff --git a/unittests/test_identifiable_adapters.py b/unittests/test_identifiable_adapters.py index 8fbbf07e..5108e83c 100644 --- a/unittests/test_identifiable_adapters.py +++ b/unittests/test_identifiable_adapters.py @@ -44,7 +44,6 @@ from caoscrawler.sync_graph import SyncNode UNITTESTDIR = Path(__file__).parent - def mock_retrieve_RecordType(id, name): return { "Person": db.RecordType(name="Person"), @@ -58,6 +57,7 @@ def mock_retrieve_RecordType(id, name): "Measurement": db.RecordType(name="Measurement").add_parent("Experiment") }[name] + def test_create_query_for_identifiable(): query = IdentifiableAdapter.create_query_for_identifiable( Identifiable(record_type="Person", properties={"first_name": "A", "last_name": "B"})) @@ -113,7 +113,8 @@ def test_create_query_for_identifiable(): Identifiable(record_type="record type", name="it's weird")) assert query == ("FIND RECORD 'record type' WITH name='it\\'s weird'") -@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", new=Mock(side_effect=mock_retrieve_RecordType)) def test_load_from_yaml_file(): ident = CaosDBIdentifiableAdapter() @@ -189,7 +190,8 @@ def test_convert_value(): assert convert_value(A()) == " a " -@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", new=Mock(side_effect=mock_retrieve_RecordType)) def test_get_identifiable(): ident = CaosDBIdentifiableAdapter() @@ -300,7 +302,8 @@ def test_referencing_entity_has_appropriate_type(): registered_identifiable.properties[0].value = ["B", "*"] assert rft(dummy.parents, registered_identifiable) -@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", new=Mock(side_effect=mock_retrieve_RecordType)) def test_get_registered_identifiable(): # Test the case that the record has a parent for which an identifiable is registered @@ -317,7 +320,6 @@ def test_get_registered_identifiable(): assert registered is not None assert registered.parents[0].name == "Experiment" - # Test the same but with an additional parent that also has a registered identifiable rec = db.Record().add_parent(name="Analysis").add_parent(name="Experiment") with pytest.raises(RuntimeError): @@ -335,4 +337,4 @@ def test_get_registered_identifiable(): rec = db.Record().add_parent(name="Measurement") registered = ident.get_registered_identifiable(rec) assert registered is not None - assert registered.parents[0].name == "Experiment" \ No newline at end of file + assert registered.parents[0].name == "Experiment" diff --git a/unittests/test_sync_graph.py b/unittests/test_sync_graph.py index 85979a58..030306b9 100644 --- a/unittests/test_sync_graph.py +++ b/unittests/test_sync_graph.py @@ -139,8 +139,9 @@ def test_create_reference_mapping(): assert forward_references_backref[id(b)] == set([a]) assert backward_references_backref[id(b)] == set() -@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", - new=Mock(side_effect=lambda id, name: db.RecordType(id=id,name=name))) + +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id, name=name))) @patch("caoscrawler.sync_graph.cached_get_entity_by", new=Mock(side_effect=mock_get_entity_by)) def test_SyncGraph_init(): @@ -204,8 +205,9 @@ def test_SyncGraph_init(): if not st._identity_relies_on_unchecked_entity(el): assert el.identifiable is not None or el.id is not None -@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", - new=Mock(side_effect=lambda id, name: db.RecordType(id=id,name=name))) + +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id, name=name))) @patch("caoscrawler.identifiable_adapters.get_children_of_rt", new=Mock(side_effect=lambda x: [x])) def test_merge_into_trivial(simple_adapter): @@ -283,8 +285,9 @@ def test_merge_into_trivial(simple_adapter): assert len(st.backward_references_backref[id(se_c)]) == 1 assert se_b in st.backward_references_backref[id(se_c)] -@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", - new=Mock(side_effect=lambda id, name: db.RecordType(id=id,name=name))) + +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id, name=name))) @patch("caoscrawler.identifiable_adapters.get_children_of_rt", new=Mock(side_effect=lambda x: [x])) def test_merge_into_simple(simple_adapter): @@ -365,8 +368,9 @@ def test_merge_into_simple(simple_adapter): assert len(st.backward_references_backref[id(se_c)]) == 1 se_b in st.backward_references_backref[id(se_c)] -@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", - new=Mock(side_effect=lambda id, name: db.RecordType(id=id,name=name))) + +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id, name=name))) @patch("caoscrawler.identifiable_adapters.get_children_of_rt", new=Mock(side_effect=lambda x: [x])) def test_backward_references_backref(): @@ -384,8 +388,9 @@ def test_backward_references_backref(): st = SyncGraph(ent_list, ident_adapter) assert st.nodes[1] in st.backward_references_backref[id(st.nodes[0])] -@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", - new=Mock(side_effect=lambda id, name: db.RecordType(id=id,name=name))) + +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id, name=name))) @patch("caoscrawler.identifiable_adapters.get_children_of_rt", new=Mock(side_effect=lambda x: [x])) def test_set_id_of_node(simple_adapter): @@ -488,8 +493,9 @@ def test_set_id_of_node(simple_adapter): assert st.nodes[0].identifiable is not None assert len(st.nodes) == 2 -@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", - new=Mock(side_effect=lambda id, name: db.RecordType(id=id,name=name))) + +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id, name=name))) @patch("caoscrawler.sync_graph.cached_get_entity_by", new=Mock(side_effect=mock_get_entity_by)) def test_merging(simple_adapter): @@ -574,8 +580,9 @@ def test_merging(simple_adapter): assert len(st.nodes) == 4 assert len(st.unchecked) == 0 -@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", - new=Mock(side_effect=lambda id, name: db.RecordType(id=id,name=name))) + +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id, name=name))) def test_update_of_reference_values(simple_adapter): # multiple nodes are merged including one that is referenced # assure that this still leads to the value of the property of the referencing node to be @@ -598,8 +605,9 @@ def test_update_of_reference_values(simple_adapter): b_prop = st.nodes[1].properties[0].value assert b_prop.id == 101 -@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", - new=Mock(side_effect=lambda id, name: db.RecordType(id=id,name=name))) + +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id, name=name))) def test_ignoring_irrelevant_references(simple_adapter): # make sure that a circle of references is no problem if one references is not identifying b = db.Record(name='b').add_parent("RT5") @@ -660,8 +668,9 @@ def test_set_each_scalar_value(): _set_each_scalar_value(a, lambda x: x == 42, lambda x: None) assert a.properties[0].value is None -@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", - new=Mock(side_effect=lambda id, name: db.RecordType(id=id,name=name))) + +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id, name=name))) @patch("caoscrawler.identifiable_adapters.cached_query", new=Mock(side_effect=mock_cached_only_rt_allow_empty)) def test_merge_referenced_by(): -- GitLab From 0cb1f314e0de8122bc9c4bb8024aff14886550fe Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Mon, 13 Jan 2025 18:01:08 +0100 Subject: [PATCH 109/131] DOC: Improve explanations on new identifiable behavior --- src/doc/concepts.rst | 48 +++++++++++++--------- src/doc/converters/standard_converters.rst | 2 +- 2 files changed, 30 insertions(+), 20 deletions(-) diff --git a/src/doc/concepts.rst b/src/doc/concepts.rst index 15b7812f..e1cbb10e 100644 --- a/src/doc/concepts.rst +++ b/src/doc/concepts.rst @@ -95,25 +95,35 @@ RecordType name in the configuration which will only require, that ANY Record references the Record at hand. -Instead of defining registered identifiables for a RecordType directly, they can be -defined for their parents. I.e. if there is no registered identifiable for a RecordType, -then it will be checked whether there is a parent that has one. -If multiple recordtypes exist in the inheritance chain with a registered identifiable, then -the one that is closest to the direct parent is used. In case of multiple inheritance, only one branch must have registered identifiables. - -Reasoning: -If there would be mutliple registered identifiables that could be used to identify a given record and only a single -one of them would used, it might be that the existence check returns a different result than if the other one would -be used. This would allow for unpredictable and inconsistent behavior (Example: one registered identifiable -contains the name another one property date. Using the name might imply that the record does not exist and using -the date might imply that it does. Thus, for any Record the registered identifiable must be unique). -Anlogous Example: If you think in the context, of relational databases, there can always only be a foreign key -associated with one table. - -Note: -In case of using the registered identifiable of a parent, the identifiable will be created by using the parent RecordType. Example: The -registered identifiable is defined for the parent "Experiment" and the RecordType at hand "LaseExperiment" is a child of "Experiment". -Then the identifiable will construct a query that searches for "Experiment" Records (and not "LaseExperiment" Records). +Instead of defining registered identifiables for a RecordType +directly, they can be defined for their parents. I.e., if there is no +registered identifiable for a RecordType, then it will be checked +whether there is a parent that has one. If multiple recordtypes exist +in the inheritance chain with a registered identifiable, then the one +that is closest to the direct parent is used. In case of multiple +inheritance, only one branch must have registered identifiables. + +The reason for this behavior is the following. If there were +mutliple registered identifiables that could be used to identify a +given record and only a single one of them would used, it might be +that the existence check returns a different result than if the other +one would be used. This would allow for unpredictable and inconsistent +behavior (Example: one registered identifiable contains the name +another one property date. Using the name might imply that the record +does not exist and using the date might imply that it does. Thus, for +any Record the registered identifiable must be unique). Analogous +Example: If you think in the context of relational databases, there +can always only be a foreign key associated with one table. + +.. note:: + + In case of using the registered identifiable of a parent, the + identifiable will be created by using the parent + RecordType. Example: The registered identifiable is defined for the + parent "Experiment" and the RecordType at hand "LaseExperiment" is + a child of "Experiment". Then the identifiable will construct a + query that searches for "Experiment" Records (and not + "LaseExperiment" Records). Identified Records diff --git a/src/doc/converters/standard_converters.rst b/src/doc/converters/standard_converters.rst index 5f86abb5..96b089f2 100644 --- a/src/doc/converters/standard_converters.rst +++ b/src/doc/converters/standard_converters.rst @@ -361,7 +361,7 @@ The XMLTagConverter is a generic converter for XMLElements with the following ma no converter implemented that can match XMLAttributeNodes. Namespaces -********** +.......... The default is to take the namespace map from the current node and use it in xpath queries. Because default namespaces cannot be handled by -- GitLab From 224730bc32436e2ee08916777a565b0edcb5ce52 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Tue, 14 Jan 2025 10:43:33 +0100 Subject: [PATCH 110/131] DOC: added documentaiton on how to use custom transformers --- src/doc/converters/transform_functions.rst | 85 ++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/src/doc/converters/transform_functions.rst b/src/doc/converters/transform_functions.rst index ecd47d2d..3021b47d 100644 --- a/src/doc/converters/transform_functions.rst +++ b/src/doc/converters/transform_functions.rst @@ -70,3 +70,88 @@ the usual ``$`` notation: There are a number of transform functions that are defined by default (see ``src/caoscrawler/default_transformers.yml``). You can define custom transform functions by adding them to the cfood definition (see :doc:`CFood Documentation<../cfood>`). + + +Custom Transformers +=================== + +Custom transformers are basically python functions having a special form/signature. They need to +be registered in the cfood definition in order to be available during the scanning process. + +Let's assume we want to implement a transformer that replaces all occurrences of single letters +in the value of a variable with a different letter each. So passing "abc" as `in_letters` and +"xyz" as `out_letters` would result in a replacement of a value of "scan started" to +"szxn stxrted". We could implement this in python using the +following code: + +.. code-block:: python + + def replace_letters(in_value: Any, in_parameters: dict) -> Any: + """ + Replace letters in variables + """ + + # The arguments to the transformer (as given by the definition in the cfood) + # are contained in `in_parameters`. We need to make sure they are set or + # set their defaults otherwise: + + if "in_letter" not in in_parameters: + raise RuntimeError("Parameter `in_letters` missing.") + + if "out_letter" not in in_parameters: + raise RuntimeError("Parameter `out_letters` missing.") + + l_in = in_parameters["in_letters"] + l_out = in_parameters["out_letters"] + + + if len(l_in) != len(l_out): + raise RuntimeError("`in_letters` and `out_letters` must have the same length.") + + for l1, l2 in zip(l_in, l_out): + in_value = in_value.replace(l1, l2) + + return in_value + + +This code needs to be put into a module that can be found during runtime of the crawler. +One possibility is to install the package into the same virtual environment that is used +to run the crawler. + +In the cfood the transfomer needs to be registered: + +.. code-block:: yaml + + --- + metadata: + crawler-version: 0.10.2 + macros: + --- + Converters: # put custom converters here + Transformers: + replace_letters: # This name will be made available in the cfood + function: replace_letters + package: utilities.replace_letters + +This would assume that the code for the function `replace_letters` is residing in a file +called `replace_letters.py` that is stored in a package called `utilities`. + +The transformer can then be used in a converter, e.g.: + + +.. code-block:: yaml + + Experiment: + type: Dict + match: ".*" + transform: + replace_letters: + in: $a + out: $b + functions: + - replace_letters: # This is the name of our custom transformer + in_letters: "abc" + out_letters: "xyz" + records: + Report: + tags: $b -- GitLab From ce3844f26e4e78f15eec44e801f2cc53ba3b8ff7 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Tue, 14 Jan 2025 10:43:33 +0100 Subject: [PATCH 111/131] DOC: added documentaiton on how to use custom transformers --- src/doc/converters/transform_functions.rst | 85 ++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/src/doc/converters/transform_functions.rst b/src/doc/converters/transform_functions.rst index ecd47d2d..3021b47d 100644 --- a/src/doc/converters/transform_functions.rst +++ b/src/doc/converters/transform_functions.rst @@ -70,3 +70,88 @@ the usual ``$`` notation: There are a number of transform functions that are defined by default (see ``src/caoscrawler/default_transformers.yml``). You can define custom transform functions by adding them to the cfood definition (see :doc:`CFood Documentation<../cfood>`). + + +Custom Transformers +=================== + +Custom transformers are basically python functions having a special form/signature. They need to +be registered in the cfood definition in order to be available during the scanning process. + +Let's assume we want to implement a transformer that replaces all occurrences of single letters +in the value of a variable with a different letter each. So passing "abc" as `in_letters` and +"xyz" as `out_letters` would result in a replacement of a value of "scan started" to +"szxn stxrted". We could implement this in python using the +following code: + +.. code-block:: python + + def replace_letters(in_value: Any, in_parameters: dict) -> Any: + """ + Replace letters in variables + """ + + # The arguments to the transformer (as given by the definition in the cfood) + # are contained in `in_parameters`. We need to make sure they are set or + # set their defaults otherwise: + + if "in_letter" not in in_parameters: + raise RuntimeError("Parameter `in_letters` missing.") + + if "out_letter" not in in_parameters: + raise RuntimeError("Parameter `out_letters` missing.") + + l_in = in_parameters["in_letters"] + l_out = in_parameters["out_letters"] + + + if len(l_in) != len(l_out): + raise RuntimeError("`in_letters` and `out_letters` must have the same length.") + + for l1, l2 in zip(l_in, l_out): + in_value = in_value.replace(l1, l2) + + return in_value + + +This code needs to be put into a module that can be found during runtime of the crawler. +One possibility is to install the package into the same virtual environment that is used +to run the crawler. + +In the cfood the transfomer needs to be registered: + +.. code-block:: yaml + + --- + metadata: + crawler-version: 0.10.2 + macros: + --- + Converters: # put custom converters here + Transformers: + replace_letters: # This name will be made available in the cfood + function: replace_letters + package: utilities.replace_letters + +This would assume that the code for the function `replace_letters` is residing in a file +called `replace_letters.py` that is stored in a package called `utilities`. + +The transformer can then be used in a converter, e.g.: + + +.. code-block:: yaml + + Experiment: + type: Dict + match: ".*" + transform: + replace_letters: + in: $a + out: $b + functions: + - replace_letters: # This is the name of our custom transformer + in_letters: "abc" + out_letters: "xyz" + records: + Report: + tags: $b -- GitLab From 624610c9b601959b1c36ac0e76865a5ed65cc10d Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Tue, 14 Jan 2025 10:46:28 +0100 Subject: [PATCH 112/131] Revert "DOC: added documentaiton on how to use custom transformers" This reverts commit 224730bc32436e2ee08916777a565b0edcb5ce52. --- src/doc/converters/transform_functions.rst | 85 ---------------------- 1 file changed, 85 deletions(-) diff --git a/src/doc/converters/transform_functions.rst b/src/doc/converters/transform_functions.rst index 3021b47d..ecd47d2d 100644 --- a/src/doc/converters/transform_functions.rst +++ b/src/doc/converters/transform_functions.rst @@ -70,88 +70,3 @@ the usual ``$`` notation: There are a number of transform functions that are defined by default (see ``src/caoscrawler/default_transformers.yml``). You can define custom transform functions by adding them to the cfood definition (see :doc:`CFood Documentation<../cfood>`). - - -Custom Transformers -=================== - -Custom transformers are basically python functions having a special form/signature. They need to -be registered in the cfood definition in order to be available during the scanning process. - -Let's assume we want to implement a transformer that replaces all occurrences of single letters -in the value of a variable with a different letter each. So passing "abc" as `in_letters` and -"xyz" as `out_letters` would result in a replacement of a value of "scan started" to -"szxn stxrted". We could implement this in python using the -following code: - -.. code-block:: python - - def replace_letters(in_value: Any, in_parameters: dict) -> Any: - """ - Replace letters in variables - """ - - # The arguments to the transformer (as given by the definition in the cfood) - # are contained in `in_parameters`. We need to make sure they are set or - # set their defaults otherwise: - - if "in_letter" not in in_parameters: - raise RuntimeError("Parameter `in_letters` missing.") - - if "out_letter" not in in_parameters: - raise RuntimeError("Parameter `out_letters` missing.") - - l_in = in_parameters["in_letters"] - l_out = in_parameters["out_letters"] - - - if len(l_in) != len(l_out): - raise RuntimeError("`in_letters` and `out_letters` must have the same length.") - - for l1, l2 in zip(l_in, l_out): - in_value = in_value.replace(l1, l2) - - return in_value - - -This code needs to be put into a module that can be found during runtime of the crawler. -One possibility is to install the package into the same virtual environment that is used -to run the crawler. - -In the cfood the transfomer needs to be registered: - -.. code-block:: yaml - - --- - metadata: - crawler-version: 0.10.2 - macros: - --- - Converters: # put custom converters here - Transformers: - replace_letters: # This name will be made available in the cfood - function: replace_letters - package: utilities.replace_letters - -This would assume that the code for the function `replace_letters` is residing in a file -called `replace_letters.py` that is stored in a package called `utilities`. - -The transformer can then be used in a converter, e.g.: - - -.. code-block:: yaml - - Experiment: - type: Dict - match: ".*" - transform: - replace_letters: - in: $a - out: $b - functions: - - replace_letters: # This is the name of our custom transformer - in_letters: "abc" - out_letters: "xyz" - records: - Report: - tags: $b -- GitLab From 86c18cde7ca2667cb5133827080dc0539ccf3d3c Mon Sep 17 00:00:00 2001 From: "i.nueske" <i.nueske@indiscale.com> Date: Wed, 15 Jan 2025 18:38:13 +0100 Subject: [PATCH 113/131] DOC: Make the Custom Transformers example executable without changes, fix indentation --- src/doc/converters/transform_functions.rst | 84 +++++++++++----------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/src/doc/converters/transform_functions.rst b/src/doc/converters/transform_functions.rst index 3021b47d..35c11093 100644 --- a/src/doc/converters/transform_functions.rst +++ b/src/doc/converters/transform_functions.rst @@ -86,32 +86,32 @@ following code: .. code-block:: python - def replace_letters(in_value: Any, in_parameters: dict) -> Any: - """ - Replace letters in variables - """ + def replace_letters(in_value: Any, in_parameters: dict) -> Any: + """ + Replace letters in variables + """ - # The arguments to the transformer (as given by the definition in the cfood) - # are contained in `in_parameters`. We need to make sure they are set or - # set their defaults otherwise: + # The arguments to the transformer (as given by the definition in the cfood) + # are contained in `in_parameters`. We need to make sure they are set or + # set their defaults otherwise: - if "in_letter" not in in_parameters: - raise RuntimeError("Parameter `in_letters` missing.") + if "in_letters" not in in_parameters: + raise RuntimeError("Parameter `in_letters` missing.") - if "out_letter" not in in_parameters: - raise RuntimeError("Parameter `out_letters` missing.") + if "out_letters" not in in_parameters: + raise RuntimeError("Parameter `out_letters` missing.") - l_in = in_parameters["in_letters"] - l_out = in_parameters["out_letters"] + l_in = in_parameters["in_letters"] + l_out = in_parameters["out_letters"] - if len(l_in) != len(l_out): - raise RuntimeError("`in_letters` and `out_letters` must have the same length.") + if len(l_in) != len(l_out): + raise RuntimeError("`in_letters` and `out_letters` must have the same length.") - for l1, l2 in zip(l_in, l_out): - in_value = in_value.replace(l1, l2) + for l1, l2 in zip(l_in, l_out): + in_value = in_value.replace(l1, l2) - return in_value + return in_value This code needs to be put into a module that can be found during runtime of the crawler. @@ -122,16 +122,16 @@ In the cfood the transfomer needs to be registered: .. code-block:: yaml - --- - metadata: - crawler-version: 0.10.2 - macros: - --- - Converters: # put custom converters here - Transformers: - replace_letters: # This name will be made available in the cfood - function: replace_letters - package: utilities.replace_letters + --- + metadata: + crawler-version: 0.10.2 + macros: + --- + #Converters: # put custom converters here + Transformers: + replace_letters: # This name will be made available in the cfood + function: replace_letters + package: utilities.replace_letters This would assume that the code for the function `replace_letters` is residing in a file called `replace_letters.py` that is stored in a package called `utilities`. @@ -141,17 +141,17 @@ The transformer can then be used in a converter, e.g.: .. code-block:: yaml - Experiment: - type: Dict - match: ".*" - transform: - replace_letters: - in: $a - out: $b - functions: - - replace_letters: # This is the name of our custom transformer - in_letters: "abc" - out_letters: "xyz" - records: - Report: - tags: $b + Experiment: + type: Dict + match: ".*" + transform: + replace_letters: + in: $a + out: $b + functions: + - replace_letters: # This is the name of our custom transformer + in_letters: "abc" + out_letters: "xyz" + records: + Report: + tags: $b -- GitLab From e08667908c434452dc451d1f4d1fd638046487d7 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Thu, 16 Jan 2025 11:13:06 +0100 Subject: [PATCH 114/131] ENH: improved support for flat rocrate structure --- src/caoscrawler/converters/rocrate.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/src/caoscrawler/converters/rocrate.py b/src/caoscrawler/converters/rocrate.py index 8a45af75..fe403944 100644 --- a/src/caoscrawler/converters/rocrate.py +++ b/src/caoscrawler/converters/rocrate.py @@ -196,7 +196,18 @@ class ROCrateEntityConverter(Converter): # Add the properties: for name, value in eprops.items(): - children.append(convert_basic_element(value, name)) + if isinstance(value, dict): + # This is - according to the standard - only allowed, if it's flat, i.e. + # it contains a single element with key == "@id" and the id as value which + # is supposed to be dereferenced: + if not (len(value) == 1 and "@id" in eprops): + raise RuntimeError("The JSON-LD is not flat.") + children.append( + ROCrateEntity(element.folder, element.entity.crate.dereference( + value["@id"]))) + # TODO: tests missing! + else: + children.append(convert_basic_element(value, name)) # Add the files: if isinstance(element.entity, rocrate.model.file.File): @@ -204,10 +215,14 @@ class ROCrateEntityConverter(Converter): children.append(File(name, os.path.join(element.folder, path, name))) # Parts of this entity are added as child entities: - if "hasPart" in eprops: - for p in eprops["hasPart"]: - children.append( - ROCrateEntity(element.folder, element.entity.crate.dereference( - p["@id"]))) + for sublist in ("hasPart", "variableMeasured"): + if sublist in eprops: + for p in eprops[sublist]: + children.append( + ROCrateEntity(element.folder, element.entity.crate.dereference( + p["@id"]))) + # TODO: it feels a bit strange to add (especially variableMeasured) directly, top-level + # and not in a sub-DictElement. The latter would be difficult to realize, because + # resolving the sub-references is not straight-forward. return children -- GitLab From afefa748668d1a3929f59902b3943a9cf7d94fce Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Thu, 16 Jan 2025 11:13:32 +0100 Subject: [PATCH 115/131] MAINT: set rocrate dependency to default package --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index d05f2acb..ae138a9a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -51,4 +51,4 @@ h5-crawler = spss = pandas[spss] rocrate = - rocrate @ git+https://github.com/salexan2001/ro-crate-py.git@f-automatic-dummy-ids + rocrate -- GitLab From ccabcbbb07e7197189f8565fe25ce0dbe464d572 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Thu, 16 Jan 2025 11:55:08 +0100 Subject: [PATCH 116/131] TST: adapted tests for eln files --- unittests/eln_files/PASTA.eln | Bin 27801 -> 0 bytes unittests/eln_files/records-example.eln | Bin 11922 -> 12761 bytes unittests/test_rocrate_converter.py | 25 +++++++++++++++++++++--- 3 files changed, 22 insertions(+), 3 deletions(-) delete mode 100644 unittests/eln_files/PASTA.eln diff --git a/unittests/eln_files/PASTA.eln b/unittests/eln_files/PASTA.eln deleted file mode 100644 index 61866e7d5f57cb32191af6663be230153092e712..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 27801 zcmeF(Q*bVC6fXGKPIk;48#}gb+qRQ0wsvgWwr$(CZRh-_=G2)vS5q@{ySwVW>F$el z(bZjRJ^g#-r9i>ZfPjFYfT9|NHA+7s^z`w8fX+#PfCT?LD=(m=EI==B;N)!JB;syh zV{dID?`UUfV&qKEz`&@dY;NHs;Ur*d=WK4`_&={J8#r0fJ6Zf6x6;|$njLunn3Fy> zURbSNIaeh~48($CV;wR=m5b}C*Rc}PCL1DM6cZ^*goVkOMAih4^L{%S<Nq<q4qd%G zX~#S9o^IGS>$<sVS#DVV81bwb>b~*xy}7a1$+>9f&AI8i(V1N`HfA666U@V$$x>Mn z%QLs9Kx-lj!M?`T=jn|RRM=`(QR=Xlm4QFGJU2GE&<_`c7T$_bWY;S4dQCWMvfrMd zdabR+v2$K4znNIYYxUg|#}P{2{8IqmIGty%>N*wC+1E$E(xc|pZpm+#8$ep1y{YrG zRq(8kU5`@HmE}p?wcOfuEbmYHl;TKRN-w^g#q%um<UR-eqPSh_XR^JcaXn-^2#ZS) zPn(5@oz3K9c$Lt4)=WeAkt)f!`VkYp!xEu#C}mB)zjCc&0t2NQqlUPE#itpQ?xA;r z){|nfFjD?teZJaNZn;zlS)izjZ8?JCP5@Jw^VWo?Fv37LZ?5+G^tK*$YFAK*Inh!T zIwK#76QW-_xBwpSv*8&>QC9O+5;cE`6hocXavNeL4Vda643RP;xQ|T(?YtbJ!QWZL z(9dUaS#8w+TKmgOver=bYH<93U&*$mHBVsj{-_#pQvV>%LNoOiDJuF_;++cXQFe;5 zWkrC8u52q~ouAgAyWD2UI{LgYI5p$r>YjE^)0dm<WRK)2f7*XOLN6k+<&$Bx-e$th zs8tPEong`UTEstTP}`C(P4F5eNP4j%m1~u_@sSk^^nvS&Ib+tnbBjT09onllIITWD zJT(U_;1wNPT3p)#Ku=6AYy%g#OPu-$Q^WCs`}3yDDzxvfv(KEIqcxy)o%V`VQ91Ix zm^Cnod)@uH1VyBAL%Ck=M<E&4^~>Mym26yfR+RYe&>aBYG541~n2v9)LpP0ZGmI7^ z^w(p==RfEQUnno!nBFISPvmY{Pj;CR+rlr82T_1pyO`*a4{B5TbI3_*aGIpE^Kl6Y zDbGGIV_oxsA6&GEWcXjovQdhx1%d`a3Z?pw)1=EtyRB{~_d6Cr+TU7Hu3X<KI9INp zd>iI&kG^eld^Gp;EbpNr-zV5#JRWE}W}25J!x-+aAzY#C982-(zTxNHu@9@+zrkFY z*|_MV7j=>B$!qB0e8UrYz2BQH70t}d1Ox<p!*|xL33MgT16`B?{{H>j|13dy_?n9~ zwxxDhB&_j9N_A3K+FRz85-v5pwRb#)xOm=|!{^)ldLIjWlcu5B{?%}_q+^=hqM_l< zacud#G5=JD@@Z&DcO;<P!m8#noelq2IJ)gSKA(v$m19D`B71&$MlCMl@QsRhDs%zV z#N_PkY-s5I9xWiKwY9tZ`=-aiiyF2N$^?Uc8^fbxJ6n(QJZK1;>Sgy!wd4%&ne|I) z;SUoCWd>+^LwG^DBo2m-4u)@(SElsL>nX<{ju8Z#U~4?w!>^T-mKN5ltIO6d$UvKB znr4igWQ+eQ7ylHMLHZTdiBC(5)2!a#Uguu5Lr+hSc*AkUgHp7%H6TQ}l}Vc}9=_4y z%g3{@H(hBpk7mozqHsq&HTwMgObr3CCK`MrCH3Lp`FvUSAtWrNrlzLl*-p-<1#a<7 zmY5A_t}(ZHEVJl+?4VDBP4<P-bIBSX_jQ-QeE-xJ;c|EUMqMySg9QWe7yoo`kel8v z?KQKQoF6}btuq?-`2!G7I3{W8kajXj1j5ULXdw&sn!(ENn)}+`-sVvMXmdJ&leRmO z=y(4^dqdOx_(9LI+2hM6I_fuS7A7qr^;fBHWoxGg(_%heNBtC(H!M8-4!bppUg8oz zT3Hc}u%_WQ%OU65ngdN4|97qSM2`DdZ*N%!2?f=<GsQ$RsYXFXMMZNn`m#cpv(udz zfC2uoE=A@wQFLadm<+$jZ@)<r`@E|&E7<lR+v!Vdr&0I=gi`AY6W=uW=(w3#&~@%o z%x_->yOKy%+QF)AsdZGzz=>0>?l>rLbm@hSSIl1BD_VV4G7m-+G0WH_^^A7X`)%IZ z%BTnr^^|p6y?!wavev=Tas3ZfMw#=thqP|H*V~7a#K*(Qj9I<5_s=&^MyZ6C^J9xz z`s6bbT~ZS0M#UQ*nNKwQ4Cz53?oJNY+xn`x;$_z?ArF7H1I(p>D)mt_=N5SHM8nN$ zZ*_Ng#0E!h3b!X{A%@|Mvg?{_moO?ox?Nh@=xrrPH}?jEu15ESY8cHPffAdMB)pNk zT9^_|`g#mOeXO*m1~P@GWEh(@l>1Qi^^|ex+scBQ4j{#^uC66!260OSrCQkE5oMvJ z8Li4l9Y_ABKD<WK&_vsZCVJUB{16MMl4u~bGtf_3Ru-a_tzp`Ei;vKF36uVU(=i!H zFT3)qAJyFXE<bgp61lI<*T|&&X`?0T2pSf`q#;7iZhpbrck{q*rrmU^HVd)z{Na=V z9Su#`^h)kvZ;z3a6MLGDDwwqi1k}Oa-mi>wwl%Yla>Re0wx)gxBQE*iIeB?Gf0$s) z6IR<bTMG2>2N{)EkYuSA`xhDphFoK@Eq-z6-6~gKP|OpWZxiyPp-|>!2-OWmTxe>K zpH|$Nym7A{Ha2#9J0du&I-mEuv6WTDZig2PkFOXVtGNp%(66%E_sfGAy8>e3`-&`d zB>T~nE>l{vlxl4SZCllj=RN_rDo0tln#lStXjaDnA1zb49A2HyZ({D1b_p0-L*CCf z0jmgEclTB>kL<JzlO2k}ilev@k2Ar+;j`*Z;h!N*sABbV7Y_@er_Hgku^TLiwPBH~ zN)i;X#2ss*o=R&BD|wEZt*R@%gM1TXHNBmf09H;0*FDJ(md=$*(7>ry?i&S&v4s*1 zL|k#y-|6q~@4B7t3Np0HNcI3qT4Et_{@t{oCn151FI%Ih01yy6CMJ?u606i{6E4q> zF3*>Hu*wY7biPj_%W}+Cnd)j)av&6|Y9y$(Zt4W%dOqNZK{D&p&B9vO$JIJ@OHI}% zXCQc!hRFIfaUw0$+K7=Slu_<PaD4R|Fl=I6GJ@8*^iny(NeWLl(Cus}lwyTHFGa8~ zZY*RJ+8vIhN3_G7w6s(6^C-6@;oSa`OH$mFB+6=fzSuTCih{q{{y2F6@>#(BEa>{j zbNPwr1!hZj-R;!1sQX0bgRxb$5oB?eX?vD0(B*^w)(6e{a}0~&w>4<uuXxI3>#^zb zi&Mx_)+#ede@$m`Rh4--?fl7x*}uk6y;U8a_~;~$qr+-Ii)JBBM+xu#OMC(~UTK?$ zEoNh54JItCE6zm<71AcxWY$;t8;PKt0F+IHKTAW~`DxbPm+Z=T|KPyL;7$_B4Wu0{ zCdEaHC~kA)+(^lOrVlTcou_0SM?atbfrONlNTe^qNXQ4}@oC7;hKA466{+qqDT!b& zAK1wUw2hfI!EIG0TY`f(G`*1NsNsIgPw(Un{{0ZYlyjesZ}D#2=3%6Zsw1y(I>~l> z&`8ZRkF2xT(?v<(5`*Hk$`t!oJ3`{NWS>mH-%Qb-VSWb`P|%Ii**{JzD@)MOFcEoX zi$ro6sgG(w^B3|0iV8h0Ru>PTvVk=Dq(W|!N0HRgUEnvl(Y1!;5mtJKtpZ4Y93`QN z6-4I)e}P8Bl<cfn3^Xj(qOVM)eP-{fgcj9747rGEvT9ajBSyb<RqtsB=KJ@Y+up44 zMH#-Grv&afQEg}j7cxc#l-s;szn9>bFo}6o<~?u}UdE3d*67Wk#-{rSikn+lNQ86O zgkVzO)|opyGsp8Q1%Rv&Y3K$jjGS{ojW5A&_9vCA)#x_WOYUJ8W&*WPZofkvSHzNG z;!vwE5e%mAT?Jg8oxf+pV(eG>B>y2M9zbN8V;*-t#rFjf7NUa}Z3Rkkw`%$uTK`ST z()U7r@#zn?GsawW6)fJ0M;*2h3R&aQ(^45Y&lqU<bN6002H@%Pa<~5))V@<RI^lwC zDijL{EMLmun|VfWcR5Zi0h@|xz;*CSN;KAe;yNH1kRS$Lwk0GMH{Kr&{xueTFK=gT zV!{)cIS&idnYZr_vQD`mj~HRbCo5C1TICNe5c0wAr1cxc=pt-agP$;TI1CL1r9MZm zC83-tSt-?hYRClmbPDd=;y#0o#WGuc06+=_4&A{D?Db~FM{#S0kKbH0a_Z3F^{^x( z9Fe7q`0EM8i-c<X+_ayJU-jWjQ>|r5d{7%mDVyVg3$vCKDD&|T>9whcYcq5D^J*ON zvHY#z8RpPj3x@&c2%&)-Q;;?*3kwTln?#I4N1D+96jm7~<~Xw?m<60OP0aB#u!WFf zAGm+q>}5(pWo6}s2H2$x=H19jjouKc{`&7S62*+|g4}y!x=~e!l1XYedrCw2OfWy@ zrMN7LumuRn#iAa7Q(x{I4r)v1-q7FUM@)Q7MQ{)U){#z2>?D@Y{)!x-=`VPUYkL6^ zChY#GKq}qgV>-BEJ2SJFX6p@%8tWUbPk!uP<|s#NV`J1e;OU42*LQv)j~yiPtgshL zV@v!%NOztRu7do6nuU$c|30DhhLE0a+$HwnVp<!d#Ke#u8DH{D&d$!<fxMCP!HCI~ zt{Xz<6GcQr{EP5FCwFl1*m<ys0l8}G<tcs@CZ%iOOGM%iggH&1Ef&Y$?Y$C`Qg#Ss z`lkjc6<R2T#PcDI76g3n%Y3L0aE{Y;H||_aj-WNTC%ENzP}+|`OQUS!7`h0Bw$<kP z7ql4DJ^wv?CFgWbyzmmp*?3kko6{zgas^4$rMtgoz1&j|5;c%8s$XjJeFX?jHvXKB zW9qb~4e(nKn9llp;MSu@#Kk%MIM~?rhO;iks%&KZ)ql*27vV&}L*R|Q?$P&?Iep0* zkH3DYHtatN8e^kWh|m~+$!(#2RhIJ^nfZknlR{7?#)JSwFpOCYyMq0T2!2>esT^x9 zJ6d^X12GZbUK>K%*XG_6PpLt>CT)Tv*EG6uTDV+4N{j%vy?VgHeit#B_|l2g>MOIv z2H#=t5pkkz11=d-sHFD#_Kqs%UY(gXBLX)T4kN`4X}<FE(zh*2AlohIWu;v{wf(m% z{T*G&t%NsX40rOqra6JNm6;d~EX7{`*C;bBwTx7SiRCxMRn1Qc(Mysa5$f?`<E9{= zw?{%UnJBFiMAD10kPxWgiuvNs>7c}$yJ9%=ZCP@6L7v%rlu&0nv{ox88xniM%AAad z*C?g-JHj#`jOx3+9%~{8f{ZtZA+VCUSifj~H7W`|uLV3wdKG#oDyPPFr+A3^#s&I| zBnw9zdKY&53!WI4Nl;JQAqhbh(wv8gMmk>svAR<EC`}*bZ9k<!7P9rlHumzgP_i0k z^Bl_fF!{#Va(l(9x%YHSl6(Oqf#rf+;Vj#g7lc|V*x6J@1Fh&ZBb}2mh|!-U*BV~_ zUDv7(H4(D!RRR%lf~;qV$L`{EH_oLBqG#P);QUuf@|j=uv~BgY_+_^4L=Po&$&p*` zb7m!QJlob?=s2l3tW#B4oHqMF1_wAhp}m=WS*E(sI`l0b_gX0TT8~*xH8UOL!(;_h zVbYxV@R*Ai4A}akVU{^>ln2CYLqSBum$Z6jW1g=>;l3!mDMLUAC$5`ScfdE?_&dY= z*7CX^bJHZlJYH$uR)@vVP?xyYvnWrph{TqATJmpt^3HlLUcbvM8%3rlMc@V3M43S9 zO=PuR@uk#0zPRolH)s2AmlyMVCQ&tn;uQ{Mmlt5mlSi#N?J}OKhZg;kbNZ4KdTNj* zKc*<PXDeGPD{AM4Vd{h4-4uZ-L5h6`jt3%7ljB_uSVEH2KEqjDP};>P_elG@-Dxp4 zr$f6V-p593vT5Z5?Vl%5t5Y5S=$C%%Z6AK|rC{x&B5T$lQ@3k1Mx!^c)7iYR9?@MC zEE)vblPGthn+F@SF#F7zsGm>CDzoy2mK9)~cJAD_qrJ4;B`O$L+1=x4h8Yxg$3nxG zw4}4@<t-&~-Hw92aE3Phs6Zvc6b;wovU>Hv_YpO>)hJg@v@6gep^Htqr<bgs`@5cH zmPD}eFSzp@E7T1Z{xj;<=DPzBo&=TeM+`vO*#5gqZZK6Y>j;kCWNtu~5FeH_KQDyh zbZ`dP{+B3}mAE9(-F*vAnfWQgb@uPN#)~;4v6ye~7TLy1x0ma2YrUDBl7yYBm6K5J zK(Io7uZ+Z3S74oS&b|?~roSylBQXO=bmE9uYeicfZc#kldfP)<w0x|>0no(KQ^NxV zjY5NTJSh%s2%P9;iO&2kz<f~i1;o^4ZvDN$@yoWtg=xv)pWZN}Y^<#4ruXcu&D1I} zkccKVeHnW0<nxn+y4a-1{N2jX;5Wo2?}=AcUmwph1$6g#9PvSg_R6)xO=EU77u{XU z=>j^9sjtu2Xmeb-zqdl(-{)wKUSi=K|LZ$fcS=eo$FYh$r%ne-iv-9a6y~H0N(cCA zZ+7UBTd!i&OudWIFmtQ$fT;}(sGEzzTY?glWVa}+5K`+H7Qfa{jj{y{2y7Rx59&%v z5OOeDzFZUO!~6;>7gj15J4&u&Gi=p0T@~&E1T9l88R~&7-9FBQ=sJ^{To5;(9v{IJ z89)P1;Dm}lzyWEN4RaNb4ubtM?+j_-;o<&zQM#jtnHmO;v3qypOLguwEtAuJy-yGA z?TU*`!Q#JL-EMTNRX`69#nhatdI3;hEaVqOD2fLMhzCg)R#sNz>y{U2jQvSG)Of|R zjnk^)QF%t3+Pkx)(Q2w}L5`r<h-t8<wde9}v>sd^M8@ybfn)T>+VNe-?>vFWU85JU zT>5L2WG93*G<0;b1(R|S5$9NtDG#Dz_L6Ldxud|Uegtw4)L-o*!A`5#dusc!j_x6= z3!kQQk!X6eJ`jVvTGV)Pp(H&_2EKd`yR)bEtX>392O}|l!S+hIj#K&eit>b@VY^39 zYt+9{LkSJ7oTiv(=t5vq(j+FOc3k*uaLq9aM~Syp9`#BLlGzGIF3pm2=yM3l1<`*! zGgFOcE~nvWu-s-x4OUrL=@VJCg=;;Pq0HU^Ue2b2m+;AR;e&horTof?c=$Ld3{(ib zrD0d&;N7?OoxngUjb&`EJpSt{ojh<@YIHHCbZWJ`H9BwC?5Ya`%idmT6zj4gw2(Xq zNy*5X2ds+n@%iw#^>a<E(q1Rej32Mx@v2i8H9~Hv)DZYfthm(_T|oEi2L01M{l6PW zE_=0?8h$<><dnumAs0W2D3u~IbM<NGi>S@nN<rud-=+f$ZOttM!bANG{SN)ueMOVR zFYzm(z*#LI8<7{e>#=*1trwVNvbAiar^Ys0lC?mZSz||Y%8-Fw<_#{vv#S`N&mT%r z$8LnLtvJ}FK|>phS~&KXZj7#WW5b1Cticj1Z_ZM90>>M|0ImyJH?iG!j0y@0G5G#l z^rhwH-!f@fh*{NhiHg%2+M4-h+|HmNe#Sqqtq66CJ+ZK%Ri0Go?(3}Su;e#&4Q6R3 zONqI+=aYdi%%TJjAKq7Dmia)g{J1sZ?a_-Q<PZB3F<I;mf;&Jdf5gL4TRz)S?=Pc~ z?t{>-+<eDCuuJCkjPUV>mmk5IM0A}tVY*4^w7E}VhoNkPW5B_)q^D_K1_pWp$5v_r zPtn`s(}WIYAPLM|oc*$?1qe`nAmN2D=t%cw0TF2irT0Kwo)CqHZ@*?ghPtU;U+)nN z!*9mLuZ(&(w-pmg$fH1yg#bZOFzEsUby>`sh(|;{iyj_Kq^Fm$0YqiSj*g0x`%dI4 z7GGKi`KKedn6S4(VhS2-g0n!<4{p;|7DTAD>`)gEMD$*10t%w&-M5vbQ-09yJjLrG zK|#guU&kbKhgAI9+R~FR|Bax0!w2Uu0~10xOpzKX1*Ekdk+~&-e-W7z8q|P0e0bp> zgwp(QX^fAu8fySNXQCle&+x3mgoZ~>Ki=QdQ%UUC4uwX2t!zY=G4{7j-Wq2bM&dLa zzKU|EbZmd#BJjg*UR=oIWDq0s$}Nwa)ocj?spNEnPLHsqVy!`&QK!a+EV&6fL9Zla z5X{eAlata>_N+mDb%t}s%f!dg@*pqu&~C?Eq%a{{V7xnvi-UnLlftYHmt8*_QGT^z z)AMwE<!dsX)TxU*eW5TUMX}F(*?i>Z7kVXf%aCz)W+~Cuq^qGj9T?On25xtTiTbIn zAt4aAQIId!SJrT|kKT_!0UJ{dJaTfqlD0EDD)Z3^mZ?rfu%cYW0vsp1yQ!8l@%(zs zfywrBYKNpz+~#}x`yd9CRD||0Q}IvtMQg+NV4Le}^%C{Al75_SC(~JIndiW)tkl%h zv4vHYt8(Ac)-)Cp5)vfwp1yT*FxTG4O`i!UC{^BJYtRxH+h`acLJMe_-9mQ$m;?kl ziOO8{3kQhpKA^@gE5o9ckFiQt5eSue#%>t-^)=Khnx})%vk9D3OoG3Eb82cTZ2_>t zp2}CWP+P<tou0Kp@jnDN?|E0G4#4|up5z4tVF09DNajnP0+F*uK3z8A>}Pg*j%^{9 zG=V^^n)?^XNb*!+d5|M$k>s!bmckGRUB{=V=C-`bLzRcUvs>Nlnrz#Nsjru5@Sw{` z6HrKyMs#LMto2H|remN>H|OW;YZHS|p$>t|D74u%ktr?<s%huIYwfD-#l23E{+bh! zM+kzfIq49iR1}YO@ruIL-lyPGParp>!DMaVo4${zuP5x`!c27cxOh0kA-VuG3^g#2 zIAP8!e^*HV)eyHIz~25)vX+(iGTfkOw>ZF|rOqQszFpmNROoVawdi6Dbg;p*14cVU zqPc1a748D`zEh0N4~dmKr7`sNG-i1exbuYTS$wSp)-!kPL!j$n?hoGccB**MB%C!{ zEw*=IYzD<${PPt6kL-E0+Uy6GBgR==?ekMlJ?dIWB2YsR-$S&kdVK1>%K6O<77ZoZ zx>P91?b%et921=#QN|>NQq${u3iK)_1Q{wlZWC#JMVwV30DPL4mv>Zm>w37c49`5S zeyW?J_5wacF~8)5>adR<k-aJ|Yco1B5{^hgVZrdmE#PAR`6{i6h2=rSyj=g$@A+tA z=jCFor(=y=ER`M<!1vN@{`}eIWH{P<@8T;?4~c+lk7Ji89AlQ2Oc@bo49q3;oD`M+ z>r?f=;gh^N0h~;oMSHvj1w=^X{(>;;TpcpXa?~h31E8C#!<ks8p6hzs=R2Xe=sOlb zDCl?wRAoAvQ67V@bf_oANef(QZB0o@iBsteG{U*l#7j?0Th)72G|0Xkqm=+*qIYp2 zaMUfZoX!H`OlnWF%&AXlyNh)L0pL(6GkzudHtj>gGcfxh@XL<N-bQ#s1>EKPw78wP z-ycG663y$wXH+$-Jxs*nLDvrT&*J`t!oO*`JxF-;`%&~+OFq!M`8^L+b~Y<@tO8OP z0m(AQ$Ozbm<;&(app}B05A`lhU)+2J=}Ga=<Jnj|>+5V>v8FHeVk`BwC$#goJvpPa zX>;pju~pp6Yz|Yd^<a=|SHPJfGWgX;#OP|He}H&61vkO4h)f{~j9Qa3ovZ+IX(zUQ z^xTwYEfsifm0AArI{_wtAEoy~#<W-=TBBC~zc$J5>M3~dGG66({qCXVx3I6b(E-ZF za^k)rz*7<CP)5qi%E!lN!B4MXRCzIBczBrKRj<vlh#+^z)(EGpu?xZ5Tol(DV3rqN z$qv~4d1taLHgT$+OyvJ>+GzDo+ED#OLTBq_MJF#IZ*Ry%`~m9ztBmTJX*zJ6lasT# z8FhDCI)$hdQ;ukX2f#GX_Ft5vv#yFP*~7Zg0*&JKps(tRfO(Xz6fWk1DMi8aAWQGb zLrB=((LtottGq4{jSn8Z^|XzmPK~sol~ECoX^wu)qDs)Qe3SCE2P&9Be1tLcZi9g6 z4!;?H?L@BK6W#epb)Sljlj}b$5U;4X8S%YMi2mO=%^!*?p?N5GG!386H8M`<Qxt&o ztMICcC6<BSXdiehZ!&_Zm$A8p_0M-pN^(qesZFP^&J#><ezkxPY%MFs>sP)1i1f<~ zuma}Az`C%dVXxy~GP&OO*&5{DIcNoiV=_rCVPUXcdx>RGs=Dxr5>Q~&{n^gqwuS{9 zKTtT-LbjNI8zy^Y`TI;o1XlRO9h0r$XnekD=i5c8690J~&u5xTTxgtF^8r~xk-O;g zKuE-Rx=)e@P~AfcY73KRR#qn7zieRYM<A}-cux47h3U?H{~e>F=C&tWj2Pr*a@GIM z_5^_42)W1|A3NtGGq0#63$0_<h_t#=M79--P%;b4zIS?*enx8GzKcYEuIUNBeSZLh z?*Nzw%5d2(Fb!`7G~5$r=cS$U^x#iO`Tvz)NB);zn_14BWn%*Y?GXY2G5tT~*GziC z2F?be7S<+C^iIxpjsQA4W9KCe7j@;04t&GY<ah)Dj}Vlz#8@<7aA7QuQGcxV4k<<c z_Ku=B)iV?@iy|Bgf7lb=6MP=E8J-@#n{+cC&l!*9k7~QCud1f*uP)07iqb(KAP69R zLbSZ6&J4&CIv}9dvK1L1>e3{}ek@Zs|2~?$5G1h0VeCVFn?Vy$f1y2XRt6(0VjLk) zY9lK!jQ&;Vyk}Ib-B3Xnog$k`ncU)MSKrxQ06&tDkdf5EMGvcM?hdhquYZuHfOOOz zAtgmW3`pP)XzD(Oeo#~tDV(8=^&4WVUK(se17p*lkO6qYALHv9(I8?%sE;2|v2Pry zAE(6dB6u)^&zyCr2LH8zYQ4fQU-otQLwX!W28M48n3Lfc!ex0>Bldua>Cr;I;+=I2 z>D?CMeLHRjN?;I>D|Oz*jFPS+M0XglD{|ziX5m3|o^d*DIUFsjq@%3ArLx-r@?Ef1 zlXeTce~eAu3GGxHhUpeZRdW8&GP7t*$Yuz4nO$cj3@`7RPm#U;-p)}18gZ<pN*JO% z;}>oU&$IdLuHu$;a{+@OlW9eJy#0$4yxaS`cq~CIEb5Ws#E5fl7Ar5{QIbZy%EL*Y zR;RH=n7w!&k}rr}B{0Ge!|t<>xUV^~Y6$=9SYbyR`Ag?HtZ!_MFmtnJ<|JdHCFgCr zZ78kH8mE0V#`S87arnIC*1Dpa4TejvO4|`$cT1P1!Zo%o5p$G@=dWhjcF9sAwUUzX zNGUHX4)AYapl%AF-W!4)f{9s7;62CsdbpUgC~*i>NZbyiu~b9)gx?8*^-aj1=a=Kw z&&B~_sm-U!qTUL$*L&NN%fZ&a?9?_@pKDMs<VJ=eW*>;rsI7~J3>Lw#xwV7$PHrE* zw`Zt7A|78)sl+>BPRxD8(1fx!InwYPAch)T97)>^g!D6|0-RiA&}j%9rw9d)$h<nE z$)k%7^FWqOO7GUB77N$5-O-7LS0P%uhMH)}<00<j1!ImxH$)%`KHAjlrKMV!tJF=P zM|#D$EGzs?V>8v2N@gI4qizmEkqisOb~MAKT_XZz>DS$M9O5~~lQ4zEcWpl1tPaJY z*jeP9F!cR`$<V7+)GYXti7YTTGCDf#04YUwpaeD`fn%v9XC8!P%=;0V>xFs|2z)in z#^<Pi8=f{qv#d9LYl`Gc0(7co+*OL9v9X>28*E8!KaJK-jy)&Y{hRgM&lAw!cCVyZ zmw1VE?&TRJX$VS76<z@3J@h0K8;vAmIw=XR&FKV%3vHT_s_lyf)9QF&>Toy*I4#eY z)}xMna-H8H4NNezme({)j3HUB!djK*;!k4Gfye_xMWf;b*s^@MMorRoujMms3K1Qx zUQ=j0_YArRin4vnIr^x~DT=FKtd6mV+}3US>k*Y~54#(d)Tc0S!?fM*{syoswtX@0 za<&c)<-5aBV2kT$J_p!SoDudg0s+ZcOsQ1|T9AQE%NENmkB{Q>;>hT?TAi(?q%q3B zDCDcTvHf4}q)XQqMB==hT<E8#PoFlGuB&SdLxzr0D7Uu^CJb<YU#hCns&HScZ)Z$| zqunlQ+3)L5-jJ~kGdnd}qb@RrX*ab{{noA-B|8|SGNi0tJ|v-9Az3(tVD(JQTw9BI zIFINuq|c>Q^6F$V)TNK#<lEq5$hf0YO@akG@5<y-PI2)3uI3bYB~(u0xqiJ^$x6DL z2v?NU31;zMZ;+z=?#w*IPLKKPG$R?=$Ww|o$H<mEg@CGeMw<E*y)#tFL>pdV!@G@? z9oYWj8HaOvs!bbLuWMXZ#)}q4z1~7-8CiFp`Y6X&@&_#y;9kGh=&K>q4VmA;)y_hz zsz*N0js++tP$XS`8?vWBmO?A5gWpe~uj53S_VjEZr#1tfN>mQhPCu>{oa%G#Ay?;R zA_oa*F*C3}g6i%^wO-tw+*+|LpWTG|!&D6t=N~2IrrWW!1t|gL$E-Zaq8#lw*)!fk z`|3Ir#+lZJuIojn*Si~Q6N%~9S@@XGW7sLY*Da%*Yn_m3ae+v&OCnP#Mtz7Wn`}L8 z4KPyIW)KknE61tp$}WgYi>Ksrse50BA`->VK*8Y$;W96xQLRZwfpl5=!Z0wK^LmO- zbQ0%twE`}ZyzQj+6K280a;%{shEx@I`6akG!PPf{N!ZorY!YlySU1&VWCy!1u?W9o zuwLp|nJj`1jC8irLax+85<Y%E82<Ih6abdbi)vnfi9yZ`Hn%Bq;B8ZPSYV8oeVXY< zzNJWYGt@bp9>%Ln+c6J3LPo08Sr9hE--Ij!=fgkWCoM$%tGK*Y_MMCxBWdtYx+l}G z&Q?!l42+u$B55TPclJm5zx(0dgjmc}+@uE|tq`sSFc3e#JdujNmFeQdl*Q#wY>Hlz z<C-e>o_FM26C#{ooW~2&Od7<9d-c#$p{oLMn*gzPW5*I1AC}-OzqBfSGin%+Ro1g| zsee5>P|^R8rO3Gr3dE&GHC=vOZ>c|cmYT&i(C~HeZR=Opudv<Svpl4?ciuD-To+B6 zA^KI!BT4QaG6%~OK{J?E&L0q};diK|CSkNG!YW7NEKXiY>P2uTLuK0#KuGvj&$BA` zBY6_#Ju9Z*OX{5PKi6^4Q59lS8$W$U|4s;hq#9lYiTE*8jq4+nhvp%DM;2Wvh<y1& z#Z!%PXx8*fR9lk@oRDVjNZmw&xG(N5X!V-g$5{+n@THgzoIl4a4@27cHtutxD?yNL z@0)1Ca2|@WL+2`yC%UBo&dNVld%wDXnWj>*&C9pAHk^l>&7!4Binw{4)NVbW&4B$p z@;U$_+G_a--=+nxWO&2js%r9h<I8PvZwOo!25kzDQAi<D|6Ho!^0DC=(>aMtMSkmA zHYt;7CEK-1Bh!1trw)9({aYes#g-w@Ym+P@-h8K_!cH|-VVzZ-BI$be!ax4#+q^!- z4!3|5+bk7m!R)U~UTmV+*%1mQRkRWEx4Jef7<vkTGu<%l%<h}r41*Q|V44}GC49xz z@OYd%$4$9R_GJB`MBFS}6Lu_j!0jLL-a6P-$7GiZR_lQneFgj3#1_u&!je6pa2Q+0 zUA(mhe#&>0G^`x#llqs7A3dpWtstyVQ@r@<vBHN8$nAp(%U<mYW>QL{O?{M#-J1Dm z%O8nnFw!wA^@bjqF&PaNjSs+gyGPyZD)=a&;@P;K@msRD0ajU3g<ui`xX4CL2Mm{8 ztn*&xUARf2x=^Iw%oxrX;oV9#w8kUTrX19txYa@OcNYD<DhU>IrCo)_xztUz%3fu9 z#R7jDkO=bi*(9>Ywdq?MZ1c7-(&29LzR6OoJ9VtnO5e3Rt)LrBi0xUjM8mo2C|~8B z2u(d<UOmK}Y(b}iQIJ~_X##0oQPt@kxdVdb1y5&MI4@D($<d)$JiL6HN|6`KSrZ1b z<_R7GO<~GRYQb}Q$@Jp}1YbJhrfQ>VwN4x}@eYsTvOSuCY|^5^78cSNt+rD?*fx=M zV%)SDAd^VhnZ8iA32UI!6lvv!Wttt>%J|U2;qp2awC4iU(besdw15ly9sLhZz`C2a z`BOm<_VMfPw_rxoq0KD{im5^JQjv^gc<xi|1AE=jk^ig_0`>qpz)7dO6Q03Cb+xYn zl3&&@mXjO2(dap@GH3eZ&<`sq4q!jNFvn}uZTsf+GkF{ibcfb=OOw|mi=UtJlmN5{ zL2<+L%P5ZqJX^HSalPLh6Y~*I%3EB#88*t{30N-HD$l7s$r=HoW-`B_g>~{2o*Ca0 zcXm577)0X|pV%D8kMMM+;dZ8VF^81Z^YJ~gvy9E5EF`Tf$)5$x5*m`|yYnbOBLkwM zY;@01L0`wMPbD@*p{Lsw6i6H>@*)?O;l$I@&3jKpY2F<P9UUD9CnGa6Gbu?$U0q#6 zV{LJ9acymFV}t9zwv2_Psj;!KsmaCN-5me`z_+)zziWG6yrxU335Y5VSXf#mZsejy z`cwIjG3u1Pf0g^(P`_iZcp}6TG1ML^hJaj!&U>3YO1m{nJBi$eQ}n!veJHNniEib8 zwYJGGNDTtU6$gY9)LXBjFBJYxT%ff1yLm^_v7#Q%&5;y~#?hwN#}R#YhAb?Y?b^PM z*&i<IV}5i?3FqWVry{sj;Q-b~&c-I!4$IbJyG_pzj{DlE$Lni!Hg|vAhKPSP#&W!J zaNKvEkR2+x=l)irD!O_2$UQx#9YCH=9`+3q*5dpVx8M?#%|%aG!SGU96NizMYpnJr zwo`l!l)y{iKGG1h&)eIsq~yu4M?oMsSopfB-h1cs&JVf%Tb9u7;>>HY<*-p)zn;QC z;M?o4R!1r}d`Bk)w}txtZ^J^C%6r2?6-I*qu10E(BNj~U`rFbBXJGt#q@)$G`e$bU zrKCqysi0mzSkUOD`GmMo5}Q`NwldR*GgvpID?fMk`80wq_`$y5)vz^}E{>s3^hf@M zOnw=Tf0hRIBPC_Egw^`Qn&29`2_BX0j7m18(BvcX9Px}J`88N2Ry`1ybEQgj9Q;z~ zpSt&$BH4t$8%}Q0YF?Fp%B&HPr00mCW|Z1i8gJ7WFFfrMEd?F<oUYV?e!ybq6ZMQ3 zC|h?UBZS|7z3HxZY$Ek<c(;Gzm!o}lAp#;I<2k>e00hoY*{X)otkhAG=v`OqgnA)C zh&zTbg+&y5hSzEk0!tSv_a4W=_ASbN5lV0LGij*q1Z(Acmo38+A`BuiBK{SkR%MLz z^%s-tS5TBn9jJzVtnqk)OSqVbpGAhWnD~na>Fn(p^cDCRXAWe^R~j8?ygDIU)UbKc z(QW*e$TwY(yngU{dDMge&+z4*1|QbpE}a~Uw{tFp2PY8DZ$Uf0Uwy1Zn!$m+sAvII z%%52n1wnN1SlSt#I>jk>j9qO~8>K2V^{&w^3bR<sX&Cj5Rn`-EMQL{2ARqE<#I_as z+{)$z8R(emXFFF;C6^^*oTj5r94M<VsF^Iv;+Vs2Vr;L{Q+=eDOu~!<MtIu{llw%} z$B3m8GOl9L%(|M+@D&_$$rYYQE{CB3zY_%Dw2PzN+oIDQ{oB7Of8egDUnSr`iohFc zfkd2i8gA`R6)&d#L|s|)E}v-pam4!mh7jAZP?NZ(0}1x9@O(7_EgkR*9R=lTqZi&E zK)7JdW>aeD4t1$2@lmEZ%2DuYp2VOrbVB~NXQDHYTou{5eF|&G`HKC68D5X)OWdZg za;56KBlak&D70CF&vFyK?LwikzhD4fA*a*`3OGgxKS*XSg_?{AF!uDAY}K=$8y?w@ zDXk{7FYDvQ&vQ=yTgn+@Soce_4qr{4;D#O*Z$#OCJNxAXd49XQ3zbFGAgWn9i4Uo3 zZ+N3b0LUaF$2iq00}xKUjW1>?7)F#W+<2Rqf!8gedR-4Fv7wkM8K+Tdv}(|9)}_Mv z)Fj%td$KKtN*H5gr*{d<r|}G!72pZGM&6*EA$9VXS;NzXZH5G5zCeSmuSZzhMiQn0 z|3L6p^*TFGaGp)b)D+4Db^9@@mBwr8?A`}Mz8?>ws!oh_&US<C`#|y3gAI|Owbe}e zdY#+1=?j`ws*uywBJngbc>)-rO<zUDQKOI1md#1MPj6&490_95<$)Mcib1p9SJ@hH zIwDwS)g=wc8p0o5J)}N9B3|h9H%e+Ox^5xv&+``cvJz(1k`T=7Pp+TnXXRb5>zAe{ zkJ@k2Hn?a*^S7&6t?$yVb0p)k*k?5(yIqtRURXq5X}d@_u~pjcjqG2`_y;-KPUHAv zx%ssz*91ve2&qvyH?F#tpVyP@0JkX}FZmy&a9B-HZN^KJx%I!|T@^Ft3E#-y*h(Vt zJBWibRTqC2goB!4^bvPK685OS$<OV12yBzUyIp{%8iNo<*~(=ZEmr>bGGdft+_iw~ zhoNqI!R>;_e&VrG;OqL%a26w?F^%pXNc<cuKOQ{49yL##CX`^S|NR$*ABXd5hoaJ| zc%j<RG}sKwqaCU)z_AH+%qXj|8KZ?4y_OD9%o8=*G~l|J7gG&M#-MO~Ex^M83-d-{ z0m}O_yULEw&!C)L0S?80PvcKv^^9i&e%Ts1Wv0}8c>=2ad5&7y6Kf7ZONs1a#4WzX z)CVl7ix*kt6ijh6cRM@kyVCb=g9sJX(egK}(?f%t7@Qlj#r#GlwB4Wz$ck`9nZPeK z#YKpEz0Y@7W7g$_KUZ=u1~mikL9}-TC#^T2wE<exh0|-@7={{CcV4!1XiYS`=P+T` zF0-(n^}lqu(7w-~k#xiu#(E7VCyO-|fF&bOaP0R-;B#lY>^Qq_Y<dio)<mWqLJvnc z977ui8+F^3GGY>jIG}RV%5`M;#tRLMm7j2S20?E@Kwcjkna_5$_ONVUwn^{I%WNf- zx*sPhw*)=!%EIsNLC2#pWJQn0>;GnSni$J={^Go`R84VfPAkBR4Kl<>BH}n8HFhK+ z&U+Hh@2hB?g!Na}i{S?ObPOVuLth$|Eg`S-UO3fu+cF1(G?gQ9)D=ak>`IiZ#EFli zIX|$W&}yN8z17r<*MII{BgY$TLuMd<=ZDLGvK-nm5my0K`?8lf_nEQE?}pzw{WSwW zdVHby9^52MFsLLkOS2Wm(b{hJ<Qj9jpLomPvu*nuv7E{*y2q_?9&9I6OMWJWtjX-5 zHHSd6u<6n5u6ts5Q|t*X?b&8?;FFtHeRmkea`MGhdR6l!(Czv@2$Od+L0-?+gZODK z{tcT@MOE@Em)NOXZ=Ac8&`>|rF~fDE*fYZsC?XjVT0nYfbL`@RDL7W-xaaB&J>aiT zCzx4!sfn*v;XW)6OKnkBP7V%M1du$(E9S-OY%XcPbT&o-hulzm@X)y$YzUpVRTR$N zzK5!g+26Cu5#0rQg*{nI^6%HP!jpmo^*Q=!R)!=;vn&|R4MNIVR5hR1TLBM7GI?t5 z$--e1@$Nn0F!L;!U-%-F?JX@@$xLqHzNpIeGl(q6dZBUBnLmm8ecja|tjl2;o6j;n zM`Mvsk39RyStFQeM<O=^$+(llWEoQ-l2z{ZXmhLwX&QkV_^CUL-;ED-btWh`vyTQ0 zD)3PQ@sXpM1q(&}56fx4LL_V}+EZ|9-#OvXj&~=`+Hg8d{6_|GI<zKqSP;XNv#H{0 z`_rn74J+KhfttLUBhwM3INysgjgKS}uYVbwaEr2UtiO|ACHj&wES6sZ@--4_hI|`y z--jsHS6P&y6h<ZU(}@6nNvOt1=^{3q)si-;&u1NiFW>K+%db#TZ{a(f3u|xB3gyUR zhz5c!ed+pWU>NkOB{+ZQ*9kFg4Rg(F_eE71!sm@{65%Qpwd;iX*KvBAxtBnCdU0!{ zsLlgsF~ZK`@WvYXN^VBR>tLMOWvsoH{8y9%w@n?p?^LB%6P;rvTHWm-GAdLk`?!V; z#EX50M@7k4`PjL3N)vx0)GC#YcP`J|#V%yVsVG9zjM%9DJetEUGkkLkl6z$xKEawk zyR2X)uW+@yR>eegw$wUZV8O?w#+@#`b(&BoMqm!Ex=MtxZ>w|)OT!P34<gIN(IfKc zmK+x{5@VSA=8OVIoWDMACIeIorHNVF1Q`XD%X}>4mzp@*PNEX*D@p-x`kfAc>;48J z3hP+o!BwEltNzs@<L_5fuvmc=z(I29WJ|^u;0uwXkt-`(&$}tfsjNKYsf##qV`o>o z--nu}P=-l322B(8;YT%L3|A_OqrOVRJ$-p{*Xmh{Y3Cp8X)Pp7A?sew#?L+&)d(eK z5Xze7NxK%f*`h$M*qQ$CSNF`$xDDrLmaj#Ddy8G1<|0oQBm;yEq;CvhaZ1+37d7;W zJMNAzfOY#$+U*2fbv2y9hFAf9g+f;iFEPzZ;ZXRm<cOm_uv+${Rfk;2x6|JnE4jFp z#CrR*Ne?O-`Ch|VD(5g5msLK>TU5OOHtp0-Hv(O+>XIs~(Ou_Gh)^Ak7-st$4^_li zxWBkzUU$rSGE6tFxw3?`nYF~&ZX5?xcfvVex^)0+TX3oyx>KK9WThS%Gyo410n-9w zf_d&{p5j~>4Q!#L;yj)_wk0Vup&97Mg|7vZBWH`c3Ono!$tAk>*I@*E^(^fiz2;PF zz>;RNEd@G%wV~TiwF@r0!wGpU)N2oXVhr&X!agC$20heo3AR|6Qg;-y`=x4&R5}aa zT#H%pDNx;r^KLcX+4dT)({${eUd~olxsVNDLsN^35f&REwzwNrhB>;IC0>c`T5*w* zD=($zB8){-c>__wWmH2`jA+x#p-j!>%W&t0OCykwUs}~Y@71^87HbLGYDbXR@tH7G z^|?QBwAE~W@u>n22n@0>(VnB9qVf+!Ir9|o$rUrKiV2Z04E^_>_&HIpI=n}==-qk7 z2QO7zj0(~I=*2{<<hpm(Vm40e;@Au3;G_v7Rw6r5AoS8?>P7Zve3fP14kX(l&Rw0b zOI}hX`f~cpnMU-;y7!n_uNRqr2{)vZd)Um3L|cDEPYfg?2GJVYM0Z+&yt8Jnzb;qJ zY5y8Z&_z}v$BcMqntEAIxH<)zLuG4&X;E?gnbJ@s1Zs&rfr_DJ3x2UM<}cw3UiHHe zcMJB^+#%#<_)EEInRAzIqBe=fv!_4iueacoAJbPJodZ72fRT+i%;#=Mm$%0i{f%d+ z-#bH&d~Cfiw~-8~_YynPYb+nPG3{3J0I;D8ce3;~Hvi+>>eagm-6yNHer-)_)dsT7 zZTBuStuTF2<5bLdzbZaD=(lqb$(O>XGSXAvb1C$MD)@#mzBqHgf0JhJYY6iRzVH)| z<0~h6hpKTxKkAc6@N@Op{S$Cv#&N%+>5O=mb2PT|5c@M|^aY#Ub2#oh5&s37+;ceL z%T-dQSNlm}_N`?V7W*Ty{b`rX*Ahw4#Fd$&ei-MQ%=Hufsj&UF_5`Z%t@!>O{mzK| z{NP^Jn-iSdGxKs2Uf&~0G<~yR>Pzr5k_vr6o=nwA;6&#;^3{4~_!HFe`-`&so$GdF zEAWtf7kS9e)tp6ya9swn|03s(W%4ja<WcW`PrLr_TJz8UtF-GM&-}+T|MAR!Jo6vV z{Kqr@@yvfb^B>Rr$20%&%zr%dAJ6>9Gyn0-e?0Ra&-}+T|MAR!Jo6vV{Kqr@@yvfb z^B>Rr$20%&%zr%dAJ6>9Gyi|^Osq7zD8v7P>W=@`|Chkp$jOy-dJG<v0ZMp|tBC}C zL;tj=E;3wBPMn~UfM=ZCBR(>lB%{++>5`pePjdM{dTiQR(5s*w=q(?r6s&-LB|oA1 zL^||vL8R$W1F`Z{SFUYW&lU2NmYT-+D8;v;%Um^<;D68956Ugm^!_W?^1sFK-}*my zL{@S(ur)StG?ueBaWrtYur>Q{S7aj-V;9H&DCz&)2U$-^PM*%j7;jt(#*Yw9bg!-A zH7{+JOwddquGx>!lYbk>gLK^+C36h2q)#0T5dQMZUXfOciZQknA|#Fjf<ty)1-wrG zpb^r)*3!2oGI-m!FQGM>wlO<PgzXWc>uFn7*6IoP|0-6(K~`fjUJybN9tg-v3<wDK ze--O!M{D$7l_s<{Ce8-N|B+WZOD8+qD{U>O4OVoYo0?uS`i`|Q5dK)Ohj~tic@Ofk zuDT5Ck~4Q;kVH(WLdn?f!iAr1$XvQcnXTVL8(R`&7=H(W5vISm{O&*4pm^TtDDNfp z)MB#p#0JXKQ_a4<lX56)C?LP;Psxi)opam^`EXtr<KQJ3u|VeNZ=v&cwsxKY#h211 zb522Bb=5&ANw>V|zugL_YR^scuX-_x&{bT^b}){H_M2iwY&g<oI#O3J_|kbljCipz zrT#ivPe;hcCv~**Wk`g0&}^U08}~BK>Y*Ugt9NL3wm#vqmdROij8DpUm{1m1irL51 z{^`#~{)`;Nj{%7u*btTlsgS_bi^XKoD!Saf8tXe-6Sz6LAUNUrC~oTM@X(Cxu1P;| zB)q>wEiI{P(~f+~MA^#6{S}x`d#k7o2&}|ju4ip`K2A{6RgC<2F!_+o_*<U(IP!bq z1J$5ck?bwENhejA-698`MRLeQ!8ED);`c1a{0gd$sZpw0WUK_!E76}MjoFo>(kaD( zX*_(*_gON^1_>6G+eIj(CqelqDvk)|DbY04rMHTFFr4xS|2MG9#3#(Jg5?4lEZ5Oi zb;<*HQx8uscW3jFYSWtP5D|cn<{9(S+uH-vM8e;?(GKo=FxD+oQ=sj%ANC`2&VhX! zs^PU$)8}@peS^5b>3JFsN|$t<4TI347I(zyWDYzmx#_VUDZVFOR?nwIUAXDQCHr)a zgB+_Sii9#Nqt~TCJ|Dyp-p|3a9`r#qBqO*{XZ)u1{B_+7r+&U-zOQykaNQoE<xnK7 zsA|oAlw;w<B~65`@9?Kz5BHjp<iy^J=mai`Ph9TpX3F$6<RZ|(kVPhpqIkAmBhbeO zB?z5~2toNy3&Kwrr*AW1-m+ei4zuH+)3#g(n~n*(STD}mXjBuBxyyC-5;MolP<%(h zRMqz;$*x;c02+xyry=ED5@JPr6v`Y+{5$5TMeiFP98-usyEvg#M@zxnI@M&H(@tmN z1;ax9EEfylhCh-y3g3|s_~Fn4Zi62Xt`KlFg+$I}`DUL8B4yw8m@avCy$*>ELeYpZ zT4nNyX|cp#qOeLSwp)7W&&}(KePT0L0b<95Io+I~dzFsyU*`$rL?w9<`RZbkl9McV z352SOU=bgTJK{OT$sfuVm|}CyRPaXlbSh_Ban$yz;%=-Wzw7C@;@YkwU|`~!LAcO7 z*Tv8FoTXf(0r6C0hSR^X9Qb0X_8pK$7z@EVC0lHT^d$Qn>?eL$74325>&pVp0x4QQ zjzFydo2UjL-RXnu5nu^|!>cJ&v5DZ627==snyhIikFNe4YNxHpiAhF$IDh@O1zYmO z?2+v2qils@z=`hO6t{RqYnT}Ns3|lQTvT!u;#%Vey+#!f*Bn=SyX7q*Ns$r@ZA19W zryaqhO4=Jm8B2tIQ^Fc)){sT@M`?kDMQG1t=Lu*mqRv#gz_h6pRMz7wHm!JD@@*4> zQVP*;PcrN@93L#Q=dk8bF*3--o%Bt{T#-eJsuJ091_DvE0%!m;OAp~$K;PNQrQ!;p z(^SCG<$fFGK<@T`QpEm@$ZK1I5knL~C@i8<>{bD@qxi?KD^D9aQdniYZEf_JW<OW% zLSSDa2`dQZf!2ijX2{Wt7N<`6I68;DpPpVJm&0Trn|9`Q%D2a@%K;$bo8ldUinE!6 z7nPJpCn`&)PKULwYPLvsi!hg_4TU0qwr_}hdBZ*7-$A97x0q{t&-cms^|Jq*5jeAA zs<w>GWFEjrHnPKemr{mPa@K<lAOwIbrdB^rGdf#I!+Y{&xc(YG*toe)eM2O$j9i$d z!uubkop)4I*%pThO?s1YL|_0ZQbWXGDAGYn=!Vc_01-$iLkSp>BGMy8K)Q59MFa`G zNDD>400EKEETMO#D-nJ1y>Xn!_}+T6-o5v%`_Eb5@2qw1KI`ti&OSS3|3Y|$+i%CM zR<L`@QXdPZ^Qg+VI}tEg+A+%m6Zu5sOWB$d-B&Z=mR^HG{AMREmf=1uohsRaTi3wz zH_lnGQa07SI@}CAvEXjT=4TfM8d`i?H#3$S(Obtc4|Y4Ru}Ut2oLSK(8*6DwHUDgc zw*rnsA-{=D4v?U<O^VxV7qD4&yw`vIl}1U<ldm-j$@|(LGGsHlQP9NCb;A;#eWm&3 z>Rl&c7mZRxn72tdL6vy9JL`%7D8uAX{Dh-)vB0MKF>P8K(JW@pV#XN-W1PZ>CaSnP zLC_9m;H_KcP+Ym-3o2IFEE0j4a><p4(q(rVNSrWZqbnGD#y90E)1y~!(M%v{Uq?2- z%V_DlDD1F}n}1{m2&n1nZcKHVK&&6O4~;*dn@Nbr3`ViIH}68>mG-`yX~bZjx{!RO zXT+Cgnn9;D4`t+;kMqr>h-LTcb-i=8LoiELY;{Y)Y9e2#V~jxYb-3pV-Cog|uT42s z>bR;o`*xJ~yw4zf-;b8(XF3$r7-qnd?@MEYkM$}Jal*lILBQi`4hHW9+Z^wg7>Zh8 z3@!9do=Ey4sDYY_rRnm+72hr%gbCIUhI2#no^w}fbgsMmkH(jsDbf_*kZcMi6v>g8 zn8?U0YP&Lgp$A3wtSi{n*cOLFNT&Y4nJ&bA)W*zJ!oEJJVf}SLBf6&><aFJ(haMau z!`iO01HTi?RFORQU@2t)y)N{KH&wazQFV#B1Y2^-#4W7$%^n$i1?J(f6S!}Sd8RE= z=PCwX6K++gX^A7<cM)Sg%Y}}D&*tWk6ub!e#dWjY;~;gQooH_GyNk*&`oX6Q4r_bM zmmbmZaHQfYCvr3f4quhml(#MGsM50V7*Nx92Teo8uq@)N)7(<M!xp>gkaQ@zOTM(q zxG~z(9R1d8gY1~bV!NDyoSu}q-DfY}VZv$k;VEwci!*y2+%nZO(PvC0Xu({83CHSa z#`Y`gC&Nm2m5I0ofpqg#{553{X~%;>c)Ll_;?L}b_HyW;HGJ@29#0y{FRivdraUn* zJ^)eg&YxiPDW#zJOU8w!90g$xNaw_i7xJlqxJ8=9p?N{eXdN=!7`+a4Ld;;8Ol$-P z*~20U)L+mxmcuu(T9TvUo}MI~f3FyE)r;4;ER?Jmz!2!e7tKr8kF6i*-u>X}9(OOY zAHv!d->8fAe|MP&@^B^pUal>2TU?{rp{NYP@zJ8>W@&qc*ZIlA5@CZTOzlr??0Im5 zd~0Tz6O)(eJzK3$hku#V3y_kY!lXo?Yv0!IF?JHPyRdDhb54ce2*C#>TUy=zIE#o> zoE&XWVM(60J`UMg5`*PV`oeK*t1<mRs8~ppGYycRB)%7}U?`uY4K?%+QZuy~2m83F zo#E!bt1vqm(HSMiWEuTLC0D|4--)R7yhCu!Xnd>|^nq*J1~PHiNtJ-r8jV#b5Mgzi zlNxAiXb>=XaFR`4c+cf+FuP(T$RdURP10QVo7EbN&O?V<%+pTJ_!${5C#5U|@}sw@ z#c7^KTf6cVW_5w<AXA<D{<&jjS5hn>wbv8Jsbp#^F11tDddWUFd6uC|>RVlk2N=sP zz3-QbAZuL%9s*1awL`*lpI$BRH7{)lB%YnG%EEO$ArC(}SX*cszEK*Y@MU=HTD$fm zgb2@F8|p+;Z92#0bID?);4e2?I@y#zc|bRCfX%vHARl24opWbCyZ$prqi;^|*?q-x zE0c_EdTwXZm9Bp-)Lra@6K+nXvwc<zM&{6|rO0~v;9ni2MNQ&AE^T5UaQ+ssTI8}- z$nm_bj`yP)^J1!W;A*;LS9{)|IOAov8w3Qoni*1;GuuI(N$uUuLAx&$Z4zK+eP5Z7 zIL;KfiVzT=EN%7)Cs<gI6{zgY_L@AD9on8DMlbHZsH#f$GReEJq#E<=GO8vxUbtFh ziUhc6oakqRxr{OAr^!(+%*jJ#eU@8kh)tY$tK%UgqCW!?)W8P_9+#v7=nJsGXEK;^ z71NkO`C|%&@<q1<5_{OSHMKBKoacr~@IYO?2kCOafWGOJ`zevQJE>fh%y&i+myju+ zo*iGxpYP#I(j~OL$FwfMc{6RlK2)sj^Y9dG{L+UK$hhzaAC<>)<n|o$N!!i-ENx{T zmCcevRSsL{gVt18!Htvb=*a`M7o9D+JW6j+A-C7+Y)|7Mq(qNSq49Ik$95Ehu}XE5 zC@c&0baKn^GqX1;t5wO5ycce*yT#<la|)wh7lAVQtNJ}7Q@hU+&X?0HuPeo7t!<E( z^2b%1*c$hO2y&k3{yzTuab0gL-Hd_!t9i*DK!L`jAf<t{G|a=9vE>(f4@t=FobuJv z_KFPwJl;td(a6<6h)c-)sJ!1>nT}okX;RDGGzNyczJ=3Kl`sy;$h;UB!mz=lp0NKi zm3+HQqw0zL>yePGK#{Aeuer>vPu&dYnAV}mnCb`j*lfFd?mhb@Kel{A;jvv4fBf1~ z(-;=z^MF;mnR^LpXS|{p$loX~JEfz{5M8?i04<cojYd{$4giT$63Xkp)M{1NX1KO| zpd@Gdz4vjw^zM{}CwybE+6kX9nfcl>6WEH2N3YY$ew5p^ky!U_y|K{iRbzzC_PjeV zGyr{aY~-FSmM&;KX~U?p&GlhM&BqyD@)yldn}2c~zxrZhvk==b02S37rBSNs{O&mZ z*cStOB7Z}okgm>mCKC$PIifa#nQ&#yM&Y3u8PW_1w@Vq^T=rZ#3KY@q2OQ7G@lEU= zm4Q{Mu;S!{$DdniNia|XlSHdy9D}`Hy%4Lr<mq_Pxv<j`@5BAyX%HE9c9|yLh_qxk zs5m1>sMwdh(nVGlhwPuLWRg~$>jJpvnx%22&dzu2s4dq$8YKi(m&*6J!{Bzri^Pf2 zB5zdV)JeHAcN=UOO1+w^na7D}XZ#tpXWJ6{L~GDKY;Q|5*?nNc13+aY;P&dh&DAiW zthWpT-u_lJN%yktM8^cwxAuAHy(c(A=g)(M@F}>F`MxI+ratEVFz}0EkJ26?Ul0Gp zyRG_i`e#q=20b=l)OE(h%nn7(@*6YqFoLH(Ibll1Tif@yw2*Ym_nyI-!B-$zkggb{ zmblib+w&*2AVmAQ!QKLzv=gb0v9YScV;4R#Oe4H#_S|4>As2|FwQB3r*VuLt2b<@v z&rBR5@+LS&vqxpYS2wP-zVT|}>lkG}tRe|mzViJ@_wW9+3HWB7FJ<n$Deb4f%puU* z|BAbFfK<Pfgrp2m-^fgFV6fk)UkYp9(~o~4JzzL^NnG5huh$UXV_XLFrq~S1U(OGF zP2LxE9*mx5Vq%v#O9UFT^o*bl%3{P>hstmF$w7gSe4%WNtb@Kp6Flo~{7AW^5gvb* zaetcuBN;pL)NhA|$`lHq=BD}2RhuXy{pVGE>+9?Pui^A(y&s*j^)EFls_I*9l-g1M zRquyqZXM+x&1?A1FIW6K{6DfBjtU%&Lwy&pxBB}6l&I8Efum8Q?*d%b-vs{m7}B5V zKMM1FqlW@o|DgYPp*c!Ax=4K|L0!L%^T(ubOV&~1QIq*jWJew){%$~~P&)dr-54nM NAT{MPqI9I9`Ujyyjhz4h diff --git a/unittests/eln_files/records-example.eln b/unittests/eln_files/records-example.eln index 09ed53fc179e80a240ab773247d6f9adee71b429..4907bcc4e88e2152fdf2675a50ca661b666c947d 100644 GIT binary patch delta 1246 zcmai!O=uHA6vvZYo3`7eiAlR!+t}&GubO1@t!-7Lie93%N)>ugobJxHqnq7jciUP) z5JWF}us%e*i8p(b6fc6{Rq*0TZ}#NLubUvKvuQ(<T5(|5nfHGG|Gasy^WfqKB{W{< zw{-d1d_Ldetn)mC{oT1-IyXF#D~uG2BZZP&Ja}MeXb5ikr}><mhs)u95V$CaEzw>T zyJUQJN~g{YQKb<i<+J%>x~7MZTwnRzne=g+F^F8VOxq<Y(rmMW%2=hv)7V9tsq5xE zHKvg}L(qv+V@RVqVPU(%vW`U*s!;{I)HEO%I04tW7+h*?hox5Gf^^cfkxmt2I0R8+ zRl~p(vUGx3%qAMKk!#kxsiD$2s?K5^v7%YpMTV&oCxxhnuw^kLJf#yEeuoCw731)X zJ6l52#2_|R5H?g~TEsw(IcqBf%~9L+tZd>CY%4Q}7%XHO6=JxM<maJ3Ap{f5Ua_fF ztNt|?6DG(t_srB4qPVaa84xSjzN(t@hLjReZAdNy;Co9PaJa8}kJtQgj~7?HSjNUq z+Fh^+FfTYFi14tj5B78=;Wi)lpti&s1f%WS;SDFOLE+l|T~`dW@V2_+I=s@}(*%3_ zU+p^@VX^K!VVb(LJD)x85LM1L6cUC^M>o6@2mde2ZIE@q=MDjWcI;}Du9sKdx3fvG z7#x5XLGPSF*wXS1mRm)*&quumbT@WfT5IZ!YG>dW2Y2|szZ<zCO7-S#t6vc-u4&8D zbdDHfxI&<T+1|`(xI*=XjnKb()!?%^l50Gw>HdBRGU8r1)Hw+ILl-vwo~7Yzt~3mH zBNBWKE%Di+1ZVy6O{jdKz^J{|Qus07lh{9>X%T%sZ#KOE)lbo{L7qJY)erqox#}Y+ z!oee{yZV0D)<Dk;Z;js<d_HeBso+QQD2yhTJt14a>}21`^)ujg>SQe?q<gtZ5w<2J zcoK-fWIBR8veCD|UZVQA=`SbN;26A0*Xn||%;T`k!Uc%StD%(K%jGjHl!XO37Wk6! G4E_K!LVMW& delta 965 zcmcbaJSmnpz?+#xgn@&Bf#F+$=tkZ|Mo|L`T_eK~LnA8#b1M^5JwroNOG|^v&5Y@c z29xg#$xr5C5|WPQQh)*_m&B69;?xo)s7UPOJRaxC-<gCbACMHCyn&5-@^(hi$)}ht z4D%H73sUnGit|g0l2d_HT1k0gQK~{(evv|MVqRi;W?s5NQEG8&Vo`F2LJH8-$>uEO zlMPfkHs`Z?FixJ&BQ)8Z`@>{cak<S!Ja$ZzPjbo$fE=Nen39rNl9`{Em=iL2frP~5 z7o2>PrP+BWTT1awUN0^<IbA>&zv7eZ5@5yA5~`C=aLVI0wVFc`tZ*KO06|m55|c~v zi}VuH(lT>06G86N%S+5noopZ_HhF?1@8m2g34E4GPPXIZ0$S#YPmwa%HKJTnFxM!d z2S{Z>s*;t05->PZm2|*SH2EQ?=;R|@ieRnpx#TBb6IdvN7UxC=7MAA5+6D$z1_nx# z1?42<y@LJy6jCb+@{39o^3#Cvl$>9bQmmU=k(gVMld3oQA&bo9b%Lvy&4B6y8D%jI zv@|d>Kr&E0$i+=TBc!ybBqvo9zrl)|w+PK-maI|!Ak!wqzyQK1iFR|E*n1vkVCvl* zp}dG`k}fZ!(&jg+a!}SEbrUwp)8gvEC%70GKo~{uWP5%4$sKxIK-z)q-F!eg5@FM1 z4g)u^RG@)0(=DONhK5R$cm*aeFc44xDMU8b7?^`#pn>s>JVdRqPJlNflL#|BhebYg y5*3*aR0zU`FbzPuf$^&rP>(&>V1tGp-&mj^2%{=`tqoLUF!_L?Fxzq+kahqotrYP9 diff --git a/unittests/test_rocrate_converter.py b/unittests/test_rocrate_converter.py index dc7cef9f..ebf585fb 100644 --- a/unittests/test_rocrate_converter.py +++ b/unittests/test_rocrate_converter.py @@ -76,6 +76,12 @@ def eln_entities(basic_eln_converter): return entities +@pytest.mark.xfail( + reason="The example files for PASTA have not yet been updated in:" + "https://github.com/TheELNConsortium/TheELNFileFormat/tree/master/examples/PASTA" + "However, there was the announcement that these files are going to follow the" + "flattened structure soon: https://github.com/TheELNConsortium/TheELNFileFormat/issues/98" +) def test_load_pasta(basic_eln_converter): """ Test for loading the .eln example export from PASTA. @@ -99,7 +105,7 @@ def test_load_kadi4mat(basic_eln_converter): match = basic_eln_converter.match(f_k4mat) assert match is not None entities = basic_eln_converter.create_children(GeneralStore(), f_k4mat) - assert len(entities) == 10 + assert len(entities) == 17 assert isinstance(entities[0], ROCrateEntity) assert isinstance(entities[0].folder, str) assert isinstance(entities[0].entity, Entity) @@ -131,7 +137,7 @@ match_properties: match = ds2.match(eln_entities[1]) assert match is not None - assert match["dateCreated"] == "2024-08-21T12:07:45.115990+00:00" + assert match["dateCreated"] == "2024-11-19T13:44:35.476888+00:00" children = ds2.create_children(GeneralStore(), eln_entities[1]) assert len(children) == 8 @@ -193,7 +199,20 @@ def test_scanner(): assert len(rlist) == 1 assert isinstance(rlist[0], db.Record) assert rlist[0].name == "records-example" - assert rlist[0].description == "This is a sample record." + # This assertion was moved to a different test, see below: + # assert rlist[0].description == "This is a sample record." assert rlist[0].parents[0].name == "Dataset" assert rlist[0].get_property("keywords").value == "sample" assert rlist[0].get_property("dateModified").value == "2024-08-21T11:43:17.626965+00:00" + + +@pytest.mark.xfail( + reason="The description is no longer a simple string, but a reference to another record." + "The rocrate converter will be able to dereference this as soon as this feature is implemented:" + "https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/191" + "This test might need changes in the yaml definition." +) +def test_description_reference(): + rlist = scanner.scan_directory(os.path.join(UNITTESTDIR, "eln_files/"), + os.path.join(UNITTESTDIR, "eln_cfood.yaml")) + assert rlist[0].description == "This is a sample record." -- GitLab From 66595e57881df0097a7d3fc75fdd6c77c54c4c3d Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Fri, 17 Jan 2025 16:02:15 +0100 Subject: [PATCH 117/131] TST: deactivated zip file test for outdated eln and added simple zip test --- unittests/test_zipfile_converter.py | 32 ++++++++++++++++++++++++++++ unittests/zip_minimal/empty.zip | Bin 0 -> 536 bytes 2 files changed, 32 insertions(+) create mode 100644 unittests/zip_minimal/empty.zip diff --git a/unittests/test_zipfile_converter.py b/unittests/test_zipfile_converter.py index 9bc8b880..22e4f2ea 100644 --- a/unittests/test_zipfile_converter.py +++ b/unittests/test_zipfile_converter.py @@ -50,6 +50,12 @@ def converter_registry(): return converter_registry +@pytest.mark.xfail( + reason="The example files for PASTA have not yet been updated in:" + "https://github.com/TheELNConsortium/TheELNFileFormat/tree/master/examples/PASTA" + "However, there was the announcement that these files are going to follow the" + "flattened structure soon: https://github.com/TheELNConsortium/TheELNFileFormat/issues/98" +) def test_zipfile_converter(converter_registry): zipfile = File("PASTA.eln", os.path.join(UNITTESTDIR, "eln_files", "PASTA.eln")) zip_conv = ZipFileConverter(yaml.safe_load(""" @@ -77,3 +83,29 @@ match: ^PASTA$ assert isinstance(children[i], Directory) for i in range(2, 5): assert isinstance(children[i], File) + + +def test_zipfile_minimal(converter_registry): + zipfile = File("empty.zip", os.path.join(UNITTESTDIR, "zip_minimal", "empty.zip")) + zip_conv = ZipFileConverter(yaml.safe_load(""" +type: ZipFile +match: .*$ +"""), "TestZipFileConverter", converter_registry) + + match = zip_conv.match(zipfile) + assert match is not None + + children = zip_conv.create_children(GeneralStore(), zipfile) + assert len(children) == 2 + assert children[1].name == "empty.txt" + + dir_conv = DirectoryConverter(yaml.safe_load(""" +type: Directory +match: ^folder$ +"""), "TestDirectory", converter_registry) + match = dir_conv.match(children[0]) + assert match is not None + children = dir_conv.create_children(GeneralStore(), children[0]) + assert len(children) == 3 + for i in range(3): + assert isinstance(children[i], File) diff --git a/unittests/zip_minimal/empty.zip b/unittests/zip_minimal/empty.zip new file mode 100644 index 0000000000000000000000000000000000000000..3eb2cee755e1b0265b13b1ee8f31c2aa1abe62de GIT binary patch literal 536 zcmWIWW@Zs#00I66!zeHVO0WazwEUcu)FS-=xFT*eMS=)LX_+~x@yU866(tDeoCK67 zp(^Jfpga*(xpO_Li#dUgNzE-Nse~wFWU^<*?KP+`8QwaASa6RbG-2~9Ow*D^RfHyF zk0Z2W^EyI%2%2_iKp?bZ3krnxW;E^4fI(=-7BmR$n}AHDfMR6>NwET<CL;sGO|Uuu DMZ{6C literal 0 HcmV?d00001 -- GitLab From ca9c9d257f110e15cafaa75e214e6387934cda10 Mon Sep 17 00:00:00 2001 From: "i.nueske" <i.nueske@indiscale.com> Date: Sun, 26 Jan 2025 11:26:33 +0100 Subject: [PATCH 118/131] DOC: Extend ROCrateEntityConverter explanation --- src/doc/converters/further_converters.rst | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/doc/converters/further_converters.rst b/src/doc/converters/further_converters.rst index a3c306a2..0fffc2e7 100644 --- a/src/doc/converters/further_converters.rst +++ b/src/doc/converters/further_converters.rst @@ -132,8 +132,16 @@ File object, which can be matched with SimpleFile. And each subpart of the ROCrateEntity is also converted to a ROCrateEntity, which can then again be treated using this converter. -To match a ROCrateEntity using its entity_type, the match_entity_types keyword -can be used. +The ``match_entity_type`` keyword can be used to match a ROCrateEntity using its +entity_type. With the ``match_properties`` keyword, properties of a ROCrateEntity +can be either matched or extracted, as seen in the cfood example below: +* with ``match_properties: "@id": ro-crate-metadata.json`` the ROCrateEntities +can be filtered to only match the metadata json files. +* with ``match_properties: dateCreated: (?P<dateCreated>.*)`` the ``dateCreated`` +entry of that metadata json file is extracted and accessible through the +``dateCreated`` variable. +* the example could then be extended to use any other entry present in the metadata +json to filter the results, or insert the extracted information into generated records. Example cfood ------------- -- GitLab From f17c6102b335e41168809db79f96222fc69894f8 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Fri, 31 Jan 2025 17:47:26 +0100 Subject: [PATCH 119/131] TST: fixed tests due to changes to rocrate feature --- src/caoscrawler/converters/rocrate.py | 12 ++++++++---- unittests/test_rocrate_converter.py | 19 +++++++++++++------ 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/src/caoscrawler/converters/rocrate.py b/src/caoscrawler/converters/rocrate.py index fe403944..f0140fa1 100644 --- a/src/caoscrawler/converters/rocrate.py +++ b/src/caoscrawler/converters/rocrate.py @@ -200,12 +200,16 @@ class ROCrateEntityConverter(Converter): # This is - according to the standard - only allowed, if it's flat, i.e. # it contains a single element with key == "@id" and the id as value which # is supposed to be dereferenced: - if not (len(value) == 1 and "@id" in eprops): + if not (len(value) == 1 and "@id" in value): raise RuntimeError("The JSON-LD is not flat.") - children.append( - ROCrateEntity(element.folder, element.entity.crate.dereference( - value["@id"]))) # TODO: tests missing! + dereferenced = element.entity.crate.dereference(value["@id"]) + if dereferenced is not None: + children.append( + ROCrateEntity(element.folder, dereferenced)) + else: + # This is just an external ID and will be added as simple DictElement + children.append(convert_basic_element(value, name)) else: children.append(convert_basic_element(value, name)) diff --git a/unittests/test_rocrate_converter.py b/unittests/test_rocrate_converter.py index ebf585fb..02f0f602 100644 --- a/unittests/test_rocrate_converter.py +++ b/unittests/test_rocrate_converter.py @@ -144,8 +144,8 @@ match_properties: assert isinstance(children[0], TextElement) assert children[0].name == "@id" assert children[0].value == "ro-crate-metadata.json" - assert isinstance(children[5], DictElement) - assert children[5].value == {'@id': 'https://kadi.iam.kit.edu'} + assert isinstance(children[5], ROCrateEntity) + assert children[5].name == "https://kadi.iam.kit.edu" def test_file(eln_entities): @@ -184,13 +184,20 @@ match_properties: assert match is not None children = ds_parts.create_children(GeneralStore(), ent_parts) + # Number of children = number of properties + number of parts + + # number of variables measured + number of files + assert len(children) == (len(ent_parts.entity.properties()) + + len(ent_parts.entity.properties()["hasPart"]) + + len(ent_parts.entity.properties()["variableMeasured"])) - # Number of children = number of properties + number of parts: - assert len(children) == len(ent_parts.entity.properties()) + 4 entity_children = [f for f in children if isinstance(f, ROCrateEntity)] - assert len(entity_children) == 4 + assert len(entity_children) == 13 + file_counter = 0 + for f in entity_children: - assert isinstance(f.entity, rocrate.model.file.File) + if isinstance(f.entity, rocrate.model.file.File): + file_counter += 1 + assert file_counter == 4 def test_scanner(): -- GitLab From dd3f75bfc0a6304450c625c652a98a310b5ff628 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Fri, 31 Jan 2025 17:50:33 +0100 Subject: [PATCH 120/131] DOC: updated changelog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 34ce1852..1f6f5558 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,6 +38,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed ### - Registered identifiables can also be used by children of the given RecordType if no registered identifiable is defined for them. +- ROCrate converter supports dereferencing property values with a single "@id"-property during + subtree generation. +- ROCrate converter supports the special property "variablesMeasured" in addition to "hasPart". ### Deprecated ### -- GitLab From 1a3fc544d9a657b195e6f2f9b4a4680557c5ff2a Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Thu, 6 Feb 2025 13:06:25 +0100 Subject: [PATCH 121/131] FIX: Add unit tests and fix for https://gitlab.com/linkahead/linkahead-crawler/-/issues/112 --- src/caoscrawler/converters/converters.py | 5 +- unittests/test_issues.py | 84 ++++++++++++++++++++++-- 2 files changed, 84 insertions(+), 5 deletions(-) diff --git a/src/caoscrawler/converters/converters.py b/src/caoscrawler/converters/converters.py index 09942918..5fcdfda6 100644 --- a/src/caoscrawler/converters/converters.py +++ b/src/caoscrawler/converters/converters.py @@ -1327,7 +1327,10 @@ out: m1 = {} if "match_value" in definition: - m2 = re.match(definition["match_value"], str(value), re.DOTALL) + # None values will be interpreted as empty strings for the + # matcher. + m_value = str(value) if (value is not None and not pd.isna(value)) else "" + m2 = re.match(definition["match_value"], m_value, re.DOTALL) if m2 is None: return None else: diff --git a/unittests/test_issues.py b/unittests/test_issues.py index a6de6540..b7cde016 100644 --- a/unittests/test_issues.py +++ b/unittests/test_issues.py @@ -20,14 +20,44 @@ # along with this program. If not, see <https://www.gnu.org/licenses/>. # -from pytest import mark +import importlib -from caoscrawler.converters import CrawlerTemplate, replace_variables +from pathlib import Path +from pytest import fixture, mark + +from caoscrawler.converters import (CrawlerTemplate, replace_variables, TextElementConverter) from caoscrawler.crawl import Crawler -from caoscrawler.scanner import (create_converter_registry, +from caoscrawler.scanner import (create_converter_registry, scan_directory, scan_structure_elements) from caoscrawler.stores import GeneralStore -from caoscrawler.structure_elements import DictElement +from caoscrawler.structure_elements import DictElement, TextElement + + +UNITTESTDIR = Path(__file__).parent + + +@fixture +def converter_registry(): + converter_registry: dict[str, dict[str, str]] = { + "TextElement": { + "converter": "TextElementConverter", + "package": "caoscrawler.converters"}, + "Directory": { + "converter": "DirectoryConverter", + "package": "caoscrawler.converters"}, + "CSVTableConverter": { + "converter": "CSVTableConverter", + "package": "caoscrawler.converters"}, + "Datetime": { + "converter": "DatetimeElementConverter", + "package": "caoscrawler.converters" + } + } + + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry def test_issue_10(): @@ -148,3 +178,49 @@ def test_issue_93(): propvalue_template = CrawlerTemplate(propvalue) assert (propvalue_template.safe_substitute(**values.get_storage()) == f"some text before >> This is {exp} << some text after") + + +def test_issue_112(converter_registry): + """Test that empty table cells are not matched in case of + ``match_value: ".+"``. + + See https://gitlab.com/linkahead/linkahead-crawler/-/issues/112. + + """ + tec = TextElementConverter( + name="TestTextConverter", + definition={ + "match_name": ".*", + "match_value": "(?P<content>.+)" + }, + converter_registry=converter_registry + ) + + empty = TextElement(name="empty", value='') + assert tec.match(empty) is None + + empty_none = TextElement(name="empty", value=None) + assert tec.match(empty_none) is None + + non_empty = TextElement(name="empty", value=' ') + matches = tec.match(non_empty) + assert "content" in matches + assert matches["content"] == ' ' + + # Cfood definition for CSV example file + records = scan_directory(UNITTESTDIR / "test_directories" / "examples_tables" / "ExperimentalData", + UNITTESTDIR / "test_directories" / "examples_tables" / "crawler_for_issue_112.yml") + assert records + for rec in records: + print(rec.name) + assert len(rec.parents.filter_by_identity(name="Event")) > 0 + assert rec.name in ["event_a", "event_b", "event_c"] + if rec.name == "event_a": + assert rec.get_property("event_time") is not None + assert rec.get_property("event_time").value == "2025-02-06" + elif rec.name == "event_b": + # This should not have matched + assert rec.get_property("event_time") is None + if rec.name == "event_c": + assert rec.get_property("event_time") is not None + assert rec.get_property("event_time").value == "2025-02-06T09:00:00" -- GitLab From d8cc3a465a384b07c138eaa01340982883479eb0 Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Thu, 6 Feb 2025 13:11:57 +0100 Subject: [PATCH 122/131] DOC: Update changelog --- CHANGELOG.md | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1a0dea9a..e843e99f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -36,8 +36,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 their contents were last modified before that datetime. ### Changed ### + - Registered identifiables can also be used by children of the given RecordType if no registered identifiable is defined for them. +- `None` and other NA values (i.e., values where `pandas.isna` is + `True`) are now interpreted as empty strings in + `converters.match_name_and_value` instead of being cast to string naïvely ### Deprecated ### @@ -48,6 +52,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `spss_to_datamodel` script works again. - The cfood now supports bi-directional references when defining records on the same level. (See: https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/175) +- [#112](https://gitlab.com/linkahead/linkahead-crawler/-/issues/112) + Children of CSVTableConverter match despite match_value: ".+" and + empty cell. This has been fixed by treating None and NA values in + `converters.match_name_and_value` (see above). ### Security ### @@ -79,9 +87,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Units for properties. They can be specified by giving the property as a dict in the form ```yaml MyRecord: - my_prop: - value: 5 - unit: m + my_prop: + value: 5 + unit: m ``` - Support for Python 3.13 - ROCrateConverter, ELNFileConverter and ROCrateEntityConverter for crawling ROCrate and .eln files -- GitLab From 3ff3181e7aee2dd8bdf39ee74c49a0afadf99a7c Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Thu, 6 Feb 2025 13:12:53 +0100 Subject: [PATCH 123/131] TST: Add test csv and cfood for new unit tests --- .../ExperimentalData/test_with_empty.csv | 4 +++ .../examples_tables/crawler_for_issue_112.yml | 27 +++++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 unittests/test_directories/examples_tables/ExperimentalData/test_with_empty.csv create mode 100644 unittests/test_directories/examples_tables/crawler_for_issue_112.yml diff --git a/unittests/test_directories/examples_tables/ExperimentalData/test_with_empty.csv b/unittests/test_directories/examples_tables/ExperimentalData/test_with_empty.csv new file mode 100644 index 00000000..be25239a --- /dev/null +++ b/unittests/test_directories/examples_tables/ExperimentalData/test_with_empty.csv @@ -0,0 +1,4 @@ +event,date +event_a,2025-02-06 +event_b, +event_c,2025-02-06T09:00:00 diff --git a/unittests/test_directories/examples_tables/crawler_for_issue_112.yml b/unittests/test_directories/examples_tables/crawler_for_issue_112.yml new file mode 100644 index 00000000..4bab5ada --- /dev/null +++ b/unittests/test_directories/examples_tables/crawler_for_issue_112.yml @@ -0,0 +1,27 @@ +ExperimentalData: + type: Directory + match: ExperimentalData + subtree: + CSVTable: + type: CSVTableConverter + match: "test_with_empty\\.csv" + subtree: + Row: + type: DictElement + records: + Event: + subtree: + EventName: + type: TextElement + match_name: "event" + match_value: "(?P<name>.*)" + records: + Event: + name: $name + Date: + type: Datetime + match_name: "date" + match_value: "(?P<date>.+)" + records: + Event: + event_time: $date -- GitLab From 9f92638120ddc730c2124aa7670c129a5a631c92 Mon Sep 17 00:00:00 2001 From: Daniel <d.hornung@indiscale.com> Date: Thu, 6 Feb 2025 16:19:55 +0100 Subject: [PATCH 124/131] Revert whitespace changes. --- CHANGELOG.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e843e99f..d04329ed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -87,9 +87,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Units for properties. They can be specified by giving the property as a dict in the form ```yaml MyRecord: - my_prop: - value: 5 - unit: m + my_prop: + value: 5 + unit: m ``` - Support for Python 3.13 - ROCrateConverter, ELNFileConverter and ROCrateEntityConverter for crawling ROCrate and .eln files -- GitLab From 22b53f7a8c96ff894e1cdc227b35ce21753fa9af Mon Sep 17 00:00:00 2001 From: Daniel <d.hornung@indiscale.com> Date: Thu, 6 Feb 2025 16:35:16 +0100 Subject: [PATCH 125/131] STY, DOC: A few small changes to documentation and regression test. --- src/caoscrawler/converters/converters.py | 8 ++++---- unittests/test_issues.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/caoscrawler/converters/converters.py b/src/caoscrawler/converters/converters.py index 5fcdfda6..e16b2c0f 100644 --- a/src/caoscrawler/converters/converters.py +++ b/src/caoscrawler/converters/converters.py @@ -1295,17 +1295,17 @@ class YAMLFileConverter(SimpleFileConverter): def match_name_and_value(definition, name, value): """Take match definitions from the definition argument and apply regular expression to name and - possibly value + possibly value. - one of the keys 'match_name' and "match' needs to be available in definition - 'match_value' is optional + Exactly one of the keys ``match_name`` and ``match`` must exist in ``definition``, + ``match_value`` is optional Returns ------- out: None, if match_name or match lead to no match. Otherwise, returns a dictionary with the - matched groups, possibly including matches from using match_value + matched groups, possibly including matches from using `definition["match_value"]` """ if "match_name" in definition: diff --git a/unittests/test_issues.py b/unittests/test_issues.py index b7cde016..779f7771 100644 --- a/unittests/test_issues.py +++ b/unittests/test_issues.py @@ -218,8 +218,8 @@ def test_issue_112(converter_registry): if rec.name == "event_a": assert rec.get_property("event_time") is not None assert rec.get_property("event_time").value == "2025-02-06" - elif rec.name == "event_b": - # This should not have matched + if rec.name == "event_b": + # `date` field is empty, so there must be no match assert rec.get_property("event_time") is None if rec.name == "event_c": assert rec.get_property("event_time") is not None -- GitLab From fcddbe7e09e0bb1b25119fe29451deaab531551a Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Thu, 13 Feb 2025 15:25:19 +0100 Subject: [PATCH 126/131] TST: made test_zipfile_minimal more general --- unittests/test_zipfile_converter.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/unittests/test_zipfile_converter.py b/unittests/test_zipfile_converter.py index 22e4f2ea..451d23c9 100644 --- a/unittests/test_zipfile_converter.py +++ b/unittests/test_zipfile_converter.py @@ -97,15 +97,26 @@ match: .*$ children = zip_conv.create_children(GeneralStore(), zipfile) assert len(children) == 2 - assert children[1].name == "empty.txt" + + file_obj = None + dir_obj = None + for ch in children: + if isinstance(ch, File): + file_obj = ch + elif isinstance(ch, Directory): + dir_obj = ch + else: + assert False + assert file_obj is not None and dir_obj is not None + assert file_obj.name == "empty.txt" dir_conv = DirectoryConverter(yaml.safe_load(""" type: Directory match: ^folder$ """), "TestDirectory", converter_registry) - match = dir_conv.match(children[0]) + match = dir_conv.match(dir_obj) assert match is not None - children = dir_conv.create_children(GeneralStore(), children[0]) + children = dir_conv.create_children(GeneralStore(), dir_obj) assert len(children) == 3 for i in range(3): assert isinstance(children[i], File) -- GitLab From 44eb46ec5975b615191f135d5093269e78c43788 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Thu, 13 Feb 2025 15:35:31 +0100 Subject: [PATCH 127/131] TST: fixed test for description --- unittests/eln_cfood.yaml | 11 +++++++++-- unittests/test_rocrate_converter.py | 6 ------ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/unittests/eln_cfood.yaml b/unittests/eln_cfood.yaml index ab8e7108..bb29b7da 100644 --- a/unittests/eln_cfood.yaml +++ b/unittests/eln_cfood.yaml @@ -26,11 +26,18 @@ DataDir: "@id": records-example/$ name: (?P<name>.*) keywords: (?P<keywords>.*) - description: (?P<description>.*) dateModified: (?P<dateModified>.*) records: Dataset: name: $name keywords: $keywords - description: $description dateModified: $dateModified + subtree: + Description: + type: ROCrateEntity + match_type: TextObject + match_properties: + text: (?P<description>.*) + records: + Dataset: + description: $description diff --git a/unittests/test_rocrate_converter.py b/unittests/test_rocrate_converter.py index 02f0f602..4b6bde17 100644 --- a/unittests/test_rocrate_converter.py +++ b/unittests/test_rocrate_converter.py @@ -213,12 +213,6 @@ def test_scanner(): assert rlist[0].get_property("dateModified").value == "2024-08-21T11:43:17.626965+00:00" -@pytest.mark.xfail( - reason="The description is no longer a simple string, but a reference to another record." - "The rocrate converter will be able to dereference this as soon as this feature is implemented:" - "https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/191" - "This test might need changes in the yaml definition." -) def test_description_reference(): rlist = scanner.scan_directory(os.path.join(UNITTESTDIR, "eln_files/"), os.path.join(UNITTESTDIR, "eln_cfood.yaml")) -- GitLab From 2c2d711943c258efd21f39ddd0f68e86235752c0 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Thu, 13 Feb 2025 15:40:50 +0100 Subject: [PATCH 128/131] DOC: added a comment to child generation behavior in rocrate converter --- src/caoscrawler/converters/rocrate.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/caoscrawler/converters/rocrate.py b/src/caoscrawler/converters/rocrate.py index f0140fa1..7dcad865 100644 --- a/src/caoscrawler/converters/rocrate.py +++ b/src/caoscrawler/converters/rocrate.py @@ -202,7 +202,6 @@ class ROCrateEntityConverter(Converter): # is supposed to be dereferenced: if not (len(value) == 1 and "@id" in value): raise RuntimeError("The JSON-LD is not flat.") - # TODO: tests missing! dereferenced = element.entity.crate.dereference(value["@id"]) if dereferenced is not None: children.append( @@ -225,8 +224,6 @@ class ROCrateEntityConverter(Converter): children.append( ROCrateEntity(element.folder, element.entity.crate.dereference( p["@id"]))) - # TODO: it feels a bit strange to add (especially variableMeasured) directly, top-level - # and not in a sub-DictElement. The latter would be difficult to realize, because - # resolving the sub-references is not straight-forward. + # TODO: See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/195 for discussion. return children -- GitLab From b34c2178495aefad84026afce786fcfa8bd2e242 Mon Sep 17 00:00:00 2001 From: Daniel <d.hornung@indiscale.com> Date: Fri, 14 Feb 2025 09:48:33 +0100 Subject: [PATCH 129/131] WIP: Fix pipeline: Update MariaDB version --- .docker/docker-compose.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.docker/docker-compose.yml b/.docker/docker-compose.yml index 02ccac5c..97f70320 100644 --- a/.docker/docker-compose.yml +++ b/.docker/docker-compose.yml @@ -1,7 +1,7 @@ version: '3.7' services: sqldb: - image: mariadb:10.4 + image: mariadb:11.4 environment: MYSQL_ROOT_PASSWORD: caosdb1234 networks: -- GitLab From 75a5736838a7fd4fc58623fd23f1eda72bff9d2b Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Wed, 5 Mar 2025 14:30:12 +0100 Subject: [PATCH 130/131] DOC: Update changelog for release --- CHANGELOG.md | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 00252658..f54c2a25 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased] ## +## [0.11.0] - 2025-03-05 ## ### Added ### @@ -46,10 +46,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 `True`) are now interpreted as empty strings in `converters.match_name_and_value` instead of being cast to string naïvely -### Deprecated ### - -### Removed ### - ### Fixed ### - `spss_to_datamodel` script works again. @@ -60,9 +56,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 empty cell. This has been fixed by treating None and NA values in `converters.match_name_and_value` (see above). -### Security ### - ### Documentation ### + - Added documentation for ROCrateConverter, ELNFileConverter, and ROCrateEntityConverter ## [0.10.1] - 2024-11-13 ## -- GitLab From aa8a0f90024960a73222c69854b538c85fb160be Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Wed, 5 Mar 2025 14:31:44 +0100 Subject: [PATCH 131/131] REL: Fix versions for release --- CITATION.cff | 4 ++-- setup.cfg | 2 +- src/doc/conf.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/CITATION.cff b/CITATION.cff index ed859432..8f4e22a4 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -17,6 +17,6 @@ authors: given-names: Alexander orcid: https://orcid.org/0000-0003-4124-9649 title: CaosDB - Crawler -version: 0.10.1 +version: 0.11.0 doi: 10.3390/data9020024 -date-released: 2024-11-13 \ No newline at end of file +date-released: 2025-03-05 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index ae138a9a..da645c0d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = caoscrawler -version = 0.10.2 +version = 0.11.0 author = Alexander Schlemmer author_email = alexander.schlemmer@ds.mpg.de description = A new crawler for LinkAhead diff --git a/src/doc/conf.py b/src/doc/conf.py index 01ca66bf..a1e9dbde 100644 --- a/src/doc/conf.py +++ b/src/doc/conf.py @@ -33,10 +33,10 @@ copyright = '2024, IndiScale' author = 'Alexander Schlemmer' # The short X.Y version -version = '0.10.2' +version = '0.11.0' # The full version, including alpha/beta/rc tags # release = '0.5.2-rc2' -release = '0.10.2-dev' +release = '0.11.0' # -- General configuration --------------------------------------------------- -- GitLab