diff --git a/.docker/Dockerfile b/.docker/Dockerfile index 1e9763f3496c9dca6cc33e6ba8217a654bed487e..1468a17feb16940ae658d3ca6b885af7139ce3d8 100644 --- a/.docker/Dockerfile +++ b/.docker/Dockerfile @@ -34,7 +34,7 @@ RUN rm -r /git/.git # Install pycaosdb.ini for the tests RUN mv /git/.docker/tester_pycaosdb.ini /git/integrationtests/pycaosdb.ini -RUN cd /git/ && pip3 install --break-system-packages .[h5-crawler,spss] +RUN cd /git/ && pip3 install --break-system-packages .[h5-crawler,spss,rocrate] WORKDIR /git/integrationtests # wait for server, diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8812abacc0ef157c418e8f658a4fa7261bb04743..e43223568252b2e7a1504610692fe20dc9d78348 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -121,27 +121,21 @@ unittest_py3.11: - python3 -c "import sys; assert sys.version.startswith('3.11')" - tox -unittest_py3.8: +unittest_py3.9: tags: [cached-dind] stage: test - image: python:3.8 + image: python:3.9 script: &python_test_script # install dependencies - pip install pytest pytest-cov # TODO: Use f-branch logic here - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@dev - - pip install .[h5-crawler,spss] + - pip install .[h5-crawler,spss,rocrate] # actual test - caosdb-crawler --help - pytest --cov=caosdb -vv ./unittests -unittest_py3.9: - tags: [cached-dind] - stage: test - image: python:3.9 - script: *python_test_script - unittest_py3.10: tags: [cached-dind] stage: test @@ -155,23 +149,10 @@ unittest_py3.12: script: *python_test_script unittest_py3.13: - allow_failure: true tags: [cached-dind] stage: test - image: python:3.13-rc - script: - # TODO: Replace by '*python_test_script' as soon as 3.13 has been officially released. - # TODO Remove the "!" after 3.13 release, which serves as an xfail - - apt update && apt install -y cargo - # install dependencies - - pip install pytest pytest-cov - # TODO: Use f-branch logic here - - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev - - (! pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@dev) - - (! pip install .[h5-crawler,spss]) - # actual test - - (! caosdb-crawler --help) - - (! pytest --cov=caosdb -vv ./unittests) + image: python:3.13 + script: *python_test_script inttest: tags: [docker] diff --git a/CHANGELOG.md b/CHANGELOG.md index 978de14e872bbf2c6a575462ae322e4f5d58e79d..ecf7350b87b754b3f5a0a7c129aec8e72be5bebd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,56 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.10.0] - 2024-11-13 ## + +### Added ### + +- XMLTextNodeConverter for converting text nodes created by XMLTagConverter +- XMLAttributeNodeConverter for converting attribute nodes created by XMLTagConverter +- Units for properties. They can be specified by giving the property as a dict in the form + ```yaml + MyRecord: + my_prop: + value: 5 + unit: m + ``` +- Support for Python 3.13 +- ROCrateConverter, ELNFileConverter and ROCrateEntityConverter for crawling ROCrate and .eln files +- `max_log_level` parameter to `logging.configure_server_side_logging` + to control the server-side debuglog's verboosity, and an optional + `sss_max_log_level` parameter to `crawler_main` to control the SSS + loglevel separately from the global `debug` option. + +### Changed ### + +- Property values specified by dicts do not have to contain a + `collection_mode` key anymore. If none is given, the + `collection_mode` is determined from the `value` as it is done for + values specified by strings: + - if `value` starts with '+', collection mode is "list". + - if `value` starts with '*', collection mode is "multiproperty". + - in all other cases, collection mode is "single". +- The default server-side scrippting debug level is now controlled by + the global `debug` option by default and set to log level `INFO` in + case of `debug=False`. The previous behavior can be restored by + calling `crawler_main` with `sss_max_log_level=logging.DEBUG`. + +### Removed ### + +* Support for Python 3.8 (end of life) + +### Fixed ### + +- Added better error message for some cases of broken converter and + record definitions. +- [#108](https://gitlab.com/linkahead/linkahead-crawler/-/issues/108) + Too verbose server-side scripting logs that could lead to high disk + usage. + +### Documentation ### + +- Tutorial on crawling a simple CSV file + ## [0.9.1] - 2024-09-26 ## ### Fixed ### diff --git a/CITATION.cff b/CITATION.cff index 99756999f4a42818510fffbe1d02e1bf4396540b..fc3a2ee634f1896a7f8145fdc3a5f965e3c91e47 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -17,6 +17,6 @@ authors: given-names: Alexander orcid: https://orcid.org/0000-0003-4124-9649 title: CaosDB - Crawler -version: 0.9.1 +version: 0.10.0 doi: 10.3390/data9020024 -date-released: 2024-09-26 \ No newline at end of file +date-released: 2024-11-13 \ No newline at end of file diff --git a/integrationtests/test_issues.py b/integrationtests/test_issues.py index 76392f3a4ce20d7ed6b6ccc30c79f1ce400001f7..cb1e2e0925dd85b9f6cadf2b56b22aface4bb468 100644 --- a/integrationtests/test_issues.py +++ b/integrationtests/test_issues.py @@ -28,6 +28,8 @@ from linkahead.cached import cache_clear from linkahead.utils.register_tests import clear_database, set_test_key from pytest import fixture, mark, raises +import tempfile + set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") @@ -328,3 +330,46 @@ def test_indiscale_87(clear_database): print(db.apiutils.compare_entities(rec, retrieved)) assert db.apiutils.empty_diff(rec, retrieved) print("---") + + +def test_issue_14(clear_database): + """ + Issue title: Some parent updates are required before inserts + + https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/14 + """ + + rt1 = db.RecordType(name="RT1") + rt2 = db.RecordType(name="RT2").insert() + rt1.add_property(rt2, importance=db.OBLIGATORY) + rt1.insert() + + r = db.Record() + r.add_parent(rt1) + with tempfile.NamedTemporaryFile() as tmpf: + f = db.File(name="test_parent", path="parent_test/file.txt", file=tmpf.name) + f.insert() + + # We create a clean new file object here: + f2 = db.File(name="test_parent", path="parent_test/file.txt", file=tmpf.name) + + f2.add_parent(rt2) + r.add_property(name="RT2", value=f2) + + # Current state in the database: File without parents + f_test_base = db.File(name="test_parent").retrieve() + assert len(f_test_base.parents) == 0 + assert len(db.execute_query("FIND Record")) == 0 + + ident = CaosDBIdentifiableAdapter() + ident.register_identifiable("RT1", db.RecordType().add_parent( + name="RT1").add_property(name="RT2")) + crawler = Crawler(identifiableAdapter=ident) + crawler.synchronize(crawled_data=[f2, r]) + + f_test = db.File(name="test_parent").retrieve() + assert len(f_test.parents) == 1 + assert f_test.parents[0].name == "RT2" + records = db.execute_query("FIND Record") + assert len(records) == 1 + assert records[0].get_property("RT2").value == f_test.id diff --git a/setup.cfg b/setup.cfg index bac5bf79af0ce6f31f7748f21fa2175e5444104b..29b576ed789d5e77ec71eb372a8c9d98b3189ecd 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = caoscrawler -version = 0.9.1 +version = 0.10.0 author = Alexander Schlemmer author_email = alexander.schlemmer@ds.mpg.de description = A new crawler for LinkAhead @@ -17,11 +17,11 @@ classifiers = package_dir = = src packages = find: -python_requires = >=3.8 +python_requires = >=3.9 install_requires = caosadvancedtools >= 0.7.0 importlib-resources - linkahead > 0.13.2 + linkahead >= 0.16.0 odfpy #make optional packaging pandas @@ -49,3 +49,5 @@ h5-crawler = numpy spss = pandas[spss] +rocrate = + rocrate @ git+https://github.com/salexan2001/ro-crate-py.git@f-automatic-dummy-ids diff --git a/src/caoscrawler/authorize.py b/src/caoscrawler/authorize.py index 6f1011b227881d4b73186996076abe20d94d52e5..f3deed4f8c78afa85fdd4471fe9383760b8c8b12 100644 --- a/src/caoscrawler/authorize.py +++ b/src/caoscrawler/authorize.py @@ -19,10 +19,10 @@ # along with this program. If not, see <https://www.gnu.org/licenses/>. # -from caosadvancedtools.crawler import Crawler as OldCrawler - import argparse +from caosadvancedtools.crawler import Crawler as OldCrawler + def parse_args(): parser = argparse.ArgumentParser() diff --git a/src/caoscrawler/cfood-schema.yml b/src/caoscrawler/cfood-schema.yml index acc3911f21d320146d0c35abc9d781541ee151ac..c5e0eaad092c12efbceb5f55b62b3d7cf8afdccf 100644 --- a/src/caoscrawler/cfood-schema.yml +++ b/src/caoscrawler/cfood-schema.yml @@ -76,6 +76,7 @@ cfood: - XMLFile - XMLTag - XMLTextNode + - XMLAttributeNode - PropertiesFromDictElement description: Type of this converter node. match: @@ -134,6 +135,9 @@ cfood: value: description: Dictionary notation for variable values. Values can be given by a variable which is indicated by an initial "$". Use "$$" for setting values actually starting with a dollar sign. type: string + unit: + description: The unit of this property. Units can be given by a variable which is indicated by an initial "$". Use "$$" for setting values actually starting with a dollar sign. + type: string collection_mode: description: The collection mode defines whether the resulting property will be a single property or whether the values of multiple structure elements will be collected either into a list or a multiproperty. enum: diff --git a/src/caoscrawler/converters/__init__.py b/src/caoscrawler/converters/__init__.py index 540a4cfca9ff19248baab2bc0fe8d10987d4bd1f..670d4e966c72c6bcf45d0d46c1db715fb79d8ab5 100644 --- a/src/caoscrawler/converters/__init__.py +++ b/src/caoscrawler/converters/__init__.py @@ -30,3 +30,17 @@ except ImportError as err: SPSSConverter: type = utils.MissingImport( name="SPSSConverter", hint="Try installing with the `spss` extra option.", err=err) + +try: + from .rocrate import (ELNFileConverter, ROCrateConverter, + ROCrateEntityConverter) +except ImportError as err: + ROCrateEntityConverter: type = utils.MissingImport( + name="ROCrateEntityConverter", hint="Try installing with the `rocrate` extra option.", + err=err) + ROCrateConverter: type = utils.MissingImport( + name="ROCrateConverter", hint="Try installing with the `rocrate` extra option.", + err=err) + ELNFileConverter: type = utils.MissingImport( + name="ELNFileConverter", hint="Try installing with the `rocrate` extra option.", + err=err) diff --git a/src/caoscrawler/converters/converters.py b/src/caoscrawler/converters/converters.py index f31a0f4463ea805472044e5bd7697ed1316d1d9b..64a557ce4e26fd8bfd345000d3abf18bf0360117 100644 --- a/src/caoscrawler/converters/converters.py +++ b/src/caoscrawler/converters/converters.py @@ -169,64 +169,84 @@ Parameters ---------- value: Union[dict, str, list] - - If *str*, the value to be interpreted. E.g. "4", "hello" or "$a" etc. - - If *dict*, must have keys ``value`` and ``collection_mode``. The returned tuple is directly - created from the corresponding values. - - If *list*, each element is checked for replacement and the resulting list will be used - as (list) value for the property + - If *str*, the value to be interpreted. E.g. "4", "hello" or "$a" + etc. No unit is set and collection mode is determined from the + first character: + - '+' corresponds to "list" + - '*' corresponds to "multiproperty" + - everything else is "single" + - If *dict*, it must have a ``value`` key and may ``unit``, and + ``collection_mode``. The returned tuple is directly created from + the corresponding values if they are given; ``unit`` defaults to + None and ``collection_mode`` is determined from ``value`` as + explained for the str case above, i.e., + - if it starts with '+', collection mode is "list", + - in case of '*', collection mode is "multiproperty", + - and everything else is "single". + - If *list*, each element is checked for variable replacement and the + resulting list will be used as (list) value for the property Returns ------- out: tuple - the final value of the property; variable names contained in `values` are replaced. + - the final unit of the property; variable names contained in `values` are replaced. - the collection mode (can be single, list or multiproperty) """ # @review Florian Spreckelsen 2022-05-13 + propunit = None + propvalue = None + collection_mode = None if isinstance(value, dict): if "value" not in value: # TODO: how do we handle this case? Just ignore? # or disallow? raise NotImplementedError(f"This definition has no \"value\": {value}") propvalue = value["value"] + if "unit" in value: + propunit = replace_variables(value["unit"], values) # can be "single", "list" or "multiproperty" - collection_mode = value["collection_mode"] - elif isinstance(value, str): - propvalue = value - collection_mode = "single" - if propvalue.startswith("+"): - collection_mode = "list" - propvalue = propvalue[1:] - elif propvalue.startswith("*"): - collection_mode = "multiproperty" - propvalue = propvalue[1:] - elif isinstance(value, list): - # TODO: (for review) - # This is a bit dirty right now and needed for - # being able to directly set list values. Semantics is, however, a bit - # different from the two cases above. - collection_mode = "single" - - # variables replacement: - propvalue = list() - for element in value: - # Do the element-wise replacement only, when its type is string: - if isinstance(element, str): - propvalue.append(replace_variables(element, values)) - else: - propvalue.append(element) - - return (propvalue, collection_mode) + if "collection_mode" in value: + collection_mode = value["collection_mode"] else: - # value is another simple type - collection_mode = "single" propvalue = value - # Return it immediately, otherwise variable substitution would be done and fail: - return (propvalue, collection_mode) + if collection_mode is None: + if isinstance(propvalue, str): + # Determine collection mode from string value + collection_mode = "single" + if propvalue.startswith("+"): + collection_mode = "list" + propvalue = propvalue[1:] + elif propvalue.startswith("*"): + collection_mode = "multiproperty" + propvalue = propvalue[1:] + elif isinstance(propvalue, list): + # TODO: (for review) + # This is a bit dirty right now and needed for + # being able to directly set list values. Semantics is, however, a bit + # different from the two cases above. + collection_mode = "single" + + # variables replacement: + returnvalue = list() + for element in propvalue: + # Do the element-wise replacement only, when its type is string: + if isinstance(element, str): + returnvalue.append(replace_variables(element, values)) + else: + returnvalue.append(element) + + return (returnvalue, propunit, collection_mode) + else: + # value is another simple type + collection_mode = "single" + # Return it immediately, otherwise variable substitution would be done and fail: + return (propvalue, propunit, collection_mode) propvalue = replace_variables(propvalue, values) - return (propvalue, collection_mode) + return (propvalue, propunit, collection_mode) def create_records(values: GeneralStore, records: RecordStore, def_records: dict): @@ -268,6 +288,9 @@ def create_records(values: GeneralStore, records: RecordStore, def_records: dict c_record = records[name] + if isinstance(record, str): + raise RuntimeError( + "dict expected, but found str: {}".format(record)) for key, value in record.items(): if key == "parents" or key == "role": continue @@ -277,7 +300,7 @@ def create_records(values: GeneralStore, records: RecordStore, def_records: dict key = key_template.safe_substitute(**values.get_storage()) keys_modified.append((name, key)) - propvalue, collection_mode = handle_value(value, values) + propvalue, propunit, collection_mode = handle_value(value, values) if key.lower() in SPECIAL_PROPERTIES: # e.g. description, name, etc. @@ -291,17 +314,26 @@ def create_records(values: GeneralStore, records: RecordStore, def_records: dict else: if c_record.get_property(key) is None: if collection_mode == "list": - c_record.add_property(name=key, value=[propvalue]) + c_record.add_property(name=key, value=[propvalue], unit=propunit) elif (collection_mode == "multiproperty" or collection_mode == "single"): - c_record.add_property(name=key, value=propvalue) + c_record.add_property(name=key, value=propvalue, unit=propunit) else: if collection_mode == "list": + if propunit and c_record.get_property(key).unit and propunit != c_record.get_property(key).unit: + raise RuntimeError( + f"Property '{key}' has contradictory units: " + f"{propunit} and {c_record.get_property(key).unit}" + ) c_record.get_property(key).value.append(propvalue) + if propunit and not c_record.get_property(key).unit: + c_record.get_property(key).unit = propunit elif collection_mode == "multiproperty": - c_record.add_property(name=key, value=propvalue) + c_record.add_property(name=key, value=propvalue, unit=propunit) elif collection_mode == "single": c_record.get_property(key).value = propvalue + if propunit: + c_record.get_property(key).unit = propunit # no matter whether the record existed in the record store or not, # parents will be added when they aren't present in the record yet: @@ -368,6 +400,15 @@ class Converter(object, metaclass=ABCMeta): self.converters.append(Converter.converter_factory( converter_definition, converter_name, converter_registry)) + self.setup() + + def setup(self): + """ + Analogous to `cleanup`. Can be used to set up variables that are permanently + stored in this converter. + """ + pass + @staticmethod def converter_factory(definition: dict, name: str, converter_registry: dict): """Create a Converter instance of the appropriate class. @@ -375,6 +416,10 @@ class Converter(object, metaclass=ABCMeta): The `type` key in the `definition` defines the Converter class which is being used. """ + if definition is None: + raise RuntimeError("Definition of converter \"{}\" is " + "empty".format(name)) + if "type" not in definition: raise RuntimeError( "Type is mandatory for converter entries in CFood definition.") @@ -583,6 +628,13 @@ class Converter(object, metaclass=ABCMeta): """ pass + def cleanup(self): + """ + This function is called when the converter runs out of scope and can be used to + clean up objects that were needed in the converter or its children. + """ + pass + class DirectoryConverter(Converter): """ diff --git a/src/caoscrawler/converters/hdf5_converter.py b/src/caoscrawler/converters/hdf5_converter.py index a4d974bd53fc4b0e22d155f01a6a47295b79e984..97dac53d053dbcb87c48f0cfb59d4f09770b9710 100644 --- a/src/caoscrawler/converters/hdf5_converter.py +++ b/src/caoscrawler/converters/hdf5_converter.py @@ -28,16 +28,16 @@ except ModuleNotFoundError: "its optional `h5-crawler` dependency?" ) -import numpy as np - from typing import Union import linkahead as db +import numpy as np -from .converters import (convert_basic_element, Converter, DictElementConverter, - match_name_and_value, SimpleFileConverter) from ..stores import GeneralStore, RecordStore -from ..structure_elements import DictElement, File, FloatElement, IntegerElement, StructureElement +from ..structure_elements import (DictElement, File, FloatElement, + IntegerElement, StructureElement) +from .converters import (Converter, DictElementConverter, SimpleFileConverter, + convert_basic_element, match_name_and_value) def convert_attributes(elt: Union[h5py.File, h5py.Group, h5py.Dataset]): diff --git a/src/caoscrawler/converters/rocrate.py b/src/caoscrawler/converters/rocrate.py new file mode 100644 index 0000000000000000000000000000000000000000..b84462acba2fdd7e60094e38edc38605c80deb11 --- /dev/null +++ b/src/caoscrawler/converters/rocrate.py @@ -0,0 +1,224 @@ +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Converters take structure elements and create Records and new structure elements from them. + +This converter converts ro-crate files which may also be .eln-files. + +""" + +from __future__ import annotations + +import os +import re +import tempfile +from typing import Optional +from zipfile import ZipFile + +import linkahead as db +import rocrate +from rocrate.rocrate import ROCrate + +from ..stores import GeneralStore, RecordStore +from ..structure_elements import (Directory, File, ROCrateEntity, + StructureElement) +from .converters import (Converter, ConverterValidationError, + SimpleFileConverter, convert_basic_element) + + +class ROCrateConverter(SimpleFileConverter): + + """Convert ro-crate files / directories. + """ + + def setup(self): + self._tempdir = None + + def cleanup(self): + self._tempdir.cleanup() + + def typecheck(self, element: StructureElement): + """ + Check whether the current structure element can be converted using + this converter. + """ + return isinstance(element, File) or isinstance(element, Directory) + + def match(self, element: StructureElement) -> Optional[dict]: + m = re.match(self.definition["match"], element.name) + if m is None: + return None + return m.groupdict() + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + """ + Loads an ROCrate from an rocrate file or directory. + + Arguments: + ---------- + element must be a File or Directory (structure element). + + Returns: + -------- + A list with an ROCrateElement representing the contents of the .eln-file or None + in case of errors. + """ + + if isinstance(element, File): + self._tempdir = tempfile.TemporaryDirectory() + with ZipFile(element.path) as zipf: + zipf.extractall(self._tempdir.name) + crate_path = self._tempdir.name + crate = ROCrate(crate_path) + entity_ls = [] + for ent in crate.get_entities(): + entity_ls.append(ROCrateEntity(crate_path, ent)) + return entity_ls + elif isinstance(element, Directory): + # This would be an unzipped .eln file + # As this is possible for rocrate files, I think it is reasonable + # to support it as well. + raise NotImplementedError() + else: + raise ValueError("create_children was called with wrong type of StructureElement") + return None + + +class ELNFileConverter(ROCrateConverter): + + """Convert .eln-Files + See: https://github.com/TheELNConsortium/TheELNFileFormat + + These files are basically RO-Crates with some minor differences: + - The ro-crate metadata file is not on top-level within the .eln-zip-container, + but in a top-level subdirectory. + """ + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + """ + Loads an ROCrate from an .eln-file or directory. + + This involves unzipping the .eln-file to a temporary folder and creating an ROCrate object + from its contents. + + Arguments: + ---------- + element must be a File or Directory (structure element). + + Returns: + -------- + A list with an ROCrateElement representing the contents of the .eln-file or None + in case of errors. + """ + + if isinstance(element, File): + self._tempdir = tempfile.TemporaryDirectory() + with ZipFile(element.path) as zipf: + zipf.extractall(self._tempdir.name) + cratep = os.listdir(self._tempdir.name) + if len(cratep) != 1: + raise RuntimeError(".eln file must contain exactly one folder") + crate_path = os.path.join(self._tempdir.name, cratep[0]) + crate = ROCrate(crate_path) + entity_ls = [] + for ent in crate.get_entities(): + entity_ls.append(ROCrateEntity(crate_path, ent)) + return entity_ls + elif isinstance(element, Directory): + # This would be an unzipped .eln file + # As this is possible for rocrate files, I think it is reasonable + # to support it as well. + raise NotImplementedError() + else: + raise ValueError("create_children was called with wrong type of StructureElement") + return None + + +class ROCrateEntityConverter(Converter): + + def typecheck(self, element: StructureElement): + """ + Check whether the current structure element can be converted using + this converter. + """ + return isinstance(element, ROCrateEntity) + + def match(self, element: StructureElement) -> Optional[dict]: + # See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145 + # for a suggestion for the design of the matching algorithm. + if not isinstance(element, ROCrateEntity): + raise TypeError("Element must be an instance of ROCrateEntity.") + + # Store the result of all individual regexp variable results: + vardict = {} + + if "match_entity_type" in self.definition: + m_type = re.match(self.definition["match_entity_type"], element.type) + if m_type is None: + return None + vardict.update(m_type.groupdict()) + + if "match_properties" in self.definition: + # This matcher works analogously to the attributes matcher in the XMLConverter + for prop_def_key, prop_def_value in self.definition["match_properties"].items(): + match_counter = 0 + matched_m_prop = None + matched_m_prop_value = None + for prop_key, prop_value in element.entity.properties().items(): + m_prop = re.match(prop_def_key, prop_key) + if m_prop is not None: + match_counter += 1 + matched_m_prop = m_prop + m_prop_value = re.match(prop_def_value, prop_value) + if m_prop_value is None: + return None + matched_m_prop_value = m_prop_value + if match_counter == 0: + return None + elif match_counter > 1: + raise RuntimeError("Multiple properties match the same match_prop entry.") + vardict.update(matched_m_prop.groupdict()) + vardict.update(matched_m_prop_value.groupdict()) + + return vardict + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + + children = [] + + eprops = element.entity.properties() + + # Add the properties: + for name, value in eprops.items(): + children.append(convert_basic_element(value, name)) + + # Add the files: + if isinstance(element.entity, rocrate.model.file.File): + path, name = os.path.split(eprops["@id"]) + children.append(File(name, os.path.join(element.folder, path, name))) + + # Parts of this entity are added as child entities: + if "hasPart" in eprops: + for p in eprops["hasPart"]: + children.append( + ROCrateEntity(element.folder, element.entity.crate.dereference( + p["@id"]))) + + return children diff --git a/src/caoscrawler/converters/spss.py b/src/caoscrawler/converters/spss.py index b4f03aeaed6663be98487a4780bb96237e72e27e..00742e91506245435ed0c590f68ea9ffce65717a 100644 --- a/src/caoscrawler/converters/spss.py +++ b/src/caoscrawler/converters/spss.py @@ -22,17 +22,16 @@ from __future__ import annotations # Can be removed with 3.10. import argparse from collections import OrderedDict +from typing import Any, Optional import numpy as np import pandas as pd import pyreadstat import yaml -from . import converters from ..stores import GeneralStore -from ..structure_elements import (File, StructureElement) -from typing import Optional, Any - +from ..structure_elements import File, StructureElement +from . import converters READSTAT_TYPES = { "double": "DOUBLE", diff --git a/src/caoscrawler/converters/xml_converter.py b/src/caoscrawler/converters/xml_converter.py index d1d8b8871f9dad9762f35ee79e1a9106c259f4a9..b9f7487ee633d0ba25a3b81b78b9a3561274edc9 100644 --- a/src/caoscrawler/converters/xml_converter.py +++ b/src/caoscrawler/converters/xml_converter.py @@ -22,17 +22,17 @@ from __future__ import annotations -import lxml.etree import re - from typing import Optional import linkahead as db +import lxml.etree -from .converters import SimpleFileConverter, ConverterValidationError, Converter from ..stores import GeneralStore, RecordStore -from ..structure_elements import (File, StructureElement, - XMLTagElement, XMLTextNode, XMLAttributeNode) +from ..structure_elements import (File, StructureElement, XMLAttributeNode, + XMLTagElement, XMLTextNode) +from .converters import (Converter, ConverterValidationError, + SimpleFileConverter) class XMLFileConverter(SimpleFileConverter): @@ -183,6 +183,7 @@ class XMLTagConverter(Converter): # - Require unique attribute-key and attribute-value matches: Very complex # - Only allow one single attribute-key to match and run attribute-value match separately. # Currently the latter option is implemented. + # TODO: The ROCrateEntityConverter implements a very similar behavior. if match_counter == 0: return None elif match_counter > 1: @@ -195,7 +196,10 @@ class XMLTagConverter(Converter): class XMLTextNodeConverter(Converter): def create_children(self, generalStore: GeneralStore, element: StructureElement): - raise NotImplementedError() + """ + This converter does not create children. + """ + return [] def typecheck(self, element: StructureElement): """ @@ -210,6 +214,47 @@ class XMLTextNodeConverter(Converter): if not isinstance(element, XMLTextNode): raise TypeError("Element must be an instance of XMLTextNode.") - raise NotImplementedError() + vardict = {} + + m_text = re.match(self.definition["match_text"], element.value, + re.DOTALL) + if m_text is None: + return None + vardict.update(m_text.groupdict()) - return None + return vardict + + +class XMLAttributeNodeConverter(Converter): + def create_children(self, generalStore: GeneralStore, element: StructureElement): + """ + This converter does not create children. + """ + return [] + + def typecheck(self, element: StructureElement): + """ + Check whether the current structure element can be converted using + this converter. + """ + return isinstance(element, XMLAttributeNode) + + def match(self, element: StructureElement) -> Optional[dict]: + # See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145 + # for a suggestion for the design of the matching algorithm. + if not isinstance(element, XMLAttributeNode): + raise TypeError("Element must be an instance of XMLAttributeNode.") + + vardict = {} + + m_name = re.match(self.definition["match_name"], element.key) + if m_name is None: + return None + vardict.update(m_name.groupdict()) + + m_value = re.match(self.definition["match_value"], element.value) + if m_value is None: + return None + vardict.update(m_value.groupdict()) + + return vardict diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index a449a779ed7979719bbe0ac780adecf0e4fec8f6..a79e4434ee8f58fd1cc2646ced85c0d02d3fb66b 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -39,7 +39,6 @@ import sys import traceback import uuid import warnings - from argparse import RawTextHelpFormatter from copy import deepcopy from datetime import datetime @@ -52,13 +51,10 @@ from caosadvancedtools.cache import UpdateCache from caosadvancedtools.crawler import Crawler as OldCrawler from caosadvancedtools.serverside.helper import send_mail from caosadvancedtools.utils import create_entity_link -from linkahead.apiutils import (compare_entities, - merge_entities) +from linkahead.apiutils import compare_entities, merge_entities from linkahead.cached import cache_clear, cached_get_entity_by from linkahead.common.datatype import get_list_datatype, is_reference -from linkahead.exceptions import ( - TransactionError, -) +from linkahead.exceptions import TransactionError from linkahead.utils.escape import escape_squoted_text from .config import get_config_setting @@ -99,7 +95,7 @@ in a quite complex fashion: - If one of the entities has additional parents or additional properties -> not identical - If the value of one of the properties differs -> not identical - If datatype, importance or unit are reported different for a property by compare_entities - return "not_identical" only if these attributes are set explicitely by record1. + return False only if these attributes are set explicitely by record1. Ignore the difference otherwise. - If description, name, id or path appear in list of differences -> not identical. - If file, checksum, size appear -> Only different, if explicitely set by record1. @@ -598,6 +594,9 @@ one with the entities that need to be updated and the other with entities to be unique_names=True): Crawler.set_ids_and_datatype_of_parents_and_properties(to_be_updated) logger.debug("UPDATE") + # Here, it's probably much more reasonable to show a diff of the update: + # from linkahead.apiutils import compare_entities + # [compare_entities(c, db.Record(id=c.id).retrieve()) for c in to_be_updated] logger.debug(to_be_updated) if len(to_be_updated) > 0: if securityMode.value > SecurityMode.INSERT.value: @@ -1017,6 +1016,7 @@ def crawler_main(crawled_directory_path: str, restricted_path: Optional[list[str]] = None, remove_prefix: Optional[str] = None, add_prefix: Optional[str] = None, + sss_max_log_level: Optional[int] = None, ): """ @@ -1050,6 +1050,12 @@ def crawler_main(crawled_directory_path: str, add_prefix : Optional[str] Add the given prefix to file paths. See docstring of '_fix_file_paths' for more details. + sss_max_log_level : Optional[int] + If given, set the maximum log level of the server-side + scripting log separately from the general ``debug`` option. If + None is given, the maximum sss log level will be determined + from the value of ``debug``: ``logging.INFO`` if ``debug`` is + False, ``logging.DEBUG`` if ``debug`` is True. Returns ------- @@ -1060,7 +1066,11 @@ def crawler_main(crawled_directory_path: str, crawler = Crawler(securityMode=securityMode) if "SHARED_DIR" in os.environ: # setup logging and reporting if serverside execution - userlog_public, htmluserlog_public, debuglog_public = configure_server_side_logging() + if sss_max_log_level is None: + sss_max_log_level = logging.DEBUG if debug else logging.INFO + userlog_public, htmluserlog_public, debuglog_public = configure_server_side_logging( + max_log_level=sss_max_log_level + ) # TODO make this optional _create_status_record( get_shared_resource_link(get_config_setting("public_host_url"), htmluserlog_public), diff --git a/src/caoscrawler/debug_tree.py b/src/caoscrawler/debug_tree.py index 0d57040f5c20aca236a3c11531e8b7c45bad89ab..c154f5b91d850476be0c0610e5bb1dfcbf9866ab 100644 --- a/src/caoscrawler/debug_tree.py +++ b/src/caoscrawler/debug_tree.py @@ -29,35 +29,20 @@ A structure containing debug tree information. from __future__ import annotations -import argparse -import importlib -import logging -import os -import sys -import warnings -import yaml - -from argparse import RawTextHelpFormatter from collections import defaultdict -from copy import deepcopy -from enum import Enum -from importlib_resources import files -from jsonschema import validate -from typing import Any, Optional, Type, Union import linkahead as db - -from caosadvancedtools.cache import UpdateCache, Cache -from caosadvancedtools.crawler import Crawler as OldCrawler -from linkahead.apiutils import (compare_entities, EntityMergeConflictError, +import yaml +from importlib_resources import files +from jsonschema import validate +from linkahead.apiutils import (EntityMergeConflictError, compare_entities, merge_entities) from linkahead.common.datatype import is_reference -from .converters import Converter, DirectoryConverter, ConverterValidationError - +from .converters import Converter, ConverterValidationError, DirectoryConverter from .macros import defmacro_constructor, macro_constructor -from .stores import Store, GeneralStore, RecordStore -from .structure_elements import StructureElement, Directory, NoneElement +from .stores import GeneralStore, RecordStore, Store +from .structure_elements import Directory, NoneElement, StructureElement from .version import check_cfood_version diff --git a/src/caoscrawler/default_converters.yml b/src/caoscrawler/default_converters.yml index a78c1579fc05c2ede424c076e7590d25550ea2f3..656b0ba0f1f76007266cc8b2e75f5bd7046f1206 100644 --- a/src/caoscrawler/default_converters.yml +++ b/src/caoscrawler/default_converters.yml @@ -111,3 +111,7 @@ XMLTag: XMLTextNode: converter: XMLTextNodeConverter package: caoscrawler.converters + +XMLAttributeNode: + converter: XMLAttributeNodeConverter + package: caoscrawler.converters diff --git a/src/caoscrawler/identifiable.py b/src/caoscrawler/identifiable.py index f6c85c694e5ef0be7e6a9be8154a34c400bab008..cd52effb954d66bcc69b7296de77ddaf7b2b8394 100644 --- a/src/caoscrawler/identifiable.py +++ b/src/caoscrawler/identifiable.py @@ -80,7 +80,7 @@ class Identifiable(): def get_representation(self) -> str: return sha256(Identifiable._create_hashable_string(self).encode('utf-8')).hexdigest() - @ staticmethod + @staticmethod def _value_representation(value) -> str: """returns the string representation of property values to be used in the hash function @@ -103,7 +103,7 @@ class Identifiable(): else: raise ValueError(f"Unknown datatype of the value: {value}") - @ staticmethod + @staticmethod def _create_hashable_string(identifiable: Identifiable) -> str: """ creates a string from the attributes of an identifiable that can be hashed diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index 854ee614638712bdcf957c592ef2946dbdd43afc..592f603bef508771d734ff633f8cdb2c100742d5 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -36,12 +36,8 @@ import yaml from linkahead.cached import cached_get_entity_by, cached_query from linkahead.utils.escape import escape_squoted_text -from .exceptions import ( - InvalidIdentifiableYAML, - MissingIdentifyingProperty, - MissingRecordType, - MissingReferencingEntityError, -) +from .exceptions import (InvalidIdentifiableYAML, MissingIdentifyingProperty, + MissingRecordType, MissingReferencingEntityError) from .identifiable import Identifiable from .sync_node import SyncNode from .utils import has_parent diff --git a/src/caoscrawler/logging.py b/src/caoscrawler/logging.py index 69ec1fabb97e1d236162552540a35815e25a33fb..b57a067d8635a468df7345365fabbfae9ee0b22f 100644 --- a/src/caoscrawler/logging.py +++ b/src/caoscrawler/logging.py @@ -20,29 +20,46 @@ # along with this program. If not, see <https://www.gnu.org/licenses/>. import logging +import sys -from caosadvancedtools.webui_formatter import WebUI_Formatter from caosadvancedtools.serverside.helper import get_shared_filename -import sys +from caosadvancedtools.webui_formatter import WebUI_Formatter -def configure_server_side_logging(): +def configure_server_side_logging(max_log_level: int = logging.INFO): """ Set logging up to save one plain debugging log file, one plain info log file (for users) and a stdout stream with messages wrapped in html elements returns the path to the file with debugging output + + Parameters + ---------- + max_log_level : int, optional + The maximum log level to use for SSS-logs. Default is + ``logging.INFO``. + + Returns + ------- + userlog_public, htmluserlog_public, debuglog_public: str + Public paths of the respective log files. """ adv_logger = logging.getLogger("caosadvancedtools") - adv_logger.setLevel(level=logging.DEBUG) + # The max_<level> variables will be used to set the logger levels + # to the respective maximum of intended level and max_log_level, + # effectively cutting off logging above the specified + # max_log_level. + max_info = max(logging.INFO, max_log_level) + max_debug = max(logging.DEBUG, max_log_level) + adv_logger.setLevel(level=max_debug) cr_logger = logging.getLogger("caoscrawler") - cr_logger.setLevel(level=logging.DEBUG) + cr_logger.setLevel(level=max_debug) userlog_public, userlog_internal = get_shared_filename("userlog.txt") root_logger = logging.getLogger() - root_logger.setLevel(level=logging.INFO) + root_logger.setLevel(level=max_info) # this is a log file with INFO level for the user user_file_handler = logging.FileHandler(filename=userlog_internal) diff --git a/src/caoscrawler/macros/macro_yaml_object.py b/src/caoscrawler/macros/macro_yaml_object.py index d85883011db3cf651da0dda6c110015128fbe439..5d2bc1fe0775499fa8b40a65e115fb4569892e38 100644 --- a/src/caoscrawler/macros/macro_yaml_object.py +++ b/src/caoscrawler/macros/macro_yaml_object.py @@ -26,11 +26,10 @@ # A. Schlemmer, 05/2022 import re -from dataclasses import dataclass -from typing import Any, Dict from copy import deepcopy +from dataclasses import dataclass from string import Template - +from typing import Any, Dict _SAFE_SUBST_PAT = re.compile(r"^\$(?P<key>\w+)$") _SAFE_SUBST_PAT_BRACES = re.compile(r"^\$\{(?P<key>\w+)}$") diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py index 9f8f5e40beb729d73151bad38f3e390a4a8cecb4..89bd1c04411665bf4832d6bccce69bbe1b11cad1 100644 --- a/src/caoscrawler/scanner.py +++ b/src/caoscrawler/scanner.py @@ -39,7 +39,7 @@ import logging import os import warnings from collections.abc import Callable -from typing import Any, Optional, Type, Union +from typing import Any, Optional, Union import linkahead as db import yaml @@ -55,10 +55,19 @@ from .version import check_cfood_version logger = logging.getLogger(__name__) -def load_definition(crawler_definition_path: str): +def load_definition(crawler_definition_path: str) -> dict: """ Load a cfood from a crawler definition defined by crawler definition path and validate it using cfood-schema.yml. + + Arguments: + ---------- + crawler_definition_path: str + Path to the crawler definition file in yaml format. + + Returns: + -------- + dict containing the crawler definition. """ # Load the cfood from a yaml file: @@ -70,13 +79,21 @@ def load_definition(crawler_definition_path: str): return _resolve_validator_paths(crawler_definition, crawler_definition_path) -def _load_definition_from_yaml_dict(crawler_definitions: list[dict]): +def _load_definition_from_yaml_dict(crawler_definitions: list[dict]) -> dict: """Load crawler definitions from a list of (yaml) dicts `crawler_definitions` which contains either one or two documents. Doesn't resolve the validator paths in the cfood definition, so for internal and testing use only. + Arguments: + ---------- + crawler_definitions: list[dict] + List of one or two dicts containing (optionally) metadata and the crawler definition. + + Returns: + -------- + dict containing the crawler definition. """ if len(crawler_definitions) == 1: # Simple case, just one document: @@ -383,6 +400,9 @@ def scanner(items: list[StructureElement], crawled_data, debug_tree, registered_transformer_functions) + # Clean up converter: + converter.cleanup() + if restricted_path and not path_found: raise RuntimeError("A 'restricted_path' argument was given that is not contained in " "the data tree") diff --git a/src/caoscrawler/scripts/generators.py b/src/caoscrawler/scripts/generators.py index ba8e6e39cc03e9be1923d72ec5c8d699c01fa8f9..2bf8a90f5af5086e23b7e7cc35d21a50d8cd511a 100644 --- a/src/caoscrawler/scripts/generators.py +++ b/src/caoscrawler/scripts/generators.py @@ -30,7 +30,6 @@ from typing import Optional import pandas as pd import yaml - DM_TEMPLATE = """# auto-generated data model from file "[]{infile}". # To insert a datamodel into LinkAhead, run: # diff --git a/src/caoscrawler/structure_elements/__init__.py b/src/caoscrawler/structure_elements/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..351f1069708ec94c0dd27313b6329d89858d4330 --- /dev/null +++ b/src/caoscrawler/structure_elements/__init__.py @@ -0,0 +1,31 @@ +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Submdule containing all default and optional converters.""" + +from .. import utils +from .structure_elements import * + +try: + from .rocrate_structure_elements import ROCrateEntity +except ImportError as err: + ROCrateEntity: type = utils.MissingImport( + name="ROCrateEntity", hint="Try installing with the `rocrate` extra option.", + err=err) diff --git a/src/caoscrawler/structure_elements/rocrate_structure_elements.py b/src/caoscrawler/structure_elements/rocrate_structure_elements.py new file mode 100644 index 0000000000000000000000000000000000000000..66768ad800128297a27f47d672352f21310703e9 --- /dev/null +++ b/src/caoscrawler/structure_elements/rocrate_structure_elements.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2024 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +from rocrate.model.entity import Entity + +from .structure_elements import StructureElement + + +class ROCrateEntity(StructureElement): + """ + Store entities contained in ROCrates. + """ + + def __init__(self, folder: str, entity: Entity): + """ + Initializes this ROCrateEntity. + + Arguments: + ---------- + folder: str + The folder that contains the ROCrate data. In case of a zipped ROCrate, this + is a temporary folder that the ROCrate was unzipped to. + The folder is the folder containing the ro-crate-metadata.json. + + entity: Entity + The ROCrate entity that is stored in this structure element. + The entity automatically contains an attribute ".crate" + that stores the ROCrate that this entity belongs to. It can be used + e.g. to look up links to other entities (ROCrate.dereference). + """ + super().__init__(entity.properties()["@id"]) + self.folder = folder + self.entity = entity diff --git a/src/caoscrawler/structure_elements.py b/src/caoscrawler/structure_elements/structure_elements.py similarity index 99% rename from src/caoscrawler/structure_elements.py rename to src/caoscrawler/structure_elements/structure_elements.py index 67cd1056b382c92485deada2058526a03b6d8535..3b4c6e9b9d13c61a5924a12d23b11b62edff6924 100644 --- a/src/caoscrawler/structure_elements.py +++ b/src/caoscrawler/structure_elements/structure_elements.py @@ -24,6 +24,7 @@ # import warnings + import lxml.etree diff --git a/src/caoscrawler/sync_graph.py b/src/caoscrawler/sync_graph.py index 9c021a10f35e95ca56d45151b8d064ec905993ec..a05e6320892239cbe8d7f1d9fbd7949a57f9bccb 100644 --- a/src/caoscrawler/sync_graph.py +++ b/src/caoscrawler/sync_graph.py @@ -27,18 +27,17 @@ crawler. from __future__ import annotations import logging -from typing import Any, Optional, Union, Callable +import re +from typing import Any, Callable, Optional, Union import linkahead as db from linkahead.cached import cached_get_entity_by from linkahead.exceptions import EmptyUniqueQueryError -from .identifiable_adapters import IdentifiableAdapter from .identifiable import Identifiable +from .identifiable_adapters import IdentifiableAdapter from .sync_node import SyncNode, TempID -import re - logger = logging.getLogger(__name__) diff --git a/src/caoscrawler/sync_node.py b/src/caoscrawler/sync_node.py index d35b49c17aea4cba05ab46291ba65023007283ee..d912d6465a68270411c121f65b4c5a828c9c667e 100644 --- a/src/caoscrawler/sync_node.py +++ b/src/caoscrawler/sync_node.py @@ -22,12 +22,12 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Optional, Union +from typing import TYPE_CHECKING, Any, Optional +from warnings import warn import linkahead as db import yaml -from linkahead.common.models import Parent, _ParentList, _Properties -from warnings import warn +from linkahead.common.models import Parent, ParentList, PropertyList from .exceptions import ImpossibleMergeError @@ -76,8 +76,8 @@ class SyncNode(db.Entity): self.role = entity.role self.path = entity.path self.file = entity.file - self.parents = _ParentList().extend(entity.parents) - self.properties = _Properties().extend(entity.properties) + self.parents = ParentList().extend(entity.parents) + self.properties = PropertyList().extend(entity.properties) self._check_for_multiproperties() # other members self.identifiable: Optional[Identifiable] = None @@ -254,25 +254,11 @@ class SyncNode(db.Entity): ids.add(p.id) -def parent_in_list(parent: Parent, plist: _ParentList) -> bool: +def parent_in_list(parent: Parent, plist: ParentList) -> bool: """helper function that checks whether a parent with the same name or ID is in the plist""" - missing = False - if parent.name is not None: - if parent.name not in plist._element_by_name: - missing = True - if parent.id is not None: - if str(parent.id) not in plist._element_by_id: - missing = True - return not missing + return plist.filter(parent) -def property_in_list(prop: db.Property, plist: _Properties) -> bool: +def property_in_list(prop: db.Property, plist: PropertyList) -> bool: """helper function that checks whether a property with the same name or ID is in the plist""" - missing = False - if prop.name is not None: - if prop.name not in plist._element_by_name: - missing = True - if prop.id is not None: - if str(prop.id) not in plist._element_by_id: - missing = True - return not missing + return plist.filter(prop) diff --git a/src/caoscrawler/utils.py b/src/caoscrawler/utils.py index d9a5af839068a2582859aad1b51fbc8b9713d5d1..5f736d5ad7550e0b29cb629b2fa140a2f38d6f5f 100644 --- a/src/caoscrawler/utils.py +++ b/src/caoscrawler/utils.py @@ -26,7 +26,6 @@ # Some utility functions, e.g. for extending pylib. import sys - from posixpath import join as posixjoin from typing import Optional from urllib.parse import urljoin diff --git a/src/caoscrawler/version.py b/src/caoscrawler/version.py index 0b72dd65116fbc102a4dc2492d726698cad5a13b..4cd435486aca26e20e785bbbeb65c013d8e727cb 100644 --- a/src/caoscrawler/version.py +++ b/src/caoscrawler/version.py @@ -18,9 +18,10 @@ # along with this program. If not, see <https://www.gnu.org/licenses/>. # from importlib import metadata as importlib_metadata -from packaging.version import parse as parse_version from warnings import warn +from packaging.version import parse as parse_version + def get_caoscrawler_version(): """ Read in version of locally installed caoscrawler package""" diff --git a/src/doc/cfood.rst b/src/doc/cfood.rst index 51c392780b44b73964921506ad3764b95e14d5ed..a42d593035bd37d0712986c958fb8ad7ad287968 100644 --- a/src/doc/cfood.rst +++ b/src/doc/cfood.rst @@ -27,17 +27,17 @@ A single document with a converter tree specification: .. _example_1: .. code-block:: yaml - + extroot: type: Directory match: ^extroot$ subtree: DataAnalysis: - type: Directory - match: DataAnalysis - # (...) + type: Directory + match: DataAnalysis + # (...) + - A single document with a converter tree specification, but also including a custom converters section: .. _example_2: @@ -50,15 +50,15 @@ A single document with a converter tree specification, but also including a cust CustomConverter_2: package: mypackage.converters converter: CustomConverter2 - + extroot: type: Directory match: ^extroot$ subtree: DataAnalysis: - type: Directory - match: DataAnalysis - # (...) + type: Directory + match: DataAnalysis + # (...) @@ -78,11 +78,11 @@ two custom converters in the second document (**not recommended**, see the recom - !defmacro name: SimulationDatasetFile params: - match: null - recordtype: null - nodename: null + match: null + recordtype: null + nodename: null definition: - # (...) + # (...) --- Converters: CustomConverter_1: @@ -91,15 +91,15 @@ two custom converters in the second document (**not recommended**, see the recom CustomConverter_2: package: mypackage.converters converter: CustomConverter2 - + extroot: type: Directory match: ^extroot$ subtree: DataAnalysis: - type: Directory - match: DataAnalysis - # (...) + type: Directory + match: DataAnalysis + # (...) @@ -118,27 +118,27 @@ The **recommended way** of defining metadata, custom converters, macros and the - !defmacro name: SimulationDatasetFile params: - match: null - recordtype: null - nodename: null + match: null + recordtype: null + nodename: null definition: - # (...) + # (...) Converters: CustomConverter_1: - package: mypackage.converters - converter: CustomConverter1 + package: mypackage.converters + converter: CustomConverter1 CustomConverter_2: - package: mypackage.converters - converter: CustomConverter2 + package: mypackage.converters + converter: CustomConverter2 --- extroot: type: Directory match: ^extroot$ subtree: DataAnalysis: - type: Directory - match: DataAnalysis - # (...) + type: Directory + match: DataAnalysis + # (...) List Mode @@ -148,11 +148,73 @@ Specifying values of properties can make use of two special characters, in order create lists or multi properties instead of single values: .. code-block:: yaml - - Experiment1: - Measurement: +Measurement # Element in List (list is cleared before run) - *Measurement # Multi Property (properties are removed before run) - Measurement # Overwrite + + Experiment1: + Measurement: +Measurement # Element in List (list is cleared before run) + *Measurement # Multi Property (properties are removed before run) + Measurement # Overwrite + +Values and units +---------------- + +Property values can be specified as a simple strings (as above) or as +a dictionaries that may also specify the :ref:`collection mode <List +Mode>`. Strings starting with a "$" will be replaced by a +corresponding variable if there is any. See the :doc:`tutorials +chapter<tutorials/index>` of this documentation for more elaborate +examples on how the variable replacment works exactly. A simple +example could look the following. + +.. code-block:: yaml + + ValueElt: + type: TextElement + match_name: ^my_prop$ + match_value: "(?P<value>.*)" # Anything in here is stored in the variable "value" + records: + MyRecord: + MyProp: $value # will be replace by whatever is stored in the "value" variable set above. + +If not given explicitly, the collection mode will be determined from +the first character of the property value as explained above, and the +following three definitions are all equivalent: + +.. code-block:: yaml + + MyProp: +$value + +.. code-block:: yaml + + MyProp: + value: +$value + +and + +.. code-block:: yaml + + MyProp: + value: $value + collection_mode: list + + +Units of numeric values can be set by providing a property value not +as a single string, but as a dictionary with a ``value`` and a +``unit`` key. Within a converter definition this could look the +following. + +.. code-block:: yaml + + ValueWithUnitElt: + type: TextElement + match_name: ^my_prop$ + match_value: "^(?P<number>\\d+\\.?\\d*)\s+(?P<unit>.+)" # Extract value and unit from a string which + # has a number followed by at least one whitespace + # character followed by a unit. + records: + MyRecord: + MyProp: + value: $number + unit: $unit File Entities @@ -160,7 +222,7 @@ File Entities In order to use File Entities, you must set the appropriate ``role: File``. Additionally, the path and file keys have to be given, with values that set the -paths remotely and locally, respectively. You can use the variable +paths remotely and locally, respectively. You can use the variable ``<converter name>_path`` that is automatically created by converters that deal with file system related StructureElements. The file object itsself is stored in a vairable with the same name (as it is the case for other Records). @@ -169,15 +231,15 @@ in a vairable with the same name (as it is the case for other Records). .. code-block:: yaml somefile: - type: SimpleFile - match: ^params.*$ # macht any file that starts with "params" - records: - fileEntity: - role: File # necessary to create a File Entity - path: somefile.path # defines the path in CaosDB - file: somefile.path # path where the file is found locally - SomeRecord: - ParameterFile: $fileEntity # creates a reference to the file + type: SimpleFile + match: ^params.*$ # match any file that starts with "params" + records: + fileEntity: + role: File # necessary to create a File Entity + path: somefile.path # defines the path in CaosDB + file: somefile.path # path where the file is found locally + SomeRecord: + ParameterFile: $fileEntity # creates a reference to the file Transform Functions diff --git a/src/doc/conf.py b/src/doc/conf.py index b8771ea487ec92d275d029a445e053332d307387..5210d4776cf8701c8327442e63e7e84f9b302fa1 100644 --- a/src/doc/conf.py +++ b/src/doc/conf.py @@ -21,11 +21,11 @@ # import os import sys + sys.path.insert(0, os.path.abspath('..')) import sphinx_rtd_theme # noqa: E402 - # -- Project information ----------------------------------------------------- project = 'caosdb-caoscrawler' @@ -33,10 +33,10 @@ copyright = '2024, IndiScale' author = 'Alexander Schlemmer' # The short X.Y version -version = '0.9.1' +version = '0.10.0' # The full version, including alpha/beta/rc tags # release = '0.5.2-rc2' -release = '0.9.1' +release = '0.10.0' # -- General configuration --------------------------------------------------- diff --git a/tox.ini b/tox.ini index 41249e4277391c5ffa4ec13fc4da1a6ee1f48491..e003e26ecd16861c3b8a8d991fc789c78d203e5b 100644 --- a/tox.ini +++ b/tox.ini @@ -3,7 +3,7 @@ envlist = py38, py39, py310, py311, py312, py313 skip_missing_interpreters = true [testenv] -deps = .[h5-crawler,spss] +deps = .[h5-crawler,spss,rocrate] pytest pytest-cov # TODO: Make this f-branch sensitive diff --git a/unittests/eln_cfood.yaml b/unittests/eln_cfood.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ab8e7108f511b0450d37c3e60162e412d4a1bf3b --- /dev/null +++ b/unittests/eln_cfood.yaml @@ -0,0 +1,36 @@ +--- +metadata: + crawler-version: 0.9.2 + macros: +--- +Converters: + ELNFile: + converter: ELNFileConverter + package: caoscrawler.converters + ROCrateEntity: + converter: ROCrateEntityConverter + package: caoscrawler.converters + +DataDir: + type: Directory + match: .* + subtree: + ELNFile: + type: ELNFile + match: ^.*\.eln$ + subtree: + RecordsExample: + type: ROCrateEntity + match_type: Dataset + match_properties: + "@id": records-example/$ + name: (?P<name>.*) + keywords: (?P<keywords>.*) + description: (?P<description>.*) + dateModified: (?P<dateModified>.*) + records: + Dataset: + name: $name + keywords: $keywords + description: $description + dateModified: $dateModified diff --git a/unittests/eln_files/PASTA.eln b/unittests/eln_files/PASTA.eln new file mode 100644 index 0000000000000000000000000000000000000000..61866e7d5f57cb32191af6663be230153092e712 Binary files /dev/null and b/unittests/eln_files/PASTA.eln differ diff --git a/unittests/eln_files/records-example.eln b/unittests/eln_files/records-example.eln new file mode 100644 index 0000000000000000000000000000000000000000..09ed53fc179e80a240ab773247d6f9adee71b429 Binary files /dev/null and b/unittests/eln_files/records-example.eln differ diff --git a/unittests/test_cfood_metadata.py b/unittests/test_cfood_metadata.py index 494bd383d95b4a845b5ea6f86ccff0f9a1db257f..c606a0a1afcc15d48164694768bae02adfb0fc0b 100644 --- a/unittests/test_cfood_metadata.py +++ b/unittests/test_cfood_metadata.py @@ -17,15 +17,13 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. # +from tempfile import NamedTemporaryFile +from unittest.mock import MagicMock, Mock, patch + import pytest import yaml -from tempfile import NamedTemporaryFile -from unittest.mock import patch -from unittest.mock import MagicMock, Mock - import caoscrawler - from caoscrawler.scanner import load_definition diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 0522c4e6fd31239b9b3ae1f803ef5799ad2c5423..6c7db6ed346fc5e6d0d286024e96ef8828c5c872 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -29,30 +29,29 @@ import importlib import json import logging import os -import pytest -import sys -import yaml - from itertools import product from pathlib import Path import linkahead as db +import pytest +import yaml from caoscrawler.converters import (Converter, ConverterValidationError, DateElementConverter, DictElementConverter, DictIntegerElementConverter, DirectoryConverter, FloatElementConverter, IntegerElementConverter, JSONFileConverter, - ListElementConverter, MarkdownFileConverter, + ListElementConverter, + MarkdownFileConverter, PropertiesFromDictConverter, - YAMLFileConverter, - handle_value, replace_variables) -from caoscrawler.converters.converters import _AbstractScalarValueElementConverter + YAMLFileConverter, handle_value, + replace_variables) +from caoscrawler.converters.converters import \ + _AbstractScalarValueElementConverter from caoscrawler.crawl import Crawler from caoscrawler.scanner import (_load_definition_from_yaml_dict, create_converter_registry, - create_transformer_registry, - load_definition, + create_transformer_registry, load_definition, scan_structure_elements) from caoscrawler.stores import GeneralStore, RecordStore from caoscrawler.structure_elements import (BooleanElement, DictElement, @@ -352,6 +351,8 @@ def test_variable_replacement(): values = GeneralStore() values["a"] = 4 values["b"] = "68" + values["my_unit"] = "m" + values["cm"] = "cm" # basic values stay unchanged assert replace_variables(5, values) is 5 @@ -359,28 +360,38 @@ def test_variable_replacement(): assert replace_variables("$a", values) is 4 assert replace_variables("${b}", values) == "68" - assert handle_value("b", values) == ("b", "single") - assert handle_value("+b", values) == ("b", "list") - assert handle_value("*b", values) == ("b", "multiproperty") - assert handle_value("$b", values) == ("68", "single") - assert handle_value("+$b", values) == ("68", "list") - assert handle_value("*$b", values) == ("68", "multiproperty") + # values given as simple strings never have units + assert handle_value("b", values) == ("b", None, "single") + assert handle_value("+b", values) == ("b", None, "list") + assert handle_value("*b", values) == ("b", None, "multiproperty") + assert handle_value("$b", values) == ("68", None, "single") + assert handle_value("+$b", values) == ("68", None, "list") + assert handle_value("*$b", values) == ("68", None, "multiproperty") + # No units in dicts assert handle_value({"value": "b", - "collection_mode": "single"}, values) == ("b", "single") + "collection_mode": "single"}, values) == ("b", None, "single") assert handle_value({"value": "b", - "collection_mode": "list"}, values) == ("b", "list") + "collection_mode": "list"}, values) == ("b", None, "list") assert handle_value({"value": "b", - "collection_mode": "multiproperty"}, values) == ("b", "multiproperty") + "collection_mode": "multiproperty"}, values) == ("b", None, "multiproperty") assert handle_value({"value": "$b", - "collection_mode": "single"}, values) == ("68", "single") + "collection_mode": "single"}, values) == ("68", None, "single") assert handle_value({"value": "$b", - "collection_mode": "list"}, values) == ("68", "list") + "collection_mode": "list"}, values) == ("68", None, "list") assert handle_value({"value": "$b", - "collection_mode": "multiproperty"}, values) == ("68", "multiproperty") + "collection_mode": "multiproperty"}, values) == ("68", None, "multiproperty") + + # Unit specified in the same way as value: + assert handle_value({"value": 5, "unit": "m"}, values) == (5, "m", "single") + assert handle_value({"value": 5, "unit": "${my_unit}"}, values) == (5, "m", "single") + assert handle_value({"value": "+5", "unit": "${my_unit}"}, values) == ("5", "m", "list") + assert handle_value({"value": "*5", "unit": "${my_unit}"}, + values) == ("5", "m", "multiproperty") - assert handle_value(["a", "b"], values) == (["a", "b"], "single") - assert handle_value(["$a", "$b"], values) == ([4, "68"], "single") + assert handle_value(["a", "b"], values) == (["a", "b"], None, "single") + assert handle_value(["$a", "$b"], values) == ([4, "68"], None, "single") + assert handle_value({"value": ["$a", "$a"], "unit": "$cm"}, values) == ([4, 4], "cm", "single") def test_apply_transformers(converter_registry): @@ -643,7 +654,7 @@ def test_load_converters(): # converter classes can be loaded from their respective packages. # Please adapt, if defaults change! - assert len(converter_registry) == 28 + assert len(converter_registry) == 29 # All of them are contained in caoscrawler.converters # except for the xml converters: diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index aaddec9e8c6b17ad726808bc36b0784adbc3c36d..e88ce454061fb268fa49e986f8392f71296beb07 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -23,7 +23,6 @@ """ test the Crawler class """ -import json import logging import os import warnings @@ -33,12 +32,17 @@ from os.path import basename, dirname, join from pathlib import Path from unittest.mock import MagicMock, Mock, patch -import caoscrawler import linkahead as db import linkahead.common.models as dbmodels import pytest import yaml from caosadvancedtools.models.parser import parse_model_from_string +from linkahead.apiutils import compare_entities +from linkahead.cached import cache_clear +from linkahead.exceptions import EmptyUniqueQueryError +from pytest import raises + +import caoscrawler from caoscrawler.crawl import (Crawler, SecurityMode, _treat_deprecated_prefix, crawler_main, split_restricted_path) from caoscrawler.debug_tree import DebugTree @@ -55,10 +59,6 @@ from caoscrawler.stores import GeneralStore, RecordStore from caoscrawler.structure_elements import (DictElement, DictListElement, DictTextElement, File) from caoscrawler.sync_graph import SyncGraph -from linkahead.apiutils import compare_entities -from linkahead.cached import cache_clear -from linkahead.exceptions import EmptyUniqueQueryError -from pytest import raises UNITTESTDIR = Path(__file__).parent diff --git a/unittests/test_entity_comparison.py b/unittests/test_entity_comparison.py index 0f62475b6c61d82feb3e550cf5ab53e91183f80a..8543732fde4d584e2022dcf6432e9572ae625eb5 100644 --- a/unittests/test_entity_comparison.py +++ b/unittests/test_entity_comparison.py @@ -3,7 +3,6 @@ # A. Schlemmer, 06/2021 import linkahead as db - import pytest from pytest import raises diff --git a/unittests/test_h5_converter.py b/unittests/test_h5_converter.py index 95060451badb0523cf91c70e5be345e35ec3964d..9c1058812c75c6d1e5ee7028c8f6fccd7081a54c 100644 --- a/unittests/test_h5_converter.py +++ b/unittests/test_h5_converter.py @@ -17,22 +17,21 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. # -import numpy as np - from functools import partial from pathlib import Path -from pytest import fixture, importorskip import linkahead as db +import numpy as np +from pytest import fixture, importorskip +from utils import dircheckstr as dircheck_base +from caoscrawler.converters.hdf5_converter import ( + H5DatasetElement, H5GroupElement, H5NdarrayElement, + convert_basic_element_with_nd_array, convert_h5_element) from caoscrawler.debug_tree import DebugTree -from caoscrawler.converters.hdf5_converter import (convert_basic_element_with_nd_array, - convert_h5_element, H5GroupElement, - H5DatasetElement, H5NdarrayElement) from caoscrawler.scanner import scan_directory from caoscrawler.structure_elements import (FloatElement, ListElement, TextElement) -from utils import dircheckstr as dircheck_base # Skip the whole module if h5py hasn't been installed h5py = importorskip("h5py") diff --git a/unittests/test_identifiable.py b/unittests/test_identifiable.py index d94d852583523a3b3f29f002eaacb9ae0b616c4f..44aac6a3edd40e0df8558f68083e22245ff58127 100644 --- a/unittests/test_identifiable.py +++ b/unittests/test_identifiable.py @@ -26,6 +26,7 @@ test identifiable module import linkahead as db import pytest + from caoscrawler.identifiable import Identifiable from caoscrawler.sync_node import SyncNode diff --git a/unittests/test_identifiable_adapters.py b/unittests/test_identifiable_adapters.py index 53490bc0413a95d960d94186c639dac2c6223b80..bdc0ab850d1a8253e876e8b1a6bc621327802f79 100644 --- a/unittests/test_identifiable_adapters.py +++ b/unittests/test_identifiable_adapters.py @@ -27,15 +27,14 @@ test identifiable_adapters module """ -import os from datetime import datetime -from unittest.mock import MagicMock, Mock, patch from pathlib import Path +from unittest.mock import MagicMock, Mock, patch import linkahead as db import pytest -from caoscrawler.exceptions import (InvalidIdentifiableYAML, - ) + +from caoscrawler.exceptions import InvalidIdentifiableYAML from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import (CaosDBIdentifiableAdapter, IdentifiableAdapter, @@ -233,7 +232,7 @@ def test_get_identifiable(): id_r0 = ident.get_identifiable(se, []) -@ pytest.mark.xfail +@pytest.mark.xfail def test_retrieve_identified_record_for_identifiable(): # TODO modify this such that it becomes a test that acutally tests (sufficiently) the # retrieve_identified_record_for_identifiable function diff --git a/unittests/test_issues.py b/unittests/test_issues.py index 1678280555e739bae55819fa7fe42a53c938c4e5..a6de65400f42018c3fdcde7b2f29d4fd200bf62b 100644 --- a/unittests/test_issues.py +++ b/unittests/test_issues.py @@ -22,11 +22,12 @@ from pytest import mark -from caoscrawler.converters import replace_variables, CrawlerTemplate +from caoscrawler.converters import CrawlerTemplate, replace_variables from caoscrawler.crawl import Crawler -from caoscrawler.structure_elements import DictElement +from caoscrawler.scanner import (create_converter_registry, + scan_structure_elements) from caoscrawler.stores import GeneralStore -from caoscrawler.scanner import create_converter_registry, scan_structure_elements +from caoscrawler.structure_elements import DictElement def test_issue_10(): diff --git a/unittests/test_json.py b/unittests/test_json.py index be65a26ea01e11e11968bd927c80513708e73850..5d145b38fd36fa2de4e4ab754cbadda0fff6eff7 100644 --- a/unittests/test_json.py +++ b/unittests/test_json.py @@ -26,18 +26,17 @@ """ test the JSON converter """ -import json import os - -from pytest import raises +from pathlib import Path import linkahead as db +from pytest import raises from caoscrawler.converters import JSONFileConverter -from pathlib import Path from caoscrawler.crawl import Crawler +from caoscrawler.scanner import (create_converter_registry, load_definition, + scan_structure_elements) from caoscrawler.structure_elements import File, JSONFile -from caoscrawler.scanner import load_definition, create_converter_registry, scan_structure_elements UNITTESTDIR = Path(__file__).parent diff --git a/unittests/test_macros.py b/unittests/test_macros.py index cfa405e5041fb4324b7de98ffcb942cf4b040715..a87b633e8585a03431575426733cae6ba31b7acf 100644 --- a/unittests/test_macros.py +++ b/unittests/test_macros.py @@ -22,15 +22,15 @@ # ** end header # -from caoscrawler.macros import defmacro_constructor, macro_constructor -from caoscrawler.macros.macro_yaml_object import macro_store -from caoscrawler.crawl import Crawler -from caoscrawler.scanner import load_definition - from tempfile import NamedTemporaryFile -import yaml import pytest +import yaml + +from caoscrawler.crawl import Crawler +from caoscrawler.macros import defmacro_constructor, macro_constructor +from caoscrawler.macros.macro_yaml_object import macro_store +from caoscrawler.scanner import load_definition @pytest.fixture diff --git a/unittests/test_rocrate_converter.py b/unittests/test_rocrate_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..ef59a37c7a9ca91f85d3a62b4f5b6f5c12559575 --- /dev/null +++ b/unittests/test_rocrate_converter.py @@ -0,0 +1,205 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer <a.schlemmer@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" +test the XML converters +""" +import importlib +import os +from pathlib import Path + +import jsonschema +import linkahead as db +import pytest +import rocrate +import yaml +from linkahead.high_level_api import convert_to_python_object +from lxml.etree import fromstring +from rocrate.model.entity import Entity +from rocrate.rocrate import ROCrate + +from caoscrawler import scanner +from caoscrawler.converters import ELNFileConverter, ROCrateEntityConverter +from caoscrawler.scanner import load_definition +from caoscrawler.stores import GeneralStore +from caoscrawler.structure_elements import (DictElement, File, ROCrateEntity, + TextElement) + +UNITTESTDIR = Path(__file__).parent + + +@pytest.fixture +def converter_registry(): + converter_registry: dict[str, dict[str, str]] = { + "ELNFile": { + "converter": "ELNFileConverter", + "package": "caoscrawler.converters"}, + "ROCrateEntity": { + "converter": "ROCrateEntityConverter", + "package": "caoscrawler.converters", + } + } + + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry + + +@pytest.fixture +def basic_eln_converter(converter_registry): + return ELNFileConverter(yaml.safe_load(""" +type: ELNFile +match: .*\\.eln +"""), "TestELNConverter", converter_registry) + + +@pytest.fixture +def eln_entities(basic_eln_converter): + f_k4mat = File("records-example.eln", + os.path.join(UNITTESTDIR, "eln_files", "records-example.eln")) + store = GeneralStore() + entities = basic_eln_converter.create_children(store, f_k4mat) + return entities + + +def test_load_pasta(basic_eln_converter): + """ + Test for loading the .eln example export from PASTA. + """ + f_pasta = File("PASTA.eln", os.path.join(UNITTESTDIR, "eln_files", "PASTA.eln")) + match = basic_eln_converter.match(f_pasta) + assert match is not None + entities = basic_eln_converter.create_children(GeneralStore(), f_pasta) + assert len(entities) == 20 + assert isinstance(entities[0], ROCrateEntity) + assert isinstance(entities[0].folder, str) + assert isinstance(entities[0].entity, Entity) + + +def test_load_kadi4mat(basic_eln_converter): + """ + Test for loading the .eln example export from PASTA. + """ + f_k4mat = File("records-example.eln", + os.path.join(UNITTESTDIR, "eln_files", "records-example.eln")) + match = basic_eln_converter.match(f_k4mat) + assert match is not None + entities = basic_eln_converter.create_children(GeneralStore(), f_k4mat) + assert len(entities) == 10 + assert isinstance(entities[0], ROCrateEntity) + assert isinstance(entities[0].folder, str) + assert isinstance(entities[0].entity, Entity) + + +def test_match_rocrate_entities(eln_entities): + ds1 = ROCrateEntityConverter(yaml.safe_load(""" +type: ROCrateEntity +match_properties: + "@id": \\./ + datePublished: (?P<datePublished>.*) +"""), "TestELNConverter", converter_registry) + + match = ds1.match(eln_entities[0]) + assert match is not None + + ds2 = ROCrateEntityConverter(yaml.safe_load(""" +type: ROCrateEntity +match_type: CreativeWork +match_properties: + "@id": ro-crate-metadata.json + dateCreated: (?P<dateCreated>.*) +"""), "TestELNConverter", converter_registry) + + match = ds2.match(eln_entities[0]) + assert match is None + match = ds1.match(eln_entities[1]) + assert match is None + + match = ds2.match(eln_entities[1]) + assert match is not None + assert match["dateCreated"] == "2024-08-21T12:07:45.115990+00:00" + + children = ds2.create_children(GeneralStore(), eln_entities[1]) + assert len(children) == 8 + assert isinstance(children[0], TextElement) + assert children[0].name == "@id" + assert children[0].value == "ro-crate-metadata.json" + assert isinstance(children[5], DictElement) + assert children[5].value == {'@id': 'https://kadi.iam.kit.edu'} + + +def test_file(eln_entities): + ds_csv = ROCrateEntityConverter(yaml.safe_load(""" +type: ROCrateEntity +match_type: File +match_properties: + "@id": .*\.csv$ +"""), "TestELNConverter", converter_registry) + + ent_csv = eln_entities[5] + match = ds_csv.match(ent_csv) + assert match is not None + + children = ds_csv.create_children(GeneralStore(), ent_csv) + + # Number of children = number of properties + number of files: + assert len(children) == len(ent_csv.entity.properties()) + 1 + # Get the file: + f_csv = [f for f in children if isinstance(f, File)][0] + with open(f_csv.path) as f: + text = f.read() + assert "Ultrasound Transducer" in text + + +def test_has_part(eln_entities): + ds_parts = ROCrateEntityConverter(yaml.safe_load(""" +type: ROCrateEntity +match_type: Dataset +match_properties: + "@id": records-example/ +"""), "TestELNConverter", converter_registry) + + ent_parts = eln_entities[2] + match = ds_parts.match(ent_parts) + assert match is not None + + children = ds_parts.create_children(GeneralStore(), ent_parts) + + # Number of children = number of properties + number of parts: + assert len(children) == len(ent_parts.entity.properties()) + 4 + entity_children = [f for f in children if isinstance(f, ROCrateEntity)] + assert len(entity_children) == 4 + for f in entity_children: + assert isinstance(f.entity, rocrate.model.file.File) + + +def test_scanner(): + rlist = scanner.scan_directory(os.path.join(UNITTESTDIR, "eln_files/"), + os.path.join(UNITTESTDIR, "eln_cfood.yaml")) + assert len(rlist) == 1 + assert isinstance(rlist[0], db.Record) + assert rlist[0].name == "records-example" + assert rlist[0].description == "This is a sample record." + assert rlist[0].parents[0].name == "Dataset" + assert rlist[0].get_property("keywords").value == "sample" + assert rlist[0].get_property("dateModified").value == "2024-08-21T11:43:17.626965+00:00" diff --git a/unittests/test_scalars_cfood.py b/unittests/test_scalars_cfood.py index ba604fe4f5b695506bf8df9dab79fc23232c546a..577fcd5f6c93bee2bc05451983d358aa2e07f798 100644 --- a/unittests/test_scalars_cfood.py +++ b/unittests/test_scalars_cfood.py @@ -2,10 +2,11 @@ # Tests for: # https://gitlab.com/caosdb/caosdb-crawler/-/issues/9 # A. Schlemmer, 06/2021 -import os from pathlib import Path import pytest +from utils import dircheckstr + # The main function that is affected by this issue: from caoscrawler.converters import handle_value from caoscrawler.crawl import Crawler @@ -14,8 +15,6 @@ from caoscrawler.scanner import scan_directory # We need the store for the above function from caoscrawler.stores import GeneralStore -from utils import dircheckstr - UNITTESTDIR = Path(__file__).parent @@ -24,15 +23,15 @@ def test_handle_value(): store = GeneralStore() # This one should work: - assert handle_value("bla", store) == ("bla", "single") + assert handle_value("bla", store) == ("bla", None, "single") # These failed: - assert handle_value(4, store) == (4, "single") - assert handle_value(4.2, store) == (4.2, "single") - assert handle_value(True, store) == (True, "single") + assert handle_value(4, store) == (4, None, "single") + assert handle_value(4.2, store) == (4.2, None, "single") + assert handle_value(True, store) == (True, None, "single") # List test: - assert handle_value([4, 3, 2], store) == ([4, 3, 2], "single") + assert handle_value([4, 3, 2], store) == ([4, 3, 2], None, "single") def test_record_structure_generation(): diff --git a/unittests/test_scanner.py b/unittests/test_scanner.py index 226b5040547f0e003729dba63622edf836552f18..5cbbc63406ffb3f5ec1f9019ed7877d7880d7b69 100644 --- a/unittests/test_scanner.py +++ b/unittests/test_scanner.py @@ -34,15 +34,16 @@ from unittest.mock import MagicMock, Mock, patch import linkahead as db import pytest import yaml +from pytest import raises +from utils import dircheckstr as dircheck_base + from caoscrawler.crawl import Crawler from caoscrawler.debug_tree import DebugTree -from caoscrawler.scanner import (create_converter_registry, load_definition, +from caoscrawler.scanner import (_load_definition_from_yaml_dict, + create_converter_registry, load_definition, scan_directory, scan_structure_elements) from caoscrawler.structure_elements import (DictElement, DictListElement, DictTextElement, File) -from pytest import raises - -from utils import dircheckstr as dircheck_base UNITTESTDIR = Path(__file__).parent @@ -316,3 +317,91 @@ def test_record_parents(): assert rec.parents[0].name == 'Stuff' # default parent stays if no parent is given on # lower levels assert len(rec.parents) == 1 + + +def test_error_messages(): + data = { + 'Experiments': {} + } + + broken_yaml = """ +EmptyConverter: + """ + broken_definition = _load_definition_from_yaml_dict( + [yaml.load(broken_yaml, Loader=yaml.SafeLoader)]) + + converter_registry = create_converter_registry(broken_definition) + + with pytest.raises(RuntimeError, match="Definition of converter \"EmptyConverter\" is empty"): + scan_structure_elements(DictElement(name="", value=data), + broken_definition, converter_registry) + + broken_yaml = """ +Converter: + type: DictElement + records: + TestRecord: "42" + """ + + broken_definition = _load_definition_from_yaml_dict( + [yaml.load(broken_yaml, Loader=yaml.SafeLoader)]) + + converter_registry = create_converter_registry(broken_definition) + + with pytest.raises(RuntimeError, match="dict expected, but found str: 42"): + scan_structure_elements(DictElement(name="", value=data), + broken_definition, converter_registry) + + +def test_units(): + """Test the correct setting of units.""" + crawler_definition = load_definition(UNITTESTDIR / "test_unit_cfood.yml") + converter_registry = create_converter_registry(crawler_definition) + + data = { + "value_with_unit": "1.1 m", + "array_with_units": [ + "1.1 cm", + "2.2 cm" + ] + } + records = scan_structure_elements(DictElement(name="", value=data), crawler_definition, + converter_registry) + assert len(records) == 1 + rec = records[0] + # This is hard-coded in cfood: + assert rec.get_property("may_be_overwritten") is not None + assert rec.get_property("may_be_overwritten").value == "12" + assert rec.get_property("may_be_overwritten").unit == "K" + # Those are set from data + assert rec.get_property("value_with_unit") is not None + assert rec.get_property("value_with_unit").value == "1.1" + assert rec.get_property("value_with_unit").unit == "m" + assert rec.get_property("list_with_unit") is not None + assert rec.get_property("list_with_unit").value == ["1.1", "2.2"] + assert rec.get_property("list_with_unit").unit == "cm" + + # Contradictory units + data = { + "array_with_units": [ + "1.1 K", + "45 W" + ] + } + with raises(RuntimeError) as rte: + records = scan_structure_elements(DictElement(name="", value=data), crawler_definition, + converter_registry) + assert "Property 'list_with_unit' has contradictory units" in str(rte.value) + + # Overwrite value and unit + data = { + "may_be_overwritten": "400 °C" + } + records = scan_structure_elements(DictElement(name="", value=data), crawler_definition, + converter_registry) + assert len(records) == 1 + rec = records[0] + # Now set from data + assert rec.get_property("may_be_overwritten") is not None + assert rec.get_property("may_be_overwritten").value == "400" + assert rec.get_property("may_be_overwritten").unit == "°C" diff --git a/unittests/test_schema.py b/unittests/test_schema.py index ea8549b0b8dfd1f1af35784082a9e46320cfcff4..96c388ac362583eda13ca368519467c34446868e 100644 --- a/unittests/test_schema.py +++ b/unittests/test_schema.py @@ -2,17 +2,15 @@ # Tests for schema validation # A. Schlemmer, 06/2021 -from importlib_resources import files -import linkahead as db - -from os.path import join, dirname -from caoscrawler import Crawler +from os.path import dirname, join +import linkahead as db import pytest -from pytest import raises - +from importlib_resources import files from jsonschema.exceptions import ValidationError +from pytest import raises +from caoscrawler import Crawler from caoscrawler.scanner import load_definition diff --git a/unittests/test_spss_converter.py b/unittests/test_spss_converter.py index 7ffc18dba43a6f7cd3c9fbc9273da349b4ec3c6e..59fe723849dadcda21a699416372f08f2756f4e1 100644 --- a/unittests/test_spss_converter.py +++ b/unittests/test_spss_converter.py @@ -20,16 +20,12 @@ import datetime import importlib -import re from pathlib import Path import numpy as np import pytest -from caoscrawler.converters import ( - ConverterValidationError, - SPSSConverter, -) +from caoscrawler.converters import ConverterValidationError, SPSSConverter from caoscrawler.structure_elements import (BooleanElement, DictElement, Directory, File, FloatElement, IntegerElement, ListElement, diff --git a/unittests/test_sync_graph.py b/unittests/test_sync_graph.py index 84451790ddd02f90c2a12a3ce7280b17d8f7c73b..06f0dfb9eb3d3536d26dcfd354ca27f08ef99a02 100644 --- a/unittests/test_sync_graph.py +++ b/unittests/test_sync_graph.py @@ -21,25 +21,21 @@ import logging from functools import partial +from itertools import product from unittest.mock import MagicMock, Mock, patch import linkahead as db import pytest from test_crawler import (basic_retrieve_by_name_mock_up, - mock_cached_only_rt_allow_empty, - mock_get_entity_by, - ) + mock_cached_only_rt_allow_empty, mock_get_entity_by) from caoscrawler.exceptions import (MissingIdentifyingProperty, - MissingRecordType, - ) + MissingRecordType) from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter from caoscrawler.sync_graph import SyncGraph, _set_each_scalar_value from caoscrawler.sync_node import SyncNode, parent_in_list, property_in_list -from itertools import product - @pytest.fixture def simple_adapter(): diff --git a/unittests/test_sync_node.py b/unittests/test_sync_node.py index bd9e1a6ccbc2ac9ec9ccace96e0ec0422ba1d95b..1f95551d34f9e06ab3e2fc196e1e7809eabfa019 100644 --- a/unittests/test_sync_node.py +++ b/unittests/test_sync_node.py @@ -18,19 +18,18 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. # -from functools import partial from unittest.mock import MagicMock, Mock, patch import linkahead as db import pytest +from test_crawler import basic_retrieve_by_name_mock_up, mock_get_entity_by + from caoscrawler.exceptions import ImpossibleMergeError from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter from caoscrawler.sync_graph import SyncGraph from caoscrawler.sync_node import SyncNode, parent_in_list, property_in_list -from test_crawler import basic_retrieve_by_name_mock_up, mock_get_entity_by - def assert_parents_equal(p1, p2): """Special assertion for comparing parents.""" diff --git a/unittests/test_table_converter.py b/unittests/test_table_converter.py index 3b563fd3179968fd90b1c92b9bc5bf0db9ed0858..c606c1d3cdf9a95f00728eaae88153631b08af53 100644 --- a/unittests/test_table_converter.py +++ b/unittests/test_table_converter.py @@ -28,12 +28,13 @@ test the converters module import importlib import math -import os from os.path import basename, dirname, join from pathlib import Path import linkahead as db import pytest +from utils import dircheckstr + from caoscrawler import Crawler from caoscrawler.converters import (Converter, ConverterValidationError, CSVTableConverter, DictConverter, @@ -48,8 +49,6 @@ from caoscrawler.structure_elements import (BooleanElement, DictElement, IntegerElement, ListElement, TextElement) -from utils import dircheckstr - UNITTESTDIR = Path(__file__).parent diff --git a/unittests/test_transformers.py b/unittests/test_transformers.py index 4ed12751d9052c839aa4db4abd586c419bed1018..0571dbd31de9b37230f0ee1d93c22c6df47c87e7 100644 --- a/unittests/test_transformers.py +++ b/unittests/test_transformers.py @@ -29,19 +29,18 @@ See: https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/107 """ import importlib -from functools import partial from pathlib import Path -from tempfile import NamedTemporaryFile from unittest.mock import MagicMock, Mock, patch import linkahead as db import pytest import yaml +from pytest import raises + from caoscrawler.converters import Converter, ListElementConverter from caoscrawler.scanner import create_transformer_registry, scan_directory from caoscrawler.stores import GeneralStore from caoscrawler.transformer_functions import replace, split -from pytest import raises UNITTESTDIR = Path(__file__).parent diff --git a/unittests/test_unit_cfood.yml b/unittests/test_unit_cfood.yml new file mode 100644 index 0000000000000000000000000000000000000000..214aa49adceedce49a162f380ec453fb8597f215 --- /dev/null +++ b/unittests/test_unit_cfood.yml @@ -0,0 +1,43 @@ +--- +metadata: + crawler-version: 0.9.0 +--- +data: + type: Dict + match_name: '.*' + records: + MyRec: + may_be_overwritten: + value: "12" + unit: K + subtree: + ValueWithUnit: + type: TextElement + match_name: ^value_with_unit$ + match_value: "^(?P<number>\\d+\\.?\\d*)\\s+(?P<unit>.+)" + records: + MyRec: + value_with_unit: + value: $number + unit: $unit + MayBeOverwritten: + type: TextElement + match_name: ^may_be_overwritten$ + match_value: "^(?P<number>\\d+\\.?\\d*)\\s+(?P<unit>.+)" + records: + MyRec: + may_be_overwritten: + value: $number + unit: $unit + ListOfValues: + type: ListElement + match_name: ^array_with_units$ + subtree: + SingleValueWithUnit: + type: TextElement + match_value: "^(?P<number>\\d+\\.?\\d*)\\s+(?P<unit>.+)" + records: + MyRec: + list_with_unit: + value: +$number + unit: $unit diff --git a/unittests/test_utilities.py b/unittests/test_utilities.py index 15e84a609149ac602ee80b7357f7622566563792..463e304a99161f2294e5d202611dcf0b829e2045 100644 --- a/unittests/test_utilities.py +++ b/unittests/test_utilities.py @@ -22,7 +22,7 @@ import pytest from caoscrawler.crawl import split_restricted_path -from caoscrawler.utils import get_shared_resource_link, MissingImport +from caoscrawler.utils import MissingImport, get_shared_resource_link def test_split_restricted_path(): diff --git a/unittests/test_variable_substitutions.py b/unittests/test_variable_substitutions.py index 90d144b04a4e1271f74b769759e3f201007af705..c75e37956c1ec24e47ff9cbd9b03572ed4a0f80e 100644 --- a/unittests/test_variable_substitutions.py +++ b/unittests/test_variable_substitutions.py @@ -19,7 +19,6 @@ # along with this program. If not, see <https://www.gnu.org/licenses/>. # -from copy import deepcopy from functools import partial from os.path import basename, dirname, join from pathlib import Path @@ -28,6 +27,10 @@ from unittest.mock import MagicMock, Mock import linkahead as db import pytest import yaml +from linkahead.apiutils import compare_entities +from pytest import raises +from utils import dircheckstr as dircheckstr_base + from caoscrawler import Crawler from caoscrawler.debug_tree import DebugTree from caoscrawler.identifiable_adapters import (IdentifiableAdapter, @@ -35,10 +38,6 @@ from caoscrawler.identifiable_adapters import (IdentifiableAdapter, from caoscrawler.scanner import scan_directory from caoscrawler.structure_elements import (DictListElement, DictTextElement, File) -from linkahead.apiutils import compare_entities -from pytest import raises - -from utils import dircheckstr as dircheckstr_base UNITTESTDIR = Path(__file__).parent dircheckstr = partial(dircheckstr_base, UNITTESTDIR / "test_directories" / diff --git a/unittests/test_xml_converter.py b/unittests/test_xml_converter.py index fb4c7746fa2d0b6c3d4ec95fc1de3139493a703f..e8869ef6ffad511159a583a14fd49d2fad48766b 100644 --- a/unittests/test_xml_converter.py +++ b/unittests/test_xml_converter.py @@ -24,20 +24,18 @@ test the XML converters """ import importlib -import json +from pathlib import Path + import pytest -import sys import yaml - from lxml.etree import fromstring -from pathlib import Path -from caoscrawler.converters import XMLTagConverter +from caoscrawler.converters import (XMLAttributeNodeConverter, XMLTagConverter, + XMLTextNodeConverter) from caoscrawler.scanner import load_definition from caoscrawler.stores import GeneralStore from caoscrawler.structure_elements import XMLTagElement - UNITTESTDIR = Path(__file__).parent @@ -51,6 +49,9 @@ def converter_registry(): "XMLTextNode": { "converter": "XMLTextNodeConverter", "package": "caoscrawler.converters"}, + "XMLAttributeNode": { + "converter": "XMLAttributeNodeConverter", + "package": "caoscrawler.converters"}, } for key, value in converter_registry.items(): @@ -294,3 +295,85 @@ nsmap: children = converter.create_children(GeneralStore(), tag) assert len(children) == 1 assert children[0].name == "{default-namespace}node1[2]/{sub-namespace}node2" + + +def test_attrib_nodes(converter_registry): + """ + Test attribute node converters. + """ + + xml_text = """ + <node1 active="true" size="45"> + Bla + </node1> +""" + + converter = XMLTagConverter(yaml.safe_load(""" +type: XMLTag +match_tag: "node1" +xpath: . +tags_as_children: false +attribs_as_children: true +"""), "TestXMLTagConverter", converter_registry) + + tag = XMLTagElement(fromstring(xml_text)) + m = converter.match(tag) + assert m is not None + children = converter.create_children(GeneralStore(), tag) + assert len(children) == 2 + + attrib_converter = XMLAttributeNodeConverter(yaml.safe_load(""" +type: XMLAttributeNode +match_name: active +match_value: (?P<val>.*) +"""), "TestXMLAttributeNodeConverter", converter_registry) + m = attrib_converter.match(children[1]) + assert m is None + m = attrib_converter.match(children[0]) + assert m is not None + assert m["val"] == "true" + + attrib_converter = XMLAttributeNodeConverter(yaml.safe_load(""" +type: XMLAttributeNode +match_name: size +match_value: (?P<val>.*) +"""), "TestXMLAttributeNodeConverter", converter_registry) + m = attrib_converter.match(children[0]) + assert m is None + m = attrib_converter.match(children[1]) + assert m is not None + assert m["val"] == "45" + + +def test_text_nodes(converter_registry): + """ + Test text node converters. + """ + + xml_text = """ + <node1 active="true" size="45"> + Bla + </node1> +""" + + converter = XMLTagConverter(yaml.safe_load(""" +type: XMLTag +match_tag: "node1" +xpath: . +tags_as_children: false +text_as_children: true +"""), "TestXMLTagConverter", converter_registry) + + tag = XMLTagElement(fromstring(xml_text)) + m = converter.match(tag) + assert m is not None + children = converter.create_children(GeneralStore(), tag) + assert len(children) == 1 + + attrib_converter = XMLTextNodeConverter(yaml.safe_load(""" +type: XMLTextNode +match_text: \s*(?P<val>\w*)\s* +"""), "TestXMLTextNodeConverter", converter_registry) + m = attrib_converter.match(children[0]) + assert m is not None + assert m["val"] == "Bla" diff --git a/unittests/utils.py b/unittests/utils.py index a9649dea686c33dc33d0d7636d08aa51beb35412..fee80e44028667b9b3c8c8f8201b1a774c46afdf 100644 --- a/unittests/utils.py +++ b/unittests/utils.py @@ -36,5 +36,5 @@ def dircheckstr(prefix, *pathcomponents): ftype = "Directory" else: ftype = "File" - return (f"caoscrawler.structure_elements.{ftype}: " + os.path.basename( + return (f"caoscrawler.structure_elements.structure_elements.{ftype}: " + os.path.basename( os.path.join(*pathcomponents)) + ", " + os.path.join(prefix, *pathcomponents))