diff --git a/.docker/Dockerfile b/.docker/Dockerfile index 1e9763f3496c9dca6cc33e6ba8217a654bed487e..1468a17feb16940ae658d3ca6b885af7139ce3d8 100644 --- a/.docker/Dockerfile +++ b/.docker/Dockerfile @@ -34,7 +34,7 @@ RUN rm -r /git/.git # Install pycaosdb.ini for the tests RUN mv /git/.docker/tester_pycaosdb.ini /git/integrationtests/pycaosdb.ini -RUN cd /git/ && pip3 install --break-system-packages .[h5-crawler,spss] +RUN cd /git/ && pip3 install --break-system-packages .[h5-crawler,spss,rocrate] WORKDIR /git/integrationtests # wait for server, diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8812abacc0ef157c418e8f658a4fa7261bb04743..e43223568252b2e7a1504610692fe20dc9d78348 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -121,27 +121,21 @@ unittest_py3.11: - python3 -c "import sys; assert sys.version.startswith('3.11')" - tox -unittest_py3.8: +unittest_py3.9: tags: [cached-dind] stage: test - image: python:3.8 + image: python:3.9 script: &python_test_script # install dependencies - pip install pytest pytest-cov # TODO: Use f-branch logic here - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@dev - - pip install .[h5-crawler,spss] + - pip install .[h5-crawler,spss,rocrate] # actual test - caosdb-crawler --help - pytest --cov=caosdb -vv ./unittests -unittest_py3.9: - tags: [cached-dind] - stage: test - image: python:3.9 - script: *python_test_script - unittest_py3.10: tags: [cached-dind] stage: test @@ -155,23 +149,10 @@ unittest_py3.12: script: *python_test_script unittest_py3.13: - allow_failure: true tags: [cached-dind] stage: test - image: python:3.13-rc - script: - # TODO: Replace by '*python_test_script' as soon as 3.13 has been officially released. - # TODO Remove the "!" after 3.13 release, which serves as an xfail - - apt update && apt install -y cargo - # install dependencies - - pip install pytest pytest-cov - # TODO: Use f-branch logic here - - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev - - (! pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@dev) - - (! pip install .[h5-crawler,spss]) - # actual test - - (! caosdb-crawler --help) - - (! pytest --cov=caosdb -vv ./unittests) + image: python:3.13 + script: *python_test_script inttest: tags: [docker] diff --git a/CHANGELOG.md b/CHANGELOG.md index 8dc1293bfaff323b14da65476294a63c0df587c5..354024f9be37fc102f035a5d6562b6d522aaa915 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,15 +9,81 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### +- Validation module for checking a list of generated records against a list of json schemas + that can be generated from a yaml data model file. +- DictElementConverters can now make use of `match_properties` which + works analogous to `match_properties` in ROCrateEntityConverter and + `match_attrib` in XMLConverter. +- `match_properties` is a method of class Converter and can for + example be used by CustomConverters. +- ZipFileConverter that opens zip files and exposes their contents as + File and Directory structure elements. +- `linkahead-crawler` script as alias for `caosdb-crawler`. +- New transformers of the form `cast_to_*` which allow casting + variables to `int`, `float`, `str` and `bool`. +- Transformer function definition in the cfood support variable + substitutions now. +- `crawler_main` and `scanner.scan_directory` now support list of + directories to be crawled, too. Note that giving a list of + directories is currently incompatible with + `securityMode=SecurityMode.RETRIEVE` or + `securityMode=SecurityMode.INSERT` since the functionality to + authoriye pending inserts or updates doesn't support path lists yet + and will raise a NotImplementedError for now. +- `match_newer_than_file` option for `DirectoryConverter`: A reference + file containing (only) an ISO-formatted datetime string can be + specified here. Directories with this option won't match if all + their contents were last modified before that datetime. + +### Changed ### + +### Deprecated ### + +### Removed ### + +### Fixed ### + +- `spss_to_datamodel` script works again. + +### Security ### + +### Documentation ### + +## [0.10.1] - 2024-11-13 ## + +### Fixed ### + +* Removed optional rocrate dependency which prevented package + publication on PyPI for a violation of PEP 440 (see + https://github.com/pypi/warehouse/issues/7136). It will be + re-activated once + https://github.com/ResearchObject/ro-crate-py/issues/203 has been + resolved upstream. For now, if you want to use the ROCrate or ELN + converters, manually install the fix from + https://github.com/salexan2001/ro-crate-py.git@f-automatic-dummy-ids + ```sh + pip install git+https://github.com/salexan2001/ro-crate-py.git@f-automatic-dummy-ids + ``` + +## [0.10.0] - 2024-11-13 ## + +### Added ### + - XMLTextNodeConverter for converting text nodes created by XMLTagConverter - XMLAttributeNodeConverter for converting attribute nodes created by XMLTagConverter - Units for properties. They can be specified by giving the property as a dict in the form ```yaml MyRecord: - my_prop: - value: 5 - unit: m + my_prop: + value: 5 + unit: m ``` +- Support for Python 3.13 +- ROCrateConverter, ELNFileConverter and ROCrateEntityConverter for crawling ROCrate and .eln files +- `max_log_level` parameter to `logging.configure_server_side_logging` + to control the server-side debuglog's verboosity, and an optional + `sss_max_log_level` parameter to `crawler_main` to control the SSS + loglevel separately from the global `debug` option. ### Changed ### @@ -28,17 +94,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - if `value` starts with '+', collection mode is "list". - if `value` starts with '*', collection mode is "multiproperty". - in all other cases, collection mode is "single". - -### Deprecated ### +- The default server-side scrippting debug level is now controlled by + the global `debug` option by default and set to log level `INFO` in + case of `debug=False`. The previous behavior can be restored by + calling `crawler_main` with `sss_max_log_level=logging.DEBUG`. ### Removed ### +* Support for Python 3.8 (end of life) + ### Fixed ### - Added better error message for some cases of broken converter and record definitions. - -### Security ### +- [#108](https://gitlab.com/linkahead/linkahead-crawler/-/issues/108) + Too verbose server-side scripting logs that could lead to high disk + usage. ### Documentation ### diff --git a/CITATION.cff b/CITATION.cff index 99756999f4a42818510fffbe1d02e1bf4396540b..ed859432b26cde913f7283fb8e969a97b7b74f41 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -17,6 +17,6 @@ authors: given-names: Alexander orcid: https://orcid.org/0000-0003-4124-9649 title: CaosDB - Crawler -version: 0.9.1 +version: 0.10.1 doi: 10.3390/data9020024 -date-released: 2024-09-26 \ No newline at end of file +date-released: 2024-11-13 \ No newline at end of file diff --git a/integrationtests/test_crawler_main.py b/integrationtests/test_crawler_main.py new file mode 100644 index 0000000000000000000000000000000000000000..a2eebf4f04e195754eaf71dc5e829b6a77a4cc4b --- /dev/null +++ b/integrationtests/test_crawler_main.py @@ -0,0 +1,95 @@ +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# 2024 Florian Spreckelsen <f.spreckelsen@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +import logging +import tempfile + +from pathlib import Path + +import linkahead as db + +from caoscrawler import crawl +from caoscrawler.crawl import (crawler_main, SecurityMode) +from linkahead.utils.register_tests import clear_database, set_test_key + +set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") + +INTTESTDIR = Path(__file__).parent + + +def test_list_of_paths(clear_database, monkeypatch): + + # Mock the status record + dummy_status = { + "n_calls": 0 + } + + def _mock_update_status_record(run_id, n_inserts, n_updates, status): + print("Update mocked status") + dummy_status["run_id"] = run_id + dummy_status["n_inserts"] = n_inserts + dummy_status["n_updates"] = n_updates + dummy_status["status"] = status + dummy_status["n_calls"] += 1 + monkeypatch.setattr(crawl, "_update_status_record", _mock_update_status_record) + + # mock SSS environment + monkeypatch.setenv("SHARED_DIR", tempfile.gettempdir()) + + # We need only one dummy RT + rt = db.RecordType(name="TestType").insert() + basepath = INTTESTDIR / "test_data" / "crawler_main_with_list_of_dirs" + dirlist = [basepath / "dir1", basepath / "dir2"] + crawler_main( + dirlist, + cfood_file_name=basepath / "cfood.yml", + identifiables_definition_file=basepath / "identifiable.yml" + ) + recs = db.execute_query("FIND TestType") + assert len(recs) == 2 + assert "Test1" in [r.name for r in recs] + assert "Test2" in [r.name for r in recs] + + assert dummy_status["n_inserts"] == 2 + assert dummy_status["n_updates"] == 0 + assert dummy_status["status"] == "OK" + assert dummy_status["n_calls"] == 1 + + +def test_not_implemented_list_with_authorization(caplog, clear_database): + + rt = db.RecordType(name="TestType").insert() + basepath = INTTESTDIR / "test_data" / "crawler_main_with_list_of_dirs" + dirlist = [basepath / "dir1", basepath / "dir2"] + + # This is not implemented yet, so check log for correct error. + ret = crawler_main( + dirlist, + cfood_file_name=basepath / "cfood.yml", + identifiables_definition_file=basepath / "identifiable.yml", + securityMode=SecurityMode.RETRIEVE + ) + # crawler_main hides the error, but has a non-zero return code and + # errors in the log: + assert ret != 0 + err_tuples = [t for t in caplog.record_tuples if t[1] == logging.ERROR] + assert len(err_tuples) == 1 + assert "currently implemented only for single paths, not for lists of paths" in err_tuples[0][2] + # No inserts after the errors + assert len(db.execute_query("FIND TestType")) == 0 diff --git a/integrationtests/test_data/crawler_main_with_list_of_dirs/cfood.yml b/integrationtests/test_data/crawler_main_with_list_of_dirs/cfood.yml new file mode 100644 index 0000000000000000000000000000000000000000..c7f22ce07e9b401915aefde3bf7e3a78d92e2bd6 --- /dev/null +++ b/integrationtests/test_data/crawler_main_with_list_of_dirs/cfood.yml @@ -0,0 +1,10 @@ +--- +metadata: + crawler-version: 0.10.2 +--- +BaseDirElement: + type: Directory + match: ^dir(?P<dir_number>[0-9]+)$$ + records: + TestType: + name: Test$dir_number diff --git a/integrationtests/test_data/crawler_main_with_list_of_dirs/dir1/.gitkeep b/integrationtests/test_data/crawler_main_with_list_of_dirs/dir1/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/integrationtests/test_data/crawler_main_with_list_of_dirs/dir2/.gitkeep b/integrationtests/test_data/crawler_main_with_list_of_dirs/dir2/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/integrationtests/test_data/crawler_main_with_list_of_dirs/identifiable.yml b/integrationtests/test_data/crawler_main_with_list_of_dirs/identifiable.yml new file mode 100644 index 0000000000000000000000000000000000000000..6d608cece0ae7c2aa6461fb56025a8ac8e4faf6f --- /dev/null +++ b/integrationtests/test_data/crawler_main_with_list_of_dirs/identifiable.yml @@ -0,0 +1,2 @@ +TestType: + - name diff --git a/integrationtests/test_issues.py b/integrationtests/test_issues.py index 38d00a5e249a711beb2b48e783efb058cd07ad36..01e2c3033271a7a4b2f4472c93fa13a62454ccc5 100644 --- a/integrationtests/test_issues.py +++ b/integrationtests/test_issues.py @@ -1,4 +1,4 @@ -# This file is a part of the CaosDB Project. +# This file is a part of the LinkAhead Project. # # Copyright (C) 2022 Indiscale GmbH <info@indiscale.com> # 2022 Florian Spreckelsen <f.spreckelsen@indiscale.com> @@ -16,21 +16,22 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. # +import tempfile + import linkahead as db +import yaml from caosadvancedtools.models.parser import parse_model_from_string from caoscrawler.crawl import Crawler from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter -from caoscrawler.scanner import (create_converter_registry, - scan_structure_elements, - _load_definition_from_yaml_dict) +from caoscrawler.scanner import (_load_definition_from_yaml_dict, + create_converter_registry, + scan_structure_elements) from caoscrawler.structure_elements import DictElement from linkahead.cached import cache_clear from linkahead.utils.register_tests import clear_database, set_test_key from pytest import fixture, mark, raises -import yaml - set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") @@ -383,3 +384,46 @@ Block: crawler = Crawler(identifiableAdapter=ident) crawler.synchronize(crawled_data=records) + + +def test_issue_14(clear_database): + """ + Issue title: Some parent updates are required before inserts + + https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/14 + """ + + rt1 = db.RecordType(name="RT1") + rt2 = db.RecordType(name="RT2").insert() + rt1.add_property(rt2, importance=db.OBLIGATORY) + rt1.insert() + + r = db.Record() + r.add_parent(rt1) + with tempfile.NamedTemporaryFile() as tmpf: + f = db.File(name="test_parent", path="parent_test/file.txt", file=tmpf.name) + f.insert() + + # We create a clean new file object here: + f2 = db.File(name="test_parent", path="parent_test/file.txt", file=tmpf.name) + + f2.add_parent(rt2) + r.add_property(name="RT2", value=f2) + + # Current state in the database: File without parents + f_test_base = db.File(name="test_parent").retrieve() + assert len(f_test_base.parents) == 0 + assert len(db.execute_query("FIND Record")) == 0 + + ident = CaosDBIdentifiableAdapter() + ident.register_identifiable("RT1", db.RecordType().add_parent( + name="RT1").add_property(name="RT2")) + crawler = Crawler(identifiableAdapter=ident) + crawler.synchronize(crawled_data=[f2, r]) + + f_test = db.File(name="test_parent").retrieve() + assert len(f_test.parents) == 1 + assert f_test.parents[0].name == "RT2" + records = db.execute_query("FIND Record") + assert len(records) == 1 + assert records[0].get_property("RT2").value == f_test.id diff --git a/setup.cfg b/setup.cfg index 558599013f3556a41481305ba587e3947a403d63..d05f2acb1e8d5afafa5a1003c6da2dff0980c126 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = caoscrawler -version = 0.9.2 +version = 0.10.2 author = Alexander Schlemmer author_email = alexander.schlemmer@ds.mpg.de description = A new crawler for LinkAhead @@ -17,11 +17,11 @@ classifiers = package_dir = = src packages = find: -python_requires = >=3.8 +python_requires = >=3.9 install_requires = caosadvancedtools >= 0.7.0 importlib-resources - linkahead > 0.13.2 + linkahead >= 0.16.0 odfpy #make optional packaging pandas @@ -39,8 +39,9 @@ per-file-ignores = __init__.py:F401 [options.entry_points] console_scripts = + linkahead-crawler = caoscrawler.crawl:main caosdb-crawler = caoscrawler.crawl:main - spss_to_datamodel = caoscrawler.conv_impl.spss:spss_to_datamodel_main + spss_to_datamodel = caoscrawler.converters.spss:spss_to_datamodel_main csv_to_datamodel = caoscrawler.scripts.generators:csv_to_datamodel_main [options.extras_require] @@ -49,3 +50,5 @@ h5-crawler = numpy spss = pandas[spss] +rocrate = + rocrate @ git+https://github.com/salexan2001/ro-crate-py.git@f-automatic-dummy-ids diff --git a/src/caoscrawler/authorize.py b/src/caoscrawler/authorize.py index 6f1011b227881d4b73186996076abe20d94d52e5..f3deed4f8c78afa85fdd4471fe9383760b8c8b12 100644 --- a/src/caoscrawler/authorize.py +++ b/src/caoscrawler/authorize.py @@ -19,10 +19,10 @@ # along with this program. If not, see <https://www.gnu.org/licenses/>. # -from caosadvancedtools.crawler import Crawler as OldCrawler - import argparse +from caosadvancedtools.crawler import Crawler as OldCrawler + def parse_args(): parser = argparse.ArgumentParser() diff --git a/src/caoscrawler/cfood-schema.yml b/src/caoscrawler/cfood-schema.yml index c5e0eaad092c12efbceb5f55b62b3d7cf8afdccf..d2e4cea24f0f2803499116420091b36e95b2c781 100644 --- a/src/caoscrawler/cfood-schema.yml +++ b/src/caoscrawler/cfood-schema.yml @@ -88,6 +88,12 @@ cfood: match_value: description: a regexp that is matched to the value of a key-value pair type: string + match_newer_than_file: + description: | + Only relevant for Directory. A path to a file containing + an ISO-formatted datetime. Only match if the contents of the + Directory have been modified after that datetime. + type: string record_from_dict: description: Only relevant for PropertiesFromDictElement. Specify the root record which is generated from the contained dictionary. type: object diff --git a/src/caoscrawler/converters/__init__.py b/src/caoscrawler/converters/__init__.py index 540a4cfca9ff19248baab2bc0fe8d10987d4bd1f..edb7b3633cea2657dc3b9638379a3e57c37c87e4 100644 --- a/src/caoscrawler/converters/__init__.py +++ b/src/caoscrawler/converters/__init__.py @@ -18,11 +18,12 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. -"""Submdule containing all default and optional converters.""" +"""Submodule containing all default and optional converters.""" from .. import utils from .converters import * from .xml_converter import * +from .zipfile_converter import ZipFileConverter try: from .spss import SPSSConverter @@ -30,3 +31,17 @@ except ImportError as err: SPSSConverter: type = utils.MissingImport( name="SPSSConverter", hint="Try installing with the `spss` extra option.", err=err) + +try: + from .rocrate import (ELNFileConverter, ROCrateConverter, + ROCrateEntityConverter) +except ImportError as err: + ROCrateEntityConverter: type = utils.MissingImport( + name="ROCrateEntityConverter", hint="Try installing with the `rocrate` extra option.", + err=err) + ROCrateConverter: type = utils.MissingImport( + name="ROCrateConverter", hint="Try installing with the `rocrate` extra option.", + err=err) + ELNFileConverter: type = utils.MissingImport( + name="ELNFileConverter", hint="Try installing with the `rocrate` extra option.", + err=err) diff --git a/src/caoscrawler/converters/converters.py b/src/caoscrawler/converters/converters.py index 22686e0dbae26e3322059928cf3ba0b4522f672c..f95862a900b46d9d92a2d3389d41487266a790dc 100644 --- a/src/caoscrawler/converters/converters.py +++ b/src/caoscrawler/converters/converters.py @@ -400,6 +400,15 @@ class Converter(object, metaclass=ABCMeta): self.converters.append(Converter.converter_factory( converter_definition, converter_name, converter_registry)) + self.setup() + + def setup(self): + """ + Analogous to `cleanup`. Can be used to set up variables that are permanently + stored in this converter. + """ + pass + @staticmethod def converter_factory(definition: dict, name: str, converter_registry: dict): """Create a Converter instance of the appropriate class. @@ -447,6 +456,90 @@ class Converter(object, metaclass=ABCMeta): raise RuntimeError("Condition does not match.") values.update(m) + def match_properties(self, properties: dict, vardict: dict, label: str = "match_properties"): + """This method can be used to generically match 'match_properties' from the cfood definition + with the behavior described as follows: + + 'match_properties' is a dictionary of key-regexps and value-regexp pairs. Each key matches + a property name and the corresponding value matches its property value. + + What a property means in the context of the respective converter can be different, examples: + + * XMLTag: attributes of the node + * ROCrate: properties of the ROCrateEntity + * DictElement: properties of the dict + + label can be used to customize the name of the dictionary in the definition. + + This method is not called by default, but can be called from child classes. + + Typically it would be used like this from methods overwriting `match`:: + + if not self.match_properties(<properties>, vardict): + return None + + vardict will be updated in place when there are + matches. <properties> is a dictionary taken from the structure + element that contains the properties in the context of this + converter. + + + Parameters + ---------- + + properties: dict + The dictionary containing the properties to be matched. + + vardict: dict + This dictionary will be used to store the variables created during the matching. + + label: str + Default "match_properties". Can be used to change the name + of the property in the definition. E.g. the xml converter + uses "match_attrib" which makes more sense in the context + of xml trees. + + Returns + ------- + + : bool + Returns True when properties match and False + otherwise. The vardict dictionary is updated in place. + + """ + if label in self.definition: + # This matcher works analogously to the attributes matcher in the XMLConverter + for prop_def_key, prop_def_value in self.definition[label].items(): + match_counter = 0 + matched_m_prop = None + matched_m_prop_value = None + for prop_key, prop_value in properties.items(): + # print("{} = {}".format(prop_key, prop_value)) + # TODO: automatic conversion to str ok? + m_prop = re.match(prop_def_key, str(prop_key)) + if m_prop is not None: + match_counter += 1 + matched_m_prop = m_prop + # TODO: automatic conversion to str ok? + m_prop_value = re.match(prop_def_value, str(prop_value)) + if m_prop_value is None: + return False + matched_m_prop_value = m_prop_value + # TODO: How to deal with multiple matches? + # There are multiple options: + # - Allow multiple attribute-key matches: Leads to possible overwrites of variables + # - Require unique attribute-key and attribute-value matches: Very complex + # - Only allow one single attribute-key to match and run attribute-value match separately. + # Currently the latter option is implemented. + # TODO: The ROCrateEntityConverter implements a very similar behavior. + if match_counter == 0: + return False + elif match_counter > 1: + raise RuntimeError("Multiple properties match the same {} entry.".format(label)) + vardict.update(matched_m_prop.groupdict()) + vardict.update(matched_m_prop_value.groupdict()) + return True + def apply_transformers(self, values: GeneralStore, transformer_functions: dict): """ Check if transformers are defined using the "transform" keyword. @@ -482,10 +575,19 @@ class Converter(object, metaclass=ABCMeta): " one element with they key being the name" " of the function!") tr_func_key = list(tr_func_el.keys())[0] - tr_func_params = tr_func_el[tr_func_key] + if tr_func_key not in transformer_functions: raise RuntimeError("Unknown transformer function: {}".format(tr_func_key)) + # Do variable replacment on function parameters: + if tr_func_el[tr_func_key] is not None: + # Create a copy of the function parameters: + tr_func_params = dict(tr_func_el[tr_func_key]) + for key in tr_func_params: + tr_func_params[key] = replace_variables(tr_func_params[key], values) + else: + tr_func_params = None + # Retrieve the function from the dictionary: tr_func = transformer_functions[tr_func_key] # Call the function: @@ -619,6 +721,13 @@ class Converter(object, metaclass=ABCMeta): """ pass + def cleanup(self): + """ + This function is called when the converter runs out of scope and can be used to + clean up objects that were needed in the converter or its children. + """ + pass + class DirectoryConverter(Converter): """ @@ -660,6 +769,11 @@ class DirectoryConverter(Converter): m = re.match(self.definition["match"], element.name) if m is None: return None + if "match_newer_than_file" in self.definition: + last_modified = self._get_most_recent_change_in_dir(element) + reference = self._get_reference_file_timestamp() + if last_modified < reference: + return None return m.groupdict() @staticmethod @@ -682,6 +796,49 @@ class DirectoryConverter(Converter): return children + @staticmethod + def _get_most_recent_change_in_dir(element: Directory) -> datetime.datetime: + """Return the datetime of the most recent change of any file + or directory in the given Directory element. + + """ + most_recent = os.path.getmtime(element.path) + + for root, _, files in os.walk(element.path): + mtimes = [os.path.getmtime(root)] + \ + [os.path.getmtime(os.path.join(root, fname)) for fname in files] + if max(mtimes) > most_recent: + most_recent = max(mtimes) + + return datetime.datetime.fromtimestamp(most_recent) + + def _get_reference_file_timestamp(self) -> datetime.datetime: + """Return a time stamp read from a reference file if it + exists. Otherwise return datetime.datetime.min, i.e., the + earliest datetime known to datetime. + + """ + + if "match_newer_than_file" not in self.definition: + logger.debug("No reference file specified.") + return datetime.datetime.min + + elif not os.path.isfile(self.definition["match_newer_than_file"]): + logger.debug("Reference file doesn't exist.") + return datetime.datetime.min + + with open(self.definition["match_newer_than_file"]) as ref_file: + stamp_str = ref_file.readline().strip() + try: + return datetime.datetime.fromisoformat(stamp_str) + except ValueError as e: + logger.error( + f"Reference file in {self.definition['match_newer_than_file']} " + "doesn't contain a ISO formatted datetime in its first line. " + "Match regardless of modification times." + ) + raise e + class SimpleFileConverter(Converter): """Just a file, ignore the contents.""" @@ -860,7 +1017,12 @@ class DictElementConverter(Converter): # TODO: See comment on types and inheritance if not isinstance(element, DictElement): raise RuntimeError("Element must be a DictElement.") - return match_name_and_value(self.definition, element.name, element.value) + vardict = match_name_and_value(self.definition, element.name, element.value) + + if not self.match_properties(element.value, vardict): + return None + + return vardict class PropertiesFromDictConverter(DictElementConverter): diff --git a/src/caoscrawler/converters/hdf5_converter.py b/src/caoscrawler/converters/hdf5_converter.py index a4d974bd53fc4b0e22d155f01a6a47295b79e984..97dac53d053dbcb87c48f0cfb59d4f09770b9710 100644 --- a/src/caoscrawler/converters/hdf5_converter.py +++ b/src/caoscrawler/converters/hdf5_converter.py @@ -28,16 +28,16 @@ except ModuleNotFoundError: "its optional `h5-crawler` dependency?" ) -import numpy as np - from typing import Union import linkahead as db +import numpy as np -from .converters import (convert_basic_element, Converter, DictElementConverter, - match_name_and_value, SimpleFileConverter) from ..stores import GeneralStore, RecordStore -from ..structure_elements import DictElement, File, FloatElement, IntegerElement, StructureElement +from ..structure_elements import (DictElement, File, FloatElement, + IntegerElement, StructureElement) +from .converters import (Converter, DictElementConverter, SimpleFileConverter, + convert_basic_element, match_name_and_value) def convert_attributes(elt: Union[h5py.File, h5py.Group, h5py.Dataset]): diff --git a/src/caoscrawler/converters/rocrate.py b/src/caoscrawler/converters/rocrate.py new file mode 100644 index 0000000000000000000000000000000000000000..8a45af753312a2bf29c1ddb9e6bcb15458c3ebde --- /dev/null +++ b/src/caoscrawler/converters/rocrate.py @@ -0,0 +1,213 @@ +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Converters take structure elements and create Records and new structure elements from them. + +This converter converts ro-crate files which may also be .eln-files. + +""" + +from __future__ import annotations + +import os +import re +import tempfile +from typing import Optional +from zipfile import ZipFile + +import rocrate +from rocrate.rocrate import ROCrate + +from ..stores import GeneralStore +from ..structure_elements import (Directory, File, ROCrateEntity, + StructureElement) +from .converters import Converter, SimpleFileConverter, convert_basic_element + + +class ROCrateConverter(SimpleFileConverter): + + """Convert ro-crate files / directories. + """ + + def setup(self): + self._tempdir = None + + def cleanup(self): + self._tempdir.cleanup() + + def typecheck(self, element: StructureElement): + """ + Check whether the current structure element can be converted using + this converter. + """ + return isinstance(element, File) or isinstance(element, Directory) + + def match(self, element: StructureElement) -> Optional[dict]: + m = re.match(self.definition["match"], element.name) + if m is None: + return None + return m.groupdict() + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + """ + Loads an ROCrate from an rocrate file or directory. + + Arguments: + ---------- + element must be a File or Directory (structure element). + + Returns: + -------- + A list with an ROCrateElement representing the contents of the .eln-file or None + in case of errors. + """ + + if isinstance(element, File): + self._tempdir = tempfile.TemporaryDirectory() + with ZipFile(element.path) as zipf: + zipf.extractall(self._tempdir.name) + crate_path = self._tempdir.name + crate = ROCrate(crate_path) + entity_ls = [] + for ent in crate.get_entities(): + entity_ls.append(ROCrateEntity(crate_path, ent)) + return entity_ls + elif isinstance(element, Directory): + # This would be an unzipped .eln file + # As this is possible for rocrate files, I think it is reasonable + # to support it as well. + raise NotImplementedError() + else: + raise ValueError("create_children was called with wrong type of StructureElement") + return None + + +class ELNFileConverter(ROCrateConverter): + + """Convert .eln-Files + See: https://github.com/TheELNConsortium/TheELNFileFormat + + These files are basically RO-Crates with some minor differences: + - The ro-crate metadata file is not on top-level within the .eln-zip-container, + but in a top-level subdirectory. + """ + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + """ + Loads an ROCrate from an .eln-file or directory. + + This involves unzipping the .eln-file to a temporary folder and creating an ROCrate object + from its contents. + + Arguments: + ---------- + element must be a File or Directory (structure element). + + Returns: + -------- + A list with an ROCrateElement representing the contents of the .eln-file or None + in case of errors. + """ + + if isinstance(element, File): + self._tempdir = tempfile.TemporaryDirectory() + with ZipFile(element.path) as zipf: + zipf.extractall(self._tempdir.name) + cratep = os.listdir(self._tempdir.name) + if len(cratep) != 1: + raise RuntimeError(".eln file must contain exactly one folder") + crate_path = os.path.join(self._tempdir.name, cratep[0]) + crate = ROCrate(crate_path) + entity_ls = [] + for ent in crate.get_entities(): + entity_ls.append(ROCrateEntity(crate_path, ent)) + return entity_ls + elif isinstance(element, Directory): + # This would be an unzipped .eln file + # As this is possible for rocrate files, I think it is reasonable + # to support it as well. + raise NotImplementedError() + else: + raise ValueError("create_children was called with wrong type of StructureElement") + return None + + +class ROCrateEntityConverter(Converter): + + def typecheck(self, element: StructureElement): + """ + Check whether the current structure element can be converted using + this converter. + """ + return isinstance(element, ROCrateEntity) + + def match(self, element: StructureElement) -> Optional[dict]: + # See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145 + # for a suggestion for the design of the matching algorithm. + if not isinstance(element, ROCrateEntity): + raise TypeError("Element must be an instance of ROCrateEntity.") + + # Store the result of all individual regexp variable results: + vardict = {} + + # TODO: I accidentally used "match_type" instead + # of "match_entity_type". This was completely + # unnoticed. So add it to schema and adapt tests. + + if "match_entity_type" in self.definition: + entity_type = element.entity.type + if isinstance(entity_type, list): + # TODO: this seems to be a bug in kadi4mat RO-Crates + # ./ has type ['Dataset'] + # instead of type 'Dataset' + entity_type = entity_type[0] + m_type = re.match(self.definition["match_entity_type"], entity_type) + if m_type is None: + return None + vardict.update(m_type.groupdict()) + + if not self.match_properties(element.entity.properties(), vardict): + return None + + return vardict + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + + children = [] + + eprops = element.entity.properties() + + # Add the properties: + for name, value in eprops.items(): + children.append(convert_basic_element(value, name)) + + # Add the files: + if isinstance(element.entity, rocrate.model.file.File): + path, name = os.path.split(eprops["@id"]) + children.append(File(name, os.path.join(element.folder, path, name))) + + # Parts of this entity are added as child entities: + if "hasPart" in eprops: + for p in eprops["hasPart"]: + children.append( + ROCrateEntity(element.folder, element.entity.crate.dereference( + p["@id"]))) + + return children diff --git a/src/caoscrawler/converters/spss.py b/src/caoscrawler/converters/spss.py index b4f03aeaed6663be98487a4780bb96237e72e27e..00742e91506245435ed0c590f68ea9ffce65717a 100644 --- a/src/caoscrawler/converters/spss.py +++ b/src/caoscrawler/converters/spss.py @@ -22,17 +22,16 @@ from __future__ import annotations # Can be removed with 3.10. import argparse from collections import OrderedDict +from typing import Any, Optional import numpy as np import pandas as pd import pyreadstat import yaml -from . import converters from ..stores import GeneralStore -from ..structure_elements import (File, StructureElement) -from typing import Optional, Any - +from ..structure_elements import File, StructureElement +from . import converters READSTAT_TYPES = { "double": "DOUBLE", diff --git a/src/caoscrawler/converters/xml_converter.py b/src/caoscrawler/converters/xml_converter.py index 0f25c0c0947421f0561c42318ac0abddabb447fc..60d7b49431fb011a06b7105a16471b0b3c7b2268 100644 --- a/src/caoscrawler/converters/xml_converter.py +++ b/src/caoscrawler/converters/xml_converter.py @@ -22,17 +22,16 @@ from __future__ import annotations -import lxml.etree import re - from typing import Optional -import linkahead as db +import lxml.etree -from .converters import SimpleFileConverter, ConverterValidationError, Converter -from ..stores import GeneralStore, RecordStore -from ..structure_elements import (File, StructureElement, - XMLTagElement, XMLTextNode, XMLAttributeNode) +from ..stores import GeneralStore +from ..structure_elements import (File, StructureElement, XMLAttributeNode, + XMLTagElement, XMLTextNode) +from .converters import (Converter, ConverterValidationError, + SimpleFileConverter) class XMLFileConverter(SimpleFileConverter): @@ -163,32 +162,8 @@ class XMLTagConverter(Converter): return None vardict.update(m_text.groupdict()) - if "match_attrib" in self.definition: - for attrib_def_key, attrib_def_value in self.definition["match_attrib"].items(): - match_counter = 0 - matched_m_attrib = None - matched_m_attrib_value = None - for attr_key, attr_value in element.tag.attrib.items(): - m_attrib = re.match(attrib_def_key, attr_key) - if m_attrib is not None: - match_counter += 1 - matched_m_attrib = m_attrib - m_attrib_value = re.match(attrib_def_value, attr_value) - if m_attrib_value is None: - return None - matched_m_attrib_value = m_attrib_value - # TODO: How to deal with multiple matches? - # There are multiple options: - # - Allow multiple attribute-key matches: Leads to possible overwrites of variables - # - Require unique attribute-key and attribute-value matches: Very complex - # - Only allow one single attribute-key to match and run attribute-value match separately. - # Currently the latter option is implemented. - if match_counter == 0: - return None - elif match_counter > 1: - raise RuntimeError("Multiple attributes match the same match_attrib entry.") - vardict.update(matched_m_attrib.groupdict()) - vardict.update(matched_m_attrib_value.groupdict()) + if not self.match_properties(element.tag.attrib, vardict, "match_attrib"): + return None return vardict diff --git a/src/caoscrawler/converters/zipfile_converter.py b/src/caoscrawler/converters/zipfile_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..7073e66a266168e17eb9b6143e7dc6292b5149dc --- /dev/null +++ b/src/caoscrawler/converters/zipfile_converter.py @@ -0,0 +1,82 @@ +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Converters take structure elements and create Records and new structure elements from them. + +This converter opens zip files, unzips them into a temporary directory and +exposes its contents as File structure elements. + +""" + +from __future__ import annotations + +import os +import tempfile +from os.path import isdir, join +from zipfile import ZipFile + +from ..stores import GeneralStore +from ..structure_elements import Directory, File, StructureElement +from .converters import SimpleFileConverter + + +class ZipFileConverter(SimpleFileConverter): + + """Convert zipfiles. + """ + + def setup(self): + self._tempdir = None + + def cleanup(self): + self._tempdir.cleanup() + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + """ + Loads an ROCrate from an rocrate file or directory. + + Arguments: + ---------- + element must be a File or Directory (structure element). + + Returns: + -------- + A list with an ROCrateElement representing the contents of the .eln-file or None + in case of errors. + """ + + if isinstance(element, File): + self._tempdir = tempfile.TemporaryDirectory() + unzd_path = self._tempdir.name + with ZipFile(element.path) as zipf: + zipf.extractall(unzd_path) + + entity_ls = [] + for el in os.listdir(unzd_path): + path = join(unzd_path, el) + if isdir(path): + entity_ls.append(Directory(el, path)) + else: + entity_ls.append(File(el, path)) + + return entity_ls + else: + raise ValueError("create_children was called with wrong type of StructureElement") + return None diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 89b5ba001ed446c7f5a0f261898deecd3e7a5e00..e0d243979faee8f44cdcee3b0e49c15af640c378 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -39,7 +39,6 @@ import sys import traceback import uuid import warnings - from argparse import RawTextHelpFormatter from copy import deepcopy from datetime import datetime @@ -52,13 +51,10 @@ from caosadvancedtools.cache import UpdateCache from caosadvancedtools.crawler import Crawler as OldCrawler from caosadvancedtools.serverside.helper import send_mail from caosadvancedtools.utils import create_entity_link -from linkahead.apiutils import (compare_entities, - merge_entities) +from linkahead.apiutils import compare_entities, merge_entities from linkahead.cached import cache_clear, cached_get_entity_by from linkahead.common.datatype import get_list_datatype, is_reference -from linkahead.exceptions import ( - TransactionError, -) +from linkahead.exceptions import TransactionError from linkahead.utils.escape import escape_squoted_text from .config import get_config_setting @@ -99,7 +95,7 @@ in a quite complex fashion: - If one of the entities has additional parents or additional properties -> not identical - If the value of one of the properties differs -> not identical - If datatype, importance or unit are reported different for a property by compare_entities - return "not_identical" only if these attributes are set explicitely by record1. + return False only if these attributes are set explicitely by record1. Ignore the difference otherwise. - If description, name, id or path appear in list of differences -> not identical. - If file, checksum, size appear -> Only different, if explicitely set by record1. @@ -535,8 +531,8 @@ one with the entities that need to be updated and the other with entities to be prop.value = Crawler._get_property_id_for_datatype( rtname=prop.datatype, name=prop.value) except (db.EmptyUniqueQueryError, db.QueryNotUniqueError): - logger.error("The Property {prop.name} with datatype={prop.datatype} has the " - "value {prop.value} and there is no appropriate Entity with such " + logger.error(f"The Property {prop.name} with datatype={prop.datatype} has the " + f"value {prop.value} and there is no appropriate Entity with such " "a name.") raise else: @@ -552,8 +548,8 @@ one with the entities that need to be updated and the other with entities to be name=el)) except (db.EmptyUniqueQueryError, db.QueryNotUniqueError): logger.error( - "The Property {prop.name} with datatype={prop.datatype} has the " - "value {prop.value} and there is no appropriate Entity with such " + f"The Property {prop.name} with datatype={prop.datatype} has the " + f"value {prop.value} and there is no appropriate Entity with such " "a name.") raise else: @@ -625,7 +621,7 @@ one with the entities that need to be updated and the other with entities to be crawled_data: Optional[list[db.Record]] = None, no_insert_RTs: Optional[list[str]] = None, no_update_RTs: Optional[list[str]] = None, - path_for_authorized_run: Optional[str] = "", + path_for_authorized_run: Optional[Union[str, list[str]]] = "", ): """ This function applies several stages: @@ -647,7 +643,7 @@ one with the entities that need to be updated and the other with entities to be no_update_RTs : list[str], optional list of RecordType names. Records that have one of those RecordTypes as parent will not be updated - path_for_authorized_run : str, optional + path_for_authorized_run : str or list[str], optional only used if there are changes that need authorization before being applied. The form for rerunning the crawler with the authorization of these changes will be generated with this path. See @@ -665,6 +661,12 @@ one with the entities that need to be updated and the other with entities to be "use for example the Scanner to create this data.")) crawled_data = self.crawled_data + if isinstance(path_for_authorized_run, list) and self.securityMode != SecurityMode.UPDATE: + raise NotImplementedError( + "Authorization of inserts and updates is currently implemented only " + "for single paths, not for lists of paths." + ) + to_be_inserted, to_be_updated = self._split_into_inserts_and_updates( SyncGraph(crawled_data, self.identifiableAdapter)) @@ -1008,7 +1010,7 @@ def _store_dry_run_data(ins, upd): "update": updates})) -def crawler_main(crawled_directory_path: str, +def crawler_main(crawled_directory_path: Union[str, list[str]], cfood_file_name: str, identifiables_definition_file: Optional[str] = None, debug: bool = False, @@ -1020,13 +1022,14 @@ def crawler_main(crawled_directory_path: str, restricted_path: Optional[list[str]] = None, remove_prefix: Optional[str] = None, add_prefix: Optional[str] = None, + sss_max_log_level: Optional[int] = None, ): """ Parameters ---------- - crawled_directory_path : str - path to be crawled + crawled_directory_path : str or list[str] + path(s) to be crawled cfood_file_name : str filename of the cfood to be used identifiables_definition_file : str @@ -1053,6 +1056,12 @@ def crawler_main(crawled_directory_path: str, add_prefix : Optional[str] Add the given prefix to file paths. See docstring of '_fix_file_paths' for more details. + sss_max_log_level : Optional[int] + If given, set the maximum log level of the server-side + scripting log separately from the general ``debug`` option. If + None is given, the maximum sss log level will be determined + from the value of ``debug``: ``logging.INFO`` if ``debug`` is + False, ``logging.DEBUG`` if ``debug`` is True. Returns ------- @@ -1063,7 +1072,11 @@ def crawler_main(crawled_directory_path: str, crawler = Crawler(securityMode=securityMode) if "SHARED_DIR" in os.environ: # setup logging and reporting if serverside execution - userlog_public, htmluserlog_public, debuglog_public = configure_server_side_logging() + if sss_max_log_level is None: + sss_max_log_level = logging.DEBUG if debug else logging.INFO + userlog_public, htmluserlog_public, debuglog_public = configure_server_side_logging( + max_log_level=sss_max_log_level + ) # TODO make this optional _create_status_record( get_shared_resource_link(get_config_setting("public_host_url"), htmluserlog_public), @@ -1108,42 +1121,28 @@ def crawler_main(crawled_directory_path: str, crawler.run_id) _update_status_record(crawler.run_id, len(inserts), len(updates), status="OK") return 0 - except ForbiddenTransaction as err: - logger.debug(traceback.format_exc()) - logger.error(err) - _update_status_record(crawler.run_id, 0, 0, status="FAILED") - return 1 - except ConverterValidationError as err: - logger.debug(traceback.format_exc()) - logger.error(err) - _update_status_record(crawler.run_id, 0, 0, status="FAILED") - return 1 - except ImpossibleMergeError as err: - logger.debug(traceback.format_exc()) - logger.error( - "Encountered conflicting information when creating Records from the crawled " - f"data:\n\n{err}" - ) - _update_status_record(crawler.run_id, 0, 0, status="FAILED") - return 1 - except TransactionError as err: - logger.debug(traceback.format_exc()) - logger.error(err) - logger.error("Transaction error details:") - for suberr in err.errors: - logger.error("---") - logger.error(suberr.msg) - logger.error(suberr.entity) - return 1 except Exception as err: logger.debug(traceback.format_exc()) logger.error(err) - - if "SHARED_DIR" in os.environ: - # pylint: disable=E0601 - domain = get_config_setting("public_host_url") - logger.error("Unexpected Error: Please tell your administrator about this and provide " - f"the following path.\n{get_shared_resource_link(domain, debuglog_public)}") + # Special treatment for known error types + if isinstance(err, ImpossibleMergeError): + logger.error( + "Encountered conflicting information when creating Records from the crawled " + f"data:\n\n{err}" + ) + elif isinstance(err, TransactionError): + logger.error("Transaction error details:") + for suberr in err.errors: + logger.error("---") + logger.error(suberr.msg) + logger.error(suberr.entity) + # Unkown errors get a special message + elif not isinstance(err, (ConverterValidationError, ForbiddenTransaction)): + if "SHARED_DIR" in os.environ: + # pylint: disable=E0601 + domain = get_config_setting("public_host_url") + logger.error("Unexpected Error: Please tell your administrator about this and provide " + f"the following path.\n{get_shared_resource_link(domain, debuglog_public)}") _update_status_record(crawler.run_id, 0, 0, status="FAILED") return 1 @@ -1167,6 +1166,7 @@ def parse_args(): "This file will only be generated if this option is set.") parser.add_argument("--debug", required=False, action="store_true", help="Path name of the cfood yaml file to be used.") + # TODO allow to provide multiple directories to be crawled on the commandline parser.add_argument("crawled_directory_path", help="The subtree of files below the given path will " "be considered. Use '/' for everything.") diff --git a/src/caoscrawler/debug_tree.py b/src/caoscrawler/debug_tree.py index 0d57040f5c20aca236a3c11531e8b7c45bad89ab..c154f5b91d850476be0c0610e5bb1dfcbf9866ab 100644 --- a/src/caoscrawler/debug_tree.py +++ b/src/caoscrawler/debug_tree.py @@ -29,35 +29,20 @@ A structure containing debug tree information. from __future__ import annotations -import argparse -import importlib -import logging -import os -import sys -import warnings -import yaml - -from argparse import RawTextHelpFormatter from collections import defaultdict -from copy import deepcopy -from enum import Enum -from importlib_resources import files -from jsonschema import validate -from typing import Any, Optional, Type, Union import linkahead as db - -from caosadvancedtools.cache import UpdateCache, Cache -from caosadvancedtools.crawler import Crawler as OldCrawler -from linkahead.apiutils import (compare_entities, EntityMergeConflictError, +import yaml +from importlib_resources import files +from jsonschema import validate +from linkahead.apiutils import (EntityMergeConflictError, compare_entities, merge_entities) from linkahead.common.datatype import is_reference -from .converters import Converter, DirectoryConverter, ConverterValidationError - +from .converters import Converter, ConverterValidationError, DirectoryConverter from .macros import defmacro_constructor, macro_constructor -from .stores import Store, GeneralStore, RecordStore -from .structure_elements import StructureElement, Directory, NoneElement +from .stores import GeneralStore, RecordStore, Store +from .structure_elements import Directory, NoneElement, StructureElement from .version import check_cfood_version diff --git a/src/caoscrawler/default_transformers.yml b/src/caoscrawler/default_transformers.yml index ffcb1b15bd2bad71083cc8f0ba84172ee3daf2b0..0de9a6e0585c5246fa5a21ffcbdfc37cfdc2b88d 100644 --- a/src/caoscrawler/default_transformers.yml +++ b/src/caoscrawler/default_transformers.yml @@ -15,3 +15,15 @@ date_parse: datetime_parse: package: caoscrawler.transformer_functions function: datetime_parse +cast_to_int: + package: caoscrawler.transformer_functions + function: cast_to_int +cast_to_float: + package: caoscrawler.transformer_functions + function: cast_to_float +cast_to_bool: + package: caoscrawler.transformer_functions + function: cast_to_bool +cast_to_str: + package: caoscrawler.transformer_functions + function: cast_to_str diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index 854ee614638712bdcf957c592ef2946dbdd43afc..592f603bef508771d734ff633f8cdb2c100742d5 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -36,12 +36,8 @@ import yaml from linkahead.cached import cached_get_entity_by, cached_query from linkahead.utils.escape import escape_squoted_text -from .exceptions import ( - InvalidIdentifiableYAML, - MissingIdentifyingProperty, - MissingRecordType, - MissingReferencingEntityError, -) +from .exceptions import (InvalidIdentifiableYAML, MissingIdentifyingProperty, + MissingRecordType, MissingReferencingEntityError) from .identifiable import Identifiable from .sync_node import SyncNode from .utils import has_parent diff --git a/src/caoscrawler/logging.py b/src/caoscrawler/logging.py index 69ec1fabb97e1d236162552540a35815e25a33fb..b57a067d8635a468df7345365fabbfae9ee0b22f 100644 --- a/src/caoscrawler/logging.py +++ b/src/caoscrawler/logging.py @@ -20,29 +20,46 @@ # along with this program. If not, see <https://www.gnu.org/licenses/>. import logging +import sys -from caosadvancedtools.webui_formatter import WebUI_Formatter from caosadvancedtools.serverside.helper import get_shared_filename -import sys +from caosadvancedtools.webui_formatter import WebUI_Formatter -def configure_server_side_logging(): +def configure_server_side_logging(max_log_level: int = logging.INFO): """ Set logging up to save one plain debugging log file, one plain info log file (for users) and a stdout stream with messages wrapped in html elements returns the path to the file with debugging output + + Parameters + ---------- + max_log_level : int, optional + The maximum log level to use for SSS-logs. Default is + ``logging.INFO``. + + Returns + ------- + userlog_public, htmluserlog_public, debuglog_public: str + Public paths of the respective log files. """ adv_logger = logging.getLogger("caosadvancedtools") - adv_logger.setLevel(level=logging.DEBUG) + # The max_<level> variables will be used to set the logger levels + # to the respective maximum of intended level and max_log_level, + # effectively cutting off logging above the specified + # max_log_level. + max_info = max(logging.INFO, max_log_level) + max_debug = max(logging.DEBUG, max_log_level) + adv_logger.setLevel(level=max_debug) cr_logger = logging.getLogger("caoscrawler") - cr_logger.setLevel(level=logging.DEBUG) + cr_logger.setLevel(level=max_debug) userlog_public, userlog_internal = get_shared_filename("userlog.txt") root_logger = logging.getLogger() - root_logger.setLevel(level=logging.INFO) + root_logger.setLevel(level=max_info) # this is a log file with INFO level for the user user_file_handler = logging.FileHandler(filename=userlog_internal) diff --git a/src/caoscrawler/macros/macro_yaml_object.py b/src/caoscrawler/macros/macro_yaml_object.py index d85883011db3cf651da0dda6c110015128fbe439..5d2bc1fe0775499fa8b40a65e115fb4569892e38 100644 --- a/src/caoscrawler/macros/macro_yaml_object.py +++ b/src/caoscrawler/macros/macro_yaml_object.py @@ -26,11 +26,10 @@ # A. Schlemmer, 05/2022 import re -from dataclasses import dataclass -from typing import Any, Dict from copy import deepcopy +from dataclasses import dataclass from string import Template - +from typing import Any, Dict _SAFE_SUBST_PAT = re.compile(r"^\$(?P<key>\w+)$") _SAFE_SUBST_PAT_BRACES = re.compile(r"^\$\{(?P<key>\w+)}$") diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py index eeb2bdbf8f0f0d96579598cd8842739a3d154b93..af1f4173e95827606a02979ddd6d7fcd9f133271 100644 --- a/src/caoscrawler/scanner.py +++ b/src/caoscrawler/scanner.py @@ -39,7 +39,7 @@ import logging import os import warnings from collections.abc import Callable -from typing import Any, Optional, Type, Union +from typing import Any, Optional, Union import linkahead as db import yaml @@ -400,6 +400,9 @@ def scanner(items: list[StructureElement], crawled_data, debug_tree, registered_transformer_functions) + # Clean up converter: + converter.cleanup() + if restricted_path and not path_found: raise RuntimeError("A 'restricted_path' argument was given that is not contained in " "the data tree") @@ -418,7 +421,7 @@ def scanner(items: list[StructureElement], # -------------------------------------------------------------------------------- -def scan_directory(dirname: str, crawler_definition_path: str, +def scan_directory(dirname: Union[str, list[str]], crawler_definition_path: str, restricted_path: Optional[list[str]] = None, debug_tree: Optional[DebugTree] = None): """ Crawl a single directory. @@ -431,10 +434,12 @@ def scan_directory(dirname: str, crawler_definition_path: str, Parameters ---------- + dirname: str or list[str] + directory or list of directories to be scanned restricted_path: optional, list of strings - Traverse the data tree only along the given path. When the end of the given path - is reached, traverse the full tree as normal. See docstring of 'scanner' for - more details. + Traverse the data tree only along the given path. When the end + of the given path is reached, traverse the full tree as + normal. See docstring of 'scanner' for more details. Returns ------- @@ -452,26 +457,31 @@ def scan_directory(dirname: str, crawler_definition_path: str, if not dirname: raise ValueError( "You have to provide a non-empty path for crawling.") - dir_structure_name = os.path.basename(dirname) - - # TODO: needs to be covered somewhere else - crawled_directory = dirname - if not dir_structure_name and dirname.endswith('/'): - if dirname == '/': - # Crawling the entire file system - dir_structure_name = "root" - else: - # dirname had a trailing '/' - dir_structure_name = os.path.basename(dirname[:-1]) - - return scan_structure_elements(Directory(dir_structure_name, - dirname), - crawler_definition, - converter_registry, - restricted_path=restricted_path, - debug_tree=debug_tree, - registered_transformer_functions=registered_transformer_functions - ) + if not isinstance(dirname, list): + dirname = [dirname] + dir_element_list = [] + for dname in dirname: + dir_structure_name = os.path.basename(dname) + + # TODO: needs to be covered somewhere else + crawled_directory = dname + if not dir_structure_name and dname.endswith(os.path.sep): + if dname == os.path.sep: + # Crawling the entire file system + dir_structure_name = "root" + else: + # dirname had a trailing '/' + dir_structure_name = os.path.basename(dname[:-1]) + dir_element_list.append(Directory(dir_structure_name, dname)) + + return scan_structure_elements( + dir_element_list, + crawler_definition, + converter_registry, + restricted_path=restricted_path, + debug_tree=debug_tree, + registered_transformer_functions=registered_transformer_functions + ) def scan_structure_elements(items: Union[list[StructureElement], StructureElement], diff --git a/src/caoscrawler/scripts/generators.py b/src/caoscrawler/scripts/generators.py index ba8e6e39cc03e9be1923d72ec5c8d699c01fa8f9..2bf8a90f5af5086e23b7e7cc35d21a50d8cd511a 100644 --- a/src/caoscrawler/scripts/generators.py +++ b/src/caoscrawler/scripts/generators.py @@ -30,7 +30,6 @@ from typing import Optional import pandas as pd import yaml - DM_TEMPLATE = """# auto-generated data model from file "[]{infile}". # To insert a datamodel into LinkAhead, run: # diff --git a/src/caoscrawler/structure_elements/__init__.py b/src/caoscrawler/structure_elements/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..351f1069708ec94c0dd27313b6329d89858d4330 --- /dev/null +++ b/src/caoscrawler/structure_elements/__init__.py @@ -0,0 +1,31 @@ +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Submdule containing all default and optional converters.""" + +from .. import utils +from .structure_elements import * + +try: + from .rocrate_structure_elements import ROCrateEntity +except ImportError as err: + ROCrateEntity: type = utils.MissingImport( + name="ROCrateEntity", hint="Try installing with the `rocrate` extra option.", + err=err) diff --git a/src/caoscrawler/structure_elements/rocrate_structure_elements.py b/src/caoscrawler/structure_elements/rocrate_structure_elements.py new file mode 100644 index 0000000000000000000000000000000000000000..66768ad800128297a27f47d672352f21310703e9 --- /dev/null +++ b/src/caoscrawler/structure_elements/rocrate_structure_elements.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2024 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +from rocrate.model.entity import Entity + +from .structure_elements import StructureElement + + +class ROCrateEntity(StructureElement): + """ + Store entities contained in ROCrates. + """ + + def __init__(self, folder: str, entity: Entity): + """ + Initializes this ROCrateEntity. + + Arguments: + ---------- + folder: str + The folder that contains the ROCrate data. In case of a zipped ROCrate, this + is a temporary folder that the ROCrate was unzipped to. + The folder is the folder containing the ro-crate-metadata.json. + + entity: Entity + The ROCrate entity that is stored in this structure element. + The entity automatically contains an attribute ".crate" + that stores the ROCrate that this entity belongs to. It can be used + e.g. to look up links to other entities (ROCrate.dereference). + """ + super().__init__(entity.properties()["@id"]) + self.folder = folder + self.entity = entity diff --git a/src/caoscrawler/structure_elements.py b/src/caoscrawler/structure_elements/structure_elements.py similarity index 99% rename from src/caoscrawler/structure_elements.py rename to src/caoscrawler/structure_elements/structure_elements.py index 67cd1056b382c92485deada2058526a03b6d8535..3b4c6e9b9d13c61a5924a12d23b11b62edff6924 100644 --- a/src/caoscrawler/structure_elements.py +++ b/src/caoscrawler/structure_elements/structure_elements.py @@ -24,6 +24,7 @@ # import warnings + import lxml.etree diff --git a/src/caoscrawler/sync_graph.py b/src/caoscrawler/sync_graph.py index 9c021a10f35e95ca56d45151b8d064ec905993ec..a05e6320892239cbe8d7f1d9fbd7949a57f9bccb 100644 --- a/src/caoscrawler/sync_graph.py +++ b/src/caoscrawler/sync_graph.py @@ -27,18 +27,17 @@ crawler. from __future__ import annotations import logging -from typing import Any, Optional, Union, Callable +import re +from typing import Any, Callable, Optional, Union import linkahead as db from linkahead.cached import cached_get_entity_by from linkahead.exceptions import EmptyUniqueQueryError -from .identifiable_adapters import IdentifiableAdapter from .identifiable import Identifiable +from .identifiable_adapters import IdentifiableAdapter from .sync_node import SyncNode, TempID -import re - logger = logging.getLogger(__name__) diff --git a/src/caoscrawler/sync_node.py b/src/caoscrawler/sync_node.py index d35b49c17aea4cba05ab46291ba65023007283ee..d912d6465a68270411c121f65b4c5a828c9c667e 100644 --- a/src/caoscrawler/sync_node.py +++ b/src/caoscrawler/sync_node.py @@ -22,12 +22,12 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Optional, Union +from typing import TYPE_CHECKING, Any, Optional +from warnings import warn import linkahead as db import yaml -from linkahead.common.models import Parent, _ParentList, _Properties -from warnings import warn +from linkahead.common.models import Parent, ParentList, PropertyList from .exceptions import ImpossibleMergeError @@ -76,8 +76,8 @@ class SyncNode(db.Entity): self.role = entity.role self.path = entity.path self.file = entity.file - self.parents = _ParentList().extend(entity.parents) - self.properties = _Properties().extend(entity.properties) + self.parents = ParentList().extend(entity.parents) + self.properties = PropertyList().extend(entity.properties) self._check_for_multiproperties() # other members self.identifiable: Optional[Identifiable] = None @@ -254,25 +254,11 @@ class SyncNode(db.Entity): ids.add(p.id) -def parent_in_list(parent: Parent, plist: _ParentList) -> bool: +def parent_in_list(parent: Parent, plist: ParentList) -> bool: """helper function that checks whether a parent with the same name or ID is in the plist""" - missing = False - if parent.name is not None: - if parent.name not in plist._element_by_name: - missing = True - if parent.id is not None: - if str(parent.id) not in plist._element_by_id: - missing = True - return not missing + return plist.filter(parent) -def property_in_list(prop: db.Property, plist: _Properties) -> bool: +def property_in_list(prop: db.Property, plist: PropertyList) -> bool: """helper function that checks whether a property with the same name or ID is in the plist""" - missing = False - if prop.name is not None: - if prop.name not in plist._element_by_name: - missing = True - if prop.id is not None: - if str(prop.id) not in plist._element_by_id: - missing = True - return not missing + return plist.filter(prop) diff --git a/src/caoscrawler/transformer_functions.py b/src/caoscrawler/transformer_functions.py index ce08bc6bc05caa84f342cdc25f3243c5bab0b79c..117d0b021d4ec0b0efc79c5db0d7ed397207933f 100644 --- a/src/caoscrawler/transformer_functions.py +++ b/src/caoscrawler/transformer_functions.py @@ -99,3 +99,56 @@ Parameters fmt = params.get("datetime_format", fmt_default) dt_str = datetime.datetime.strptime(in_value, fmt).strftime(fmt_default) return dt_str + + +def cast_to_int(in_value: Any, params: dict) -> int: + """ + Cast the `in_value` to int. + + Parameters + ========== + No parameters. + """ + return int(in_value) + + +def cast_to_float(in_value: Any, params: dict) -> float: + """ + Cast the `in_value` to float. + + Parameters + ========== + No parameters. + """ + return float(in_value) + + +def cast_to_bool(in_value: Any, params: dict) -> bool: + """ + Cast the `in_value` to bool. + + This is done by comparing `in_value` to "True". + Only "true", "True", "False" and "false" are accepted as possible values. + All other input values raise an error. + + Parameters + ========== + No parameters. + """ + val = str(in_value).lower() + if val == "true": + return True + if val == "false": + return False + raise ValueError("Invalid value for type cast to bool: {}".format(in_value)) + + +def cast_to_str(in_value: Any, params: dict) -> str: + """ + Cast the `in_value` to str. + + Parameters + ========== + No parameters. + """ + return str(in_value) diff --git a/src/caoscrawler/utils.py b/src/caoscrawler/utils.py index d9a5af839068a2582859aad1b51fbc8b9713d5d1..5f736d5ad7550e0b29cb629b2fa140a2f38d6f5f 100644 --- a/src/caoscrawler/utils.py +++ b/src/caoscrawler/utils.py @@ -26,7 +26,6 @@ # Some utility functions, e.g. for extending pylib. import sys - from posixpath import join as posixjoin from typing import Optional from urllib.parse import urljoin diff --git a/src/caoscrawler/validator.py b/src/caoscrawler/validator.py new file mode 100644 index 0000000000000000000000000000000000000000..33e29b02db429e3382248bbd80d2d00cd7b07c6b --- /dev/null +++ b/src/caoscrawler/validator.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer +# +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +""" +This module contains functions to validate the output of a scanner run with a +json schema. +""" + +import jsonschema +import linkahead as db +# from caosadvancedtools.models.parser import parse_model_from_string +from caosadvancedtools.json_schema_exporter import recordtype_to_json_schema +from caosadvancedtools.models.parser import parse_model_from_yaml +from jsonschema import ValidationError +from linkahead.high_level_api import convert_to_python_object + + +def load_json_schema_from_datamodel_yaml(filename: str) -> dict[str, dict]: + """ + Load a data model yaml file (using caosadvancedtools) and convert + all record types into a json schema using the json_schema_exporter module. + + Arguments + --------- + filename: str + The filename of the yaml file to load. + + Returns + ------- + A dict of json schema objects. The keys are the record types for which the schemas + are generated. + """ + + model = parse_model_from_yaml(filename) + + rt_schemas = {} + for el_key, el in model.items(): + if isinstance(el, db.RecordType): + rt_schemas[el_key] = recordtype_to_json_schema(el) + + return rt_schemas + + +def representer_ordereddict(dumper, data): + """ + Helper function to be able to represent the converted json schema objects correctly as yaml. + This representer essentially replaced OrderedDict objects with simple dict objects. + + Since Python 3.7 dicts are ordered by default, see e.g.: + https://softwaremaniacs.org/blog/2020/02/05/dicts-ordered/en/ + + Example how to use the representer: + ```python + yaml.add_representer(OrderedDict, caoscrawler.validator.representer_ordereddict) + ``` + """ + return dumper.represent_data(dict(data)) + + +def _apply_schema_patches(pobj: dict): + """ + Changes applied: + - properties are moved vom subitem "proeprties" to top-level. + - The following keys are deleted: parents, role, name, description, metadata, properties + """ + if "properties" not in pobj: + # this is probably a file + return pobj + for prop in pobj["properties"]: + if isinstance(pobj["properties"][prop], dict): + pobj[prop] = _apply_schema_patches(pobj["properties"][prop]) + else: + pobj[prop] = pobj["properties"][prop] + + for keyd in ("parents", "role", "name", + "description", "metadata", "properties"): + if keyd in pobj: + del pobj[keyd] + + return pobj + + +def convert_record(record: db.Record): + """ + Convert a record into a form suitable for validation with jsonschema. + + Uses `high_level_api.convert_to_python_object` + Afterwards `_apply_schema_patches` is called recursively to refactor the dictionary + to match the current form of the jsonschema. + + Arguments: + ---------- + record: db.Record + The record that is supposed to be converted. + """ + pobj = convert_to_python_object(record).serialize() + return _apply_schema_patches(pobj) + + +def validate(records: list[db.Record], schemas: dict[str, dict]) -> list[tuple]: + """ + Validate a list of records against a dictionary of schemas. + The keys of the dictionary are record types and the corresponding values are json schemata + associated with that record type. The current implementation assumes that each record that is + checked has exactly one parent and raises an error if that is not the case. + The schema belonging to a record is identified using the name of the first (and only) parent + of the record. + + Arguments: + ---------- + + records: list[db.Record] + List of records that will be validated. + + schemas: dict[str, dict] + A dictionary of JSON schemas generated using `load_json_schema_from_datamodel_yaml`. + + Returns: + -------- + A list of tuples, one element for each record: + + - Index 0: A boolean that determines whether the schema belonging to the record type of the + record matched. + - Index 1: A validation error if the schema did not match or None otherwise. + """ + + retval = [] + for r in records: + if len(r.parents) != 1: + raise NotImplementedError( + "Schema validation is only supported if records have exactly one parent.") + parname = r.parents[0].name + if parname not in schemas: + raise RuntimeError( + "No schema for record type {} in schema dictionary.".format(parname)) + try: + jsonschema.validate(convert_record(r), schemas[parname]) + retval.append((True, None)) + except ValidationError as ex: + retval.append((False, ex)) + return retval diff --git a/src/caoscrawler/version.py b/src/caoscrawler/version.py index 0b72dd65116fbc102a4dc2492d726698cad5a13b..4cd435486aca26e20e785bbbeb65c013d8e727cb 100644 --- a/src/caoscrawler/version.py +++ b/src/caoscrawler/version.py @@ -18,9 +18,10 @@ # along with this program. If not, see <https://www.gnu.org/licenses/>. # from importlib import metadata as importlib_metadata -from packaging.version import parse as parse_version from warnings import warn +from packaging.version import parse as parse_version + def get_caoscrawler_version(): """ Read in version of locally installed caoscrawler package""" diff --git a/src/doc/cfood.rst b/src/doc/cfood.rst index a42d593035bd37d0712986c958fb8ad7ad287968..0c7726d2017b955ecd7472d57dc259ff9a7bab53 100644 --- a/src/doc/cfood.rst +++ b/src/doc/cfood.rst @@ -207,9 +207,9 @@ following. ValueWithUnitElt: type: TextElement match_name: ^my_prop$ - match_value: "^(?P<number>\\d+\\.?\\d*)\s+(?P<unit>.+)" # Extract value and unit from a string which - # has a number followed by at least one whitespace - # character followed by a unit. + match_value: "^(?P<number>\\d+\\.?\\d*)\\s+(?P<unit>.+)" # Extract value and unit from a string which + # has a number followed by at least one whitespace + # character followed by a unit. records: MyRecord: MyProp: diff --git a/src/doc/conf.py b/src/doc/conf.py index c52442aa358d7aa60085c48168d7a63798c967b0..01ca66bf03c1fb0e105e97dccaadc4d1ef5d14f0 100644 --- a/src/doc/conf.py +++ b/src/doc/conf.py @@ -21,11 +21,11 @@ # import os import sys + sys.path.insert(0, os.path.abspath('..')) import sphinx_rtd_theme # noqa: E402 - # -- Project information ----------------------------------------------------- project = 'caosdb-caoscrawler' @@ -33,10 +33,10 @@ copyright = '2024, IndiScale' author = 'Alexander Schlemmer' # The short X.Y version -version = '0.9.2' +version = '0.10.2' # The full version, including alpha/beta/rc tags # release = '0.5.2-rc2' -release = '0.9.2-dev' +release = '0.10.2-dev' # -- General configuration --------------------------------------------------- diff --git a/src/doc/converters/standard_converters.rst b/src/doc/converters/standard_converters.rst index 586b84b48be78f1307298a11ad61a2448c3c3cd7..5f86abb5b324e0cc1584e42e6abb2612acc8067f 100644 --- a/src/doc/converters/standard_converters.rst +++ b/src/doc/converters/standard_converters.rst @@ -6,9 +6,17 @@ These are the standard converters that exist in a default installation. For wri Directory Converter =================== -The Directory Converter creates StructureElements for each File and Directory -inside the current Directory. You can match a regular expression against the -directory name using the 'match' key. + +The Directory Converter creates StructureElements for each File and +Directory inside the current Directory. You can match a regular +expression against the directory name using the 'match' key. + +With the optional ``match_newer_than_file`` key, a path to file +containing only an ISO-formatted datetime string can be specified. If +this is done, a directory will only match if it contains at least one +file or directory that has been modified since that datetime. If the +file doesn't exist or contains an invalid string, the directory will +be matched regardless of the modification times. Simple File Converter ===================== @@ -41,6 +49,41 @@ The following StructureElement types are typically created by the DictElement co Note that you may use ``TextElement`` for anything that exists in a text format that can be interpreted by the server, such as date and datetime strings in ISO-8601 format. +match_properties +---------------- + +`match_properties` is a dictionary of key-regexps and value-regexp pairs and can be used to +match direct properties of a `DictElement`. Each key matches +a property name and the corresponding value matches its property value. + +Example: +........ + +.. code-block:: json + + { + "@type": "PropertyValue", + "additionalType": "str", + "propertyID": "testextra", + "value": "hi" + } + +When applied to a dict loaded from the above json, a `DictElementConverter` with the following definition: + +.. code-block:: yaml + + Example: + type: DictElement + match_properties: + additionalType: (?P<addt>.*)$ + property(.*): (?P<propid>.*)$ + +will match and create two variables: + +- `addt = "str"` +- `propid = "testextra"` + + Scalar Value Converters ======================= `BooleanElementConverter`, `FloatElementConverter`, `TextElementConverter`, and @@ -331,3 +374,31 @@ XMLTextNodeConverter In the future, this converter can be used to match XMLTextNodes that are generated by the XMLTagConverter. + + +ZipFileConverter +================ + +This converter opens zip files, unzips them into a temporary directory and +exposes its contents as File structure elements. + +Usage Example: +-------------- + +.. code-block:: yaml + + ExampleZipFile: + type: ZipFile + match: example\.zip$ + subtree: + DirInsideZip: + type: Directory + match: experiments$ + FileInsideZip: + type: File + match: description.odt$ + +This converter will match and open files called ``example.zip``. If +the file contains a directory called ``experiments`` it will be +processed further by the respective converter in the subtree. The same +is true for a file called ``description.odt``. diff --git a/src/doc/converters/transform_functions.rst b/src/doc/converters/transform_functions.rst index 22df35c8521ea0d70b2ebf7b7c8bc7c52e176bd3..ecd47d2dc004c6f1382279901dfec2d96e0e4a2d 100644 --- a/src/doc/converters/transform_functions.rst +++ b/src/doc/converters/transform_functions.rst @@ -38,8 +38,33 @@ An example that splits the variable ``a`` and puts the generated list in ``b`` i Report: tags: $b -This splits the string in '$a' and stores the resulting list in '$b'. This is here used to add a -list valued property to the Report Record. +This splits the string in '$a' and stores the resulting list in +'$b'. This is here used to add a list valued property to the Report +Record. Note that from LinkAhead Crawler 0.11.0 onwards, the value of +``marker`` in the above example can also be read in from a variable in +the usual ``$`` notation: + +.. code-block:: yaml + + # ... variable ``separator`` is defined somewhere above this part, e.g., + # by reading a config file. + Experiment: + type: Dict + match: ".*" + transform: + param_split: + in: $a + out: $b + functions: + - split: + marker: $separator # Now the separator is read in from a + # variable, so we can, e.g., change from + # '|' to ';' without changing the cfood + # definition. + records: + Report: + tags: $b + There are a number of transform functions that are defined by default (see diff --git a/tox.ini b/tox.ini index 41249e4277391c5ffa4ec13fc4da1a6ee1f48491..e003e26ecd16861c3b8a8d991fc789c78d203e5b 100644 --- a/tox.ini +++ b/tox.ini @@ -3,7 +3,7 @@ envlist = py38, py39, py310, py311, py312, py313 skip_missing_interpreters = true [testenv] -deps = .[h5-crawler,spss] +deps = .[h5-crawler,spss,rocrate] pytest pytest-cov # TODO: Make this f-branch sensitive diff --git a/unittests/datamodels/datamodel.yaml b/unittests/datamodels/datamodel.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2759ecba7f2967062937d9b2f4805a9b501ab6c4 --- /dev/null +++ b/unittests/datamodels/datamodel.yaml @@ -0,0 +1,6 @@ +Dataset: + obligatory_properties: + keywords: + datatype: TEXT + dateModified: + datatype: DATETIME diff --git a/unittests/eln_cfood.yaml b/unittests/eln_cfood.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ab8e7108f511b0450d37c3e60162e412d4a1bf3b --- /dev/null +++ b/unittests/eln_cfood.yaml @@ -0,0 +1,36 @@ +--- +metadata: + crawler-version: 0.9.2 + macros: +--- +Converters: + ELNFile: + converter: ELNFileConverter + package: caoscrawler.converters + ROCrateEntity: + converter: ROCrateEntityConverter + package: caoscrawler.converters + +DataDir: + type: Directory + match: .* + subtree: + ELNFile: + type: ELNFile + match: ^.*\.eln$ + subtree: + RecordsExample: + type: ROCrateEntity + match_type: Dataset + match_properties: + "@id": records-example/$ + name: (?P<name>.*) + keywords: (?P<keywords>.*) + description: (?P<description>.*) + dateModified: (?P<dateModified>.*) + records: + Dataset: + name: $name + keywords: $keywords + description: $description + dateModified: $dateModified diff --git a/unittests/eln_files/PASTA.eln b/unittests/eln_files/PASTA.eln new file mode 100644 index 0000000000000000000000000000000000000000..61866e7d5f57cb32191af6663be230153092e712 Binary files /dev/null and b/unittests/eln_files/PASTA.eln differ diff --git a/unittests/eln_files/records-example.eln b/unittests/eln_files/records-example.eln new file mode 100644 index 0000000000000000000000000000000000000000..09ed53fc179e80a240ab773247d6f9adee71b429 Binary files /dev/null and b/unittests/eln_files/records-example.eln differ diff --git a/unittests/test_cfood_metadata.py b/unittests/test_cfood_metadata.py index 494bd383d95b4a845b5ea6f86ccff0f9a1db257f..b123f98584ba99ed4fec412732cb2bf536034a91 100644 --- a/unittests/test_cfood_metadata.py +++ b/unittests/test_cfood_metadata.py @@ -17,15 +17,13 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. # -import pytest -import yaml - from tempfile import NamedTemporaryFile from unittest.mock import patch -from unittest.mock import MagicMock, Mock -import caoscrawler +import pytest +import yaml +import caoscrawler from caoscrawler.scanner import load_definition @@ -35,7 +33,7 @@ def _temp_file_load(txt: str): definition using load_definition from Crawler. """ definition = None - with NamedTemporaryFile() as f: + with NamedTemporaryFile(delete=False) as f: f.write(txt.encode()) f.flush() definition = load_definition(f.name) diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 5b3c34cceea4e2be2b24a869cb3fc3de747ad740..e4b442d91060c7ba98cb1a910156b1800f050be3 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -30,11 +30,11 @@ import json import logging import os import pytest -import sys import yaml from itertools import product from pathlib import Path +from tempfile import NamedTemporaryFile import linkahead as db @@ -43,16 +43,17 @@ from caoscrawler.converters import (Converter, ConverterValidationError, DictIntegerElementConverter, DirectoryConverter, FloatElementConverter, IntegerElementConverter, JSONFileConverter, - ListElementConverter, MarkdownFileConverter, + ListElementConverter, + MarkdownFileConverter, PropertiesFromDictConverter, - YAMLFileConverter, - handle_value, replace_variables) -from caoscrawler.converters.converters import _AbstractScalarValueElementConverter + YAMLFileConverter, handle_value, + replace_variables) +from caoscrawler.converters.converters import \ + _AbstractScalarValueElementConverter from caoscrawler.crawl import Crawler from caoscrawler.scanner import (_load_definition_from_yaml_dict, create_converter_registry, - create_transformer_registry, - load_definition, + create_transformer_registry, load_definition, scan_structure_elements) from caoscrawler.stores import GeneralStore, RecordStore from caoscrawler.structure_elements import (BooleanElement, DictElement, @@ -1022,3 +1023,109 @@ def test_properties_from_dict_nested(converter_registry): # The "old" DictConverter should have added the additional property: assert myrec.get_property("additional_from_other") is not None assert myrec.get_property("additional_from_other").value == "other" + + +def test_dict_match_properties(converter_registry): + + root_dict_element = DictElement("RootDict", { + "prop_a": "value", + "prop_b": "25", + "prop_c": 24 + }) + + def_dict = { + "RootElt": { + # Root dictionary + "type": "DictElement", + "match_properties": { + "prop_a": "(?P<a>.*)$", + "prop_[^ac]": "(?P<b>.*)$", + "prop_c": "(?P<c>.*)$", + }, + "records": { + # Define top-level, use below in subtrees + "MyRec": { + "prop_a": "$a", + "prop_b": "$b", + "$a": "$c" + } + }}} + records = scan_structure_elements(root_dict_element, def_dict, converter_registry) + assert len(records) == 1 + record = records[0] + assert record.get_property("prop_a").value == "value" + assert record.get_property("prop_b").value == "25" + assert record.get_property("value").value == "24" # Note the type change here + + root_dict_element = DictElement("RootDict", { + "prop_a": "value", + "prop_b": "25", + # Property missing + }) + + records = scan_structure_elements(root_dict_element, def_dict, converter_registry) + assert len(records) == 0 + + with pytest.raises(RuntimeError, match="Multiple properties match the same match_properties entry."): + root_dict_element = DictElement("RootDict", { + "prop_a": "value", + "prop_b": "25", + "prop_d": 24 # duplicate matches + }) + records = scan_structure_elements(root_dict_element, def_dict, converter_registry) + + +def test_directory_converter_change_date(caplog, converter_registry): + """Test that only directories that were modified after a certain + date are crawled. + + """ + test_dir_element = Directory("test_directories", UNITTESTDIR / "test_directories") + date_of_dir_change = DirectoryConverter._get_most_recent_change_in_dir(test_dir_element) + past_date = date_of_dir_change - datetime.timedelta(days=1) + future_date = date_of_dir_change + datetime.timedelta(days=1) + + tmpfi = NamedTemporaryFile(delete=False) + + # Write down past + with open(tmpfi.name, "w") as fi: + fi.write(f"{past_date.isoformat()}\n") + + converter_def = { + "type": "Directory", + "match": "^test_directories$", + "match_newer_than_file": tmpfi.name + } + dc = DirectoryConverter(name="DC1", definition=converter_def, + converter_registry=converter_registry) + assert dc.match(test_dir_element) is not None + + # Write down future, so nothing should match + with open(tmpfi.name, "w") as fi: + fi.write(f"{future_date.isoformat()}\n") + assert dc.match(test_dir_element) is None + + # Also match in the corner case of equality: + with open(tmpfi.name, "w") as fi: + fi.write(f"{date_of_dir_change.isoformat()}\n") + assert dc.match(test_dir_element) is not None + + # Match but warn + with open(tmpfi.name, "w") as fi: + fi.write(f"This is garbage.\n") + with pytest.raises(ValueError): + dc.match(test_dir_element) + assert len(caplog.record_tuples) == 1 + assert caplog.record_tuples[0][1] == logging.ERROR + assert tmpfi.name in caplog.record_tuples[0][2] + assert "doesn't contain a ISO formatted datetime in its first line" in caplog.record_tuples[0][2] + + # Match anything since file doesn't exist, inform in debug log. + os.remove(tmpfi.name) + # Clear log and enforce debug level. + caplog.clear() + caplog.set_level(logging.DEBUG) + assert dc.match(test_dir_element) is not None + assert len(caplog.record_tuples) == 1 + assert caplog.record_tuples[0][1] == logging.DEBUG + assert "Reference file doesn't exist." == caplog.record_tuples[0][2] diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index aaddec9e8c6b17ad726808bc36b0784adbc3c36d..ad69c6f57cbc8d48d194507d7c1aa79c9da7521b 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -23,7 +23,6 @@ """ test the Crawler class """ -import json import logging import os import warnings @@ -33,12 +32,17 @@ from os.path import basename, dirname, join from pathlib import Path from unittest.mock import MagicMock, Mock, patch -import caoscrawler import linkahead as db import linkahead.common.models as dbmodels import pytest import yaml from caosadvancedtools.models.parser import parse_model_from_string +from linkahead.apiutils import compare_entities +from linkahead.cached import cache_clear +from linkahead.exceptions import EmptyUniqueQueryError +from pytest import raises + +import caoscrawler from caoscrawler.crawl import (Crawler, SecurityMode, _treat_deprecated_prefix, crawler_main, split_restricted_path) from caoscrawler.debug_tree import DebugTree @@ -55,10 +59,6 @@ from caoscrawler.stores import GeneralStore, RecordStore from caoscrawler.structure_elements import (DictElement, DictListElement, DictTextElement, File) from caoscrawler.sync_graph import SyncGraph -from linkahead.apiutils import compare_entities -from linkahead.cached import cache_clear -from linkahead.exceptions import EmptyUniqueQueryError -from pytest import raises UNITTESTDIR = Path(__file__).parent @@ -824,9 +824,9 @@ def test_restricted_path(create_mock): def test_split_restricted_path(): - assert ["el"] == split_restricted_path("/el") - assert ["el"] == split_restricted_path("/el/") - assert ["el", "el"] == split_restricted_path("/el/el") + assert ["el"] == split_restricted_path(os.path.sep + "el") + assert ["el"] == split_restricted_path(os.path.sep + "el" + os.path.sep) + assert ["el", "el"] == split_restricted_path(os.path.sep + "el" + os.path.sep + "el") # Filter the warning because we want to have it here and this way it does not hinder running diff --git a/unittests/test_entity_comparison.py b/unittests/test_entity_comparison.py index 0f62475b6c61d82feb3e550cf5ab53e91183f80a..8543732fde4d584e2022dcf6432e9572ae625eb5 100644 --- a/unittests/test_entity_comparison.py +++ b/unittests/test_entity_comparison.py @@ -3,7 +3,6 @@ # A. Schlemmer, 06/2021 import linkahead as db - import pytest from pytest import raises diff --git a/unittests/test_h5_converter.py b/unittests/test_h5_converter.py index 95060451badb0523cf91c70e5be345e35ec3964d..9c1058812c75c6d1e5ee7028c8f6fccd7081a54c 100644 --- a/unittests/test_h5_converter.py +++ b/unittests/test_h5_converter.py @@ -17,22 +17,21 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. # -import numpy as np - from functools import partial from pathlib import Path -from pytest import fixture, importorskip import linkahead as db +import numpy as np +from pytest import fixture, importorskip +from utils import dircheckstr as dircheck_base +from caoscrawler.converters.hdf5_converter import ( + H5DatasetElement, H5GroupElement, H5NdarrayElement, + convert_basic_element_with_nd_array, convert_h5_element) from caoscrawler.debug_tree import DebugTree -from caoscrawler.converters.hdf5_converter import (convert_basic_element_with_nd_array, - convert_h5_element, H5GroupElement, - H5DatasetElement, H5NdarrayElement) from caoscrawler.scanner import scan_directory from caoscrawler.structure_elements import (FloatElement, ListElement, TextElement) -from utils import dircheckstr as dircheck_base # Skip the whole module if h5py hasn't been installed h5py = importorskip("h5py") diff --git a/unittests/test_identifiable.py b/unittests/test_identifiable.py index d94d852583523a3b3f29f002eaacb9ae0b616c4f..44aac6a3edd40e0df8558f68083e22245ff58127 100644 --- a/unittests/test_identifiable.py +++ b/unittests/test_identifiable.py @@ -26,6 +26,7 @@ test identifiable module import linkahead as db import pytest + from caoscrawler.identifiable import Identifiable from caoscrawler.sync_node import SyncNode diff --git a/unittests/test_identifiable_adapters.py b/unittests/test_identifiable_adapters.py index d2a5c3bb5d4007348b48d513e664c39592dc4c61..bdc0ab850d1a8253e876e8b1a6bc621327802f79 100644 --- a/unittests/test_identifiable_adapters.py +++ b/unittests/test_identifiable_adapters.py @@ -27,15 +27,14 @@ test identifiable_adapters module """ -import os from datetime import datetime -from unittest.mock import MagicMock, Mock, patch from pathlib import Path +from unittest.mock import MagicMock, Mock, patch import linkahead as db import pytest -from caoscrawler.exceptions import (InvalidIdentifiableYAML, - ) + +from caoscrawler.exceptions import InvalidIdentifiableYAML from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import (CaosDBIdentifiableAdapter, IdentifiableAdapter, diff --git a/unittests/test_issues.py b/unittests/test_issues.py index 1678280555e739bae55819fa7fe42a53c938c4e5..a6de65400f42018c3fdcde7b2f29d4fd200bf62b 100644 --- a/unittests/test_issues.py +++ b/unittests/test_issues.py @@ -22,11 +22,12 @@ from pytest import mark -from caoscrawler.converters import replace_variables, CrawlerTemplate +from caoscrawler.converters import CrawlerTemplate, replace_variables from caoscrawler.crawl import Crawler -from caoscrawler.structure_elements import DictElement +from caoscrawler.scanner import (create_converter_registry, + scan_structure_elements) from caoscrawler.stores import GeneralStore -from caoscrawler.scanner import create_converter_registry, scan_structure_elements +from caoscrawler.structure_elements import DictElement def test_issue_10(): diff --git a/unittests/test_json.py b/unittests/test_json.py index be65a26ea01e11e11968bd927c80513708e73850..5d145b38fd36fa2de4e4ab754cbadda0fff6eff7 100644 --- a/unittests/test_json.py +++ b/unittests/test_json.py @@ -26,18 +26,17 @@ """ test the JSON converter """ -import json import os - -from pytest import raises +from pathlib import Path import linkahead as db +from pytest import raises from caoscrawler.converters import JSONFileConverter -from pathlib import Path from caoscrawler.crawl import Crawler +from caoscrawler.scanner import (create_converter_registry, load_definition, + scan_structure_elements) from caoscrawler.structure_elements import File, JSONFile -from caoscrawler.scanner import load_definition, create_converter_registry, scan_structure_elements UNITTESTDIR = Path(__file__).parent diff --git a/unittests/test_macros.py b/unittests/test_macros.py index cfa405e5041fb4324b7de98ffcb942cf4b040715..03fe0e665652bb12e204d76857771c1d064ec28a 100644 --- a/unittests/test_macros.py +++ b/unittests/test_macros.py @@ -22,15 +22,15 @@ # ** end header # -from caoscrawler.macros import defmacro_constructor, macro_constructor -from caoscrawler.macros.macro_yaml_object import macro_store -from caoscrawler.crawl import Crawler -from caoscrawler.scanner import load_definition - from tempfile import NamedTemporaryFile -import yaml import pytest +import yaml + +from caoscrawler.crawl import Crawler +from caoscrawler.macros import defmacro_constructor, macro_constructor +from caoscrawler.macros.macro_yaml_object import macro_store +from caoscrawler.scanner import load_definition @pytest.fixture @@ -50,10 +50,10 @@ def _temp_file_load(txt: str): definition using load_definition from Crawler. """ definition = None - with NamedTemporaryFile() as f: + with NamedTemporaryFile(delete=False) as f: f.write(txt.encode()) f.flush() - definition = load_definition(f.name) + definition = load_definition(f.name) return definition diff --git a/unittests/test_rocrate_converter.py b/unittests/test_rocrate_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..dc7cef9f6d396c73a2a285d3f60fd587863237ac --- /dev/null +++ b/unittests/test_rocrate_converter.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer <a.schlemmer@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" +test the XML converters +""" +import importlib +import os +from pathlib import Path + +import linkahead as db +import pytest +import rocrate +import yaml +from caoscrawler import scanner +from caoscrawler.converters import ELNFileConverter, ROCrateEntityConverter +from caoscrawler.stores import GeneralStore +from caoscrawler.structure_elements import (DictElement, File, ROCrateEntity, + TextElement) +from rocrate.model.entity import Entity + +UNITTESTDIR = Path(__file__).parent + + +@pytest.fixture +def converter_registry(): + converter_registry: dict[str, dict[str, str]] = { + "ELNFile": { + "converter": "ELNFileConverter", + "package": "caoscrawler.converters"}, + "ROCrateEntity": { + "converter": "ROCrateEntityConverter", + "package": "caoscrawler.converters", + } + } + + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry + + +@pytest.fixture +def basic_eln_converter(converter_registry): + return ELNFileConverter(yaml.safe_load(""" +type: ELNFile +match: .*\\.eln +"""), "TestELNConverter", converter_registry) + + +@pytest.fixture +def eln_entities(basic_eln_converter): + f_k4mat = File("records-example.eln", + os.path.join(UNITTESTDIR, "eln_files", "records-example.eln")) + store = GeneralStore() + entities = basic_eln_converter.create_children(store, f_k4mat) + return entities + + +def test_load_pasta(basic_eln_converter): + """ + Test for loading the .eln example export from PASTA. + """ + f_pasta = File("PASTA.eln", os.path.join(UNITTESTDIR, "eln_files", "PASTA.eln")) + match = basic_eln_converter.match(f_pasta) + assert match is not None + entities = basic_eln_converter.create_children(GeneralStore(), f_pasta) + assert len(entities) == 20 + assert isinstance(entities[0], ROCrateEntity) + assert isinstance(entities[0].folder, str) + assert isinstance(entities[0].entity, Entity) + + +def test_load_kadi4mat(basic_eln_converter): + """ + Test for loading the .eln example export from PASTA. + """ + f_k4mat = File("records-example.eln", + os.path.join(UNITTESTDIR, "eln_files", "records-example.eln")) + match = basic_eln_converter.match(f_k4mat) + assert match is not None + entities = basic_eln_converter.create_children(GeneralStore(), f_k4mat) + assert len(entities) == 10 + assert isinstance(entities[0], ROCrateEntity) + assert isinstance(entities[0].folder, str) + assert isinstance(entities[0].entity, Entity) + + +def test_match_rocrate_entities(eln_entities): + ds1 = ROCrateEntityConverter(yaml.safe_load(""" +type: ROCrateEntity +match_properties: + "@id": \\./ + datePublished: (?P<datePublished>.*) +"""), "TestELNConverter", converter_registry) + + match = ds1.match(eln_entities[0]) + assert match is not None + + ds2 = ROCrateEntityConverter(yaml.safe_load(""" +type: ROCrateEntity +match_type: CreativeWork +match_properties: + "@id": ro-crate-metadata.json + dateCreated: (?P<dateCreated>.*) +"""), "TestELNConverter", converter_registry) + + match = ds2.match(eln_entities[0]) + assert match is None + match = ds1.match(eln_entities[1]) + assert match is None + + match = ds2.match(eln_entities[1]) + assert match is not None + assert match["dateCreated"] == "2024-08-21T12:07:45.115990+00:00" + + children = ds2.create_children(GeneralStore(), eln_entities[1]) + assert len(children) == 8 + assert isinstance(children[0], TextElement) + assert children[0].name == "@id" + assert children[0].value == "ro-crate-metadata.json" + assert isinstance(children[5], DictElement) + assert children[5].value == {'@id': 'https://kadi.iam.kit.edu'} + + +def test_file(eln_entities): + ds_csv = ROCrateEntityConverter(yaml.safe_load(""" +type: ROCrateEntity +match_type: File +match_properties: + "@id": .*\.csv$ +"""), "TestELNConverter", converter_registry) + + ent_csv = eln_entities[5] + match = ds_csv.match(ent_csv) + assert match is not None + + children = ds_csv.create_children(GeneralStore(), ent_csv) + + # Number of children = number of properties + number of files: + assert len(children) == len(ent_csv.entity.properties()) + 1 + # Get the file: + f_csv = [f for f in children if isinstance(f, File)][0] + with open(f_csv.path) as f: + text = f.read() + assert "Ultrasound Transducer" in text + + +def test_has_part(eln_entities): + ds_parts = ROCrateEntityConverter(yaml.safe_load(""" +type: ROCrateEntity +match_type: Dataset +match_properties: + "@id": records-example/ +"""), "TestELNConverter", converter_registry) + + ent_parts = eln_entities[2] + match = ds_parts.match(ent_parts) + assert match is not None + + children = ds_parts.create_children(GeneralStore(), ent_parts) + + # Number of children = number of properties + number of parts: + assert len(children) == len(ent_parts.entity.properties()) + 4 + entity_children = [f for f in children if isinstance(f, ROCrateEntity)] + assert len(entity_children) == 4 + for f in entity_children: + assert isinstance(f.entity, rocrate.model.file.File) + + +def test_scanner(): + rlist = scanner.scan_directory(os.path.join(UNITTESTDIR, "eln_files/"), + os.path.join(UNITTESTDIR, "eln_cfood.yaml")) + assert len(rlist) == 1 + assert isinstance(rlist[0], db.Record) + assert rlist[0].name == "records-example" + assert rlist[0].description == "This is a sample record." + assert rlist[0].parents[0].name == "Dataset" + assert rlist[0].get_property("keywords").value == "sample" + assert rlist[0].get_property("dateModified").value == "2024-08-21T11:43:17.626965+00:00" diff --git a/unittests/test_scalars_cfood.py b/unittests/test_scalars_cfood.py index 4375ba199d64c3a24d07b3ea1cc4d221d967954b..577fcd5f6c93bee2bc05451983d358aa2e07f798 100644 --- a/unittests/test_scalars_cfood.py +++ b/unittests/test_scalars_cfood.py @@ -2,10 +2,11 @@ # Tests for: # https://gitlab.com/caosdb/caosdb-crawler/-/issues/9 # A. Schlemmer, 06/2021 -import os from pathlib import Path import pytest +from utils import dircheckstr + # The main function that is affected by this issue: from caoscrawler.converters import handle_value from caoscrawler.crawl import Crawler @@ -14,8 +15,6 @@ from caoscrawler.scanner import scan_directory # We need the store for the above function from caoscrawler.stores import GeneralStore -from utils import dircheckstr - UNITTESTDIR = Path(__file__).parent diff --git a/unittests/test_scanner.py b/unittests/test_scanner.py index d2003be1692b30d50849b9efa82a3918a44343dc..c531f66fd38a714ba4f6f538d41c9fbaeb364d44 100644 --- a/unittests/test_scanner.py +++ b/unittests/test_scanner.py @@ -30,20 +30,20 @@ from functools import partial from pathlib import Path from tempfile import NamedTemporaryFile from unittest.mock import MagicMock, Mock, patch - +import os import linkahead as db import pytest import yaml +from pytest import raises +from utils import dircheckstr as dircheck_base + from caoscrawler.crawl import Crawler from caoscrawler.debug_tree import DebugTree -from caoscrawler.scanner import (create_converter_registry, load_definition, - scan_directory, scan_structure_elements, - _load_definition_from_yaml_dict) +from caoscrawler.scanner import (_load_definition_from_yaml_dict, + create_converter_registry, load_definition, + scan_directory, scan_structure_elements) from caoscrawler.structure_elements import (DictElement, DictListElement, DictTextElement, File) -from pytest import raises - -from utils import dircheckstr as dircheck_base UNITTESTDIR = Path(__file__).parent @@ -110,7 +110,7 @@ def test_record_structure_generation(): assert len(subc[1]) == 0 # The data analysis node creates one variable for the node itself: - assert subd[0]["DataAnalysis"] == "examples_article/DataAnalysis" + assert subd[0]["DataAnalysis"] == os.path.join("examples_article", "DataAnalysis") assert subc[0]["DataAnalysis"] is False subd = dbt.debug_tree[dircheckstr("DataAnalysis", "2020_climate-model-predict")] @@ -128,9 +128,10 @@ def test_record_structure_generation(): assert subd[0]["identifier"] == "climate-model-predict" assert subd[0]["Project"].__class__ == db.Record - assert subd[0]["DataAnalysis"] == "examples_article/DataAnalysis" + assert subd[0]["DataAnalysis"] == os.path.join("examples_article", "DataAnalysis") assert subc[0]["DataAnalysis"] is True - assert subd[0]["project_dir"] == "examples_article/DataAnalysis/2020_climate-model-predict" + assert subd[0]["project_dir"] == os.path.join( + "examples_article", "DataAnalysis", "2020_climate-model-predict") assert subc[0]["project_dir"] is False # Check the copy flags for the first level in the hierarchy: diff --git a/unittests/test_schema.py b/unittests/test_schema.py index ea8549b0b8dfd1f1af35784082a9e46320cfcff4..96c388ac362583eda13ca368519467c34446868e 100644 --- a/unittests/test_schema.py +++ b/unittests/test_schema.py @@ -2,17 +2,15 @@ # Tests for schema validation # A. Schlemmer, 06/2021 -from importlib_resources import files -import linkahead as db - -from os.path import join, dirname -from caoscrawler import Crawler +from os.path import dirname, join +import linkahead as db import pytest -from pytest import raises - +from importlib_resources import files from jsonschema.exceptions import ValidationError +from pytest import raises +from caoscrawler import Crawler from caoscrawler.scanner import load_definition diff --git a/unittests/test_scripts.py b/unittests/test_scripts.py new file mode 100644 index 0000000000000000000000000000000000000000..da03c1f24fbd3d7ca13cfa55d6f69c0cb5a6a6f1 --- /dev/null +++ b/unittests/test_scripts.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 + +# This file is a part of the LinkAhead project. +# +# Copyright (C) 2024 IndiScale GmbH <www.indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Test if the scripts work as expected. +""" + +from subprocess import run + +SCRIPTS = [ + "linkahead-crawler", + "caosdb-crawler", + "spss_to_datamodel", + "csv_to_datamodel", +] + + +def test_script_loading(): + """Run the scripts with "-h".""" + for script in SCRIPTS: + run([script, "-h"], check=True) diff --git a/unittests/test_spss_converter.py b/unittests/test_spss_converter.py index 7ffc18dba43a6f7cd3c9fbc9273da349b4ec3c6e..59fe723849dadcda21a699416372f08f2756f4e1 100644 --- a/unittests/test_spss_converter.py +++ b/unittests/test_spss_converter.py @@ -20,16 +20,12 @@ import datetime import importlib -import re from pathlib import Path import numpy as np import pytest -from caoscrawler.converters import ( - ConverterValidationError, - SPSSConverter, -) +from caoscrawler.converters import ConverterValidationError, SPSSConverter from caoscrawler.structure_elements import (BooleanElement, DictElement, Directory, File, FloatElement, IntegerElement, ListElement, diff --git a/unittests/test_sync_graph.py b/unittests/test_sync_graph.py index 84451790ddd02f90c2a12a3ce7280b17d8f7c73b..06f0dfb9eb3d3536d26dcfd354ca27f08ef99a02 100644 --- a/unittests/test_sync_graph.py +++ b/unittests/test_sync_graph.py @@ -21,25 +21,21 @@ import logging from functools import partial +from itertools import product from unittest.mock import MagicMock, Mock, patch import linkahead as db import pytest from test_crawler import (basic_retrieve_by_name_mock_up, - mock_cached_only_rt_allow_empty, - mock_get_entity_by, - ) + mock_cached_only_rt_allow_empty, mock_get_entity_by) from caoscrawler.exceptions import (MissingIdentifyingProperty, - MissingRecordType, - ) + MissingRecordType) from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter from caoscrawler.sync_graph import SyncGraph, _set_each_scalar_value from caoscrawler.sync_node import SyncNode, parent_in_list, property_in_list -from itertools import product - @pytest.fixture def simple_adapter(): diff --git a/unittests/test_sync_node.py b/unittests/test_sync_node.py index bd9e1a6ccbc2ac9ec9ccace96e0ec0422ba1d95b..1f95551d34f9e06ab3e2fc196e1e7809eabfa019 100644 --- a/unittests/test_sync_node.py +++ b/unittests/test_sync_node.py @@ -18,19 +18,18 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. # -from functools import partial from unittest.mock import MagicMock, Mock, patch import linkahead as db import pytest +from test_crawler import basic_retrieve_by_name_mock_up, mock_get_entity_by + from caoscrawler.exceptions import ImpossibleMergeError from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter from caoscrawler.sync_graph import SyncGraph from caoscrawler.sync_node import SyncNode, parent_in_list, property_in_list -from test_crawler import basic_retrieve_by_name_mock_up, mock_get_entity_by - def assert_parents_equal(p1, p2): """Special assertion for comparing parents.""" diff --git a/unittests/test_table_converter.py b/unittests/test_table_converter.py index 3b563fd3179968fd90b1c92b9bc5bf0db9ed0858..c606c1d3cdf9a95f00728eaae88153631b08af53 100644 --- a/unittests/test_table_converter.py +++ b/unittests/test_table_converter.py @@ -28,12 +28,13 @@ test the converters module import importlib import math -import os from os.path import basename, dirname, join from pathlib import Path import linkahead as db import pytest +from utils import dircheckstr + from caoscrawler import Crawler from caoscrawler.converters import (Converter, ConverterValidationError, CSVTableConverter, DictConverter, @@ -48,8 +49,6 @@ from caoscrawler.structure_elements import (BooleanElement, DictElement, IntegerElement, ListElement, TextElement) -from utils import dircheckstr - UNITTESTDIR = Path(__file__).parent diff --git a/unittests/test_transformers.py b/unittests/test_transformers.py index 4ed12751d9052c839aa4db4abd586c419bed1018..a2d227adc5b0c6a8f2f96cb054e1c7670e980e10 100644 --- a/unittests/test_transformers.py +++ b/unittests/test_transformers.py @@ -29,18 +29,16 @@ See: https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/107 """ import importlib -from functools import partial from pathlib import Path -from tempfile import NamedTemporaryFile -from unittest.mock import MagicMock, Mock, patch +from unittest.mock import Mock -import linkahead as db import pytest -import yaml from caoscrawler.converters import Converter, ListElementConverter from caoscrawler.scanner import create_transformer_registry, scan_directory from caoscrawler.stores import GeneralStore -from caoscrawler.transformer_functions import replace, split +from caoscrawler.transformer_functions import (cast_to_bool, cast_to_float, + cast_to_int, cast_to_str, + replace, split) from pytest import raises UNITTESTDIR = Path(__file__).parent @@ -164,3 +162,55 @@ def test_empty_functions_list(converter_registry): conv.apply_transformers(values, transformer_functions) assert values['b'] == "16_45" + + +def test_cast_transformer_functions(): + for val in ("True", "true", "False", "false"): + assert type(cast_to_bool(val, {})) == bool + if val[1] == "r": + assert cast_to_bool(val, {}) is True + else: + assert cast_to_bool(val, {}) is False + for val_err in ("jaksdlfj", "0", 1): + with pytest.raises(ValueError): + cast_to_bool(val_err, {}) + assert cast_to_bool(False, {}) is False + assert cast_to_bool(True, {}) is True + + assert cast_to_int("24", {}) == 24 + assert cast_to_int(24.0, {}) == 24 + assert cast_to_int(24, {}) == 24 + assert cast_to_int("-24", {}) == -24 + with pytest.raises(ValueError): + cast_to_int("24dsf", {}) + with pytest.raises(ValueError): + cast_to_int("24.0", {}) == 24 + + assert cast_to_float("24", {}) == 24.0 + assert cast_to_float("24.0", {}) == 24.0 + assert cast_to_float(24.0, {}) == 24.0 + assert cast_to_float(24, {}) == 24.0 + with pytest.raises(ValueError): + cast_to_float("24dsf", {}) + + assert cast_to_str(24, {}) == "24" + + +def test_replace_variables(): + vals = GeneralStore() + vals["test"] = "with" + vals["a"] = "str_without_replacement" + conv = Mock() + conv.definition = {} + conv.definition["transform"] = { + "test": { + "in": "$a", + "out": "$a", + "functions": [ + {"replace": { + "remove": "without", + "insert": "$test" + }} + ]}} + Converter.apply_transformers(conv, vals, {"replace": replace}) + assert vals["a"] == "str_with_replacement" diff --git a/unittests/test_utilities.py b/unittests/test_utilities.py index 15e84a609149ac602ee80b7357f7622566563792..a9b052524957b6f8c1e0378e3153fc06f4f36806 100644 --- a/unittests/test_utilities.py +++ b/unittests/test_utilities.py @@ -20,22 +20,23 @@ # import pytest - +from os.path import sep from caoscrawler.crawl import split_restricted_path -from caoscrawler.utils import get_shared_resource_link, MissingImport +from caoscrawler.utils import MissingImport, get_shared_resource_link def test_split_restricted_path(): assert split_restricted_path("") == [] - assert split_restricted_path("/") == [] - assert split_restricted_path("test/") == ["test"] - assert split_restricted_path("/test/") == ["test"] - assert split_restricted_path("test/bla") == ["test", "bla"] - assert split_restricted_path("/test/bla") == ["test", "bla"] - assert split_restricted_path("/test1/test2/bla") == ["test1", "test2", "bla"] - assert split_restricted_path("/test//bla") == ["test", "bla"] - assert split_restricted_path("//test/bla") == ["test", "bla"] - assert split_restricted_path("///test//bla////") == ["test", "bla"] + assert split_restricted_path(f"{sep}") == [] + assert split_restricted_path(f"test{sep}") == ["test"] + assert split_restricted_path(f"{sep}test{sep}") == ["test"] + assert split_restricted_path(f"test{sep}bla") == ["test", "bla"] + assert split_restricted_path(f"{sep}test{sep}bla") == ["test", "bla"] + assert split_restricted_path(f"{sep}test1{sep}test2{sep}bla") == ["test1", "test2", "bla"] + assert split_restricted_path(f"{sep}test{sep}{sep}bla") == ["test", "bla"] + assert split_restricted_path(f"{sep}{sep}test{sep}bla") == ["test", "bla"] + assert split_restricted_path( + f"{sep}{sep}{sep}test{sep}{sep}bla{sep}{sep}{sep}{sep}") == ["test", "bla"] def test_dummy_class(): diff --git a/unittests/test_validation.py b/unittests/test_validation.py new file mode 100644 index 0000000000000000000000000000000000000000..a3215963f67b61241b321a0eb7345f9fe6fde1f2 --- /dev/null +++ b/unittests/test_validation.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer <a.schlemmer@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" +test validation +""" +from os.path import join +from pathlib import Path + +import jsonschema +import linkahead as db +import pytest +from caoscrawler.validator import (convert_record, + load_json_schema_from_datamodel_yaml, + validate) +from jsonschema import ValidationError + +UNITTESTDIR = Path(__file__).parent + + +def test_create_json_schema(): + json = load_json_schema_from_datamodel_yaml(join(UNITTESTDIR, "datamodels", "datamodel.yaml")) + r = db.Record() + r.add_parent(name="Dataset") + r.add_property(name="keywords", value="jakdlfjakdf") + r.add_property(name="dateModified", value="2024-11-16") + + pobj = convert_record(r) + # print(yaml.dump(pobj)) + # print(yaml.dump(json[0])) + assert "Dataset" in json + jsonschema.validate(pobj, json["Dataset"]) + + # Failing test: + r = db.Record() + r.add_parent(name="Dataset") + r.add_property(name="keywordss", value="jakdlfjakdf") + r.add_property(name="dateModified", value="2024-11-16") + + pobj = convert_record(r) + + with pytest.raises(ValidationError, match=".*'keywords' is a required property.*"): + jsonschema.validate(pobj, json["Dataset"]) + + +def test_validation(): + """ + Test for the main validation API function `validate` + """ + json = load_json_schema_from_datamodel_yaml( + join(UNITTESTDIR, "datamodels", "datamodel.yaml")) + r1 = db.Record() + r1.add_parent(name="Dataset") + r1.add_property(name="keywords", value="jakdlfjakdf") + r1.add_property(name="dateModified", value="2024-11-16") + + r2 = db.Record() + r2.add_parent(name="Dataset") + r2.add_property(name="keywordss", value="jakdlfjakdf") + r2.add_property(name="dateModified", value="2024-11-16") + + valres = validate([r1, r2], json) + assert valres[0][0] is True + assert valres[0][1] is None + assert not valres[1][0] + assert isinstance(valres[1][1], ValidationError) + assert valres[1][1].message == "'keywords' is a required property" diff --git a/unittests/test_variable_substitutions.py b/unittests/test_variable_substitutions.py index 90d144b04a4e1271f74b769759e3f201007af705..c75e37956c1ec24e47ff9cbd9b03572ed4a0f80e 100644 --- a/unittests/test_variable_substitutions.py +++ b/unittests/test_variable_substitutions.py @@ -19,7 +19,6 @@ # along with this program. If not, see <https://www.gnu.org/licenses/>. # -from copy import deepcopy from functools import partial from os.path import basename, dirname, join from pathlib import Path @@ -28,6 +27,10 @@ from unittest.mock import MagicMock, Mock import linkahead as db import pytest import yaml +from linkahead.apiutils import compare_entities +from pytest import raises +from utils import dircheckstr as dircheckstr_base + from caoscrawler import Crawler from caoscrawler.debug_tree import DebugTree from caoscrawler.identifiable_adapters import (IdentifiableAdapter, @@ -35,10 +38,6 @@ from caoscrawler.identifiable_adapters import (IdentifiableAdapter, from caoscrawler.scanner import scan_directory from caoscrawler.structure_elements import (DictListElement, DictTextElement, File) -from linkahead.apiutils import compare_entities -from pytest import raises - -from utils import dircheckstr as dircheckstr_base UNITTESTDIR = Path(__file__).parent dircheckstr = partial(dircheckstr_base, UNITTESTDIR / "test_directories" / diff --git a/unittests/test_xml_converter.py b/unittests/test_xml_converter.py index 9fc9749ccececd41d460fe297edfea72cc30a5ef..e8869ef6ffad511159a583a14fd49d2fad48766b 100644 --- a/unittests/test_xml_converter.py +++ b/unittests/test_xml_converter.py @@ -24,22 +24,18 @@ test the XML converters """ import importlib -import json +from pathlib import Path + import pytest -import sys import yaml - from lxml.etree import fromstring -from pathlib import Path -from caoscrawler.converters import (XMLTagConverter, - XMLAttributeNodeConverter, +from caoscrawler.converters import (XMLAttributeNodeConverter, XMLTagConverter, XMLTextNodeConverter) from caoscrawler.scanner import load_definition from caoscrawler.stores import GeneralStore from caoscrawler.structure_elements import XMLTagElement - UNITTESTDIR = Path(__file__).parent diff --git a/unittests/test_zipfile_converter.py b/unittests/test_zipfile_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..9bc8b8804e299387157869f0dc8b11a9c2a8c6f8 --- /dev/null +++ b/unittests/test_zipfile_converter.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer <a.schlemmer@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" +test the zip-file converter +""" +import importlib +import os +from pathlib import Path + +import pytest +import yaml +from caoscrawler.converters import DirectoryConverter, ZipFileConverter +from caoscrawler.stores import GeneralStore +from caoscrawler.structure_elements import Directory, File + +UNITTESTDIR = Path(__file__).parent + + +@pytest.fixture +def converter_registry(): + converter_registry: dict[str, dict[str, str]] = { + "ZipFile": { + "converter": "ZipFileConverter", + "package": "caoscrawler.converters"}, + } + + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry + + +def test_zipfile_converter(converter_registry): + zipfile = File("PASTA.eln", os.path.join(UNITTESTDIR, "eln_files", "PASTA.eln")) + zip_conv = ZipFileConverter(yaml.safe_load(""" +type: ZipFile +match: .*$ +"""), "TestZipFileConverter", converter_registry) + + match = zip_conv.match(zipfile) + assert match is not None + + children = zip_conv.create_children(GeneralStore(), zipfile) + assert len(children) == 1 + assert children[0].name == "PASTA" + + dir_conv = DirectoryConverter(yaml.safe_load(""" +type: Directory +match: ^PASTA$ +"""), "TestDirectory", converter_registry) + match = dir_conv.match(children[0]) + assert match is not None + children = dir_conv.create_children(GeneralStore(), children[0]) + assert len(children) == 5 + print(children) + for i in range(2): + assert isinstance(children[i], Directory) + for i in range(2, 5): + assert isinstance(children[i], File) diff --git a/unittests/utils.py b/unittests/utils.py index a9649dea686c33dc33d0d7636d08aa51beb35412..fee80e44028667b9b3c8c8f8201b1a774c46afdf 100644 --- a/unittests/utils.py +++ b/unittests/utils.py @@ -36,5 +36,5 @@ def dircheckstr(prefix, *pathcomponents): ftype = "Directory" else: ftype = "File" - return (f"caoscrawler.structure_elements.{ftype}: " + os.path.basename( + return (f"caoscrawler.structure_elements.structure_elements.{ftype}: " + os.path.basename( os.path.join(*pathcomponents)) + ", " + os.path.join(prefix, *pathcomponents))