diff --git a/.docker/Dockerfile b/.docker/Dockerfile index dd4f3d258443dc1f8b2bacb8d535780e8e37e5e8..1468a17feb16940ae658d3ca6b885af7139ce3d8 100644 --- a/.docker/Dockerfile +++ b/.docker/Dockerfile @@ -1,27 +1,31 @@ -FROM debian:bullseye +FROM debian:bookworm RUN apt-get update && \ apt-get install \ curl \ git \ - openjdk-11-jdk-headless \ + openjdk-17-jdk-headless \ python3-autopep8 \ python3-pip \ python3-pytest \ python3-sphinx \ tox \ -y -RUN pip3 install recommonmark sphinx-rtd-theme +RUN pip3 install --break-system-packages \ + pylint \ + recommonmark \ + sphinx-rtd-theme \ + ; COPY .docker/wait-for-it.sh /wait-for-it.sh ARG PYLIB ADD https://gitlab.indiscale.com/api/v4/projects/97/repository/commits/${PYLIB} \ pylib_version.json RUN git clone https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git && \ - cd caosdb-pylib && git checkout ${PYLIB} && pip3 install . + cd caosdb-pylib && git checkout ${PYLIB} && pip3 install --break-system-packages . ARG ADVANCED ADD https://gitlab.indiscale.com/api/v4/projects/104/repository/commits/${ADVANCED} \ advanced_version.json RUN git clone https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git && \ - cd caosdb-advanced-user-tools && git checkout ${ADVANCED} && pip3 install .[h5-crawler] + cd caosdb-advanced-user-tools && git checkout ${ADVANCED} && pip3 install --break-system-packages .[h5-crawler] COPY . /git # Delete .git because it is huge. @@ -30,7 +34,7 @@ RUN rm -r /git/.git # Install pycaosdb.ini for the tests RUN mv /git/.docker/tester_pycaosdb.ini /git/integrationtests/pycaosdb.ini -RUN cd /git/ && pip3 install . +RUN cd /git/ && pip3 install --break-system-packages .[h5-crawler,spss,rocrate] WORKDIR /git/integrationtests # wait for server, diff --git a/.gitignore b/.gitignore index 182ed05e1404483ecb553c8a4e469a86a77ba27c..7ad8606171fd21b94ffd8b15b972f956d4dfc1a1 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,5 @@ start_caosdb_docker.sh src/doc/_apidoc /dist/ *.egg-info +venv/ +.backups \ No newline at end of file diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8840e613f1e1eb86f30779b8b3535e2ff97ad0cc..e43223568252b2e7a1504610692fe20dc9d78348 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -113,46 +113,47 @@ info: script: - *env -unittest_py3.9: +unittest_py3.11: tags: [cached-dind] stage: test image: $CI_REGISTRY_IMAGE script: - - tox + - python3 -c "import sys; assert sys.version.startswith('3.11')" + - tox -unittest_py3.7: +unittest_py3.9: tags: [cached-dind] stage: test - image: python:3.7 + image: python:3.9 script: &python_test_script # install dependencies - pip install pytest pytest-cov # TODO: Use f-branch logic here - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@dev - - pip install . + - pip install .[h5-crawler,spss,rocrate] # actual test - caosdb-crawler --help - pytest --cov=caosdb -vv ./unittests -unittest_py3.8: +unittest_py3.10: tags: [cached-dind] stage: test - image: python:3.8 + image: python:3.10 script: *python_test_script -unittest_py3.10: +unittest_py3.12: tags: [cached-dind] stage: test - image: python:3.10 + image: python:3.12 script: *python_test_script -unittest_py3.11: +unittest_py3.13: tags: [cached-dind] stage: test - image: python:3.11 + image: python:3.13 script: *python_test_script - + inttest: tags: [docker] services: @@ -279,7 +280,7 @@ cert: - cd .docker - CAOSHOSTNAME=caosdb-server ./cert.sh -style: +code-style: tags: [docker] stage: style image: $CI_REGISTRY_IMAGE @@ -287,9 +288,21 @@ style: - job: build-testenv optional: true script: - - autopep8 -r --diff --exit-code . + - autopep8 --version + - autopep8 -r --diff --exit-code . allow_failure: true +pylint: + tags: [docker] + stage: style + image: $CI_REGISTRY_IMAGE + needs: + - job: build-testenv + optional: true + allow_failure: true + script: + - pylint --unsafe-load-any-extension=y -d all -e E,F src/caoscrawler + # Build the sphinx documentation and make it ready for deployment by Gitlab Pages # Special job for serving a static website. See https://docs.gitlab.com/ee/ci/yaml/README.html#pages # Based on: https://gitlab.indiscale.com/caosdb/src/caosdb-pylib/-/ci/editor?branch_name=main diff --git a/CHANGELOG.md b/CHANGELOG.md index 8eeed54fc829649a58a14cf60fbf85a48b0ae48a..e0589ec4e056a494e79762ef048cf2e644f4f40a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,22 +8,241 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ## ### Added ### -* 'transform' sections can be added to a CFood to apply functions to values stored in variables. + +- Validation module for checking a list of generated records against a list of json schemas + that can be generated from a yaml data model file. +- DictElementConverters can now make use of `match_properties` which + works analogous to `match_properties` in ROCrateEntityConverter and + `match_attrib` in XMLConverter. +- `match_properties` is a method of class Converter and can for + example be used by CustomConverters. +- ZipFileConverter that opens zip files and exposes their contents as + File and Directory structure elements. +- `linkahead-crawler` script as alias for `caosdb-crawler`. +- New transformers of the form `cast_to_*` which allow casting + variables to `int`, `float`, `str` and `bool`. +- Transformer function definition in the cfood support variable + substitutions now. +- `crawler_main` and `scanner.scan_directory` now support list of + directories to be crawled, too. Note that giving a list of + directories is currently incompatible with + `securityMode=SecurityMode.RETRIEVE` or + `securityMode=SecurityMode.INSERT` since the functionality to + authoriye pending inserts or updates doesn't support path lists yet + and will raise a NotImplementedError for now. +- `match_newer_than_file` option for `DirectoryConverter`: A reference + file containing (only) an ISO-formatted datetime string can be + specified here. Directories with this option won't match if all + their contents were last modified before that datetime. + +### Changed ### + +### Deprecated ### + +### Removed ### + +### Fixed ### + +- `spss_to_datamodel` script works again. +- The cfood now supports bi-directional references when defining records on the same level. + (See: https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/175) + +### Security ### + +### Documentation ### + +## [0.10.1] - 2024-11-13 ## + +### Fixed ### + +* Removed optional rocrate dependency which prevented package + publication on PyPI for a violation of PEP 440 (see + https://github.com/pypi/warehouse/issues/7136). It will be + re-activated once + https://github.com/ResearchObject/ro-crate-py/issues/203 has been + resolved upstream. For now, if you want to use the ROCrate or ELN + converters, manually install the fix from + https://github.com/salexan2001/ro-crate-py.git@f-automatic-dummy-ids + ```sh + pip install git+https://github.com/salexan2001/ro-crate-py.git@f-automatic-dummy-ids + ``` + +## [0.10.0] - 2024-11-13 ## + +### Added ### + +- XMLTextNodeConverter for converting text nodes created by XMLTagConverter +- XMLAttributeNodeConverter for converting attribute nodes created by XMLTagConverter +- Units for properties. They can be specified by giving the property as a dict in the form + ```yaml + MyRecord: + my_prop: + value: 5 + unit: m + ``` +- Support for Python 3.13 +- ROCrateConverter, ELNFileConverter and ROCrateEntityConverter for crawling ROCrate and .eln files +- `max_log_level` parameter to `logging.configure_server_side_logging` + to control the server-side debuglog's verboosity, and an optional + `sss_max_log_level` parameter to `crawler_main` to control the SSS + loglevel separately from the global `debug` option. + +### Changed ### + +- Property values specified by dicts do not have to contain a + `collection_mode` key anymore. If none is given, the + `collection_mode` is determined from the `value` as it is done for + values specified by strings: + - if `value` starts with '+', collection mode is "list". + - if `value` starts with '*', collection mode is "multiproperty". + - in all other cases, collection mode is "single". +- The default server-side scrippting debug level is now controlled by + the global `debug` option by default and set to log level `INFO` in + case of `debug=False`. The previous behavior can be restored by + calling `crawler_main` with `sss_max_log_level=logging.DEBUG`. + +### Removed ### + +* Support for Python 3.8 (end of life) + +### Fixed ### + +- Added better error message for some cases of broken converter and + record definitions. +- [#108](https://gitlab.com/linkahead/linkahead-crawler/-/issues/108) + Too verbose server-side scripting logs that could lead to high disk + usage. + +### Documentation ### + +- Tutorial on crawling a simple CSV file + +## [0.9.1] - 2024-09-26 ## + +### Fixed ### + +* ImpossibleMergeErrors now correctly include the problematic property + and its values in their string representation. + +## [0.9.0] - 2024-09-05 ## + +### Added ### + +* New converters for XML documents/trees/tags: XMLFile, XMLTag, XMLTextNode + +### Changed ### + +* Moved the optional `hdf5_converter` to the `converters` + submodule. When updating from 0.8 or below, this means that you have + to adapt the converter package path in your cfood definition from + `caoscrawler.hdf5_converter` to + `caoscrawler.converters.hdf5_converter`. + +### Fixed ### + +* Use `urllib.parse.urljoin` to generate link addresses in status + mails, preventing wrong addresses, e.g., due to superfluous `/`. + +## [0.8.0] - 2024-08-23 ## + +### Added ### + +* Support for Python 3.12 and experimental support for 3.13 +* CFood macros now accept complex objects as values, not just strings. +* More options for the `CSVTableConverter` +* New converters: + * `DatetimeElementConverter` + * `SPSSConverter` +* New scripts: + * `spss_to_datamodel` + * `csv_to_datamodel` +* New transformer functions: + * `date_parse` + * `datetime_parse` +* New ``PropertiesFromDictConverter`` which allows to automatically + create property values from dictionary keys. + +### Changed ### + +* CFood macros do not render everything into strings now. +* Better internal handling of identifiable/reference resolving and merging of entities. This also + includes more understandable output for users. +* Better handling of missing imports, with nice messages for users. +* No longer use configuration of advancedtools to set to and from email addresses + +### Removed ### + +* Support for Python 3.7 + +### Fixed ### + +* [93](https://gitlab.com/linkahead/linkahead-crawler/-/issues/93) cfood.yaml does not allow umlaut in $expression +* [96](https://gitlab.com/linkahead/linkahead-crawler/-/issues/96) Do not fail silently on transaction errors + +### Security ### + +### Documentation ### + +* General improvement of the documentaion, in many small places. +* The API documentation should now also include documentation of the constructors. + +## [0.7.1] - 2024-03-21 ## + +### Fixed ### + +* `crawler_main` doesn't need the deprecated `debug=True` anymore to put out a + provenance file if the `provenance_file` parameter is provided. +* [indiscale#129](https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/129) + missing packaging dependency. + +## [0.7.0] - 2024-03-04 ## + +### Added ### + +* `transform` sections can be added to a CFood to apply functions to values stored in variables. * default transform functions: submatch, split and replace. +* `*` can now be used as a wildcard in the identifiables parameter file to denote + that any Record may reference the identified one. +* `crawl.TreatedRecordLookUp` class replacing the old (and slow) + `identified_cache` module. The new class now handles all records identified by + id, path, or identifiable simultaneously. See API docs for more info on how to + add to and get from the new lookup class. +* `identifiable_adapters.IdentifiableAdapter.get_identifying_referencing_entities` + and + `identifiable_adapters.IdentifiableAdapter.get_identifying_referenced_entities` + static methods to return the referencing or referenced entities belonging to a + registered identifiable, respectively. +* [#70](https://gitlab.com/linkahead/linkahead-crawler/-/issues/70): Optional + converters for HDF5 files. They require this package to be installed with its + ``h5-crawler`` dependency. ### Changed ### -- If the `parents` key is used in a cfood at a lower level for a Record that + +* If the `parents` key is used in a cfood at a lower level for a Record that already has a Parent (because it was explicitly given or the default Parent), the old Parent(s) are now overwritten with the value belonging to the `parents` key. -- If a registered identifiable states, that a reference by a Record with parent +* If a registered identifiable states, that a reference by a Record with parent RT1 is needed, then now also references from Records that have a child of RT1 as parent are accepted. +* More aggressive caching. +* The `identifiable_adapters.IdentifiableAdapter` now creates (possibly empty) + reference lists for all records in `create_reference_mapping`. This allows + functions like `get_identifiable` to be called only with the subset of the + referenceing entities belonging to a specific Record. +* The `identifiable_adapters.IdentifiableAdapter` uses entity ids (negative for + entities that don't exist remotely) instead of entity objects for keeping + track of references. +* Log output is either written to $SHARED_DIR/ (when this variable is set) or just to the terminal. ### Deprecated ### +* `IdentifiableAdapter.get_file` + ### Removed ### +* `identified_cache` module which was replaced by the `crawl.TreatedRecordLookUp` class. + ### Fixed ### * Empty Records can now be created (https://gitlab.com/caosdb/caosdb-crawler/-/issues/27) @@ -36,10 +255,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 handles cases correctly in which entities retrieved from the server have to be merged with local entities that both reference another, already existing entity - -### Security ### - -### Documentation ### +* A corner case in `split_into_inserts_and_updates` whereby two records created + in different places in the cfood definition would not be merged if both were + identified by the same LinkAhead id +* [#87](https://gitlab.com/linkahead/linkahead-crawler/-/issues/87) Handle long strings more gracefully. The crawler sometimes runs into + [linkahead-server#101](https://gitlab.com/linkahead/linkahead-server/-/issues/101), this is now mitigated. +* [indiscale#128](https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/128) Yet another corner case of referencing resolution resolved. ## [0.6.0] - 2023-06-23 ## (Florian Spreckelsen) @@ -106,6 +327,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - ``add_prefix`` and ``remove_prefix`` arguments for the command line interface and the ``crawler_main`` function for the adding/removal of path prefixes when creating file entities. +- More strict checking of `identifiables.yaml`. +- Better error messages when server does not conform to expected data model. ### Changed ### @@ -154,7 +377,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Some StructureElements changed (see "How to upgrade" in the docs): - Dict, DictElement and DictDictElement were merged into DictElement. - DictTextElement and TextElement were merged into TextElement. The "match" - keyword is now invalid for TextElements. + keyword is now invalid for TextElements. - JSONFileConverter creates another level of StructureElements (see "How to upgrade" in the docs) - create_flat_list function now collects entities in a set and also adds the entities contained in the given list directly diff --git a/CITATION.cff b/CITATION.cff index c6d41cc49d74b72056d825ca731fa79897fc537b..ed859432b26cde913f7283fb8e969a97b7b74f41 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -1,25 +1,22 @@ cff-version: 1.2.0 message: "If you use this software, please cite it as below." authors: - - family-names: Fitschen - given-names: Timm - orcid: https://orcid.org/0000-0002-4022-432X - - family-names: Schlemmer - given-names: Alexander - orcid: https://orcid.org/0000-0003-4124-9649 - - family-names: Hornung - given-names: Daniel - orcid: https://orcid.org/0000-0002-7846-6375 - family-names: tom Wörden given-names: Henrik orcid: https://orcid.org/0000-0002-5549-578X - - family-names: Parlitz - given-names: Ulrich - orcid: https://orcid.org/0000-0003-3058-1435 + - family-names: Spreckelsen + given-names: Florian + orcid: https://orcid.org/0000-0002-6856-2910 - family-names: Luther given-names: Stefan orcid: https://orcid.org/0000-0001-7214-8125 + - family-names: Parlitz + given-names: Ulrich + orcid: https://orcid.org/0000-0003-3058-1435 +- family-names: Schlemmer + given-names: Alexander + orcid: https://orcid.org/0000-0003-4124-9649 title: CaosDB - Crawler -version: 0.6.0 -doi: 10.3390/data4020083 -date-released: 2023-06-23 \ No newline at end of file +version: 0.10.1 +doi: 10.3390/data9020024 +date-released: 2024-11-13 \ No newline at end of file diff --git a/README_SETUP.md b/README_SETUP.md deleted file mode 120000 index d478016ecde09dab8820d398b15df325f4159380..0000000000000000000000000000000000000000 --- a/README_SETUP.md +++ /dev/null @@ -1 +0,0 @@ -src/doc/README_SETUP.md \ No newline at end of file diff --git a/README_SETUP.md b/README_SETUP.md new file mode 100644 index 0000000000000000000000000000000000000000..32f0bb89a6051bc2ec4be0bae6cf06cd1a540f8b --- /dev/null +++ b/README_SETUP.md @@ -0,0 +1,34 @@ +# Getting started with the CaosDB Crawler # + +## Installation +see INSTALL.md + +## Run Unit Tests + +1. Install additional dependencies: + - h5py +2. Run `pytest unittests`. + +## Documentation ## +We use sphinx to create the documentation. Docstrings in the code should comply +with the Googly style (see link below). + +Build documentation in `src/doc` with `make doc`. Note that for the +automatic generation of the complete API documentation, it is +necessary to first install this library with all its optional +dependencies, i.e., `pip install .[h5-crawler,spss]`. + +### Requirements ### + +- `sphinx` +- `sphinx-autoapi` +- `recommonmark` +- `sphinx-rtd-theme` + +### How to contribute ### + +- [Google Style Python Docstrings](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) +- [Google Style Python Docstrings 2nd reference](https://github.com/google/styleguide/blob/gh-pages/pyguide.md#38-comments-and-docstrings) +- [References to other documentation](https://www.sphinx-doc.org/en/master/usage/extensions/intersphinx.html#role-external) + + diff --git a/integrationtests/basic_example/test_basic.py b/integrationtests/basic_example/test_basic.py index d3482a19f0c95912002b2ff68101623476d452ea..6fd322e5f6425e9bce25b970d6de7d99892762a5 100755 --- a/integrationtests/basic_example/test_basic.py +++ b/integrationtests/basic_example/test_basic.py @@ -32,7 +32,7 @@ import sys from argparse import RawTextHelpFormatter from pathlib import Path -import caosdb as db +import linkahead as db import pytest import yaml from caosadvancedtools.crawler import Crawler as OldCrawler @@ -42,8 +42,8 @@ from caoscrawler.debug_tree import DebugTree from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter from caoscrawler.scanner import scan_directory -from caosdb import EmptyUniqueQueryError -from caosdb.utils.register_tests import clear_database, set_test_key +from linkahead import EmptyUniqueQueryError +from linkahead.utils.register_tests import clear_database, set_test_key set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") @@ -112,11 +112,11 @@ def crawler_extended(ident): return cr, crawled_data, debug_tree -def test_ambigious_lookup(clear_database, usemodel, crawler, ident): +def test_ambiguous_lookup(clear_database, usemodel, crawler, ident): ins, ups = crawler[0].synchronize(crawled_data=crawler[1]) proj = db.execute_query("FIND Project WITH identifier='SpeedOfLight'", unique=True) - with pytest.raises(RuntimeError, match=".*unambigiously.*"): + with pytest.raises(RuntimeError, match=".*unambiguously.*"): print(crawler[0].identifiableAdapter.retrieve_identified_record_for_identifiable( Identifiable(properties={'project': proj.id}))) diff --git a/integrationtests/test_crawler_main.py b/integrationtests/test_crawler_main.py new file mode 100644 index 0000000000000000000000000000000000000000..a2eebf4f04e195754eaf71dc5e829b6a77a4cc4b --- /dev/null +++ b/integrationtests/test_crawler_main.py @@ -0,0 +1,95 @@ +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# 2024 Florian Spreckelsen <f.spreckelsen@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +import logging +import tempfile + +from pathlib import Path + +import linkahead as db + +from caoscrawler import crawl +from caoscrawler.crawl import (crawler_main, SecurityMode) +from linkahead.utils.register_tests import clear_database, set_test_key + +set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") + +INTTESTDIR = Path(__file__).parent + + +def test_list_of_paths(clear_database, monkeypatch): + + # Mock the status record + dummy_status = { + "n_calls": 0 + } + + def _mock_update_status_record(run_id, n_inserts, n_updates, status): + print("Update mocked status") + dummy_status["run_id"] = run_id + dummy_status["n_inserts"] = n_inserts + dummy_status["n_updates"] = n_updates + dummy_status["status"] = status + dummy_status["n_calls"] += 1 + monkeypatch.setattr(crawl, "_update_status_record", _mock_update_status_record) + + # mock SSS environment + monkeypatch.setenv("SHARED_DIR", tempfile.gettempdir()) + + # We need only one dummy RT + rt = db.RecordType(name="TestType").insert() + basepath = INTTESTDIR / "test_data" / "crawler_main_with_list_of_dirs" + dirlist = [basepath / "dir1", basepath / "dir2"] + crawler_main( + dirlist, + cfood_file_name=basepath / "cfood.yml", + identifiables_definition_file=basepath / "identifiable.yml" + ) + recs = db.execute_query("FIND TestType") + assert len(recs) == 2 + assert "Test1" in [r.name for r in recs] + assert "Test2" in [r.name for r in recs] + + assert dummy_status["n_inserts"] == 2 + assert dummy_status["n_updates"] == 0 + assert dummy_status["status"] == "OK" + assert dummy_status["n_calls"] == 1 + + +def test_not_implemented_list_with_authorization(caplog, clear_database): + + rt = db.RecordType(name="TestType").insert() + basepath = INTTESTDIR / "test_data" / "crawler_main_with_list_of_dirs" + dirlist = [basepath / "dir1", basepath / "dir2"] + + # This is not implemented yet, so check log for correct error. + ret = crawler_main( + dirlist, + cfood_file_name=basepath / "cfood.yml", + identifiables_definition_file=basepath / "identifiable.yml", + securityMode=SecurityMode.RETRIEVE + ) + # crawler_main hides the error, but has a non-zero return code and + # errors in the log: + assert ret != 0 + err_tuples = [t for t in caplog.record_tuples if t[1] == logging.ERROR] + assert len(err_tuples) == 1 + assert "currently implemented only for single paths, not for lists of paths" in err_tuples[0][2] + # No inserts after the errors + assert len(db.execute_query("FIND TestType")) == 0 diff --git a/integrationtests/test_data/crawler_main_with_list_of_dirs/cfood.yml b/integrationtests/test_data/crawler_main_with_list_of_dirs/cfood.yml new file mode 100644 index 0000000000000000000000000000000000000000..c7f22ce07e9b401915aefde3bf7e3a78d92e2bd6 --- /dev/null +++ b/integrationtests/test_data/crawler_main_with_list_of_dirs/cfood.yml @@ -0,0 +1,10 @@ +--- +metadata: + crawler-version: 0.10.2 +--- +BaseDirElement: + type: Directory + match: ^dir(?P<dir_number>[0-9]+)$$ + records: + TestType: + name: Test$dir_number diff --git a/integrationtests/test_data/crawler_main_with_list_of_dirs/dir1/.gitkeep b/integrationtests/test_data/crawler_main_with_list_of_dirs/dir1/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/integrationtests/test_data/crawler_main_with_list_of_dirs/dir2/.gitkeep b/integrationtests/test_data/crawler_main_with_list_of_dirs/dir2/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/integrationtests/test_data/crawler_main_with_list_of_dirs/identifiable.yml b/integrationtests/test_data/crawler_main_with_list_of_dirs/identifiable.yml new file mode 100644 index 0000000000000000000000000000000000000000..6d608cece0ae7c2aa6461fb56025a8ac8e4faf6f --- /dev/null +++ b/integrationtests/test_data/crawler_main_with_list_of_dirs/identifiable.yml @@ -0,0 +1,2 @@ +TestType: + - name diff --git a/integrationtests/test_issues.py b/integrationtests/test_issues.py index 441edac5481585e483c94d61d864a1baaa139aa2..0506fa4db03e9b3638051e6ec4fa132bd348a988 100644 --- a/integrationtests/test_issues.py +++ b/integrationtests/test_issues.py @@ -1,4 +1,4 @@ -# This file is a part of the CaosDB Project. +# This file is a part of the LinkAhead Project. # # Copyright (C) 2022 Indiscale GmbH <info@indiscale.com> # 2022 Florian Spreckelsen <f.spreckelsen@indiscale.com> @@ -16,24 +16,28 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. # -from pytest import fixture, mark +import tempfile -import caosdb as db -from caosdb.cached import cache_clear +import linkahead as db +import yaml from caosadvancedtools.models.parser import parse_model_from_string - from caoscrawler.crawl import Crawler +from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter +from caoscrawler.scanner import (_load_definition_from_yaml_dict, + create_converter_registry, + scan_structure_elements) from caoscrawler.structure_elements import DictElement +from linkahead.cached import cache_clear +from linkahead.utils.register_tests import clear_database, set_test_key +from pytest import fixture, mark, raises -from caoscrawler.scanner import create_converter_registry, scan_structure_elements - -from caosdb.utils.register_tests import clear_database, set_test_key set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") @fixture(autouse=True) def clear_cache(): + """Clear the LinkAhead cache.""" cache_clear() @@ -169,8 +173,9 @@ def test_issue_83(clear_database): name=referencing_type.name).add_property(name=referenced_type.name, value=[ref_target1]) referencing2 = db.Record(name="Referencing2").add_parent( name=referencing_type.name).add_property(name=referenced_type.name, value=[ref_target2]) - referencing3 = db.Record(name="Referencing3").add_parent(name=referencing_type.name).add_property( - name=referenced_type.name, value=[ref_target1, ref_target2]) + referencing3 = db.Record(name="Referencing3").add_parent( + name=referencing_type.name).add_property(name=referenced_type.name, value=[ref_target1, + ref_target2]) records = db.Container().extend( [ref_target1, ref_target2, referencing1, referencing2, referencing3]) @@ -266,3 +271,165 @@ Campaign: # Nothing to do for the existing ents assert len(ups) == 0 assert ins[0].name == event.name + + +def test_indiscale_87(clear_database): + """Handle long string queries gracefully. + + https://gitlab.com/linkahead/linkahead-crawler/-/issues/87 + """ + + prop = db.Property(name="str", datatype=db.TEXT).insert() + rt = db.RecordType(name="RT1").add_property(prop).insert() + strings = [ + "X123456789" * 26, + "X" * 260, + "X123456789" * 25 + "9876543210", + ] + recs = [ + db.Record().add_parent(rt).add_property(name="str", value=string).insert() + for string in strings + ] + idents = [ + Identifiable(record_type="RT1", properties={"str": string}) + for string in strings + ] + adapter = CaosDBIdentifiableAdapter() + for rec, ident in zip(recs, idents): + print(f"Testing: ...{rec.get_property('str').value[-10:]}") + retrieved = adapter.retrieve_identified_record_for_identifiable(ident) + # print(rec) + # print(retrieved) + print(db.apiutils.compare_entities(rec, retrieved)) + assert db.apiutils.empty_diff(rec, retrieved) + print("---") + + # add another, harmless, property + prop2 = db.Property(name="someint", datatype=db.INTEGER).insert() + rt.add_property(prop2).update() + string = "Y123456789" * 26 + numbers = [23, 42] + recs = [ + db.Record().add_parent(rt).add_property(name="str", value=string).add_property( + name="someint", value=number).insert() + for number in numbers + ] + idents = [Identifiable(record_type="RT1", properties={"str": string})] + # Ambiguous result + with raises(RuntimeError, match=".*unambiguously.*"): + retrieved = adapter.retrieve_identified_record_for_identifiable(idents[0]) + + # Upgrade new property to be identifying + idents = [ + Identifiable(record_type="RT1", properties={"str": string, "someint": number}) + for number in numbers + ] + for rec, ident in zip(recs, idents): + print(f"Testing: someint={rec.get_property('someint').value}") + retrieved = adapter.retrieve_identified_record_for_identifiable(ident) + # print(rec) + # print(retrieved) + print(db.apiutils.compare_entities(rec, retrieved)) + assert db.apiutils.empty_diff(rec, retrieved) + print("---") + + +def test_issue_16(clear_database): + """ + This is another a test for: + https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/16 + + In addition to the two unit tests for recursive definition in `test_scanner.py` this system test + tests whether recursively defined records can be synchronized correctly using the crawler. + """ + recursive_yaml = """ +FirstConverter: + type: DictElement + records: + Experiment: + subtree: + Converter: + type: DictElement + records: + Block: + name: block 1 + Experiment: $Experiment + Experiment: + name: experiment 1 + Block: $Block + """ + + crawler_definition = _load_definition_from_yaml_dict( + [yaml.load(recursive_yaml, Loader=yaml.SafeLoader)]) + converter_registry = create_converter_registry(crawler_definition) + + # Nested DictElements that match the yaml structure in recursive_yaml: + data = {"data": { + }} + records = scan_structure_elements(DictElement(name="", value=data), crawler_definition, + converter_registry) + + rt_exp = db.RecordType(name="Experiment").insert() + rt_block = db.RecordType(name="Block").insert() + + ident = CaosDBIdentifiableAdapter() + ident.load_from_yaml_object(yaml.safe_load(""" +Experiment: +- name +Block: +- name +""")) + + crawler = Crawler(identifiableAdapter=ident) + crawler.synchronize(crawled_data=records) + + exp_res = db.execute_query("FIND Experiment") + assert len(exp_res) == 1 + exp_block = db.execute_query("FIND Block") + assert len(exp_block) == 1 + + assert exp_res[0].get_property("Block").value == exp_block[0].id + assert exp_block[0].get_property("Experiment").value == exp_res[0].id + + +def test_issue_14(clear_database): + """ + Issue title: Some parent updates are required before inserts + + https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/14 + """ + + rt1 = db.RecordType(name="RT1") + rt2 = db.RecordType(name="RT2").insert() + rt1.add_property(rt2, importance=db.OBLIGATORY) + rt1.insert() + + r = db.Record() + r.add_parent(rt1) + with tempfile.NamedTemporaryFile() as tmpf: + f = db.File(name="test_parent", path="parent_test/file.txt", file=tmpf.name) + f.insert() + + # We create a clean new file object here: + f2 = db.File(name="test_parent", path="parent_test/file.txt", file=tmpf.name) + + f2.add_parent(rt2) + r.add_property(name="RT2", value=f2) + + # Current state in the database: File without parents + f_test_base = db.File(name="test_parent").retrieve() + assert len(f_test_base.parents) == 0 + assert len(db.execute_query("FIND Record")) == 0 + + ident = CaosDBIdentifiableAdapter() + ident.register_identifiable("RT1", db.RecordType().add_parent( + name="RT1").add_property(name="RT2")) + crawler = Crawler(identifiableAdapter=ident) + crawler.synchronize(crawled_data=[f2, r]) + + f_test = db.File(name="test_parent").retrieve() + assert len(f_test.parents) == 1 + assert f_test.parents[0].name == "RT2" + records = db.execute_query("FIND Record") + assert len(records) == 1 + assert records[0].get_property("RT2").value == f_test.id diff --git a/integrationtests/test_realworld_example.py b/integrationtests/test_realworld_example.py index 82644947a3cdc85a38be3403615b51fe1f4ded50..fbbf25643e1c1cf928aa9599c92d3d6e94a88974 100644 --- a/integrationtests/test_realworld_example.py +++ b/integrationtests/test_realworld_example.py @@ -24,24 +24,22 @@ """ an integration test module that runs a test against a (close to) real world example """ -from caosdb.utils.register_tests import clear_database, set_test_key -import logging import json +import logging import os +import pytest +import sys -import caosdb as db -from caosdb.cached import cache_clear +import linkahead as db +from linkahead.cached import cache_clear +from linkahead.utils.register_tests import clear_database, set_test_key +from caosadvancedtools.loadFiles import loadpath +from caosadvancedtools.models.parser import parse_model_from_json_schema, parse_model_from_yaml from caoscrawler.crawl import Crawler, crawler_main from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter -from caoscrawler.structure_elements import Directory -import pytest -from caosadvancedtools.models.parser import parse_model_from_json_schema, parse_model_from_yaml -from caosadvancedtools.loadFiles import loadpath - from caoscrawler.scanner import load_definition, scan_structure_elements, create_converter_registry - -import sys +from caoscrawler.structure_elements import Directory set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") @@ -91,15 +89,6 @@ def usemodel(): dataset_inherits.sync_data_model(noquestion=True) -@pytest.fixture -def clear_database(): - # TODO(fspreck): Remove once the corresponding advancedtools function can - # be used. - ents = db.execute_query("FIND ENTITY WITH ID>99") - if ents: - ents.delete() - - def create_identifiable_adapter(): ident = CaosDBIdentifiableAdapter() ident.load_from_yaml_definition(os.path.join(DATADIR, "identifiables.yml")) diff --git a/integrationtests/test_use_case_simple_presentation.py b/integrationtests/test_use_case_simple_presentation.py index cf38e951b78534806c0ea76ef58051436aa22704..05b0a543deb03eb524d40d6a386876812e6b54e2 100644 --- a/integrationtests/test_use_case_simple_presentation.py +++ b/integrationtests/test_use_case_simple_presentation.py @@ -27,12 +27,12 @@ import os import pytest from subprocess import run -import caosdb as db +import linkahead as db from caosadvancedtools.loadFiles import loadpath -from caosdb.cached import cache_clear +from linkahead.cached import cache_clear from caosadvancedtools.models import parser as parser from caoscrawler.crawl import crawler_main -from caosdb.utils.register_tests import clear_database, set_test_key +from linkahead.utils.register_tests import clear_database, set_test_key set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") diff --git a/setup.cfg b/setup.cfg index ffb00e54eb7bf8753a335bb96d9fdf23700aaadd..9ca07edf0329d83bbe4c28ffb3203e7c342ad612 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,9 +1,9 @@ [metadata] name = caoscrawler -version = 0.6.1 +version = 0.10.2 author = Alexander Schlemmer author_email = alexander.schlemmer@ds.mpg.de -description = A new crawler for caosdb +description = A new crawler for LinkAhead long_description = file: README.md long_description_content_type = text/markdown # url @@ -17,17 +17,19 @@ classifiers = package_dir = = src packages = find: -python_requires = >=3.7 +python_requires = >=3.9 install_requires = importlib-resources caosadvancedtools >= 0.7.0 - linkahead > 0.13.2 + linkahead > 0.16.0 yaml-header-tools >= 0.2.1 - pyyaml odfpy #make optional jinja2 #make optional? + packaging pandas - importlib_metadata;python_version<'3.8' + pyarrow # Will be required by Pandas >= 3.0. + pyyaml + yaml-header-tools >= 0.2.1 [options.packages.find] where = src @@ -39,4 +41,16 @@ per-file-ignores = __init__.py:F401 [options.entry_points] console_scripts = + linkahead-crawler = caoscrawler.crawl:main caosdb-crawler = caoscrawler.crawl:main + spss_to_datamodel = caoscrawler.converters.spss:spss_to_datamodel_main + csv_to_datamodel = caoscrawler.scripts.generators:csv_to_datamodel_main + +[options.extras_require] +h5-crawler = + h5py >= 3.8 + numpy +spss = + pandas[spss] +rocrate = + rocrate @ git+https://github.com/salexan2001/ro-crate-py.git@f-automatic-dummy-ids diff --git a/src/caoscrawler/__init__.py b/src/caoscrawler/__init__.py index 05bad0b54d9098c0b7f165d8295a0faa2966fa32..ba4844e15387cd13aa15db88521b2022fa52bfd6 100644 --- a/src/caoscrawler/__init__.py +++ b/src/caoscrawler/__init__.py @@ -1,3 +1,4 @@ +from . import converters, utils from .crawl import Crawler, SecurityMode from .version import CfoodRequiredVersionError, get_caoscrawler_version diff --git a/src/caoscrawler/authorize.py b/src/caoscrawler/authorize.py index 6f1011b227881d4b73186996076abe20d94d52e5..f3deed4f8c78afa85fdd4471fe9383760b8c8b12 100644 --- a/src/caoscrawler/authorize.py +++ b/src/caoscrawler/authorize.py @@ -19,10 +19,10 @@ # along with this program. If not, see <https://www.gnu.org/licenses/>. # -from caosadvancedtools.crawler import Crawler as OldCrawler - import argparse +from caosadvancedtools.crawler import Crawler as OldCrawler + def parse_args(): parser = argparse.ArgumentParser() diff --git a/src/caoscrawler/cfood-schema.yml b/src/caoscrawler/cfood-schema.yml index b0d77bbf5d7ba09df3c0c47d656fa3d22d07b6d2..d2e4cea24f0f2803499116420091b36e95b2c781 100644 --- a/src/caoscrawler/cfood-schema.yml +++ b/src/caoscrawler/cfood-schema.yml @@ -1,9 +1,44 @@ cfood: type: object + properties: + Converters: + description: Defintiion of custom converters + type: object + additionalProperties: + type: object + properties: + converter: + type: string + package: + type: string + required: + - converter + - package + macros: + description: Macro definitions + type: array + Transformers: + description: Variable transformer definition + type: object + additionalProperties: + type: object + properties: + function: + type: string + package: + type: string + required: + - package + - function additionalProperties: $ref: "#/$defs/converter" $defs: + parents: + description: Parents for this record are given here as a list of names. + type: array + items: + type: string converter: properties: type: @@ -28,9 +63,21 @@ cfood: - Definitions - Dict - Date + - Datetime - JSONFile + - YAMLFile - CSVTableConverter - XLSXTableConverter + - SPSSFile + - H5File + - H5Dataset + - H5Group + - H5Ndarray + - XMLFile + - XMLTag + - XMLTextNode + - XMLAttributeNode + - PropertiesFromDictElement description: Type of this converter node. match: description: typically a regexp which is matched to a structure element name @@ -41,15 +88,52 @@ cfood: match_value: description: a regexp that is matched to the value of a key-value pair type: string - records: - description: This field is used to define new records or to modify records which have been defined on a higher level. + match_newer_than_file: + description: | + Only relevant for Directory. A path to a file containing + an ISO-formatted datetime. Only match if the contents of the + Directory have been modified after that datetime. + type: string + record_from_dict: + description: Only relevant for PropertiesFromDictElement. Specify the root record which is generated from the contained dictionary. type: object + required: + - variable_name properties: - parents: - description: Parents for this record are given here as a list of names. + variable_name: + description: | + Name of the record by which it can be accessed in the + cfood definiton. Can also be the name of an existing + record in which case that record will be updated by + the PropertiesFromDictConverter. + type: string + properties_blacklist: + description: List of keys to be ignored in the automatic treatment. They will be ignored on all levels of the dictionary. type: array items: type: string + references: + description: List of keys that will be transformed into named reference properties. + type: object + additionalProperties: + type: object + properties: + parents: + $ref: + "#/$defs/parents" + name: + description: Name of this record. If none is given, variable_name is used. + type: string + parents: + $ref: + "#/$defs/parents" + records: + description: This field is used to define new records or to modify records which have been defined on a higher level. + type: object + properties: + parents: + $ref: + "#/$defs/parents" additionalProperties: oneOf: - type: object @@ -57,6 +141,9 @@ cfood: value: description: Dictionary notation for variable values. Values can be given by a variable which is indicated by an initial "$". Use "$$" for setting values actually starting with a dollar sign. type: string + unit: + description: The unit of this property. Units can be given by a variable which is indicated by an initial "$". Use "$$" for setting values actually starting with a dollar sign. + type: string collection_mode: description: The collection mode defines whether the resulting property will be a single property or whether the values of multiple structure elements will be collected either into a list or a multiproperty. enum: @@ -71,3 +158,15 @@ cfood: additionalProperties: $ref: "#/$defs/converter" + if: + properties: + type: + const: + "PropertiesFromDictElement" + then: + required: + - type + - record_from_dict + else: + required: + - type diff --git a/src/caoscrawler/converters/__init__.py b/src/caoscrawler/converters/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..edb7b3633cea2657dc3b9638379a3e57c37c87e4 --- /dev/null +++ b/src/caoscrawler/converters/__init__.py @@ -0,0 +1,47 @@ +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Submodule containing all default and optional converters.""" + +from .. import utils +from .converters import * +from .xml_converter import * +from .zipfile_converter import ZipFileConverter + +try: + from .spss import SPSSConverter +except ImportError as err: + SPSSConverter: type = utils.MissingImport( + name="SPSSConverter", hint="Try installing with the `spss` extra option.", + err=err) + +try: + from .rocrate import (ELNFileConverter, ROCrateConverter, + ROCrateEntityConverter) +except ImportError as err: + ROCrateEntityConverter: type = utils.MissingImport( + name="ROCrateEntityConverter", hint="Try installing with the `rocrate` extra option.", + err=err) + ROCrateConverter: type = utils.MissingImport( + name="ROCrateConverter", hint="Try installing with the `rocrate` extra option.", + err=err) + ELNFileConverter: type = utils.MissingImport( + name="ELNFileConverter", hint="Try installing with the `rocrate` extra option.", + err=err) diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters/converters.py similarity index 62% rename from src/caoscrawler/converters.py rename to src/caoscrawler/converters/converters.py index df8291d1f633c852ef853ca9e27d63129614dfdb..5577268b1884143ad36d067b25cafe2838031b86 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters/converters.py @@ -1,11 +1,11 @@ -#!/usr/bin/env python3 # encoding: utf-8 # -# ** header v3.0 -# This file is a part of the CaosDB Project. +# This file is a part of the LinkAhead Project. # +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> # Copyright (C) 2021 Henrik tom Wörden -# 2021 Alexander Schlemmer +# Copyright (C) 2021 Alexander Schlemmer +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as @@ -19,9 +19,8 @@ # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. -# -# ** end header -# + +"""Converters take structure elements and create Records and new structure elements from them.""" from __future__ import annotations @@ -34,31 +33,35 @@ import warnings from abc import ABCMeta, abstractmethod from inspect import signature from string import Template -from typing import Any, List, Optional, Tuple, Union +from typing import Any, Callable, Optional, Union -import caosdb as db +import linkahead as db import pandas as pd import yaml import yaml_header_tools from jsonschema import ValidationError, validate -from .stores import GeneralStore, RecordStore -from .structure_elements import (BooleanElement, DictElement, Directory, File, - FloatElement, IntegerElement, JSONFile, - ListElement, NoneElement, StructureElement, - TextElement) -from .utils import has_parent +from ..stores import GeneralStore, RecordStore +from ..structure_elements import (BooleanElement, DictElement, Directory, File, + FloatElement, IntegerElement, JSONFile, + ListElement, NoneElement, StructureElement, + TextElement) +from ..utils import has_parent # These are special properties which are (currently) treated differently # by the converters: SPECIAL_PROPERTIES = ("description", "name", "id", "path", "file", "checksum", "size") -SINGLE_VAR_RE = re.compile(r"^\$(\{)?(?P<varname>[0-9a-zA-Z_]+)(\})?$") +ID_PATTERN = r"\D[.\w]*" +SINGLE_VAR_RE = re.compile(r"^\$(\{)?(?P<varname>" + ID_PATTERN + r")(\})?$") logger = logging.getLogger(__name__) class CrawlerTemplate(Template): - braceidpattern = r"(?a:[_a-z][_\.a-z0-9]*)" + # This also adds a dot to the default pattern. + # See: https://docs.python.org/3/library/string.html#template-strings + # Default flags is re.IGNORECASE + braceidpattern = ID_PATTERN def _only_max(children_with_keys): @@ -134,8 +137,8 @@ def replace_variables(propvalue: Any, values: GeneralStore): This function replaces variables in property values (and possibly other locations, where the crawler can replace cfood-internal variables). - If `propvalue` is a single variable name preceeded with a '$' (e.g. '$var' or '${var}'), then - the corresponding value stored in `values` is returned. + If ``propvalue`` is a single variable name preceeded by a ``$`` (e.g. ``$var`` or ``${var}``), + then the corresponding value stored in ``values`` is returned. In any other case the variable substitution is carried out as defined by string templates and a new string with the replaced variables is returned. """ @@ -160,77 +163,121 @@ def handle_value(value: Union[dict, str, list], values: GeneralStore): add as an additional property (multiproperty). Variable names (starting with a "$") are replaced by the corresponding value stored in the - `values` GeneralStore. + ``values`` GeneralStore. Parameters ---------- -value: - - if str, the value to be interpreted. E.g. "4", "hallo" or "$a" etc. - - if dict, must have keys "value" and "collection_mode". The returned tuple is directly - created from the corresponding values. - - if list, each element is checked for replacement and the resulting list will be used - as (list) value for the property +value: Union[dict, str, list] + - If *str*, the value to be interpreted. E.g. "4", "hello" or "$a" + etc. No unit is set and collection mode is determined from the + first character: + - '+' corresponds to "list" + - '*' corresponds to "multiproperty" + - everything else is "single" + - If *dict*, it must have a ``value`` key and may ``unit``, and + ``collection_mode``. The returned tuple is directly created from + the corresponding values if they are given; ``unit`` defaults to + None and ``collection_mode`` is determined from ``value`` as + explained for the str case above, i.e., + - if it starts with '+', collection mode is "list", + - in case of '*', collection mode is "multiproperty", + - and everything else is "single". + - If *list*, each element is checked for variable replacement and the + resulting list will be used as (list) value for the property Returns ------- out: tuple - the final value of the property; variable names contained in `values` are replaced. + - the final unit of the property; variable names contained in `values` are replaced. - the collection mode (can be single, list or multiproperty) """ # @review Florian Spreckelsen 2022-05-13 - if type(value) == dict: + propunit = None + propvalue = None + collection_mode = None + if isinstance(value, dict): if "value" not in value: # TODO: how do we handle this case? Just ignore? # or disallow? - raise NotImplementedError() + raise NotImplementedError(f"This definition has no \"value\": {value}") propvalue = value["value"] + if "unit" in value: + propunit = replace_variables(value["unit"], values) # can be "single", "list" or "multiproperty" - collection_mode = value["collection_mode"] - elif type(value) == str: - propvalue = value - collection_mode = "single" - if propvalue.startswith("+"): - collection_mode = "list" - propvalue = propvalue[1:] - elif propvalue.startswith("*"): - collection_mode = "multiproperty" - propvalue = propvalue[1:] - elif type(value) == list: - # TODO: (for review) - # This is a bit dirty right now and needed for - # being able to directly set list values. Semantics is, however, a bit - # different from the two cases above. - collection_mode = "single" - - # variables replacement: - propvalue = list() - for element in value: - # Do the element-wise replacement only, when its type is string: - if type(element) == str: - propvalue.append(replace_variables(element, values)) - else: - propvalue.append(element) - - return (propvalue, collection_mode) + if "collection_mode" in value: + collection_mode = value["collection_mode"] else: - # value is another simple type - collection_mode = "single" propvalue = value - # Return it immediately, otherwise variable substitution would be done and fail: - return (propvalue, collection_mode) + if collection_mode is None: + if isinstance(propvalue, str): + # Determine collection mode from string value + collection_mode = "single" + if propvalue.startswith("+"): + collection_mode = "list" + propvalue = propvalue[1:] + elif propvalue.startswith("*"): + collection_mode = "multiproperty" + propvalue = propvalue[1:] + elif isinstance(propvalue, list): + # TODO: (for review) + # This is a bit dirty right now and needed for + # being able to directly set list values. Semantics is, however, a bit + # different from the two cases above. + collection_mode = "single" + + # variables replacement: + returnvalue = list() + for element in propvalue: + # Do the element-wise replacement only, when its type is string: + if isinstance(element, str): + returnvalue.append(replace_variables(element, values)) + else: + returnvalue.append(element) + + return (returnvalue, propunit, collection_mode) + else: + # value is another simple type + collection_mode = "single" + # Return it immediately, otherwise variable substitution would be done and fail: + return (propvalue, propunit, collection_mode) propvalue = replace_variables(propvalue, values) - return (propvalue, collection_mode) + return (propvalue, propunit, collection_mode) + +def create_records(values: GeneralStore, + records: RecordStore, + def_records: dict) -> list[tuple[str, str]]: + """ + Create records in GeneralStore `values` and RecordStore `records` as given + by the definition in `def_records`. + + This function will be called during scanning using the cfood definition. + It also should be used by CustomConverters to set records as automatic substitution + and other crawler features are applied automatically. + + Parameters + ---------- + values: GeneralStore + This GeneralStore will be used to access variables that are needed during variable substitution + in setting the properties of records and files. + Furthermore, the records that are generated in this function will be stored in this GeneralStore + **additionally to** storing them in the RecordStore given as the second argument to this function. + + records: RecordStore + The RecordStore where the generated records will be stored. + + Returns + ------- + : list[tuple[str, str]] + A list of tuples containing the record names (1st element of tuple) and respective property names + as 2nd element of the tuples. This list will be used by the scanner for creating the debug tree. -def create_records(values: GeneralStore, records: RecordStore, def_records: dict): - # list of keys to identify, which variables have been set by which paths: - # the items are tuples: - # 0: record name - # 1: property name + """ keys_modified = [] for name, record in def_records.items(): @@ -263,8 +310,22 @@ def create_records(values: GeneralStore, records: RecordStore, def_records: dict if (role == "Record" and "parents" not in record): c_record.add_parent(name) + if isinstance(record, str): + raise RuntimeError( + "dict expected, but found str: {}".format(record)) + + # We do a second run over the def_records, here. Having finished the first run + # for creating the records (in the variable and records stores) makes sure that + # records, that are defined on this level can already be accessed during variable substitution + # in the properties that will be set in the next block. + for name, record in def_records.items(): + # See above: + if record is None: + record = {} + c_record = records[name] + # Set the properties: for key, value in record.items(): if key == "parents" or key == "role": continue @@ -274,7 +335,7 @@ def create_records(values: GeneralStore, records: RecordStore, def_records: dict key = key_template.safe_substitute(**values.get_storage()) keys_modified.append((name, key)) - propvalue, collection_mode = handle_value(value, values) + propvalue, propunit, collection_mode = handle_value(value, values) if key.lower() in SPECIAL_PROPERTIES: # e.g. description, name, etc. @@ -286,21 +347,29 @@ def create_records(values: GeneralStore, records: RecordStore, def_records: dict propvalue = os.path.normpath(propvalue) setattr(c_record, key.lower(), propvalue) else: - if c_record.get_property(key) is None: - if collection_mode == "list": - c_record.add_property(name=key, value=[propvalue]) + c_record.add_property(name=key, value=[propvalue], unit=propunit) elif (collection_mode == "multiproperty" or collection_mode == "single"): - c_record.add_property(name=key, value=propvalue) + c_record.add_property(name=key, value=propvalue, unit=propunit) else: if collection_mode == "list": + if (propunit and c_record.get_property(key).unit + and propunit != c_record.get_property(key).unit): + raise RuntimeError( + f"Property '{key}' has contradictory units: " + f"{propunit} and {c_record.get_property(key).unit}" + ) c_record.get_property(key).value.append(propvalue) + if propunit and not c_record.get_property(key).unit: + c_record.get_property(key).unit = propunit elif collection_mode == "multiproperty": - c_record.add_property(name=key, value=propvalue) + c_record.add_property(name=key, value=propvalue, unit=propunit) elif collection_mode == "single": c_record.get_property(key).value = propvalue + if propunit: + c_record.get_property(key).unit = propunit # no matter whether the record existed in the record store or not, # parents will be added when they aren't present in the record yet: @@ -315,17 +384,23 @@ def create_records(values: GeneralStore, records: RecordStore, def_records: dict class Converter(object, metaclass=ABCMeta): - """Converters treat StructureElements contained in the hierarchical sturcture.""" + """Converters treat StructureElements contained in the hierarchical sturcture. + + This is the abstract super class for all Converters. + """ def __init__(self, definition: dict, name: str, converter_registry: dict): """ Parameters ---------- - definition: dict, Please refer to ``src/doc/converters.rst`` to learn about the structure - that the definition dict must have. - converter_registry: dict, A dictionary that contains converter names as keys and dicts as - values. Those value dicts have the keys 'converter' and 'package'. + definition: dict + Please refer to ``src/doc/converters.rst`` to learn about the structure that the + definition dict must have. + converter_registry: dict + A dictionary that contains converter names as keys and dicts as values. Those value dicts + have the keys 'converter', 'package' and 'class'. 'converter' is the class name, + 'package' the module and 'class' the class instance of converters. """ self.definition = definition @@ -361,6 +436,8 @@ class Converter(object, metaclass=ABCMeta): self.converters.append(Converter.converter_factory( converter_definition, converter_name, converter_registry)) + self.setup() + def get_dict(self): """ Return a dictionary containing all the attributes from this structure element. @@ -370,13 +447,24 @@ class Converter(object, metaclass=ABCMeta): "type": str(type(self))[8:-2], "definition": self.definition} + def setup(self): + """ + Analogous to `cleanup`. Can be used to set up variables that are permanently + stored in this converter. + """ + pass + @staticmethod def converter_factory(definition: dict, name: str, converter_registry: dict): - """creates a Converter instance of the appropriate class. + """Create a Converter instance of the appropriate class. The `type` key in the `definition` defines the Converter class which is being used. """ + if definition is None: + raise RuntimeError("Definition of converter \"{}\" is " + "empty".format(name)) + if "type" not in definition: raise RuntimeError( "Type is mandatory for converter entries in CFood definition.") @@ -398,8 +486,8 @@ class Converter(object, metaclass=ABCMeta): Extract information from the structure element and store them as values in the general store. - Parameters: - ------------ + Parameters + ---------- values: GeneralStore The GeneralStore to store values in. @@ -413,13 +501,97 @@ class Converter(object, metaclass=ABCMeta): raise RuntimeError("Condition does not match.") values.update(m) + def match_properties(self, properties: dict, vardict: dict, label: str = "match_properties"): + """This method can be used to generically match 'match_properties' from the cfood definition + with the behavior described as follows: + + 'match_properties' is a dictionary of key-regexps and value-regexp pairs. Each key matches + a property name and the corresponding value matches its property value. + + What a property means in the context of the respective converter can be different, examples: + + * XMLTag: attributes of the node + * ROCrate: properties of the ROCrateEntity + * DictElement: properties of the dict + + label can be used to customize the name of the dictionary in the definition. + + This method is not called by default, but can be called from child classes. + + Typically it would be used like this from methods overwriting `match`:: + + if not self.match_properties(<properties>, vardict): + return None + + vardict will be updated in place when there are + matches. <properties> is a dictionary taken from the structure + element that contains the properties in the context of this + converter. + + + Parameters + ---------- + + properties: dict + The dictionary containing the properties to be matched. + + vardict: dict + This dictionary will be used to store the variables created during the matching. + + label: str + Default "match_properties". Can be used to change the name + of the property in the definition. E.g. the xml converter + uses "match_attrib" which makes more sense in the context + of xml trees. + + Returns + ------- + + : bool + Returns True when properties match and False + otherwise. The vardict dictionary is updated in place. + + """ + if label in self.definition: + # This matcher works analogously to the attributes matcher in the XMLConverter + for prop_def_key, prop_def_value in self.definition[label].items(): + match_counter = 0 + matched_m_prop = None + matched_m_prop_value = None + for prop_key, prop_value in properties.items(): + # print("{} = {}".format(prop_key, prop_value)) + # TODO: automatic conversion to str ok? + m_prop = re.match(prop_def_key, str(prop_key)) + if m_prop is not None: + match_counter += 1 + matched_m_prop = m_prop + # TODO: automatic conversion to str ok? + m_prop_value = re.match(prop_def_value, str(prop_value)) + if m_prop_value is None: + return False + matched_m_prop_value = m_prop_value + # TODO: How to deal with multiple matches? + # There are multiple options: + # - Allow multiple attribute-key matches: Leads to possible overwrites of variables + # - Require unique attribute-key and attribute-value matches: Very complex + # - Only allow one single attribute-key to match and run attribute-value match separately. + # Currently the latter option is implemented. + # TODO: The ROCrateEntityConverter implements a very similar behavior. + if match_counter == 0: + return False + elif match_counter > 1: + raise RuntimeError("Multiple properties match the same {} entry.".format(label)) + vardict.update(matched_m_prop.groupdict()) + vardict.update(matched_m_prop_value.groupdict()) + return True + def apply_transformers(self, values: GeneralStore, transformer_functions: dict): """ Check if transformers are defined using the "transform" keyword. Then apply the transformers to the variables defined in GeneralStore "values". - Parameters: - ------------ + Parameters + ---------- values: GeneralStore The GeneralStore to store values in. @@ -433,10 +605,11 @@ class Converter(object, metaclass=ABCMeta): pass """ - if not "transform" in self.definition: + if "transform" not in self.definition: return for transformer_key, transformer in self.definition["transform"].items(): in_value = replace_variables(transformer["in"], values) + out_value = in_value for tr_func_el in transformer["functions"]: if not isinstance(tr_func_el, dict): @@ -447,10 +620,19 @@ class Converter(object, metaclass=ABCMeta): " one element with they key being the name" " of the function!") tr_func_key = list(tr_func_el.keys())[0] - tr_func_params = tr_func_el[tr_func_key] + if tr_func_key not in transformer_functions: raise RuntimeError("Unknown transformer function: {}".format(tr_func_key)) + # Do variable replacment on function parameters: + if tr_func_el[tr_func_key] is not None: + # Create a copy of the function parameters: + tr_func_params = dict(tr_func_el[tr_func_key]) + for key in tr_func_params: + tr_func_params[key] = replace_variables(tr_func_params[key], values) + else: + tr_func_params = None + # Retrieve the function from the dictionary: tr_func = transformer_functions[tr_func_key] # Call the function: @@ -469,13 +651,13 @@ class Converter(object, metaclass=ABCMeta): values[match.group('varname')] = out_value @abstractmethod - def create_children(self, values: GeneralStore, - element: StructureElement): + def create_children(self, values: GeneralStore, element: StructureElement): pass def create_records(self, values: GeneralStore, records: RecordStore, element: StructureElement): # TODO why is element passed but not used??? + # ANSWER: because it might be used by overriding child classes. if "records" not in self.definition: return [] @@ -486,7 +668,7 @@ class Converter(object, metaclass=ABCMeta): self.definition["records"]) def filter_children(self, children_with_strings: - List[Tuple[StructureElement, str]], expr: str, + list[tuple[StructureElement, str]], expr: str, group: str, rule: str): """Filter children according to regexp `expr` and `rule`.""" @@ -524,8 +706,8 @@ class Converter(object, metaclass=ABCMeta): result: Optional[dict]): """ Template for the debugging output for the match function """ msg = "\n--------" + name + "-----------\n" - for re, ma in zip(regexp, matched): - msg += "matching reg:\t" + re + "\n" + for exp, ma in zip(regexp, matched): + msg += "matching reg:\t" + exp + "\n" msg += "matching val:\t" + ma + "\n" msg += "---------\n" if result is None: @@ -584,8 +766,21 @@ class Converter(object, metaclass=ABCMeta): """ pass + def cleanup(self): + """ + This function is called when the converter runs out of scope and can be used to + clean up objects that were needed in the converter or its children. + """ + pass + class DirectoryConverter(Converter): + """ + Converter that matches and handles structure elements of type directory. + + This is one typical starting point of a crawling procedure. + """ + def create_children(self, generalStore: GeneralStore, element: StructureElement): # TODO: See comment on types and inheritance if not isinstance(element, Directory): @@ -619,6 +814,11 @@ class DirectoryConverter(Converter): m = re.match(self.definition["match"], element.name) if m is None: return None + if "match_newer_than_file" in self.definition: + last_modified = self._get_most_recent_change_in_dir(element) + reference = self._get_reference_file_timestamp() + if last_modified < reference: + return None return m.groupdict() @staticmethod @@ -629,7 +829,7 @@ class DirectoryConverter(Converter): element: A directory (of type Directory) which will be traversed. """ - children: List[StructureElement] = [] + children: list[StructureElement] = [] for name in sorted(os.listdir(element.path)): path = os.path.join(element.path, name) @@ -641,6 +841,49 @@ class DirectoryConverter(Converter): return children + @staticmethod + def _get_most_recent_change_in_dir(element: Directory) -> datetime.datetime: + """Return the datetime of the most recent change of any file + or directory in the given Directory element. + + """ + most_recent = os.path.getmtime(element.path) + + for root, _, files in os.walk(element.path): + mtimes = [os.path.getmtime(root)] + \ + [os.path.getmtime(os.path.join(root, fname)) for fname in files] + if max(mtimes) > most_recent: + most_recent = max(mtimes) + + return datetime.datetime.fromtimestamp(most_recent) + + def _get_reference_file_timestamp(self) -> datetime.datetime: + """Return a time stamp read from a reference file if it + exists. Otherwise return datetime.datetime.min, i.e., the + earliest datetime known to datetime. + + """ + + if "match_newer_than_file" not in self.definition: + logger.debug("No reference file specified.") + return datetime.datetime.min + + elif not os.path.isfile(self.definition["match_newer_than_file"]): + logger.debug("Reference file doesn't exist.") + return datetime.datetime.min + + with open(self.definition["match_newer_than_file"]) as ref_file: + stamp_str = ref_file.readline().strip() + try: + return datetime.datetime.fromisoformat(stamp_str) + except ValueError as e: + logger.error( + f"Reference file in {self.definition['match_newer_than_file']} " + "doesn't contain a ISO formatted datetime in its first line. " + "Match regardless of modification times." + ) + raise e + class SimpleFileConverter(Converter): """Just a file, ignore the contents.""" @@ -669,7 +912,7 @@ class SimpleFileConverter(Converter): class FileConverter(SimpleFileConverter): def __init__(self, *args, **kwargs): warnings.warn(DeprecationWarning( - "This class is depricated. Please use SimpleFileConverter.")) + "This class is deprecated. Please use SimpleFileConverter.")) super().__init__(*args, **kwargs) @@ -702,12 +945,12 @@ class MarkdownFileConverter(SimpleFileConverter): "Error during the validation (yaml header cannot be read) of the markdown file " "located at the following node in the data structure:\n" "{}\nError:\n{}".format(path, err)) - children: List[StructureElement] = [] + children: list[StructureElement] = [] for name, entry in header.items(): - if type(entry) == list: + if isinstance(entry, list): children.append(ListElement(name, entry)) - elif type(entry) == str: + elif isinstance(entry, str): children.append(TextElement(name, entry)) else: if generalStore is not None and self.name in generalStore: @@ -722,7 +965,9 @@ class MarkdownFileConverter(SimpleFileConverter): def convert_basic_element(element: Union[list, dict, bool, int, float, str, None], name=None, msg_prefix=""): """Convert basic Python objects to the corresponding StructureElements""" - if isinstance(element, list): + if isinstance(element, StructureElement): + return element + elif isinstance(element, list): return ListElement(name, element) elif isinstance(element, dict): return DictElement(name, element) @@ -774,6 +1019,12 @@ schema_resource: class DictElementConverter(Converter): + """ +**Operates on:** :py:class:`caoscrawler.structure_elements.DictElement` + +**Generates:** :py:class:`caoscrawler.structure_elements.StructureElement` + """ + def create_children(self, generalStore: GeneralStore, element: StructureElement): # TODO: See comment on types and inheritance if not isinstance(element, DictElement): @@ -811,20 +1062,199 @@ class DictElementConverter(Converter): # TODO: See comment on types and inheritance if not isinstance(element, DictElement): raise RuntimeError("Element must be a DictElement.") - return match_name_and_value(self.definition, element.name, element.value) + vardict = match_name_and_value(self.definition, element.name, element.value) + + if not self.match_properties(element.value, vardict): + return None + + return vardict + + +class PropertiesFromDictConverter(DictElementConverter): + """Extend the :py:class:`DictElementConverter` by a heuristic to set + property values from the dictionary keys. + + """ + + def __init__(self, definition: dict, name: str, converter_registry: dict, + referenced_record_callback: Optional[callable] = None): + + super().__init__(definition, name, converter_registry) + self.referenced_record_callback = referenced_record_callback + + def _recursively_create_records(self, subdict: dict, root_record: db.Record, + root_rec_name: str, + values: GeneralStore, records: RecordStore, + referenced_record_callback: callable, + keys_modified: list = [] + ): + """Create a record form the given `subdict` and recursively create referenced records.""" + + blacklisted_keys = self.definition["record_from_dict"][ + "properties_blacklist"] if "properties_blacklist" in self.definition["record_from_dict"] else [] + special_references = self.definition["record_from_dict"]["references"] if "references" in self.definition["record_from_dict"] else [ + ] + + for key, value in subdict.items(): + + if key in blacklisted_keys: + # We ignore this in the automated property generation + continue + if isinstance(value, list): + if not any([isinstance(val, dict) for val in value]): + # no dict in list, i.e., no references, so this is simple + root_record.add_property(name=key, value=value) + else: + if not all([isinstance(val, dict) for val in value]): + # if this is not an error (most probably it is), this + # needs to be handled manually for now. + raise ValueError( + f"{key} in {subdict} contains a mixed list of references and scalars.") + ref_recs = [] + for ii, ref_dict in enumerate(value): + ref_var_name = f"{root_rec_name}.{key}.{ii+1}" + ref_rec, keys_modified = self._create_ref_rec( + ref_var_name, + key, + ref_dict, + special_references, + records, + values, + keys_modified, + referenced_record_callback + ) + ref_recs.append(ref_rec) + root_record.add_property(name=key, value=ref_recs) + + elif isinstance(value, dict): + # Treat scalar reference + ref_var_name = f"{root_rec_name}.{key}" + ref_rec, keys_modified = self._create_ref_rec( + ref_var_name, + key, + value, + special_references, + records, + values, + keys_modified, + referenced_record_callback + ) + root_record.add_property(key, ref_rec) + else: + # All that remains are scalar properties which may or + # may not be special attributes like name. + if key.lower() in SPECIAL_PROPERTIES: + setattr(root_record, key.lower(), value) + else: + root_record.add_property(name=key, value=value) + keys_modified.append((root_rec_name, key)) + + if referenced_record_callback: + root_record = referenced_record_callback(root_record, records, values) + + return keys_modified + + def _create_ref_rec( + self, + name: str, + key: str, + subdict: dict, + special_references: dict, + records: RecordStore, + values: GeneralStore, + keys_modified: list, + referenced_record_callback: callable + ): + """Create the referenced Record and forward the stores etc. to + ``_recursively_create_records``. + + Parameters: + ----------- + name : str + name of the referenced record to be created in RecordStore and Value Store. + key : str + name of the key this record's definition had in the original dict. + subdict : dict + subdict containing this record's definition from the original dict. + special_references : dict + special treatment of referenced records from the converter definition. + records : RecordStore + RecordStore for entering new Records + values : GeneralStore + ValueStore for entering new Records + keys_modified : list + List for keeping track of changes + referenced_record_callback : callable + Advanced treatment of referenced records as given in the + converter initialization. + """ + ref_rec = db.Record() + if key in special_references: + for par in special_references[key]["parents"]: + ref_rec.add_parent(par) + else: + ref_rec.add_parent(key) + records[name] = ref_rec + values[name] = ref_rec + keys_modified = self._recursively_create_records( + subdict=subdict, + root_record=ref_rec, + root_rec_name=name, + values=values, + records=records, + referenced_record_callback=referenced_record_callback, + keys_modified=keys_modified + ) + return ref_rec, keys_modified + + def create_records(self, values: GeneralStore, records: RecordStore, + element: StructureElement): + + keys_modified = [] + + rfd = self.definition["record_from_dict"] + if rfd["variable_name"] not in records: + rec = db.Record() + if "name" in rfd: + rec.name = rfd["name"] + if "parents" in rfd: + for par in rfd["parents"]: + rec.add_parent(par) + else: + rec.add_parent(rfd["variable_name"]) + records[rfd["variable_name"]] = rec + values[rfd["variable_name"]] = rec + + else: + rec = records[rfd["variable_name"]] + + keys_modified = self._recursively_create_records( + subdict=element.value, + root_record=rec, + root_rec_name=rfd["variable_name"], + values=values, + records=records, + referenced_record_callback=self.referenced_record_callback, + keys_modified=keys_modified, + ) + + keys_modified.extend(super().create_records( + values=values, records=records, element=element)) + + return keys_modified class DictConverter(DictElementConverter): def __init__(self, *args, **kwargs): warnings.warn(DeprecationWarning( - "This class is depricated. Please use DictConverter.")) + "This class is deprecated. Please use DictElementConverter.")) super().__init__(*args, **kwargs) class DictDictElementConverter(DictElementConverter): def __init__(self, *args, **kwargs): warnings.warn(DeprecationWarning( - "This class is depricated. Please use DictElementConverter.")) + "This class is deprecated. Please use DictElementConverter.")) super().__init__(*args, **kwargs) @@ -889,7 +1319,7 @@ out: """ if "match_name" in definition: if "match" in definition: - raise RuntimeError(f"Do not supply both, 'match_name' and 'match'.") + raise RuntimeError("Do not supply both, 'match_name' and 'match'.") m1 = re.match(definition["match_name"], name) if m1 is None: @@ -1012,7 +1442,7 @@ class BooleanElementConverter(_AbstractScalarValueElementConverter): class DictBooleanElementConverter(BooleanElementConverter): def __init__(self, *args, **kwargs): warnings.warn(DeprecationWarning( - "This class is depricated. Please use BooleanElementConverter.")) + "This class is deprecated. Please use BooleanElementConverter.")) super().__init__(*args, **kwargs) @@ -1028,7 +1458,7 @@ class FloatElementConverter(_AbstractScalarValueElementConverter): class DictFloatElementConverter(FloatElementConverter): def __init__(self, *args, **kwargs): warnings.warn(DeprecationWarning( - "This class is depricated. Please use FloatElementConverter.")) + "This class is deprecated. Please use FloatElementConverter.")) super().__init__(*args, **kwargs) @@ -1043,7 +1473,7 @@ class TextElementConverter(_AbstractScalarValueElementConverter): def __init__(self, definition, *args, **kwargs): if "match" in definition: raise ValueError(""" -The 'match' key will in future be used to match a potential name of a TextElement. Please use +The 'match' key is used to match a potential name of a TextElement. Please use the 'match_value' key to match the value of the TextElement and 'match_name' for matching the name. """) @@ -1053,7 +1483,7 @@ the 'match_value' key to match the value of the TextElement and 'match_name' for class DictTextElementConverter(TextElementConverter): def __init__(self, *args, **kwargs): warnings.warn(DeprecationWarning( - "This class is depricated. Please use TextElementConverter.")) + "This class is deprecated. Please use TextElementConverter.")) super().__init__(*args, **kwargs) @@ -1069,7 +1499,7 @@ class IntegerElementConverter(_AbstractScalarValueElementConverter): class DictIntegerElementConverter(IntegerElementConverter): def __init__(self, *args, **kwargs): warnings.warn(DeprecationWarning( - "This class is depricated. Please use IntegerElementConverter.")) + "This class is deprecated. Please use IntegerElementConverter.")) super().__init__(*args, **kwargs) @@ -1079,19 +1509,16 @@ class ListElementConverter(Converter): # TODO: See comment on types and inheritance if not isinstance(element, ListElement): raise RuntimeError( - "This converter can only process DictListElements.") + "This converter can only process ListElements.") children: list[StructureElement] = [] for index, list_element in enumerate(element.value): - # TODO(fspreck): Refactor this and merge with DictXXXElements maybe? - if isinstance(list_element, str): - children.append(TextElement(str(index), list_element)) - elif isinstance(list_element, dict): - children.append(DictElement(str(index), list_element)) - elif isinstance(list_element, StructureElement): - children.append(list_element) - else: - raise NotImplementedError( - f"Unkown type {type(list_element)} in list element {list_element}.") + children.append( + convert_basic_element( + list_element, + name=f"{index}", + msg_prefix=f"The value at index {index} in the list as an unknown type." + ) + ) return children def typecheck(self, element: StructureElement): @@ -1114,7 +1541,7 @@ class ListElementConverter(Converter): class DictListElementConverter(ListElementConverter): def __init__(self, *args, **kwargs): warnings.warn(DeprecationWarning( - "This class is depricated. Please use ListElementConverter.")) + "This class is deprecated. Please use ListElementConverter.")) super().__init__(*args, **kwargs) @@ -1128,15 +1555,22 @@ class TableConverter(Converter): The rows can be matched using a DictElementConverter. """ - @abstractmethod - def get_options(self): - """ - This method needs to be overwritten by the specific table converter to provide - information about the possible options. + + def get_options(self) -> dict: + """Get specific options, e.g. from ``self.definitions``. + +This method may to be overwritten by the specific table converter to provide information about the +possible options. Implementors may use ``TableConverter._get_options(...)`` to get (and convert) +options from ``self.definitions``. + +Returns +------- +out: dict + An options dict. """ - pass + return {} - def _get_options(self, possible_options): + def _get_options(self, possible_options: list[tuple[str, Callable]]) -> dict: option_dict = dict() for opt_name, opt_conversion in possible_options: if opt_name in self.definition: @@ -1164,8 +1598,22 @@ class TableConverter(Converter): return None return m.groupdict() + @staticmethod + def _children_from_dataframe(dataframe: pd.DataFrame): + child_elements = list() + for index, row in dataframe.iterrows(): + child_elements.append( + DictElement(str(index), row.to_dict())) + return child_elements + class XLSXTableConverter(TableConverter): + """ +**Operates on:** :py:class:`caoscrawler.structure_elements.File` + +**Generates:** :py:class:`caoscrawler.structure_elements.DictElement` + """ + def get_options(self): return self._get_options([ ("sheet_name", str), @@ -1187,11 +1635,7 @@ class XLSXTableConverter(TableConverter): if not isinstance(element, File): raise RuntimeError("Element must be a File.") table = pd.read_excel(element.path, **self.get_options()) - child_elements = list() - for index, row in table.iterrows(): - child_elements.append( - DictElement(str(index), row.to_dict())) - return child_elements + return self._children_from_dataframe(table) class CSVTableConverter(TableConverter): @@ -1216,22 +1660,19 @@ class CSVTableConverter(TableConverter): if not isinstance(element, File): raise RuntimeError("Element must be a File.") table = pd.read_csv(element.path, **self.get_options()) - child_elements = list() - for index, row in table.iterrows(): - child_elements.append( - DictElement(str(index), row.to_dict())) - return child_elements + return self._children_from_dataframe(table) class DateElementConverter(TextElementConverter): """allows to convert different text formats of dates to Python date objects. The text to be parsed must be contained in the "date" group. The format string can be supplied - under "dateformat" in the Converter definition. The library used is datetime so see its + under "date_format" in the Converter definition. The library used is datetime so see its documentation for information on how to create the format string. """ + # TODO make `date` parameter name configurable def match(self, element: StructureElement): matches = super().match(element) if matches is not None and "date" in matches: @@ -1240,3 +1681,24 @@ class DateElementConverter(TextElementConverter): self.definition["date_format"] if "date_format" in self.definition else "%Y-%m-%d" ).date()}) return matches + + +class DatetimeElementConverter(TextElementConverter): + """Convert text so that it is formatted in a way that LinkAhead can understand it. + +The text to be parsed must be in the ``val`` parameter. The format string can be supplied in the +``datetime_format`` node. This class uses the ``datetime`` module, so ``datetime_format`` must +follow this specificaton: +https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes + + """ + + # TODO make `val` parameter name configurable + def match(self, element: StructureElement): + matches = super().match(element) + if matches is not None and "val" in matches: + fmt_default = "%Y-%m-%dT%H:%M:%S" + fmt = self.definition.get("datetime_format", fmt_default) + dt_str = datetime.datetime.strptime(matches["val"], fmt).strftime(fmt_default) + matches.update({"val": dt_str}) + return matches diff --git a/src/caoscrawler/converters/hdf5_converter.py b/src/caoscrawler/converters/hdf5_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..97dac53d053dbcb87c48f0cfb59d4f09770b9710 --- /dev/null +++ b/src/caoscrawler/converters/hdf5_converter.py @@ -0,0 +1,338 @@ +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2023 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2023 Florian Spreckelsen <f.spreckelsen@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +from typing import Optional + +try: + import h5py +except ModuleNotFoundError: + raise ModuleNotFoundError( + "Couldn't find module h5py. Did you install the crawler package with " + "its optional `h5-crawler` dependency?" + ) + +from typing import Union + +import linkahead as db +import numpy as np + +from ..stores import GeneralStore, RecordStore +from ..structure_elements import (DictElement, File, FloatElement, + IntegerElement, StructureElement) +from .converters import (Converter, DictElementConverter, SimpleFileConverter, + convert_basic_element, match_name_and_value) + + +def convert_attributes(elt: Union[h5py.File, h5py.Group, h5py.Dataset]): + """Convert hdf5 attributes to a list of either basic scalar structure elements or ndarrays. + + Parameters + ---------- + elt : Union[h5py.File, h5py.Group, h5py.Dataset] + The hdf5 element the attributes of which will be converted to structure + elements. + + Returns + ------- + converted : list[StructureElement] + A list of the attributes converted to StructureElements (either basic + scalar elements or ndarray). + """ + + converted = [] + for name, value in elt.attrs.items(): + converted.append(convert_basic_element_with_nd_array( + value, name, f"The value of attribute {name} has an unknown type: {type(value)}.")) + + return converted + + +def convert_h5_element(elt: Union[h5py.Group, h5py.Dataset], name: str): + """Convert a given HDF5 element to the corresponding StructureElement. + + Parameters + ---------- + elt : Union[h5py.Group, h5py.Dataset] + The hdf5 element to be converted. + name : str + The name of the StructureElement that the hdf5 element is converted to. + + Raises + ------ + ValueError + In case of anything that is not convertible to a HDF5 structure element. + + Returns + ------- + StructureElement + The converted StructureElement. + """ + + if isinstance(elt, h5py.Group): + + return H5GroupElement(name, elt) + + if isinstance(elt, h5py.Dataset): + + return H5DatasetElement(name, elt) + + raise ValueError("The given element must be either a HDF5 Group or Dataset object.") + + +def convert_basic_element_with_nd_array(value, name: Optional[str] = None, + internal_path: Optional[str] = None, msg_prefix: str = ""): + """Convert a given object either to an ndarray structure element or to a + basic scalar structure element. + + This function extends :func:`~caoscrawler.converters.convert_basic_element` + by a special treatment for certain numpy objects, most importantly + ndarrays. They are converted to a scalar in case of a size-1 array, to a + list in case of a 1-d array, and to a ``H5NdarrayElement`` in all other + cases. In addition, numpy integers and floats are also converted to + IntegerElements and FloatElements, respectively. + + Parameters + ---------- + value + The object to be converted. + name : str, optional + The name of the structure element ``value`` is being converted + to. Default is None. + internal_path : str, optional + The internal path of ``value`` within the HDF5 file. Default is None. + msg_prefix : str, optional + The prefix of the error message that will be raised. Default is ``""``. + + Returns + ------- + StructureElement + The StructureElement ``value`` was converted to. + + """ + + if isinstance(value, np.ndarray): + + if value.size == 1: + # this is a scalar stacked in a numpy array. We don't know its + # actual shape, so we reshape first, then use the actual value + # inside. + value = value.reshape((1,))[0] + + elif np.squeeze(value).ndim == 1: + # If the array is one-dimensional we can save it as a list + value = list(np.squeeze(value)) + + else: + # real multi-dimensional array + return H5NdarrayElement(name, value, internal_path) + + elif isinstance(value, np.int32) or isinstance(value, np.int64): + + return IntegerElement(name, value) + + elif isinstance(value, np.float64): + + return FloatElement(name, value) + + return convert_basic_element(value, name, msg_prefix) + + +class H5GroupElement(DictElement): + """StructureElement specific for HDF5 groups""" + + def __init__(self, name: str, value: h5py.Group): + super().__init__(name, value) + + +class H5DatasetElement(DictElement): + """StructureElement specific for HDF5 datasets.""" + + def __init__(self, name: str, value: h5py.Dataset): + super().__init__(name, value) + + +class H5NdarrayElement(DictElement): + """StructureElement specific for NDArrays within HDF5 files. + + Also store the internal path of the array within the HDF5 file in its + ``internal_path`` attribute. + + """ + + def __init__(self, name: str, value, internal_path: str): + super().__init__(name, value) + self.internal_path = internal_path + + +class H5FileConverter(SimpleFileConverter): + """Converter for HDF5 files that creates children for the contained + attributes, groups, and datasets. + + """ + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + """Create children from root-level file attributes and contained hdf5 + elements. + + """ + + if not isinstance(element, File): + + raise ValueError("create_children should have been called with a File object.") + + ff = h5py.File(element.path, 'r') + + children = [] + + for name, value in ff.items(): + + children.append(convert_h5_element(value, name)) + + children.extend(convert_attributes(ff)) + + return children + + +class H5GroupConverter(DictElementConverter): + """Converter for HDF5 groups that creates children from the group-level + attributes and the contained subgroups and datasets. + + """ + + def typecheck(self, element: StructureElement): + + return isinstance(element, H5GroupElement) + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + """Create children from group attributes and hdf5 elements contained in + this group. + + """ + + if not isinstance(element.value, h5py.Group): + + raise ValueError("create_children should have been called with a HDF5 Group object.") + + children = [] + + for name, value in element.value.items(): + + children.append(convert_h5_element(value, name)) + + children.append(convert_attributes(element.value)) + + return children + + +class H5DatasetConverter(DictElementConverter): + """Converter for HDF5 datasets that creates children from the dataset + attributes and the contained array data. + + """ + + def typecheck(self, element: StructureElement): + + return isinstance(element, H5DatasetElement) + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + """Create children from the dataset attributes and append the array data + contained in this dataset. + + """ + + if not isinstance(element.value, h5py.Dataset): + + raise ValueError("create_children should have been called with a HDF5 Dataset object") + + children = convert_attributes(element.value) + + children.append( + H5NdarrayElement( + name=self.name+"_ndarray", + value=element.value, + internal_path=element.value.name + ) + ) + return children + + +class H5NdarrayConverter(Converter): + """Converter for ndarrays contained in HDF5 files. Creates the wrapper + record for this ndarray. + + """ + + def __init__(self, definition: dict, name: str, converter_registry: dict): + + # Check that a non-empty name for the record that will be created for + # the ndarray Record (within the cfood) is given + if not ("recordname" in definition and definition["recordname"]): + + raise RuntimeError(f"Converter {name} lacks the `recordname` definition.") + + super().__init__(definition, name, converter_registry) + + def create_children(self, values: GeneralStore, element: StructureElement): + """The ndarray doesn't have any further children.""" + + return [] + + def create_records(self, values: GeneralStore, records: RecordStore, element: StructureElement): + """Create a wrapper record with name ``recordname``, type + ``array_recordtype_name`` (default ``H5Ndarray``) and the internal path + stored in a property with name ``internal_path_property_name`` (default + ``internal_hdf5_path``). + + """ + + rname = self.definition["recordname"] + if "array_recordtype_name" in self.definition: + rtname = self.definition["array_recordtype_name"] + else: + rtname = "H5Ndarray" + + if "internal_path_property_name" in self.definition: + propname = self.definition["internal_path_property_name"] + else: + propname = "internal_hdf5_path" + + rec = db.Record().add_parent(rtname) + records[rname] = rec + values[rname] = rec + + rec.add_property(name=propname, value=element.internal_path) + keys_modified = [(rname, propname)] + + keys_modified.extend(super().create_records(values, records, element)) + + return keys_modified + + def typecheck(self, element: StructureElement): + + return isinstance(element, H5NdarrayElement) + + @Converter.debug_matching("name") + def match(self, element: StructureElement): + + if not isinstance(element, H5NdarrayElement): + + raise RuntimeError("This converter can only be called with H5NdarrayElements.") + + return match_name_and_value(self.definition, element.name, element.value) diff --git a/src/caoscrawler/converters/rocrate.py b/src/caoscrawler/converters/rocrate.py new file mode 100644 index 0000000000000000000000000000000000000000..8a45af753312a2bf29c1ddb9e6bcb15458c3ebde --- /dev/null +++ b/src/caoscrawler/converters/rocrate.py @@ -0,0 +1,213 @@ +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Converters take structure elements and create Records and new structure elements from them. + +This converter converts ro-crate files which may also be .eln-files. + +""" + +from __future__ import annotations + +import os +import re +import tempfile +from typing import Optional +from zipfile import ZipFile + +import rocrate +from rocrate.rocrate import ROCrate + +from ..stores import GeneralStore +from ..structure_elements import (Directory, File, ROCrateEntity, + StructureElement) +from .converters import Converter, SimpleFileConverter, convert_basic_element + + +class ROCrateConverter(SimpleFileConverter): + + """Convert ro-crate files / directories. + """ + + def setup(self): + self._tempdir = None + + def cleanup(self): + self._tempdir.cleanup() + + def typecheck(self, element: StructureElement): + """ + Check whether the current structure element can be converted using + this converter. + """ + return isinstance(element, File) or isinstance(element, Directory) + + def match(self, element: StructureElement) -> Optional[dict]: + m = re.match(self.definition["match"], element.name) + if m is None: + return None + return m.groupdict() + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + """ + Loads an ROCrate from an rocrate file or directory. + + Arguments: + ---------- + element must be a File or Directory (structure element). + + Returns: + -------- + A list with an ROCrateElement representing the contents of the .eln-file or None + in case of errors. + """ + + if isinstance(element, File): + self._tempdir = tempfile.TemporaryDirectory() + with ZipFile(element.path) as zipf: + zipf.extractall(self._tempdir.name) + crate_path = self._tempdir.name + crate = ROCrate(crate_path) + entity_ls = [] + for ent in crate.get_entities(): + entity_ls.append(ROCrateEntity(crate_path, ent)) + return entity_ls + elif isinstance(element, Directory): + # This would be an unzipped .eln file + # As this is possible for rocrate files, I think it is reasonable + # to support it as well. + raise NotImplementedError() + else: + raise ValueError("create_children was called with wrong type of StructureElement") + return None + + +class ELNFileConverter(ROCrateConverter): + + """Convert .eln-Files + See: https://github.com/TheELNConsortium/TheELNFileFormat + + These files are basically RO-Crates with some minor differences: + - The ro-crate metadata file is not on top-level within the .eln-zip-container, + but in a top-level subdirectory. + """ + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + """ + Loads an ROCrate from an .eln-file or directory. + + This involves unzipping the .eln-file to a temporary folder and creating an ROCrate object + from its contents. + + Arguments: + ---------- + element must be a File or Directory (structure element). + + Returns: + -------- + A list with an ROCrateElement representing the contents of the .eln-file or None + in case of errors. + """ + + if isinstance(element, File): + self._tempdir = tempfile.TemporaryDirectory() + with ZipFile(element.path) as zipf: + zipf.extractall(self._tempdir.name) + cratep = os.listdir(self._tempdir.name) + if len(cratep) != 1: + raise RuntimeError(".eln file must contain exactly one folder") + crate_path = os.path.join(self._tempdir.name, cratep[0]) + crate = ROCrate(crate_path) + entity_ls = [] + for ent in crate.get_entities(): + entity_ls.append(ROCrateEntity(crate_path, ent)) + return entity_ls + elif isinstance(element, Directory): + # This would be an unzipped .eln file + # As this is possible for rocrate files, I think it is reasonable + # to support it as well. + raise NotImplementedError() + else: + raise ValueError("create_children was called with wrong type of StructureElement") + return None + + +class ROCrateEntityConverter(Converter): + + def typecheck(self, element: StructureElement): + """ + Check whether the current structure element can be converted using + this converter. + """ + return isinstance(element, ROCrateEntity) + + def match(self, element: StructureElement) -> Optional[dict]: + # See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145 + # for a suggestion for the design of the matching algorithm. + if not isinstance(element, ROCrateEntity): + raise TypeError("Element must be an instance of ROCrateEntity.") + + # Store the result of all individual regexp variable results: + vardict = {} + + # TODO: I accidentally used "match_type" instead + # of "match_entity_type". This was completely + # unnoticed. So add it to schema and adapt tests. + + if "match_entity_type" in self.definition: + entity_type = element.entity.type + if isinstance(entity_type, list): + # TODO: this seems to be a bug in kadi4mat RO-Crates + # ./ has type ['Dataset'] + # instead of type 'Dataset' + entity_type = entity_type[0] + m_type = re.match(self.definition["match_entity_type"], entity_type) + if m_type is None: + return None + vardict.update(m_type.groupdict()) + + if not self.match_properties(element.entity.properties(), vardict): + return None + + return vardict + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + + children = [] + + eprops = element.entity.properties() + + # Add the properties: + for name, value in eprops.items(): + children.append(convert_basic_element(value, name)) + + # Add the files: + if isinstance(element.entity, rocrate.model.file.File): + path, name = os.path.split(eprops["@id"]) + children.append(File(name, os.path.join(element.folder, path, name))) + + # Parts of this entity are added as child entities: + if "hasPart" in eprops: + for p in eprops["hasPart"]: + children.append( + ROCrateEntity(element.folder, element.entity.crate.dereference( + p["@id"]))) + + return children diff --git a/src/caoscrawler/converters/spss.py b/src/caoscrawler/converters/spss.py new file mode 100644 index 0000000000000000000000000000000000000000..00742e91506245435ed0c590f68ea9ffce65717a --- /dev/null +++ b/src/caoscrawler/converters/spss.py @@ -0,0 +1,302 @@ +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Converter for SAV files (stored by SPSS).""" + +from __future__ import annotations # Can be removed with 3.10. + +import argparse +from collections import OrderedDict +from typing import Any, Optional + +import numpy as np +import pandas as pd +import pyreadstat +import yaml + +from ..stores import GeneralStore +from ..structure_elements import File, StructureElement +from . import converters + +READSTAT_TYPES = { + "double": "DOUBLE", + "string": "TEXT", +} +ORIGINAL_TYPES = { + "EDATE8": "DATETIME", +} + + +class SPSSConverter(converters.TableConverter): + """Converter for SAV files (stored by SPSS).""" + + def create_children(self, values: GeneralStore, element: StructureElement) -> list: + assert isinstance(element, File) + # The default dtype backend "numpy_nullable" does not handle dates well. + # Note that pandas.ArrowDtype is considered experimental (in Pandas 2.2). + df = pd.io.spss.read_spss(element.path, dtype_backend="pyarrow") + dtypes = read_column_types(element.path) + + # Fix datetime columns + for name, dtype in dtypes.items(): + if dtype != "DATETIME": + continue + col = df.loc[:, name] + col.fillna(np.nan, inplace=True) + col.replace([np.nan], [None], inplace=True) + + return self._children_from_dataframe(df) + + +def read_column_types(savfile: Optional[str] = None, meta: Optional[Any] = None) -> dict[str, str]: + """Read SAV file and return the column types. + +Optionally, take data from a previours reading. + +Parameters +---------- +savfile : Optional[str] + The SAV file to read. + +meta : Optional + The meta data result from `pyreadstat.read_sav(...)`. + +Returns +------- +out : dict[str, str] + The column names and types. + """ + if not meta: + _, meta = pyreadstat.read_sav(savfile, metadataonly=True) + elif savfile is not None: + raise ValueError("Only one of `savfile` and `meta` must be given.") + dtypes: dict[str, str] = {} + for name in meta.column_names: + datatype = ORIGINAL_TYPES.get(meta.original_variable_types[name], + READSTAT_TYPES[meta.readstat_variable_types[name]]) + dtypes[name] = datatype + return dtypes + + +def spss_to_yaml(savfile: str, yamlfile: str, cfood: Optional[str] = None) -> None: + """Parse the *.sav and create basic datamodel in ``yamlfile``. + +Parameters +---------- +cfood: str + If given, also create a cfood skeleton. + """ + _, meta = pyreadstat.read_sav(savfile, metadataonly=True) + dtypes = read_column_types(meta=meta) + + cfood_str = """ +--- +metadata: + macros: + - !defmacro + # Simple column value -> property rule + name: ColumnValue + params: + name: null + belongsto: BaseElement + type: TextElement + definition: + ${name}: + type: ${type} + match_name: ^${name}$$ + match_value: (?P<val>.*) + records: + ${belongsto}: + ${name}: $$val + - !defmacro + # column value -> reference property + name: ColumnValueReference + params: + name: null + reftype: null # RecordType of the reference + belongsto: BaseElement + type: TextElement # References are always text, right? + definition: + ${name}: + type: ${type} + match_name: ^${name}$$ + match_value: (?P<val>.*) + records: + ${reftype}: + name: $$val + ${belongsto}: + ${name}: $$${reftype} + - !defmacro + # Same as "ColumnValue", but also give name of property. + name: ColumnValuePropname + params: + name: null + propname: null + belongsto: BaseElement + type: TextElement + definition: + ${name}: + type: ${type} + match_name: ^${name}$$ + match_value: (?P<val>.*) + records: + ${belongsto}: + ${propname}: $$val +--- +directory: # corresponds to the directory given to the crawler + type: Directory + match: .* # we do not care how it is named here + subtree: + # This is the file + thisfile: + type: SPSSFile + match: ".*sav" + subtree: + entry: + type: Dict + match: .* # Name is irrelevant + records: + MyParent: + subtree: !macro +""" + + enums: dict[str, list[str]] = {} + properties = OrderedDict() + + for name in meta.column_names: + prop = { + "datatype": dtypes[name], + } + desc = meta.column_names_to_labels.get(name) + if desc and desc != name: + prop["description"] = desc + # Handle categorial variables + if var_label := meta.variable_to_label.get(name): + vvl = meta.variable_value_labels[name] + # reproducible (and sensible) order + label_values = [vvl[key] for key in sorted(vvl.keys())] + if label_values not in enums.values(): + enums[var_label] = label_values + else: + var_label = [key for key, value in enums.items() if value == label_values][0] + prop["datatype"] = var_label + properties[name] = prop + + output = f"""# auto-generated data model from file "{savfile}". +# To insert a datamodel into LinkAhead, run: +# +# python3 -m caosadvancedtools.models.parser datamodel.yaml --sync + +""" + + # Actual datamodel + output += """ +######### +# Enums # +######### + +""" + for name, values in enums.items(): + output += f"""{name}: + description: + # possible values: {values}\n""" + + output += (""" +############### +# RecordTypes # +############### + +DummyRT: + description: Note: Change name and enter description. + recommended_properties: + """ + + " ".join(yaml.dump(dict(properties), # from OrderedDict to dict + allow_unicode=True, + sort_keys=False).splitlines(keepends=True))) + + # Experimental: Enum creation + output += """ +############### +# Enum values # +############### +""" + for name, values in enums.items(): + output += f"\n# ### {name} ###\n" + for value in values: + output += f""" +{value}: + role: Record + inherit_from_suggested: + - {name} +""" + + with open(yamlfile, encoding="utf-8", mode="w") as myfile: + myfile.write(output) + + if cfood: + defs_col_value: list[str] = [] + defs_col_value_ref: list[str] = [] + prefix = " " * 14 + for name, propdef in properties.items(): + def_str = prefix + f"- name: {name}\n" + dtype = None + reftype = None + defs = defs_col_value + # Which type? + if propdef["datatype"] == "DOUBLE": + dtype = "FloatElement" + elif propdef["datatype"] in ("TEXT", "DATETIME"): + dtype = None + else: + reftype = propdef["datatype"] + defs = defs_col_value_ref + + # Append according to types: + if reftype: + def_str += prefix + f" reftype: {reftype}\n" + if dtype: + def_str += prefix + f" type: {dtype}\n" + + # Store result + defs.append(def_str) + del defs + + cfood_str += (prefix[2:] + "ColumnValue:\n" + "".join(defs_col_value) + + prefix[2:] + "ColumnValueReference:\n" + "".join(defs_col_value_ref) + ) + with open(cfood, encoding="utf-8", mode="w") as myfile: + myfile.write(cfood_str) + + +def _parse_arguments(): + """Parse the arguments.""" + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('-i', '--input', help="The *.sav file.", required=True) + parser.add_argument('-o', '--outfile', help="Yaml filename to save the result", required=True) + parser.add_argument('--cfood', help="Yaml filename to create cfood output in", required=False) + + return parser.parse_args() + + +def spss_to_datamodel_main(): + """The main function of this script.""" + args = _parse_arguments() + spss_to_yaml(savfile=args.input, yamlfile=args.outfile, cfood=args.cfood) + print(f"Written datamodel to: {args.outfile}") + if args.cfood: + print(f"Written cfood to: {args.cfood}") diff --git a/src/caoscrawler/converters/xml_converter.py b/src/caoscrawler/converters/xml_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..60d7b49431fb011a06b7105a16471b0b3c7b2268 --- /dev/null +++ b/src/caoscrawler/converters/xml_converter.py @@ -0,0 +1,234 @@ +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Converters take structure elements and create Records and new structure elements from them.""" + +from __future__ import annotations + +import re +from typing import Optional + +import lxml.etree + +from ..stores import GeneralStore +from ..structure_elements import (File, StructureElement, XMLAttributeNode, + XMLTagElement, XMLTextNode) +from .converters import (Converter, ConverterValidationError, + SimpleFileConverter) + + +class XMLFileConverter(SimpleFileConverter): + + """Convert XML files. See + https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145 + for the current suggestion for the specification. + + """ + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + # TODO: See comment on types and inheritance + if not isinstance(element, File): + raise ValueError("create_children was called with wrong type of StructureElement") + with open(element.path, 'r') as xml_file: + xml = lxml.etree.parse(xml_file) + if "validate" in self.definition and self.definition["validate"]: + try: + raise NotImplementedError("XML validation not implemented yet.") + except ConverterValidationError as err: + raise ConverterValidationError( + "Error during the validation of the XML file:\n" + f"{element.path}\n" + err.message) + + return [XMLTagElement(xml.getroot())] + + +class XMLTagConverter(Converter): + def create_children(self, generalStore: GeneralStore, element: StructureElement): + """Children that are generated by this function are the + result of the xpath query given in the yaml property + ``xpath``. Its default (when not given) is ``child::*``, so the + direct children of the current xml node. The xpath expression + must be designed in a way that it returns xml tags (and no + attributes or texts). That means, that the axis ``attribute::`` + and the function ``text()`` must not be used. + + The following yaml properties can be used to generate other + types of nodes (text nodes and attribute nodes) as subtree + structure elements: + + :: + + # _*_ marks the default: + attribs_as_children: true # true / _false_ + text_as_children: true # true / _false_ + tags_as_children: true # _true_ / false + + The default is to generate the tags matched by the xpath expression only. + + - When text_as_children is set to true, text nodes will be generated that contain the text + contained in the matched tags. + - When attribs_as_children is set to true, attribute nodes will be generated from the attributes + of the matched tags. + + Notes + ----- + The default is to take the namespace map from the current node and use it in xpath queries. + Because default namespaces cannot be handled by xpath, it is possible to remap the default namespace + using the key ``default_namespace``. + The key ``nsmap`` can be used to define additional nsmap entries. + + """ + if not isinstance(element, XMLTagElement): + raise TypeError("Element must be an instance of XMLTagElement.") + + # Get the namespace map from the element: + nsmap = element.tag.nsmap + # The default name of the default namespace is "default". + # You can overwrite it using the attribute "default_namespace" in the converter definition: + default_namespace = self.definition.get("default_namespace", "default") + if None in nsmap: + nsmap[default_namespace] = nsmap[None] + del nsmap[None] + + # Set additional nsmap entries from the converter definition: + if "nsmap" in self.definition: + for key, value in self.definition["nsmap"].items(): + nsmap[key] = value + + xpath = self.definition.get("xpath", "child::*") + children = element.tag.xpath(xpath, namespaces=nsmap) + el_lst = [] + for el in children: + if isinstance(el, str): + raise RuntimeError( + "Only standard xml nodes are supported as results of xpath queries.") + elif isinstance(el, lxml.etree._Element): + if self.definition.get("tags_as_children", True): + el_lst.append(XMLTagElement(el)) + if self.definition.get("attribs_as_children", False): + for attrib in el.attrib: + el_lst.append(XMLAttributeNode(el, attrib)) + if self.definition.get("text_as_children", False): + el_lst.append(XMLTextNode(el)) + else: + raise RuntimeError("Unsupported child type.") + return el_lst + + def typecheck(self, element: StructureElement): + """ + Check whether the current structure element can be converted using + this converter. + """ + return isinstance(element, XMLTagElement) + + def match(self, element: StructureElement) -> Optional[dict]: + # See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145 + # for a suggestion for the design of the matching algorithm. + if not isinstance(element, XMLTagElement): + raise TypeError("Element must be an instance of XMLTagElement.") + + # Store the result of all individual regexp variable results: + vardict = {} + + if "match_tag" in self.definition: + m_tag = re.match(self.definition["match_tag"], element.tag.tag) + if m_tag is None: + return None + vardict.update(m_tag.groupdict()) + + if "match_text" in self.definition: + tagtext = element.tag.text + if element.tag.text is None: + tagtext = "" + m_text = re.match(self.definition["match_text"], tagtext, re.DOTALL) + if m_text is None: + return None + vardict.update(m_text.groupdict()) + + if not self.match_properties(element.tag.attrib, vardict, "match_attrib"): + return None + + return vardict + + +class XMLTextNodeConverter(Converter): + def create_children(self, generalStore: GeneralStore, element: StructureElement): + """ + This converter does not create children. + """ + return [] + + def typecheck(self, element: StructureElement): + """ + Check whether the current structure element can be converted using + this converter. + """ + return isinstance(element, XMLTextNode) + + def match(self, element: StructureElement) -> Optional[dict]: + # See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145 + # for a suggestion for the design of the matching algorithm. + if not isinstance(element, XMLTextNode): + raise TypeError("Element must be an instance of XMLTextNode.") + + vardict = {} + + m_text = re.match(self.definition["match_text"], element.value, + re.DOTALL) + if m_text is None: + return None + vardict.update(m_text.groupdict()) + + return vardict + + +class XMLAttributeNodeConverter(Converter): + def create_children(self, generalStore: GeneralStore, element: StructureElement): + """ + This converter does not create children. + """ + return [] + + def typecheck(self, element: StructureElement): + """ + Check whether the current structure element can be converted using + this converter. + """ + return isinstance(element, XMLAttributeNode) + + def match(self, element: StructureElement) -> Optional[dict]: + # See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145 + # for a suggestion for the design of the matching algorithm. + if not isinstance(element, XMLAttributeNode): + raise TypeError("Element must be an instance of XMLAttributeNode.") + + vardict = {} + + m_name = re.match(self.definition["match_name"], element.key) + if m_name is None: + return None + vardict.update(m_name.groupdict()) + + m_value = re.match(self.definition["match_value"], element.value) + if m_value is None: + return None + vardict.update(m_value.groupdict()) + + return vardict diff --git a/src/caoscrawler/converters/zipfile_converter.py b/src/caoscrawler/converters/zipfile_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..7073e66a266168e17eb9b6143e7dc6292b5149dc --- /dev/null +++ b/src/caoscrawler/converters/zipfile_converter.py @@ -0,0 +1,82 @@ +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Converters take structure elements and create Records and new structure elements from them. + +This converter opens zip files, unzips them into a temporary directory and +exposes its contents as File structure elements. + +""" + +from __future__ import annotations + +import os +import tempfile +from os.path import isdir, join +from zipfile import ZipFile + +from ..stores import GeneralStore +from ..structure_elements import Directory, File, StructureElement +from .converters import SimpleFileConverter + + +class ZipFileConverter(SimpleFileConverter): + + """Convert zipfiles. + """ + + def setup(self): + self._tempdir = None + + def cleanup(self): + self._tempdir.cleanup() + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + """ + Loads an ROCrate from an rocrate file or directory. + + Arguments: + ---------- + element must be a File or Directory (structure element). + + Returns: + -------- + A list with an ROCrateElement representing the contents of the .eln-file or None + in case of errors. + """ + + if isinstance(element, File): + self._tempdir = tempfile.TemporaryDirectory() + unzd_path = self._tempdir.name + with ZipFile(element.path) as zipf: + zipf.extractall(unzd_path) + + entity_ls = [] + for el in os.listdir(unzd_path): + path = join(unzd_path, el) + if isdir(path): + entity_ls.append(Directory(el, path)) + else: + entity_ls.append(File(el, path)) + + return entity_ls + else: + raise ValueError("create_children was called with wrong type of StructureElement") + return None diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 79fd72b0b54d132b2ea35b7047448c03fbfc390f..e0d243979faee8f44cdcee3b0e49c15af640c378 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -3,10 +3,11 @@ # # This file is a part of the CaosDB Project. # -# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> -# 2021-2023 Research Group Biomedical Physics, -# Max-Planck-Institute for Dynamics and Self-Organization Göttingen -# Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> +# Copyright (C) 2021-2024 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# Copyright (C) 2021-2023 Research Group Biomedical Physics, MPI-DS Göttingen +# Copyright (C) 2021-2023 Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> +# Copyright (C) 2021-2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as @@ -26,7 +27,7 @@ """ Crawl a file structure using a yaml cfood definition and synchronize -the acuired data with CaosDB. +the acuired data with LinkAhead. """ from __future__ import annotations @@ -42,35 +43,34 @@ from argparse import RawTextHelpFormatter from copy import deepcopy from datetime import datetime from enum import Enum -from typing import Any, Optional, Union +from typing import Any, List, Optional, Union -import caosdb as db +import linkahead as db import yaml from caosadvancedtools.cache import UpdateCache from caosadvancedtools.crawler import Crawler as OldCrawler from caosadvancedtools.serverside.helper import send_mail from caosadvancedtools.utils import create_entity_link -from caosdb.apiutils import (EntityMergeConflictError, compare_entities, - merge_entities) -from caosdb.cached import cache_clear, cached_get_entity_by -from caosdb.exceptions import EmptyUniqueQueryError +from linkahead.apiutils import compare_entities, merge_entities +from linkahead.cached import cache_clear, cached_get_entity_by from linkahead.common.datatype import get_list_datatype, is_reference -from linkahead.utils.escape import escape_quoted_text +from linkahead.exceptions import TransactionError +from linkahead.utils.escape import escape_squoted_text from .config import get_config_setting from .converters import Converter, ConverterValidationError from .debug_tree import DebugTree -from .identifiable import Identifiable +from .exceptions import ImpossibleMergeError from .identifiable_adapters import (CaosDBIdentifiableAdapter, - IdentifiableAdapter, - LocalStorageIdentifiableAdapter) -from .identified_cache import IdentifiedCache + IdentifiableAdapter) from .logging import configure_server_side_logging from .macros import defmacro_constructor, macro_constructor from .scanner import (create_converter_registry, initialize_converters, load_definition, scan_directory, scan_structure_elements) from .stores import GeneralStore from .structure_elements import StructureElement +from .sync_graph import SyncGraph +from .utils import get_shared_resource_link logger = logging.getLogger(__name__) @@ -95,7 +95,7 @@ in a quite complex fashion: - If one of the entities has additional parents or additional properties -> not identical - If the value of one of the properties differs -> not identical - If datatype, importance or unit are reported different for a property by compare_entities - return "not_identical" only if these attributes are set explicitely by record1. + return False only if these attributes are set explicitely by record1. Ignore the difference otherwise. - If description, name, id or path appear in list of differences -> not identical. - If file, checksum, size appear -> Only different, if explicitely set by record1. @@ -172,48 +172,6 @@ def _resolve_datatype(prop: db.Property, remote_entity: db.Entity): return prop -def _treat_merge_error_of(newrecord, record): - """ - The parameters are two entities that cannot be merged with the merge_entities function. - - # This function checks for two obvious cases where no merge will ever be possible: - # 1. Two Entities with differing IDs - # 2. Two non-Entity values which differ - - It creates a more informative logger message and raises an Exception in those cases. - """ - for this_p in newrecord.properties: - that_p = record.get_property(this_p.name) - if (isinstance(this_p.value, db.Entity) - and isinstance(that_p.value, db.Entity)): - if this_p.value.id is not None and that_p.value.id is not None: - if this_p.value.id != that_p.value.id: - logger.error("The Crawler is trying to merge two entities " - "because they should be the same object (same" - " identifiables), but they reference " - "different Entities with the same Property." - f"Problematic Property: {this_p.name}\n" - f"Referenced Entities: {this_p.value.id} and " - f"{that_p.value.id}\n" - f"{record}\n{newrecord}") - raise RuntimeError("Cannot merge Entities") - elif (not isinstance(this_p.value, db.Entity) - and not isinstance(that_p.value, db.Entity)): - if ((this_p.value != that_p.value) - # TODO can we also compare lists? - and not isinstance(this_p.value, list) - and not isinstance(that_p.value, list)): - logger.error("The Crawler is trying to merge two entities " - "because they should be the same object (same" - " identifiables), but they have " - "different values for the same Property." - f"Problematic Property: {this_p.name}\n" - f"Values: {this_p.value} and " - f"{that_p.value}\n" - f"{record}\n{newrecord}") - raise RuntimeError("Cannot merge Entities") - - class SecurityMode(Enum): RETRIEVE = 0 INSERT = 1 @@ -256,14 +214,13 @@ class Crawler(object): # The following caches store records, where we checked whether they exist on the remote # server. Since, it is important to know whether they exist or not, we store them into two # different caches. - self.remote_existing_cache = IdentifiedCache() - self.remote_missing_cache = IdentifiedCache() + # TODO does it make sense to have this as member variable? self.securityMode = securityMode # TODO does it make sense to have this as member variable(run_id)? self.generate_run_id() - self.identifiableAdapter: IdentifiableAdapter = LocalStorageIdentifiableAdapter() + self.identifiableAdapter: IdentifiableAdapter = CaosDBIdentifiableAdapter() if identifiableAdapter is not None: self.identifiableAdapter = identifiableAdapter @@ -340,391 +297,59 @@ class Crawler(object): self.crawled_data = data return data - def _has_reference_value_without_id(self, ident: Identifiable) -> bool: - """ - Returns True if there is at least one value in the properties and backrefs attributes of - ``ident`` which: + def _split_into_inserts_and_updates(self, st: SyncGraph): + """Classify nodes in the SyncGraph ``st`` with respect to their state on the server. - a) is a reference property AND - b) where the value is set to a - :external+caosdb-pylib:py:class:`db.Entity <caosdb.common.models.Entity>` - (instead of an ID) AND - c) where the ID of the value (the - :external+caosdb-pylib:py:class:`db.Entity <caosdb.common.models.Entity>` object in b)) - is not set (to an integer) - - Returns - ------- - bool - True if there is a value without id (see above) +This method iteratively checks whether those nodes exist on the remote server and creates two lists, +one with the entities that need to be updated and the other with entities to be inserted. - Raises - ------ - ValueError - If no Identifiable is given. - """ - if ident is None: - raise ValueError("Identifiable has to be given as argument") - for pvalue in list(ident.properties.values()) + ident.backrefs: - if isinstance(pvalue, list): - for el in pvalue: - if isinstance(el, db.Entity) and el.id is None: - return True - elif isinstance(pvalue, db.Entity) and pvalue.id is None: - return True - return False +.. todo:: - @staticmethod - def create_flat_list(ent_list: list[db.Entity], flat: Optional[list[db.Entity]] = None): - """ - Recursively adds entities and all their properties contained in ent_list to - the output list flat. + Should this be made into a public method of SyncGraph instead? At the moment, this is a + purely static method that only operates on the state of ``st``. - TODO: This function will be moved to pylib as it is also needed by the - high level API. - """ - # Note: A set would be useful here, but we do not want a random order. - if flat is None: - flat = list() - for el in ent_list: - if el not in flat: - flat.append(el) - for ent in ent_list: - for p in ent.properties: - # For lists append each element that is of type Entity to flat: - if isinstance(p.value, list): - for el in p.value: - if isinstance(el, db.Entity): - if el not in flat: - flat.append(el) - Crawler.create_flat_list([el], flat) - elif isinstance(p.value, db.Entity): - if p.value not in flat: - flat.append(p.value) - Crawler.create_flat_list([p.value], flat) - return flat - - def _has_missing_object_in_references(self, ident: Identifiable, referencing_entities: list): """ - returns False if any value in the properties attribute is a db.Entity object that - is contained in the `remote_missing_cache`. If ident has such an object in - properties, it means that it references another Entity, where we checked - whether it exists remotely and it was not found. - """ - if ident is None: - raise ValueError("Identifiable has to be given as argument") - for pvalue in list(ident.properties.values()) + ident.backrefs: - # Entity instead of ID and not cached locally - if (isinstance(pvalue, list)): - for el in pvalue: - if (isinstance(el, db.Entity) and self.get_from_remote_missing_cache( - self.identifiableAdapter.get_identifiable(el, referencing_entities)) is not None): - return True - if (isinstance(pvalue, db.Entity) and self.get_from_remote_missing_cache( - self.identifiableAdapter.get_identifiable(pvalue, referencing_entities)) is not None): - # might be checked when reference is resolved - return True - return False - - def replace_references_with_cached(self, record: db.Record, referencing_entities: list): - """ - Replace all references with the versions stored in the cache. + entity_was_treated = True + # st.unchecked contains Entities which could not yet be checked against the remote server + while entity_was_treated and len(st.unchecked) > 0: + entity_was_treated = False - If the cache version is not identical, raise an error. - """ - for p in record.properties: - if (isinstance(p.value, list)): - lst = [] - for el in p.value: - if (isinstance(el, db.Entity) and el.id is None): - cached = self.get_from_any_cache( - self.identifiableAdapter.get_identifiable(el, referencing_entities)) - if cached is None: - lst.append(el) - continue - if not check_identical(cached, el, True): - if isinstance(p.value, db.File): - if p.value.path != cached.path: - raise RuntimeError( - "The cached and the refernced entity are not identical.\n" - f"Cached:\n{cached}\nReferenced:\n{el}" - ) - else: - raise RuntimeError( - "The cached and the refernced entity are not identical.\n" - f"Cached:\n{cached}\nReferenced:\n{el}" - ) - lst.append(cached) - else: - lst.append(el) - p.value = lst - if (isinstance(p.value, db.Entity) and p.value.id is None): - cached = self.get_from_any_cache( - self.identifiableAdapter.get_identifiable(p.value, referencing_entities)) - if cached is None: + for se in st.unchecked: + if se.identifiable is None: # we cannot yet identify this node continue - if not check_identical(cached, p.value, True): - if isinstance(p.value, db.File): - if p.value.path != cached.path: - raise RuntimeError( - "The cached and the refernced entity are not identical.\n" - f"Cached:\n{cached}\nReferenced:\n{p.value}" - ) - else: - raise RuntimeError( - "The cached and the refernced entity are not identical.\n" - f"Cached:\n{cached}\nReferenced:\n{p.value}" - ) - p.value = cached - - def get_from_remote_missing_cache(self, identifiable: Identifiable): - """ - returns the identified record if an identifiable with the same values already exists locally - (Each identifiable that is not found on the remote server, is 'cached' locally to prevent - that the same identifiable exists twice) - """ - if identifiable is None: - raise ValueError("Identifiable has to be given as argument") - - if identifiable in self.remote_missing_cache: - return self.remote_missing_cache[identifiable] - else: - return None - - def get_from_any_cache(self, identifiable: Identifiable): - """ - returns the identifiable if an identifiable with the same values already exists locally - (Each identifiable that is not found on the remote server, is 'cached' locally to prevent - that the same identifiable exists twice) - """ - if identifiable is None: - raise ValueError("Identifiable has to be given as argument") - - if identifiable in self.remote_existing_cache: - return self.remote_existing_cache[identifiable] - elif identifiable in self.remote_missing_cache: - return self.remote_missing_cache[identifiable] - else: - return None - - def add_to_remote_missing_cache(self, record: db.Record, identifiable: Identifiable): - """ - stores the given Record in the remote_missing_cache. - - If identifiable is None, the Record is NOT stored. - """ - self.add_to_cache(record=record, cache=self.remote_missing_cache, - identifiable=identifiable) - - def add_to_remote_existing_cache(self, record: db.Record, identifiable: Identifiable): - """ - stores the given Record in the remote_existing_cache. - - If identifiable is None, the Record is NOT stored. - """ - self.add_to_cache(record=record, cache=self.remote_existing_cache, - identifiable=identifiable) - - def add_to_cache(self, record: db.Record, cache: IdentifiedCache, - identifiable: Identifiable) -> None: - """ - stores the given Record in the given cache. - - If identifiable is None, the Record is NOT stored. - """ - if identifiable is not None: - cache.add(identifiable=identifiable, record=record) - - @staticmethod - def bend_references_to_new_object(old, new, entities): - """ Bend references to the other object - Iterate over all entities in `entities` and check the values of all properties of - occurances of old Entity and replace them with new Entity - """ - for el in entities: - for p in el.properties: - if isinstance(p.value, list): - for index, val in enumerate(p.value): - if val is old: - p.value[index] = new - else: - if p.value is old: - p.value = new - - @staticmethod - def create_reference_mapping(flat: list[db.Entity]): - """ - Create a dictionary of dictionaries of the form: - dict[int, dict[str, list[db.Entity]]] - - - The integer index is the Python id of the value object. - - The string is the name of the first parent of the referencing object. - Each value objects is taken from the values of all properties from the list flat. + # check remote server + identified_record = ( + st.identifiableAdapter.retrieve_identified_record_for_identifiable( + se.identifiable)) + remote_id = None + if identified_record is not None: + remote_id = identified_record.id + # set id of node. if node is missing, remote_id is None and the SyncGraph marks it + # as missing + st.set_id_of_node(se, remote_id) + entity_was_treated = True + break # one or more nodes were just removed from st.unchecked -> back to start - So the returned mapping maps ids of entities to the objects which are referring - to them. - """ - # TODO we need to treat children of RecordTypes somehow. - references: dict[int, dict[str, list[db.Entity]]] = {} - for ent in flat: - for p in ent.properties: - val = p.value - if not isinstance(val, list): - val = [val] - for v in val: - if isinstance(v, db.Entity): - if id(v) not in references: - references[id(v)] = {} - if ent.parents[0].name not in references[id(v)]: - references[id(v)][ent.parents[0].name] = [] - references[id(v)][ent.parents[0].name].append(ent) - - return references - - def split_into_inserts_and_updates(self, ent_list: list[db.Entity]): - to_be_inserted: list[db.Entity] = [] - to_be_updated: list[db.Entity] = [] - flat = Crawler.create_flat_list(ent_list) - - # TODO: can the following be removed at some point - for ent in flat: - if ent.role == "Record" and len(ent.parents) == 0: - raise RuntimeError(f"Records must have a parent.\n{ent}") - - resolved_references = True - # flat contains Entities which could not yet be checked against the remote server - try_to_merge_later = [] - while resolved_references and len(flat) > 0: - resolved_references = False - referencing_entities = self.create_reference_mapping( - flat + to_be_updated + try_to_merge_later + to_be_inserted) - - # For each element we try to find out whether we can find it in the server or whether - # it does not yet exist. Since a Record may reference other unkown Records it might not - # be possible to answer this right away. - # The following checks are done on each Record: - # 1. Can it be identified via an ID? - # 2. Can it be identified via a path? - # 3. Is it in the cache of already checked Records? - # 4. Can it be checked on the remote server? - # 5. Does it have to be new since a needed reference is missing? - for i in reversed(range(len(flat))): - record = flat[i] - identifiable = self.identifiableAdapter.get_identifiable( - record, - referencing_entities=referencing_entities) - - # TODO remove if the exception is never raised - if record in to_be_inserted: - raise RuntimeError("This should not be reached since treated elements" - "are removed from the list") - # 1. Can it be identified via an ID? - elif record.id is not None: - to_be_updated.append(record) - self.add_to_remote_existing_cache(record, identifiable) - del flat[i] - # 2. Can it be identified via a path? - elif record.path is not None: - try: - existing = cached_get_entity_by(path=record.path) - except EmptyUniqueQueryError: - existing = None - if existing is None: - to_be_inserted.append(record) - self.add_to_remote_missing_cache(record, identifiable) - del flat[i] - else: - record.id = existing.id - # TODO check the following copying of _size and _checksum - # Copy over checksum and size too if it is a file - record._size = existing._size - record._checksum = existing._checksum - to_be_updated.append(record) - self.add_to_remote_existing_cache(record, identifiable) - del flat[i] - # 3. Is it in the cache of already checked Records? - elif self.get_from_any_cache(identifiable) is not None: - newrecord = self.get_from_any_cache(identifiable) - # Since the identifiables are the same, newrecord and record actually describe - # the same obejct. - # We merge the two in order to prevent loss of information - try: - merge_entities( - newrecord, record, merge_references_with_empty_diffs=False, merge_id_with_resolved_entity=True) - except EntityMergeConflictError: - _treat_merge_error_of(newrecord, record) - # We cannot merge but it is none of the clear case where merge is - # impossible. Thus we try later - try_to_merge_later.append(record) - if newrecord.id is not None: - record.id = newrecord.id - except NotImplementedError: - print(newrecord) - print(record) - raise - Crawler.bend_references_to_new_object( - old=record, new=newrecord, - entities=flat + to_be_updated + to_be_inserted + try_to_merge_later - ) - referencing_entities = self.create_reference_mapping( - flat + to_be_updated + try_to_merge_later + to_be_inserted) - - del flat[i] - resolved_references = True - - # 4. Can it be checked on the remote server? - elif not self._has_reference_value_without_id(identifiable): - identified_record = ( - self.identifiableAdapter.retrieve_identified_record_for_identifiable( - identifiable)) - if identified_record is None: - # identifiable does not exist remotely -> record needs to be inserted - to_be_inserted.append(record) - self.add_to_remote_missing_cache(record, identifiable) - del flat[i] - else: - # side effect - record.id = identified_record.id - to_be_updated.append(record) - self.add_to_remote_existing_cache(record, identifiable) - del flat[i] - resolved_references = True - - # 5. Does it have to be new since a needed reference is missing? - # (Is it impossible to check this record because an identifiable references a - # missing record?) - elif self._has_missing_object_in_references(identifiable, referencing_entities): - to_be_inserted.append(record) - self.add_to_remote_missing_cache(record, identifiable) - del flat[i] - resolved_references = True - - for record in flat: - self.replace_references_with_cached(record, referencing_entities) - - # We postponed the merge for records where it failed previously and try it again now. # This only might add properties of the postponed records to the already used ones. - for record in try_to_merge_later: - identifiable = self.identifiableAdapter.get_identifiable( - record, - referencing_entities=referencing_entities) - newrecord = self.get_from_any_cache(identifiable) - merge_entities(newrecord, record, merge_id_with_resolved_entity=True) - if len(flat) > 0: - circle = self.detect_circular_dependency(flat) - if circle is None: - logger.error("Failed, but found NO circular dependency. The data is as follows:" - + str(self.compact_entity_list_representation(flat))) - else: - logger.error("Found circular dependency (Note that this might include references " - "that are not identifying properties): " - + self.compact_entity_list_representation(circle)) + if len(st.unchecked) > 0: + # circle = st.unchecked_contains_circular_dependency() + # if circle is None: + # logger.error("Failed, but found NO circular dependency. The data is as follows:" + # + "\n".join([str(el) for el in st.unchecked]) + + # ) + # else: + # logger.error("Found circular dependency (Note that this might include references " + # "that are not identifying properties): " + # + "\n".join([str(el) for el in st.unchecked]) + # ) + raise RuntimeError( - f"Could not finish split_into_inserts_and_updates. Circular dependency: " - f"{circle is not None}") + "Could not finish _split_into_inserts_and_updates. " + "It might be due to a circular dependency") - return to_be_inserted, to_be_updated + return st.export_record_lists() def replace_entities_with_ids(self, rec: db.Record): for el in rec.properties: @@ -738,53 +363,36 @@ class Crawler(object): el.value[index] = val.id @staticmethod - def compact_entity_list_representation(circle): + def compact_entity_list_representation(entities, referencing_entities: List) -> str: """ a more readable representation than the standard xml representation TODO this can be removed once the yaml format representation is in pylib """ text = "\n--------\n" - for el in circle: - if el.name is not None: - text += f"{el.name}\n" - text += f"{[el.name for el in el.parents]}\n" - props = {p.name: p.value for p in el.properties} - text += f"{props}\n" - - return text + "--------\n" - @staticmethod - def detect_circular_dependency(flat: list[db.Entity]): - """ - Detects whether there are circular references in the given entity list and returns a list - where the entities are ordered according to the chain of references (and only the entities - contained in the circle are included. Returns None if no circular dependency is found. - - TODO: for the sake of detecting problems for split_into_inserts_and_updates we should only - consider references that are identifying properties. - """ - circle = [flat[0]] - closed = False - while not closed: - current = circle[-1] - added_to_circle = False - for p in current.properties: - if isinstance(p.value, list): - for pval in p.value: - if pval in flat: - if pval in circle: - closed = True - circle.append(pval) - added_to_circle = True + grouped = {"": []} + for ent in entities: + if not ent.parents: + grouped[""].append(ent) + for parent in ent.parents: + if parent.name not in grouped: + grouped[parent.name] = [] + grouped[parent.name].append(ent) + if not grouped[""]: + del grouped[""] + for parent, group in grouped.items(): + text += f"\n> Parent: {parent}\n" + for ent in group: + if ent.name is not None: + text += f"\n>> Name: {ent.name}\n" else: - if p.value in flat: - if p.value in circle: - closed = True - circle.append(p.value) - added_to_circle = True - if not added_to_circle: - return None - return circle + text += "\n>> name: # No name" + text += f"{[ent.name for ent in ent.parents]}\n" + props = {p.name: p.value for p in ent.properties} + text += f"{props}\n" + text += f"is_referenced_by:\n{referencing_entities[id(ent)]}\n" + + return text + "--------\n" @staticmethod def _merge_properties_from_remote( @@ -900,8 +508,8 @@ class Crawler(object): @staticmethod def _get_property_id_for_datatype(rtname: str, name: str): return cached_get_entity_by( - query=f"FIND Entity '{escape_quoted_text(rtname)}' " - f"with name='{escape_quoted_text(name)}'").id + query=f"FIND Entity '{escape_squoted_text(rtname)}' " + f"with name='{escape_squoted_text(name)}'").id @staticmethod def replace_name_with_referenced_entity_id(prop: db.Property): @@ -923,8 +531,8 @@ class Crawler(object): prop.value = Crawler._get_property_id_for_datatype( rtname=prop.datatype, name=prop.value) except (db.EmptyUniqueQueryError, db.QueryNotUniqueError): - logger.error("The Property {prop.name} with datatype={prop.datatype} has the " - "value {prop.value} and there is no appropriate Entity with such " + logger.error(f"The Property {prop.name} with datatype={prop.datatype} has the " + f"value {prop.value} and there is no appropriate Entity with such " "a name.") raise else: @@ -936,12 +544,12 @@ class Crawler(object): if isinstance(el, str): try: # the get_entity function will raise an error if not unique - propval.append(Crawler._get_property_id_for_datatype(rtname=prop.datatype, + propval.append(Crawler._get_property_id_for_datatype(rtname=dt, name=el)) except (db.EmptyUniqueQueryError, db.QueryNotUniqueError): logger.error( - "The Property {prop.name} with datatype={prop.datatype} has the " - "value {prop.value} and there is no appropriate Entity with such " + f"The Property {prop.name} with datatype={prop.datatype} has the " + f"value {prop.value} and there is no appropriate Entity with such " "a name.") raise else: @@ -986,6 +594,9 @@ class Crawler(object): unique_names=True): Crawler.set_ids_and_datatype_of_parents_and_properties(to_be_updated) logger.debug("UPDATE") + # Here, it's probably much more reasonable to show a diff of the update: + # from linkahead.apiutils import compare_entities + # [compare_entities(c, db.Record(id=c.id).retrieve()) for c in to_be_updated] logger.debug(to_be_updated) if len(to_be_updated) > 0: if securityMode.value > SecurityMode.INSERT.value: @@ -1010,7 +621,7 @@ class Crawler(object): crawled_data: Optional[list[db.Record]] = None, no_insert_RTs: Optional[list[str]] = None, no_update_RTs: Optional[list[str]] = None, - path_for_authorized_run: Optional[str] = "", + path_for_authorized_run: Optional[Union[str, list[str]]] = "", ): """ This function applies several stages: @@ -1032,7 +643,7 @@ class Crawler(object): no_update_RTs : list[str], optional list of RecordType names. Records that have one of those RecordTypes as parent will not be updated - path_for_authorized_run : str, optional + path_for_authorized_run : str or list[str], optional only used if there are changes that need authorization before being applied. The form for rerunning the crawler with the authorization of these changes will be generated with this path. See @@ -1046,12 +657,18 @@ class Crawler(object): """ if crawled_data is None: warnings.warn(DeprecationWarning( - "Calling synchronize without the data to be synchronized is depricated. Please " + "Calling synchronize without the data to be synchronized is deprecated. Please " "use for example the Scanner to create this data.")) crawled_data = self.crawled_data - to_be_inserted, to_be_updated = self.split_into_inserts_and_updates(crawled_data) - referencing_entities = self.create_reference_mapping(to_be_updated + to_be_inserted) + if isinstance(path_for_authorized_run, list) and self.securityMode != SecurityMode.UPDATE: + raise NotImplementedError( + "Authorization of inserts and updates is currently implemented only " + "for single paths, not for lists of paths." + ) + + to_be_inserted, to_be_updated = self._split_into_inserts_and_updates( + SyncGraph(crawled_data, self.identifiableAdapter)) for el in to_be_updated: # all entity objects are replaced by their IDs except for the not yet inserted ones @@ -1081,9 +698,11 @@ class Crawler(object): upd_problems = [] if len(ins_problems) > 0 or len(upd_problems) > 0: raise ForbiddenTransaction( - "One or more Records that have a parent which is excluded from inserts or updates." - f"\nRecords excluded from inserts have the following RecordTypes:\n{[el.parents[0].name for el in ins_problems]}" - f"\nRecords excluded from updates have the following RecordTypes:\n{[el.parents[0].name for el in upd_problems]}" + "One or more Records have a parent which is excluded from inserts or updates." + f"\nRecords excluded from inserts have the following RecordTypes:\n" + f"{[el.parents[0].name for el in ins_problems]}" + f"\nRecords excluded from updates have the following RecordTypes:\n" + f"{[el.parents[0].name for el in upd_problems]}" ) logger.info(f"Going to insert {len(to_be_inserted)} Entities and update " @@ -1092,14 +711,14 @@ class Crawler(object): cache_clear() self.execute_parent_updates_in_list(to_be_updated, securityMode=self.securityMode, run_id=self.run_id, unique_names=unique_names) - logger.info(f"Added parent RecordTypes where necessary.") + logger.info("Added parent RecordTypes where necessary.") self.execute_inserts_in_list( to_be_inserted, self.securityMode, self.run_id, unique_names=unique_names) - logger.info(f"Executed inserts:\n" + logger.info("Executed inserts:\n" + self.create_entity_summary(to_be_inserted)) self.execute_updates_in_list( to_be_updated, self.securityMode, self.run_id, unique_names=unique_names) - logger.info(f"Executed updates:\n" + logger.info("Executed updates:\n" + self.create_entity_summary(to_be_updated)) update_cache = UpdateCache() @@ -1138,9 +757,32 @@ class Crawler(object): def inform_about_pending_changes(pending_changes, run_id, path, inserts=False): # Sending an Email with a link to a form to authorize updates is if get_config_setting("send_crawler_notifications"): - filename = OldCrawler.save_form( - [el[3] for el in pending_changes], path, run_id) - OldCrawler.send_mail([el[3] for el in pending_changes], filename) + filename = OldCrawler.save_form([el[3] for el in pending_changes], path, run_id) + link_address = get_shared_resource_link(db.configuration.get_config()[ + "Connection"]["url"], filename) + changes = "\n".join([el[3] for el in pending_changes]) + text = f"""Dear Curator, + there where changes that need your authorization. Please check the following + carefully and if the changes are ok, click on the following link: + + {link_address} + + {changes} + """ + try: + fro = get_config_setting("sendmail_from_address") + to = get_config_setting("sendmail_to_address") + except KeyError: + logger.error("Server Configuration is missing a setting for " + "sending mails. The administrator should check " + "'from_mail' and 'to_mail'.") + return + + send_mail( + from_addr=fro, + to=to, + subject="Crawler Update", + body=text) for i, el in enumerate(pending_changes): @@ -1173,7 +815,7 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) res[converter.name]["subtree"][k[0]] = d[k[0]] return res - def save_debug_data(self, filename: str, debug_tree: DebugTree = None): + def save_debug_data(self, filename: str, debug_tree: Optional[DebugTree] = None): """ Save the information contained in a debug_tree to a file named filename. """ @@ -1232,13 +874,13 @@ def _update_status_record(run_id, n_inserts, n_updates, status): cr_rec.get_property('status').value = status (cr_rec .add_property(db.execute_query( - f"FIND Property with name='number_of_inserted_entities'", unique=True).id, + "FIND Property with name='number_of_inserted_entities'", unique=True).id, n_inserts) .add_property( - db.execute_query(f"FIND Property with name='number_of_updated_entities'", + db.execute_query("FIND Property with name='number_of_updated_entities'", unique=True).id, n_updates) .add_property( - db.execute_query(f"FIND Property with name='finished'", + db.execute_query("FIND Property with name='finished'", unique=True).id, datetime.now().isoformat())) cr_rec.update() @@ -1251,6 +893,7 @@ def _notify_about_inserts_and_updates(n_inserts, n_updates, logfile, run_id): The email contains some basic information and a link to the log and the CrawlerRun Record. """ if not get_config_setting("send_crawler_notifications"): + logger.debug("Crawler email notifications are disabled.") return if n_inserts == 0 and n_updates == 0: return @@ -1261,11 +904,11 @@ the CaosDB Crawler successfully crawled the data and """ + domain = get_config_setting("public_host_url") if get_config_setting("create_crawler_status_records"): - domain = get_config_setting("public_host_url") text += ("You can checkout the CrawlerRun Record for more information:\n" f"{domain}/Entity/?P=0L10&query=find%20crawlerrun%20with%20run_id=%27{run_id}%27\n\n") - text += (f"You can download the logfile here:\n{domain}/Shared/" + logfile) + text += (f"You can download the logfile here:\n{get_shared_resource_link(domain, logfile)}") send_mail( from_addr=get_config_setting("sendmail_from_address"), to=get_config_setting("sendmail_to_address"), @@ -1367,7 +1010,7 @@ def _store_dry_run_data(ins, upd): "update": updates})) -def crawler_main(crawled_directory_path: str, +def crawler_main(crawled_directory_path: Union[str, list[str]], cfood_file_name: str, identifiables_definition_file: Optional[str] = None, debug: bool = False, @@ -1375,25 +1018,26 @@ def crawler_main(crawled_directory_path: str, dry_run: bool = False, prefix: str = "", securityMode: SecurityMode = SecurityMode.UPDATE, - unique_names=True, + unique_names: bool = True, restricted_path: Optional[list[str]] = None, remove_prefix: Optional[str] = None, add_prefix: Optional[str] = None, + sss_max_log_level: Optional[int] = None, ): """ Parameters ---------- - crawled_directory_path : str - path to be crawled + crawled_directory_path : str or list[str] + path(s) to be crawled cfood_file_name : str filename of the cfood to be used identifiables_definition_file : str filename of an identifiable definition yaml file debug : bool - DEPRECATED, whether or not to run in debug mode + DEPRECATED, use a provenance file instead. provenance_file : str - provenance information will be stored in a file with given filename + Provenance information will be stored in a file with given filename dry_run : bool do not commit any chnages to the server prefix : str @@ -1412,6 +1056,12 @@ def crawler_main(crawled_directory_path: str, add_prefix : Optional[str] Add the given prefix to file paths. See docstring of '_fix_file_paths' for more details. + sss_max_log_level : Optional[int] + If given, set the maximum log level of the server-side + scripting log separately from the general ``debug`` option. If + None is given, the maximum sss log level will be determined + from the value of ``debug``: ``logging.INFO`` if ``debug`` is + False, ``logging.DEBUG`` if ``debug`` is True. Returns ------- @@ -1421,11 +1071,23 @@ def crawler_main(crawled_directory_path: str, try: crawler = Crawler(securityMode=securityMode) - # setup logging and reporting if serverside execution - if "SHARED_DIR" in os.environ: - userlog_public, htmluserlog_public, debuglog_public = configure_server_side_logging() + if "SHARED_DIR" in os.environ: # setup logging and reporting if serverside execution + if sss_max_log_level is None: + sss_max_log_level = logging.DEBUG if debug else logging.INFO + userlog_public, htmluserlog_public, debuglog_public = configure_server_side_logging( + max_log_level=sss_max_log_level + ) + # TODO make this optional _create_status_record( - get_config_setting("public_host_url") + "/Shared/" + htmluserlog_public, crawler.run_id) + get_shared_resource_link(get_config_setting("public_host_url"), htmluserlog_public), + crawler.run_id) + else: # setup stdout logging for other cases + root_logger = logging.getLogger() + root_logger.setLevel(level=(logging.DEBUG if debug else logging.INFO)) + handler = logging.StreamHandler(stream=sys.stdout) + handler.setLevel(logging.DEBUG if debug else logging.INFO) + root_logger.addHandler(handler) + logger.handlers.clear() debug_tree = DebugTree() crawled_data = scan_directory( @@ -1433,13 +1095,17 @@ def crawler_main(crawled_directory_path: str, _fix_file_paths(crawled_data, add_prefix, remove_prefix) _check_record_types(crawled_data) - if provenance_file is not None and debug: + if provenance_file is not None: crawler.save_debug_data(debug_tree=debug_tree, filename=provenance_file) if identifiables_definition_file is not None: ident = CaosDBIdentifiableAdapter() ident.load_from_yaml_definition(identifiables_definition_file) crawler.identifiableAdapter = ident + else: + # TODO + # raise ValueError("An identifiable file is needed.") + pass remove_prefix = _treat_deprecated_prefix(prefix, remove_prefix) @@ -1455,24 +1121,28 @@ def crawler_main(crawled_directory_path: str, crawler.run_id) _update_status_record(crawler.run_id, len(inserts), len(updates), status="OK") return 0 - except ForbiddenTransaction as err: - logger.debug(traceback.format_exc()) - logger.error(err) - _update_status_record(crawler.run_id, 0, 0, status="FAILED") - return 1 - except ConverterValidationError as err: - logger.debug(traceback.format_exc()) - logger.error(err) - _update_status_record(crawler.run_id, 0, 0, status="FAILED") - return 1 except Exception as err: logger.debug(traceback.format_exc()) - logger.debug(err) - - if "SHARED_DIR" in os.environ: - domain = get_config_setting("public_host_url") - logger.error("Unexpected Error: Please tell your administrator about this and provide the" - f" following path.\n{domain}/Shared/" + debuglog_public) + logger.error(err) + # Special treatment for known error types + if isinstance(err, ImpossibleMergeError): + logger.error( + "Encountered conflicting information when creating Records from the crawled " + f"data:\n\n{err}" + ) + elif isinstance(err, TransactionError): + logger.error("Transaction error details:") + for suberr in err.errors: + logger.error("---") + logger.error(suberr.msg) + logger.error(suberr.entity) + # Unkown errors get a special message + elif not isinstance(err, (ConverterValidationError, ForbiddenTransaction)): + if "SHARED_DIR" in os.environ: + # pylint: disable=E0601 + domain = get_config_setting("public_host_url") + logger.error("Unexpected Error: Please tell your administrator about this and provide " + f"the following path.\n{get_shared_resource_link(domain, debuglog_public)}") _update_status_record(crawler.run_id, 0, 0, status="FAILED") return 1 @@ -1496,6 +1166,7 @@ def parse_args(): "This file will only be generated if this option is set.") parser.add_argument("--debug", required=False, action="store_true", help="Path name of the cfood yaml file to be used.") + # TODO allow to provide multiple directories to be crawled on the commandline parser.add_argument("crawled_directory_path", help="The subtree of files below the given path will " "be considered. Use '/' for everything.") diff --git a/src/caoscrawler/debug_tree.py b/src/caoscrawler/debug_tree.py index 9983981c69e3df7c58ddfda4b6977944eac54999..c154f5b91d850476be0c0610e5bb1dfcbf9866ab 100644 --- a/src/caoscrawler/debug_tree.py +++ b/src/caoscrawler/debug_tree.py @@ -29,35 +29,20 @@ A structure containing debug tree information. from __future__ import annotations -import argparse -import importlib -import logging -import os -import sys -import warnings -import yaml - -from argparse import RawTextHelpFormatter from collections import defaultdict -from copy import deepcopy -from enum import Enum + +import linkahead as db +import yaml from importlib_resources import files from jsonschema import validate -from typing import Any, Optional, Type, Union - -import caosdb as db - -from caosadvancedtools.cache import UpdateCache, Cache -from caosadvancedtools.crawler import Crawler as OldCrawler -from caosdb.apiutils import (compare_entities, EntityMergeConflictError, - merge_entities) -from caosdb.common.datatype import is_reference - -from .converters import Converter, DirectoryConverter, ConverterValidationError +from linkahead.apiutils import (EntityMergeConflictError, compare_entities, + merge_entities) +from linkahead.common.datatype import is_reference +from .converters import Converter, ConverterValidationError, DirectoryConverter from .macros import defmacro_constructor, macro_constructor -from .stores import Store, GeneralStore, RecordStore -from .structure_elements import StructureElement, Directory, NoneElement +from .stores import GeneralStore, RecordStore, Store +from .structure_elements import Directory, NoneElement, StructureElement from .version import check_cfood_version diff --git a/src/caoscrawler/default_converters.yml b/src/caoscrawler/default_converters.yml index e192ab1b3bae70a6772cf6defba4a4592a92e584..656b0ba0f1f76007266cc8b2e75f5bd7046f1206 100644 --- a/src/caoscrawler/default_converters.yml +++ b/src/caoscrawler/default_converters.yml @@ -8,9 +8,15 @@ BooleanElement: Date: converter: DateElementConverter package: caoscrawler.converters +Datetime: + converter: DatetimeElementConverter + package: caoscrawler.converters Dict: converter: DictElementConverter package: caoscrawler.converters +PropertiesFromDictElement: + converter: PropertiesFromDictConverter + package: caoscrawler.converters FloatElement: converter: FloatElementConverter package: caoscrawler.converters @@ -24,7 +30,7 @@ TextElement: converter: TextElementConverter package: caoscrawler.converters - + DictDictElement: # deprecated converter: DictElementConverter package: caoscrawler.converters @@ -60,7 +66,7 @@ File: # deprecated converter: SimpleFileConverter package: caoscrawler.converters - + SimpleFile: converter: SimpleFileConverter package: caoscrawler.converters @@ -81,6 +87,31 @@ CSVTableConverter: converter: CSVTableConverter package: caoscrawler.converters +SPSSFile: + converter: SPSSConverter + package: caoscrawler.converters + XLSXTableConverter: converter: XLSXTableConverter package: caoscrawler.converters + + +# ------------------------- +# XML +# ------------------------- + +XMLFile: + converter: XMLFileConverter + package: caoscrawler.converters + +XMLTag: + converter: XMLTagConverter + package: caoscrawler.converters + +XMLTextNode: + converter: XMLTextNodeConverter + package: caoscrawler.converters + +XMLAttributeNode: + converter: XMLAttributeNodeConverter + package: caoscrawler.converters diff --git a/src/caoscrawler/default_transformers.yml b/src/caoscrawler/default_transformers.yml index d0ad23912176bdfbf2446aa6e04bd7fa6b858777..0de9a6e0585c5246fa5a21ffcbdfc37cfdc2b88d 100644 --- a/src/caoscrawler/default_transformers.yml +++ b/src/caoscrawler/default_transformers.yml @@ -1,4 +1,4 @@ - +# Lookup table for matching functions and cfood yaml node names. submatch: package: caoscrawler.transformer_functions @@ -9,3 +9,21 @@ split: replace: package: caoscrawler.transformer_functions function: replace +date_parse: + package: caoscrawler.transformer_functions + function: date_parse +datetime_parse: + package: caoscrawler.transformer_functions + function: datetime_parse +cast_to_int: + package: caoscrawler.transformer_functions + function: cast_to_int +cast_to_float: + package: caoscrawler.transformer_functions + function: cast_to_float +cast_to_bool: + package: caoscrawler.transformer_functions + function: cast_to_bool +cast_to_str: + package: caoscrawler.transformer_functions + function: cast_to_str diff --git a/src/caoscrawler/exceptions.py b/src/caoscrawler/exceptions.py new file mode 100644 index 0000000000000000000000000000000000000000..b9b94e1d4f9064701e8e05e22f5a0d3c6d3291a9 --- /dev/null +++ b/src/caoscrawler/exceptions.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +from typing import Any + + +class ForbiddenTransaction(Exception): + """Thrown if an transactions is needed that is not allowed. + For example an update of an entity if the security level is INSERT + """ + pass + + +class ImpossibleMergeError(Exception): + """Thrown if due to identifying information, two SyncNodes or two Properties of SyncNodes + should be merged, but there is conflicting information that prevents this. + + Parameters + ---------- + msg : str + A case-specific error message describing where the merger error occurred. + pname : str + The name of the property the values of which caused the merge error. + value_a, value_b : Any + The two values that couldn't be merged. + + Attributes + ---------- + message : str + A case-specific error message describing where the merger error occurred. + values : tuple[Any] + The two values that couldn't be merged. + pname : str + The name of the property the values of which caused the merge error. + """ + + def __init__(self, msg: str, pname: str, value_a: Any, value_b: Any): + self.pname = pname + self.values = (value_a, value_b) + self.message = msg + super().__init__(self, msg) + + def __str__(self): + return ( + f"{self.message}\n\nThe problematic property is '{self.pname}' with " + f"values '{self.values[0]}' and '{self.values[1]}'." + ) + + def __repr__(self): + return self.__str__() + + +class InvalidIdentifiableYAML(Exception): + """Thrown if the identifiable definition is invalid.""" + pass + + +class MissingIdentifyingProperty(Exception): + """Thrown if a SyncNode does not have the properties required by the corresponding registered + identifiable + """ + pass + + +class MissingRecordType(Exception): + """Thrown if an record type can not be found although it is expected that it exists on the + server. + """ + pass + + +class MissingReferencingEntityError(Exception): + """Thrown if the identifiable requires that some entity references the given entity but there + is no such reference """ + + def __init__(self, *args, rts=None, **kwargs): + self.rts = rts + super().__init__(self, *args, **kwargs) diff --git a/src/caoscrawler/identifiable.py b/src/caoscrawler/identifiable.py index 75af5be8f06a6ab95a4b7f2b92eda8cf3e321a1b..cd52effb954d66bcc69b7296de77ddaf7b2b8394 100644 --- a/src/caoscrawler/identifiable.py +++ b/src/caoscrawler/identifiable.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # -# This file is a part of the CaosDB Project. +# This file is a part of the LinkAhead Project. # # Copyright (C) 2022 Henrik tom Wörden # @@ -20,23 +20,27 @@ # from __future__ import annotations -import caosdb as db -from datetime import datetime + import json -from hashlib import sha256 -from typing import Union import logging +from datetime import datetime +from hashlib import sha256 +from typing import Optional, Union + +import linkahead as db + +from .exceptions import MissingIdentifyingProperty +from .sync_node import SyncNode logger = logging.getLogger(__name__) class Identifiable(): """ - The fingerprint of a Record in CaosDB. + The fingerprint of a Record in LinkAhead. - This class contains the information that is used by the CaosDB Crawler to identify Records. - On one hand, this can be the ID or a Record or the path of a File. - On the other hand, in order to check whether a Record exits in the CaosDB Server, a query can + This class contains the information that is used by the LinkAhead Crawler to identify Records. + In order to check whether a Record exits in the LinkAhead Server, a query can be created using the information contained in the Identifiable. Parameters @@ -46,23 +50,22 @@ class Identifiable(): properties: dict, keys are names of Properties; values are Property values Note, that lists are not checked for equality but are interpreted as multiple conditions for a single Property. - path: str, In case of files: The path where the file is stored. backrefs: list, TODO future """ - def __init__(self, record_id: int = None, path: str = None, record_type: str = None, - name: str = None, properties: dict = None, - backrefs: list[Union[int, str]] = None): - if (record_id is None and path is None and name is None + def __init__(self, record_id: Optional[int] = None, record_type: Optional[str] = None, + name: Optional[str] = None, properties: Optional[dict] = None, + backrefs: Optional[list[Union[int, str]]] = None): + if (record_id is None and name is None and (backrefs is None or len(backrefs) == 0) and (properties is None or len(properties) == 0)): - raise ValueError("There is no identifying information. You need to add a path or " - "properties or other identifying attributes.") + raise ValueError( + "There is no identifying information. You need to add " + "properties or other identifying attributes.") if properties is not None and 'name' in [k.lower() for k in properties.keys()]: raise ValueError("Please use the separete 'name' keyword instead of the properties " "dict for name") self.record_id = record_id - self.path = path self.record_type = record_type self.name = name if name == "": @@ -81,20 +84,17 @@ class Identifiable(): def _value_representation(value) -> str: """returns the string representation of property values to be used in the hash function - The string is the path of a File Entity, the CaosDB ID or Python ID of other Entities - (Python Id only if there is no CaosDB ID) and the string representation of bool, float, int - and str. + The string is the LinkAhead ID in case of SyncNode objects (SyncNode objects must have an ID) + and the string representation of None, bool, float, int, datetime and str. """ if value is None: return "None" - elif isinstance(value, db.File): - return str(value.path) - elif isinstance(value, db.Entity): + elif isinstance(value, SyncNode): if value.id is not None: return str(value.id) else: - return "PyID=" + str(id(value)) + raise RuntimeError("Python Entity (SyncNode) without ID not allowed") elif isinstance(value, list): return "[" + ", ".join([Identifiable._value_representation(el) for el in value]) + "]" elif (isinstance(value, str) or isinstance(value, int) or isinstance(value, float) @@ -120,27 +120,20 @@ class Identifiable(): return rec_string def __eq__(self, other) -> bool: - """ - Identifiables are equal if they belong to the same Record. Since ID and path are on their - own enough to identify the Record it is sufficient if those attributes are equal. - 1. both IDs are set (not None) -> equal if IDs are equal - 2. both paths are set (not None) -> equal if paths are equal - 3. equal if attribute representations are equal - """ + """ Identifiables are equal if they share the same ID or if the representation is equal """ if not isinstance(other, Identifiable): raise ValueError("Identifiable can only be compared to other Identifiable objects.") - elif self.record_id is not None and other.record_id is not None: + if self.record_id is not None and other.record_id is not None: return self.record_id == other.record_id - elif self.path is not None and other.path is not None: - return self.path == other.path elif self.get_representation() == other.get_representation(): return True else: return False def __repr__(self): - pstring = json.dumps(self.properties) + """ deterministic text representation of the identifiable """ + pstring = json.dumps({k: str(v) for k, v in self.properties.items()}) return (f"{self.__class__.__name__} for RT {self.record_type}: id={self.record_id}; " - f"name={self.name}\n\tpath={self.path}\n" + f"name={self.name}\n" f"\tproperties:\n{pstring}\n" f"\tbackrefs:\n{self.backrefs}") diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index ceaa3bfe1f1f040fc0099d17e14a0c6797804ac4..592f603bef508771d734ff633f8cdb2c100742d5 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -2,7 +2,7 @@ # encoding: utf-8 # # ** header v3.0 -# This file is a part of the CaosDB Project. +# This file is a part of the LinkAhead Project. # # Copyright (C) 2021-2022 Henrik tom Wörden # 2021-2022 Alexander Schlemmer @@ -26,15 +26,20 @@ from __future__ import annotations import logging +import warnings from abc import ABCMeta, abstractmethod from datetime import datetime from typing import Any -import caosdb as db +import linkahead as db import yaml -from caosdb.cached import cached_get_entity_by +from linkahead.cached import cached_get_entity_by, cached_query +from linkahead.utils.escape import escape_squoted_text +from .exceptions import (InvalidIdentifiableYAML, MissingIdentifyingProperty, + MissingRecordType, MissingReferencingEntityError) from .identifiable import Identifiable +from .sync_node import SyncNode from .utils import has_parent logger = logging.getLogger(__name__) @@ -43,21 +48,27 @@ logger = logging.getLogger(__name__) def get_children_of_rt(rtname): """Supply the name of a recordtype. This name and the name of all children RTs are returned in a list""" - return [p.name for p in db.execute_query(f"FIND RECORDTYPE {rtname}")] + escaped = escape_squoted_text(rtname) + recordtypes = [p.name for p in cached_query(f"FIND RECORDTYPE '{escaped}'")] + if not recordtypes: + raise MissingRecordType(f"Record type could not be found on server: {rtname}") + return recordtypes -def convert_value(value: Any): - """ Returns a string representation of the value that is suitable - to be used in the query - looking for the identified record. +def convert_value(value: Any) -> str: + """Return a string representation of the value suitable for the search query. + + This is for search queries looking for the identified record. Parameters ---------- - value : Any type, the value that shall be returned and potentially converted. + value: Any + The value to be converted. Returns ------- - out : the string reprensentation of the value + out: str + the string reprensentation of the value. """ @@ -68,8 +79,7 @@ def convert_value(value: Any): elif isinstance(value, bool): return str(value).upper() elif isinstance(value, str): - # replace single quotes, otherwise they may break the queries - return value.replace("\'", "\\'") + return escape_squoted_text(value) else: return str(value) @@ -77,72 +87,166 @@ def convert_value(value: Any): class IdentifiableAdapter(metaclass=ABCMeta): """Base class for identifiable adapters. -Some terms: + Some terms: -- Registered identifiable is the definition of an identifiable which is: - - A record type as the parent - - A list of properties - - A list of referenced by statements -- Identifiable is the concrete identifiable, e.g. the Record based on - the registered identifiable with all the values filled in. -- Identified record is the result of retrieving a record based on the - identifiable from the database. + - A *registered identifiable* defines an identifiable template, for example by specifying: + - Parent record types + - Properties + - ``is_referenced_by`` statements + - An *identifiable* belongs to a concrete record. It consists of identifying attributes which + "fill in" the *registered identifiable*. In code, it can be represented as a Record based on + the *registered identifiable* with all the values filled in. + - An *identified record* is the result of retrieving a record from the database, based on the + *identifiable* (and its values). -General question to clarify: + General question to clarify: -- Do we want to support multiple identifiables per RecordType? -- Current implementation supports only one identifiable per RecordType. + - Do we want to support multiple identifiables per RecordType? + - Current implementation supports only one identifiable per RecordType. -The list of referenced by statements is currently not implemented. + The list of referenced by statements is currently not implemented. -The IdentifiableAdapter can be used to retrieve the three above mentioned objects (registred -identifiabel, identifiable and identified record) for a Record. + The IdentifiableAdapter can be used to retrieve the three above mentioned objects (registered + identifiabel, identifiable and identified record) for a Record. """ @staticmethod - def create_query_for_identifiable(ident: Identifiable): + def create_query_for_identifiable(ident: Identifiable, startswith: bool = False): """ This function is taken from the old crawler: caosdb-advanced-user-tools/src/caosadvancedtools/crawler.py uses the properties of ident to create a query that can determine whether the required record already exists. + + If ``startswith`` is True, use ``LIKE`` for long string values to test if the strings starts + with the first 200 characters of the value. """ query_string = "FIND RECORD " if ident.record_type is not None: - query_string += f"'{ident.record_type}'" + escaped_rt = escape_squoted_text(ident.record_type) + query_string += f"'{escaped_rt}'" for ref in ident.backrefs: eid = ref if isinstance(ref, db.Entity): eid = ref.id - query_string += (" WHICH IS REFERENCED BY " + str(eid) + " AND") + query_string += " WHICH IS REFERENCED BY " + str(eid) + " AND" query_string += " WITH " if ident.name is not None: - query_string += "name='{}'".format(convert_value(ident.name)) + query_string += "name='{}'".format(escape_squoted_text(ident.name)) if len(ident.properties) > 0: query_string += " AND " - query_string += IdentifiableAdapter.create_property_query(ident) + query_string += IdentifiableAdapter.create_property_query( + ident, startswith=startswith + ) + + # TODO Can these cases happen at all with the current code? if query_string.endswith(" AND WITH "): - query_string = query_string[:-len(" AND WITH ")] + query_string = query_string[: -len(" AND WITH ")] if query_string.endswith(" AND "): - query_string = query_string[:-len(" AND ")] + query_string = query_string[: -len(" AND ")] return query_string + def all_identifying_properties_exist( + self, node: SyncNode, raise_exception: bool = True + ): + """checks whether all identifying properties exist and raises an error if + that's not the case. It furthermore raises an error if "name" is part of + the identifiable, but the node does not have a name. + + If raise_exception is False, the function returns False instead of raising an error. + + Backreferences are not checked. + + Returns True if all identifying properties exist. + + Last review by Alexander Schlemmer on 2024-05-24. + """ + if node.registered_identifiable is None: + if raise_exception: + parents = [p.name for p in node.parents] + parents_str = "\n".join(f"- {p}" for p in parents) + raise RuntimeError("No registered identifiable for node with these parents:\n" + + parents_str) + else: + return False + for prop in node.registered_identifiable.properties: + if prop.name.lower() == "is_referenced_by": + continue + if prop.name.lower() == "name": + if node.name is None: + if raise_exception: + i = MissingIdentifyingProperty("The node has no name.") + i.prop = "name" + raise i + else: + return False + else: + continue + + # multiple occurances are ok here. We deal with that when actually creating an + # identifiable (IDs of referenced Entities might need to get resolved first). + if ( + len( + [ + el + for el in node.properties + if el.name.lower() == prop.name.lower() + ] + ) + == 0 + ): + if raise_exception: + i = MissingIdentifyingProperty( + f"The property {prop.name} is missing." + ) + i.prop = prop.name + raise i + else: + return False + + return True + + @staticmethod + def __create_pov_snippet(pname: str, pvalue, startswith: bool = False): + """Return something like ``'name'='some value'`` or ``'name' LIKE 'some*'``. + + If ``startswith`` is True, the value of strings will be cut off at 200 characters and a ``LIKE`` + operator will be used to find entities matching at the beginning. + """ + if startswith and isinstance(pvalue, str) and len(pvalue) > 200: + operator_value_str = f" LIKE '{escape_squoted_text(pvalue[:200])}*'" + else: + operator_value_str = "='" + convert_value(pvalue) + "'" + result = "'" + escape_squoted_text(pname) + "'" + operator_value_str + return result + @staticmethod - def create_property_query(entity: Identifiable): + def create_property_query(entity: Identifiable, startswith: bool = False): + """Create a POV query part with the entity's properties. + + Parameters + ---------- + + entity: Identifiable + The Identifiable whose properties shall be used. + + startswith: bool, optional + If True, check string typed properties against the first 200 characters only. Default is False. + """ query_string = "" + pov = IdentifiableAdapter.__create_pov_snippet # Shortcut for pname, pvalue in entity.properties.items(): if pvalue is None: - query_string += "'" + pname + "' IS NULL AND " + query_string += "'" + escape_squoted_text(pname) + "' IS NULL AND " elif isinstance(pvalue, list): for v in pvalue: - query_string += ("'" + pname + "'='" + - convert_value(v) + "' AND ") + query_string += pov(pname, v, startswith=startswith) + " AND " # TODO: (for review) # This code would allow for more complex identifiables with @@ -155,121 +259,147 @@ identifiabel, identifiable and identified record) for a Record. # IdentifiableAdapter.create_property_query(p.value) + # ") AND ") else: - query_string += ("'" + pname + "'='" + - convert_value(pvalue) + "' AND ") + query_string += pov(pname, pvalue, startswith=startswith) + " AND " # remove the last AND return query_string[:-4] @abstractmethod - def get_registered_identifiable(self, record: db.Record): + def get_registered_identifiable(self, record: db.Entity): """ Check whether an identifiable is registered for this record and return its definition. If there is no identifiable registered, return None. """ pass - @abstractmethod - def resolve_reference(self, record: db.Record): - pass - @abstractmethod def get_file(self, identifiable: db.File): + warnings.warn( + DeprecationWarning("This function is deprecated. Please do not use it.") + ) """ Retrieve the file object for a (File) identifiable. """ pass - def get_identifiable(self, record: db.Record, referencing_entities=None): + @staticmethod + def get_identifying_referenced_entities(record, registered_identifiable): + """Create a list of all entities that are referenced by record + and that are used as identying properties of the identifiable. + + Last review by Alexander Schlemmer on 2024-05-29. """ - retrieve the registred identifiable and fill the property values to create an - identifiable + refs = [] + for prop in registered_identifiable.properties: + pname = prop.name.lower() + if pname == "name" or pname == "is_referenced_by": + continue + if record.get_property(prop.name) is None: + logger.error(f"Record with missing identifying property:\n{record}\n" + f"This property is missing: {prop.name}\n") + raise RuntimeError("Missing identifying Property") + pval = record.get_property(prop.name).value + if not isinstance(prop.value, list): + pval = [prop.value] + for val in pval: + if isinstance(val, db.Entity): + refs.append(val) + return refs + + def get_identifiable(self, se: SyncNode, identifiable_backrefs: set[SyncNode]) -> Identifiable: + """ + Take the registered identifiable of given SyncNode ``se`` and fill the property values to + create an identifiable. Args: - record: the record for which the Identifiable shall be created. - referencing_entities: a dictionary (Type: dict[int, dict[str, list[db.Entity]]]), that - allows to look up entities with a certain RecordType, that reference ``record`` + se: the SyncNode for which the Identifiable shall be created. + identifiable_backrefs: a set (Type: set[SyncNode]), that contains SyncNodes + with a certain RecordType, that reference ``se`` Returns: Identifiable, the identifiable for record. - """ - registered_identifiable = self.get_registered_identifiable(record) - - if referencing_entities is None: - referencing_entities = {} + Last review by Alexander Schlemmer on 2024-05-29. + """ property_name_list_A = [] - property_name_list_B = [] identifiable_props = {} - identifiable_backrefs = [] - name_is_identifying_property = False - - if registered_identifiable is not None: - # fill the values: - for prop in registered_identifiable.properties: - if prop.name == "name": - # The name can be an identifiable, but it isn't a property - name_is_identifying_property = True - continue - # problem: what happens with multi properties? - # case A: in the registered identifiable - # case B: in the identifiable - - # TODO: similar to the Identifiable class, Registred Identifiable should be a - # separate class too - if prop.name.lower() == "is_referenced_by": - for givenrt in prop.value: - rt_and_children = get_children_of_rt(givenrt) - found = False - for rtname in rt_and_children: - if (id(record) in referencing_entities - and rtname in referencing_entities[id(record)]): - identifiable_backrefs.extend( - referencing_entities[id(record)][rtname]) - found = True - if not found: - # TODO: is this the appropriate error? - raise NotImplementedError( - f"The following record is missing an identifying property:" - f"RECORD\n{record}\nIdentifying PROPERTY\n{prop.name}" - ) - continue - - record_prop = record.get_property(prop.name) - if record_prop is None: - # TODO: how to handle missing values in identifiables - # raise an exception? - # TODO: is this the appropriate error? - raise NotImplementedError( - f"The following record is missing an identifying property:\n" - f"RECORD\n{record}\nIdentifying PROPERTY\n{prop.name}" + name = None + + if se.registered_identifiable is None: + raise ValueError("no registered_identifiable") + + # fill the values: + for prop in se.registered_identifiable.properties: + # TDOO: + # If there are multiproperties in the registered_identifiable, then only the LAST is + # taken into account (later properties overwrite previous one in the dict below). + if prop.name == "name": + name = se.name + continue + + if prop.name.lower() == "is_referenced_by": + for el in identifiable_backrefs: + if not isinstance(el, SyncNode): + raise ValueError("Elements of `identifiable_backrefs` must be SyncNodes") + if len(identifiable_backrefs) == 0: + raise MissingReferencingEntityError( + f"Could not find referencing entities of type(s): {prop.value}\n" + f"for registered identifiable:\n{se.registered_identifiable}\n" + f"There were {len(identifiable_backrefs)} referencing entities to " + "choose from.\n" + f"This error can also occur in case of merge conflicts in the referencing" + " entities." ) - identifiable_props[record_prop.name] = record_prop.value - property_name_list_A.append(prop.name) - - # check for multi properties in the record: - for prop in property_name_list_A: - property_name_list_B.append(prop) - if (len(set(property_name_list_B)) != len(property_name_list_B) or len( - set(property_name_list_A)) != len(property_name_list_A)): - raise RuntimeError( - "Multi properties used in identifiables could cause unpredictable results and " - "are not allowed. You might want to consider a Property with a list as value.") - - # use the RecordType of the registred Identifiable if it exists + elif len([e.id for e in identifiable_backrefs if el.id is None]) > 0: + raise RuntimeError("Referencing entity has no id") + # At this point we know that there is at least one referencing SyncNode + # with an ID. We do not need to set any property value (the reference will be used + # in the backrefs argument below) and can thus continue with the next identifying + # property + continue + + options = [p.value for p in se.properties if p.name.lower() == prop.name.lower()] + if len(options) == 0: + raise MissingIdentifyingProperty( + f"The following record is missing an identifying property:\n" + f"RECORD\n{se}\nIdentifying PROPERTY\n{prop.name}" + ) + for ii, el in enumerate(options): + if isinstance(el, SyncNode): + options[ii] = el.id + if el.id is None: + raise RuntimeError( + "Reference to unchecked in identifiable:\n" + f"{prop.name}:\n{el}" + ) + else: + options[ii] = el + if not all([f == options[0] for f in options]): + raise RuntimeError("differing prop values ") + + identifiable_props[prop.name] = options[0] + property_name_list_A.append(prop.name) + + # check for multi properties in the record: + if len(set(property_name_list_A)) != len(property_name_list_A): + raise RuntimeError( + "Multi properties used in identifiables could cause unpredictable results and " + "are not allowed. You might want to consider a Property with a list as value." + ) + + # use the RecordType of the registered Identifiable if it exists # We do not use parents of Record because it might have multiple try: return Identifiable( - record_id=record.id, - record_type=(registered_identifiable.parents[0].name - if registered_identifiable else None), - name=record.name if name_is_identifying_property else None, + record_id=se.id, + record_type=se.registered_identifiable.parents[0].name, + name=name, properties=identifiable_props, - path=record.path, - backrefs=identifiable_backrefs + backrefs=[e.id for e in identifiable_backrefs], ) - except Exception: - logger.error(f"Error while creating identifiable for this record:\n{record}") + except Exception as exc: + logger.error(exc) + logger.error(f"Error while creating identifiable for this record:\n{se}") raise @abstractmethod @@ -284,23 +414,29 @@ identifiabel, identifiable and identified record) for a Record. """ pass - def retrieve_identified_record_for_record(self, record: db.Record, referencing_entities=None): - """ - This function combines all functionality of the IdentifierAdapter by - returning the identifiable after having checked for an appropriate - registered identifiable. + @staticmethod + def referencing_entity_has_appropriate_type(parents, register_identifiable): + """returns true if one of the parents is listed by the 'is_referenced_by' property - In case there was no appropriate registered identifiable or no identifiable could - be found return value is None. - """ - if record.path is not None: - return cached_get_entity_by(path=record.path) - if record.id is not None: - return cached_get_entity_by(eid=record.id) + This function also returns True if 'is_referenced_by' contains the wildcard '*'. - identifiable = self.get_identifiable(record, referencing_entities=referencing_entities) + Last review by Alexander Schlemmer on 2024-05-29. + """ + if register_identifiable.get_property("is_referenced_by") is None: + return False + if register_identifiable.get_property("is_referenced_by").value is None: + return False - return self.retrieve_identified_record_for_identifiable(identifiable) + appropriate_types = [] + for rt in register_identifiable.get_property("is_referenced_by").value: + appropriate_types.extend(get_children_of_rt(rt)) + appropriate_types = [el.lower() for el in appropriate_types] + if "*" in appropriate_types: + return True + for parent in parents: + if parent.name.lower() in appropriate_types: + return True + return False class LocalStorageIdentifiableAdapter(IdentifiableAdapter): @@ -309,6 +445,11 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): """ def __init__(self): + warnings.warn( + DeprecationWarning( + "This class is deprecated. Please use the CaosDBIdentifiableAdapter." + ) + ) self._registered_identifiables = dict() self._records = [] @@ -323,6 +464,9 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): Just look in records for a file with the same path. """ candidates = [] + warnings.warn( + DeprecationWarning("This function is deprecated. Please do not use it.") + ) for record in self._records: if record.role == "File" and record.path == identifiable.path: candidates.append(record) @@ -334,15 +478,18 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): def store_state(self, filename): with open(filename, "w") as f: - f.write(db.common.utils.xml2str( - db.Container().extend(self._records).to_xml())) + f.write( + db.common.utils.xml2str(db.Container().extend(self._records).to_xml()) + ) def restore_state(self, filename): with open(filename, "r") as f: self._records = db.Container().from_xml(f.read()) # TODO: move to super class? - def is_identifiable_for_record(self, registered_identifiable: db.RecordType, record: db.Record): + def is_identifiable_for_record( + self, registered_identifiable: db.RecordType, record: db.Record + ): """ Check whether this registered_identifiable is an identifiable for the record. @@ -353,8 +500,7 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): Return True in that case and False otherwise. """ if len(registered_identifiable.parents) != 1: - raise RuntimeError( - "Multiple parents for identifiables not supported.") + raise RuntimeError("Multiple parents for identifiables not supported.") if not has_parent(record, registered_identifiable.parents[0].name): return False @@ -364,14 +510,13 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): return False return True - def get_registered_identifiable(self, record: db.Record): + def get_registered_identifiable(self, record: db.Entity): identifiable_candidates = [] for _, definition in self._registered_identifiables.items(): if self.is_identifiable_for_record(definition, record): identifiable_candidates.append(definition) if len(identifiable_candidates) > 1: - raise RuntimeError( - "Multiple candidates for an identifiable found.") + raise RuntimeError("Multiple candidates for an identifiable found.") if len(identifiable_candidates) == 0: return None return identifiable_candidates[0] @@ -386,8 +531,9 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): record is the record from the local database to check against. identifiable is the record that was created during the crawler run. """ - if (identifiable.record_type is not None - and not has_parent(record, identifiable.record_type)): + if identifiable.record_type is not None and not has_parent( + record, identifiable.record_type + ): return False for propname, propvalue in identifiable.properties.items(): prop_record = record.get_property(propname) @@ -416,27 +562,12 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): candidates.append(record) if len(candidates) > 1: raise RuntimeError( - f"Identifiable was not defined unambigiously. Possible candidates are {candidates}") + f"Identifiable was not defined unambigiously. Possible candidates are {candidates}" + ) if len(candidates) == 0: return None return candidates[0] - def resolve_reference(self, value: db.Record): - if self.get_registered_identifiable(value) is None: - raise NotImplementedError("Non-identifiable references cannot" - " be used as properties in identifiables.") - # TODO: just resolve the entity - - value_identifiable = self.retrieve_identified_record_for_record(value) - if value_identifiable is None: - raise RuntimeError("The identifiable which is used as property" - " here has to be inserted first.") - - if value_identifiable.id is None: - raise RuntimeError("The entity has not been assigned an ID.") - - return value_identifiable.id - class CaosDBIdentifiableAdapter(IdentifiableAdapter): """ @@ -450,43 +581,58 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): def load_from_yaml_definition(self, path: str): """Load identifiables defined in a yaml file""" - with open(path, 'r', encoding="utf-8") as yaml_f: + with open(path, "r", encoding="utf-8") as yaml_f: identifiable_data = yaml.safe_load(yaml_f) + self.load_from_yaml_object(identifiable_data) - for key, value in identifiable_data.items(): - rt = db.RecordType().add_parent(key) - for prop_name in value: + def load_from_yaml_object(self, identifiable_data): + """Load identifiables defined in a yaml object. + """ + + for rt_name, id_list in identifiable_data.items(): + rt = db.RecordType().add_parent(rt_name) + if not isinstance(id_list, list): + raise InvalidIdentifiableYAML( + f"Identifiable contents must be lists, but this was not: {rt_name}") + for prop_name in id_list: if isinstance(prop_name, str): rt.add_property(name=prop_name) elif isinstance(prop_name, dict): for k, v in prop_name.items(): + if k == "is_referenced_by" and not isinstance(v, list): + raise InvalidIdentifiableYAML( + f"'is_referenced_by' must be a list. Found in: {rt_name}") rt.add_property(name=k, value=v) else: - NotImplementedError("YAML is not structured correctly") + raise InvalidIdentifiableYAML( + "Identifiable properties must be str or dict, but this one was not:\n" + f" {rt_name}/{prop_name}") - self.register_identifiable(key, rt) + self.register_identifiable(rt_name, rt) def register_identifiable(self, name: str, definition: db.RecordType): self._registered_identifiables[name] = definition def get_file(self, identifiable: Identifiable): + warnings.warn( + DeprecationWarning("This function is deprecated. Please do not use it.") + ) # TODO is this needed for Identifiable? # or can we get rid of this function? if isinstance(identifiable, db.Entity): return cached_get_entity_by(path=identifiable) if identifiable.path is None: raise RuntimeError("Path must not be None for File retrieval.") - candidates = db.execute_query("FIND File which is stored at '{}'".format( - identifiable.path)) + candidates = cached_get_entity_by(path=identifiable.path) if len(candidates) > 1: raise RuntimeError("Identifiable was not defined unambigiously.") if len(candidates) == 0: return None return candidates[0] - def get_registered_identifiable(self, record: db.Record): + def get_registered_identifiable(self, record: db.Entity): """ - returns the registred identifiable for the given Record + returns the registered identifiable for the given Record It is assumed, that there is exactly one identifiable for each RecordType. Only the first parent of the given Record is considered; others are ignored @@ -499,23 +645,37 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): if definition.parents[0].name.lower() == rt_name.lower(): return definition - def resolve_reference(self, record: db.Record): - """ - Current implementation just sets the id for this record - as a value. It needs to be verified that references all contain an ID. - """ - if record.id is None: - return record - return record.id - def retrieve_identified_record_for_identifiable(self, identifiable: Identifiable): query_string = self.create_query_for_identifiable(identifiable) - candidates = db.execute_query(query_string) + try: + candidates = cached_query(query_string) + except db.exceptions.HTTPServerError: + query_string = self.create_query_for_identifiable( + identifiable, startswith=True + ) + candidates = cached_query( + query_string + ).copy() # Copy against cache poisoning + + # Test if the candidates really match all properties + for pname, pvalue in identifiable.properties.items(): + popme = [] + for i in range(len(candidates)): + this_prop = candidates[i].get_property(pname) + if this_prop is None: + popme.append(i) + continue + if not this_prop.value == pvalue: + popme.append(i) + for i in reversed(popme): + candidates.pop(i) + if len(candidates) > 1: raise RuntimeError( - f"Identifiable was not defined unambigiously.\n{query_string}\nReturned the " + f"Identifiable was not defined unambiguously.\n{query_string}\nReturned the " f"following {candidates}." - f"Identifiable:\n{identifiable.record_type}{identifiable.properties}") + f"Identifiable:\n{identifiable.record_type}{identifiable.properties}" + ) if len(candidates) == 0: return None return candidates[0] diff --git a/src/caoscrawler/identified_cache.py b/src/caoscrawler/identified_cache.py deleted file mode 100644 index aa2d82f8e66c738e737c62f3cc68eaf60127e28b..0000000000000000000000000000000000000000 --- a/src/caoscrawler/identified_cache.py +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -# -# ** header v3.0 -# This file is a part of the CaosDB Project. -# -# Copyright (C) 2021 Indiscale GmbH <info@indiscale.com> -# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see <https://www.gnu.org/licenses/>. -# -# ** end header -# - - -""" -see class docstring -""" - -from .identifiable import Identifiable -import caosdb as db - - -class IdentifiedCache(object): - """ - This class is like a dictionary where the keys are Identifiables. When you check whether an - Identifiable exists as key this class returns True not only if that exact Python object is - used as a key, but if an Identifiable is used as key that is **equal** to the one being - considered (see __eq__ function of Identifiable). Similarly, if you do `cache[identifiable]` - you get the Record where the key is an Identifiable that is equal to the one in the rectangular - brackets. - - This class is used for Records where we checked the existence in a remote server using - identifiables. If the Record was found, this means that we identified the corresponding Record - in the remote server and the ID of the local object can be set. - To prevent querying the server again and again for the same objects, this cache allows storing - Records that were found on a remote server and those that were not (typically in separate - caches). - """ - - def __init__(self): - self._cache = {} - self._identifiables = [] - - def __contains__(self, identifiable: Identifiable): - return identifiable in self._identifiables - - def __getitem__(self, identifiable: db.Record): - index = self._identifiables.index(identifiable) - return self._cache[id(self._identifiables[index])] - - def add(self, record: db.Record, identifiable: Identifiable): - self._cache[id(identifiable)] = record - self._identifiables.append(identifiable) diff --git a/src/caoscrawler/logging.py b/src/caoscrawler/logging.py index 69ec1fabb97e1d236162552540a35815e25a33fb..b57a067d8635a468df7345365fabbfae9ee0b22f 100644 --- a/src/caoscrawler/logging.py +++ b/src/caoscrawler/logging.py @@ -20,29 +20,46 @@ # along with this program. If not, see <https://www.gnu.org/licenses/>. import logging +import sys -from caosadvancedtools.webui_formatter import WebUI_Formatter from caosadvancedtools.serverside.helper import get_shared_filename -import sys +from caosadvancedtools.webui_formatter import WebUI_Formatter -def configure_server_side_logging(): +def configure_server_side_logging(max_log_level: int = logging.INFO): """ Set logging up to save one plain debugging log file, one plain info log file (for users) and a stdout stream with messages wrapped in html elements returns the path to the file with debugging output + + Parameters + ---------- + max_log_level : int, optional + The maximum log level to use for SSS-logs. Default is + ``logging.INFO``. + + Returns + ------- + userlog_public, htmluserlog_public, debuglog_public: str + Public paths of the respective log files. """ adv_logger = logging.getLogger("caosadvancedtools") - adv_logger.setLevel(level=logging.DEBUG) + # The max_<level> variables will be used to set the logger levels + # to the respective maximum of intended level and max_log_level, + # effectively cutting off logging above the specified + # max_log_level. + max_info = max(logging.INFO, max_log_level) + max_debug = max(logging.DEBUG, max_log_level) + adv_logger.setLevel(level=max_debug) cr_logger = logging.getLogger("caoscrawler") - cr_logger.setLevel(level=logging.DEBUG) + cr_logger.setLevel(level=max_debug) userlog_public, userlog_internal = get_shared_filename("userlog.txt") root_logger = logging.getLogger() - root_logger.setLevel(level=logging.INFO) + root_logger.setLevel(level=max_info) # this is a log file with INFO level for the user user_file_handler = logging.FileHandler(filename=userlog_internal) diff --git a/src/caoscrawler/macros/macro_yaml_object.py b/src/caoscrawler/macros/macro_yaml_object.py index c6b5de27d7f498d9b1db6b6a90d986487340a880..5d2bc1fe0775499fa8b40a65e115fb4569892e38 100644 --- a/src/caoscrawler/macros/macro_yaml_object.py +++ b/src/caoscrawler/macros/macro_yaml_object.py @@ -25,10 +25,14 @@ # Function to expand a macro in yaml # A. Schlemmer, 05/2022 -from dataclasses import dataclass -from typing import Any, Dict +import re from copy import deepcopy +from dataclasses import dataclass from string import Template +from typing import Any, Dict + +_SAFE_SUBST_PAT = re.compile(r"^\$(?P<key>\w+)$") +_SAFE_SUBST_PAT_BRACES = re.compile(r"^\$\{(?P<key>\w+)}$") @dataclass @@ -53,6 +57,12 @@ def substitute(propvalue, values: dict): Substitution of variables in strings using the variable substitution library from python's standard library. """ + # Simple matches are simply replaced by the raw dict entry. + if match := (_SAFE_SUBST_PAT.fullmatch(propvalue) + or _SAFE_SUBST_PAT_BRACES.fullmatch(propvalue)): + key = match.group("key") + if key in values: + return values[key] propvalue_template = Template(propvalue) return propvalue_template.safe_substitute(**values) diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py index 3104af112a39ea960b5954a61caabbf12a61cbe7..7f2dd64d07ed5a2e13b9835e1b97bbadc308ae22 100644 --- a/src/caoscrawler/scanner.py +++ b/src/caoscrawler/scanner.py @@ -25,7 +25,9 @@ # """ -This is the scanner, the original "_crawl" function from crawl.py. +This is the scanner. + +This was where formerly the ``_crawl(...)`` function from ``crawl.py`` was located. This is just the functionality that extracts data from the file system. """ @@ -37,11 +39,11 @@ import logging import os import warnings from collections.abc import Callable -from typing import Any, Optional, Type, Union +from typing import Any, Optional, Union import jinja2 -import caosdb as db +import linkahead as db import yaml from importlib_resources import files from jsonschema import validate @@ -58,29 +60,45 @@ import pathlib logger = logging.getLogger(__name__) -def load_definition(crawler_definition_path: str): +def load_definition(crawler_definition_path: str) -> dict: """ Load a cfood from a crawler definition defined by crawler definition path and validate it using cfood-schema.yml. + + Arguments: + ---------- + crawler_definition_path: str + Path to the crawler definition file in yaml format. + + Returns: + -------- + dict containing the crawler definition. """ # Load the cfood from a yaml file: - with open(crawler_definition_path, "r") as f: + with open(crawler_definition_path, encoding="utf-8") as f: crawler_definitions = list(yaml.safe_load_all(f)) - crawler_definition = _load_definition_from_yaml_dict( - crawler_definitions) + crawler_definition = _load_definition_from_yaml_dict(crawler_definitions) return _resolve_validator_paths(crawler_definition, crawler_definition_path) -def _load_definition_from_yaml_dict(crawler_definitions: list[dict]): +def _load_definition_from_yaml_dict(crawler_definitions: list[dict]) -> dict: """Load crawler definitions from a list of (yaml) dicts `crawler_definitions` which contains either one or two documents. Doesn't resolve the validator paths in the cfood definition, so for internal and testing use only. + Arguments: + ---------- + crawler_definitions: list[dict] + List of one or two dicts containing (optionally) metadata and the crawler definition. + + Returns: + -------- + dict containing the crawler definition. """ if len(crawler_definitions) == 1: # Simple case, just one document: @@ -134,7 +152,8 @@ def _resolve_validator_paths(definition: dict, definition_path: str): # Validator is given by a path if not value.startswith('/'): # Not an absolute path - definition[key] = os.path.join(os.path.dirname(definition_path), value) + definition[key] = os.path.join( + os.path.dirname(definition_path), value) if not os.path.isfile(definition[key]): # TODO(henrik) capture this in `crawler_main` similar to # `ConverterValidationError`. @@ -263,39 +282,44 @@ def scanner(items: list[StructureElement], crawled_data: Optional[list[db.Record]] = None, debug_tree: Optional[DebugTree] = None, registered_transformer_functions: Optional[dict] = None, - new_debug_tree: Optional[dict] = None): + new_debug_tree: Optional[dict] = None) -> list[db.Record]: """Crawl a list of StructureElements and apply any matching converters. - Formerly known as "_crawl". - - items: structure_elements (e.g. files and folders on one level on the hierarchy) + Formerly known as ``_crawl(...)``. - converters: locally defined converters for - treating structure elements. A locally defined converter could be - one that is only valid for a specific subtree of the originally - cralwed StructureElement structure. - - general_store and record_store: This recursion of the crawl function should only operate on - copies of the global stores of the Crawler object. - - restricted_path: optional, list of strings, traverse the data tree only along the given - path. For example, when a directory contains files a, b and c and b is - given as restricted_path, a and c will be ignroed by the crawler. - When the end of the given path is reached, traverse the full tree as - normal. The first element of the list provided by restricted_path should - be the name of the StructureElement at this level, i.e. denoting the - respective element in the items argument. - - registered_transformer_functions: dict + Parameters + ---------- + items: list[StructureElement] + structure_elements (e.g. files and folders on one level on the hierarchy) + + converters: list[Converter] + locally defined converters for treating structure elements. A locally + defined converter could be one that is only valid for a specific subtree + of the originally cralwed StructureElement structure. + + general_store, record_store: GeneralStore, RecordStore, optional + This recursion of the crawl function should only operate on copies of + the global stores of the Crawler object. + + restricted_path : list[str], optional + traverse the data tree only along the given path. For example, when a + directory contains files a, b and c, and b is given as ``restricted_path``, a + and c will be ignored by the crawler. When the end of the given path is + reached, traverse the full tree as normal. The first element of the list + provided by ``restricted_path`` should be the name of the StructureElement + at this level, i.e. denoting the respective element in the items + argument. + + registered_transformer_functions : dict, optional A dictionary of transformer functions that can be used in the "transform" block of a converter and that allows to apply simple transformations to variables extracted either by the current converter or to other variables found in the current variable store. Each function is a dictionary: - - The key is the name of the function to be looked up in the dictionary - of registered transformer functions. - - The value is the function which needs to be of the form: + - The key is the name of the function to be looked up in the dictionary of registered + transformer functions. + - The value is the function which needs to be of the form: def func(in_value: Any, in_parameters: dict) -> Any: pass @@ -328,14 +352,15 @@ def scanner(items: list[StructureElement], pass for element in items: - element_path = os.path.join(*(structure_elements_path + [str(element.get_name())])) + element_path = os.path.join( + *(structure_elements_path + [str(element.get_name())])) logger.debug(f"Dealing with {element_path}") # Store whether this element was matched by at least one converter: at_least_one_matched = False if new_debug_tree is not None: # Stores the matching converters for this element for usage in the new debug tree: matching_converters = [] - + for converter in converters: # type is something like "matches files", replace isinstance with "type_matches" @@ -362,7 +387,8 @@ def scanner(items: list[StructureElement], keys_modified = converter.create_records( general_store_copy, record_store_copy, element) - children = converter.create_children(general_store_copy, element) + children = converter.create_children( + general_store_copy, element) if debug_tree is not None: # add provenance information for each variable @@ -374,21 +400,24 @@ def scanner(items: list[StructureElement], debug_tree.debug_metadata["usage"][str(element)].add( "/".join(converters_path + [converter.name])) mod_info = debug_tree.debug_metadata["provenance"] - for record_name, prop_name in keys_modified: - # TODO: check - internal_id = record_store_copy.get_internal_id( - record_name) - record_identifier = record_name + \ - "_" + str(internal_id) - converter.metadata["usage"].add(record_identifier) - mod_info[record_identifier][prop_name] = ( - structure_elements_path + [element.get_name()], - converters_path + [converter.name]) + # TODO: actually keys_modified must not be None. create_records should + # always return a list. + if keys_modified is not None: + for record_name, prop_name in keys_modified: + # TODO: check + internal_id = record_store_copy.get_internal_id( + record_name) + record_identifier = record_name + \ + "_" + str(internal_id) + converter.metadata["usage"].add(record_identifier) + mod_info[record_identifier][prop_name] = ( + structure_elements_path + [element.get_name()], + converters_path + [converter.name]) sub_debug_tree = None if new_debug_tree is not None: sub_debug_tree = [] - + scanner(children, converter.converters, general_store_copy, record_store_copy, structure_elements_path + [element.get_name()], @@ -425,11 +454,11 @@ def scanner(items: list[StructureElement], for varname, value in storage.items(): dict_store[varname] = {"copied": copied[varname]} dict_store[varname]["value"] = ( - convert_to_python_object(value).serialize()) - + convert_to_python_object(value).serialize()) + converter_dictionary["subtree"] = sub_debug_tree matching_converters.append(converter_dictionary) - + if new_debug_tree is not None: element_dictionary = element.get_dict() if not at_least_one_matched: @@ -439,6 +468,9 @@ def scanner(items: list[StructureElement], element_dictionary["matching_converters"] = matching_converters new_debug_tree.append(element_dictionary) + # Clean up converter: + converter.cleanup() + if restricted_path and not path_found: raise RuntimeError("A 'restricted_path' argument was given that is not contained in " "the data tree") @@ -457,7 +489,7 @@ def scanner(items: list[StructureElement], # -------------------------------------------------------------------------------- -def scan_directory(dirname: str, crawler_definition_path: str, +def scan_directory(dirname: Union[str, list[str]], crawler_definition_path: str, restricted_path: Optional[list[str]] = None, debug_tree: Optional[DebugTree] = None, new_debug_tree: Optional[dict] = None): @@ -471,10 +503,12 @@ def scan_directory(dirname: str, crawler_definition_path: str, Parameters ---------- + dirname: str or list[str] + directory or list of directories to be scanned restricted_path: optional, list of strings - Traverse the data tree only along the given path. When the end of the given path - is reached, traverse the full tree as normal. See docstring of 'scanner' for - more details. + Traverse the data tree only along the given path. When the end + of the given path is reached, traverse the full tree as + normal. See docstring of 'scanner' for more details. Returns ------- @@ -487,26 +521,30 @@ def scan_directory(dirname: str, crawler_definition_path: str, converter_registry = create_converter_registry(crawler_definition) # Load and register transformer functions: - registered_transformer_functions = create_transformer_registry(crawler_definition) + registered_transformer_functions = create_transformer_registry( + crawler_definition) if not dirname: raise ValueError( "You have to provide a non-empty path for crawling.") - dir_structure_name = os.path.basename(dirname) + if not isinstance(dirname, list): + dirname = [dirname] + dir_element_list = [] + for dname in dirname: + dir_structure_name = os.path.basename(dname) + + # TODO: needs to be covered somewhere else + crawled_directory = dname + if not dir_structure_name and dname.endswith(os.path.sep): + if dname == os.path.sep: + # Crawling the entire file system + dir_structure_name = "root" + else: + # dirname had a trailing '/' + dir_structure_name = os.path.basename(dname[:-1]) + dir_element_list.append(Directory(dir_structure_name, dname)) - # TODO: needs to be covered somewhere else - crawled_directory = dirname - if not dir_structure_name and dirname.endswith('/'): - if dirname == '/': - # Crawling the entire file system - dir_structure_name = "root" - else: - # dirname had a trailing '/' - dir_structure_name = os.path.basename(dirname[:-1]) - - dir_structure_element = Directory(dir_structure_name, dirname) - - return scan_structure_elements(dir_structure_element, + return scan_structure_elements(dir_element_list, crawler_definition, converter_registry, restricted_path=restricted_path, @@ -522,7 +560,8 @@ def scan_structure_elements(items: Union[list[StructureElement], StructureElemen restricted_path: Optional[list[str]] = None, debug_tree: Optional[DebugTree] = None, registered_transformer_functions: Optional[dict] = None, - new_debug_tree: Optional[dict] = None): + new_debug_tree: Optional[dict] = None) -> ( + list[db.Record]): """ Start point of the crawler recursion. @@ -536,14 +575,14 @@ def scan_structure_elements(items: Union[list[StructureElement], StructureElemen crawler_definition : dict A dictionary representing the crawler definition, possibly from a yaml file. - restricted_path: optional, list of strings - Traverse the data tree only along the given path. When the end of the given path - is reached, traverse the full tree as normal. See docstring of 'scanner' for - more details. + restricted_path: list[str], optional + Traverse the data tree only along the given path. When the end of the + given path is reached, traverse the full tree as normal. See docstring + of 'scanner' for more details. Returns ------- - crawled_data : list + crawled_data : list[db.Record] the final list with the target state of Records. """ @@ -564,7 +603,8 @@ def scan_structure_elements(items: Union[list[StructureElement], StructureElemen new_debug_tree=new_debug_tree ) -def save_debug_tree(new_debug_tree: dict, filename_html: str =None, filename_yaml: str = None): + +def save_debug_tree(new_debug_tree: dict, filename_html: str = None, filename_yaml: str = None): """ This function saves the debug tree given in 'new_debug_tree' to one or two files: - If filename_yaml is given, the tree is saved in yaml format under the given file name. diff --git a/src/caoscrawler/scripts/__init__.py b/src/caoscrawler/scripts/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/caoscrawler/scripts/generators.py b/src/caoscrawler/scripts/generators.py new file mode 100644 index 0000000000000000000000000000000000000000..2bf8a90f5af5086e23b7e7cc35d21a50d8cd511a --- /dev/null +++ b/src/caoscrawler/scripts/generators.py @@ -0,0 +1,246 @@ +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Scripts and functions to generate datamodel yaml files and cfood skeletons. + +For example from actual data files. +""" + +import argparse +import csv +from collections import OrderedDict +from string import Template +from typing import Optional + +import pandas as pd +import yaml + +DM_TEMPLATE = """# auto-generated data model from file "[]{infile}". +# To insert a datamodel into LinkAhead, run: +# +# python3 -m caosadvancedtools.models.parser datamodel.yaml --sync +""" + +HEADER_RT = """ +############### +# RecordTypes # +############### + +DummyRT: + description: Note: Change name and enter description. + recommended_properties: + """ + +CFOOD_TEMPLATE = """ +--- +metadata: + macros: + - !defmacro + # Simple column value -> property rule + name: ColumnValue + params: + name: null + belongsto: BaseElement + type: TextElement + definition: + ${name}: + type: ${type} + match_name: ^${name}$$ + match_value: (?P<val>.*) + records: + ${belongsto}: + ${name}: $$val + - !defmacro + # column value -> reference property + name: ColumnValueReference + params: + name: null + reftype: null # RecordType of the reference + belongsto: BaseElement + type: TextElement # References are always text, right? + definition: + ${name}: + type: ${type} + match_name: ^${name}$$ + match_value: (?P<val>.*) + records: + ${reftype}: + name: $$val + ${belongsto}: + ${name}: $$${reftype} + - !defmacro + # Same as "ColumnValue", but also give name of property. + name: ColumnValuePropname + params: + name: null + propname: null + belongsto: BaseElement + type: TextElement + definition: + ${name}: + type: ${type} + match_name: ^${name}$$ + match_value: (?P<val>.*) + records: + ${belongsto}: + ${propname}: $$val +--- +directory: # corresponds to the directory given to the crawler + type: Directory + match: .* # we do not care how it is named here + records: + DirRecord: # One record for each directory. + subtree: + # This is the file + thisfile: + type: []{file} + match: []{match} + records: + DatFileRecord: # One record for each matching file + role: File + path: $thisfile + file: $thisfile + subtree: + entry: + type: Dict + match: .* # Name is irrelevant + records: + BaseElement: # One BaseElement record for each row in the CSV/TSV file + DatFileRecord: $DatFileRecord + DirRecord: + BaseElement: +$BaseElement + subtree: !macro +""" + + +class _CustomTemplate(Template): + delimiter = "[]" # "$" is used too much by the yaml template. + + +def csv_to_datamodel(infile: str, outfile: str, cfood: Optional[str] = None): + """Parse the input csv and create basic datamodel in ``outfile``. + +Parameters +---------- +cfood: str + If given, also create a cfood skeleton. + """ + sniffer = csv.Sniffer() + with open(infile, encoding="utf-8") as f_infile: + max_sniff = 50000 + sniffed = sniffer.sniff(f_infile.read(max_sniff)) + df = pd.read_table(infile, sep=sniffed.delimiter, quotechar=sniffed.quotechar, + escapechar=sniffed.escapechar) + + properties = OrderedDict() + for colname in df.columns: + column = df[colname] + dtype: Optional[str] = "TEXT" + if pd.api.types.is_bool_dtype(column.dtype): + dtype = "BOOLEAN" + if pd.api.types.is_float_dtype(column.dtype): + dtype = "DOUBLE" + elif pd.api.types.is_integer_dtype(column.dtype): + dtype = "INTEGER" + properties[colname] = { + "datatype": dtype + } + + result = (_CustomTemplate(DM_TEMPLATE).substitute({"infile": infile}) + + HEADER_RT + + " ".join(yaml.dump(dict(properties), # from OrderedDict to dict + allow_unicode=True, + sort_keys=False).splitlines(keepends=True)) + ) + with open(outfile, encoding="utf-8", mode="w") as myfile: + myfile.write(result) + + ################# + # cfood section # + ################# + if cfood: + defs_col_value: list[str] = [] + defs_col_value_ref: list[str] = [] + prefix = " " * 14 + for name, propdef in properties.items(): + def_str = prefix + f"- name: {name}\n" + dtype = None + reftype = None + defs = defs_col_value + # Which type? + if propdef["datatype"] == "BOOLEAN": + dtype = "BooleanElement" + elif propdef["datatype"] == "INTEGER": + dtype = "IntegerElement" + elif propdef["datatype"] == "DOUBLE": + dtype = "FloatElement" + elif propdef["datatype"] == "TEXT": + dtype = None + else: + reftype = propdef["datatype"] + defs = defs_col_value_ref + + # Append according to types: + if reftype: + def_str += prefix + f" reftype: {reftype}\n" + if dtype: + def_str += prefix + f" type: {dtype}\n" + + # Store result + defs.append(def_str) + del defs + + sep = repr(sniffed.delimiter) + sep = f'"{sep[1:-1]}"' + match_str = f"""'.*[ct]sv' + sep: {sep} + # "header": [int] + # "names": [str] + # "index_col": [int] + # "usecols": [int] + # "true_values": [str] + # "false_values": [str] + # "na_values": [str] + # "skiprows": [int] + # "nrows": [int] + # "keep_default_na": [bool] + """ + + cfood_str = (_CustomTemplate(CFOOD_TEMPLATE).substitute({"file": "CSVTableConverter", + "match": match_str}) + + prefix[2:] + "ColumnValue:\n" + "".join(defs_col_value) + + prefix[2:] + "ColumnValueReference:\n" + "".join(defs_col_value_ref) + ) + with open(cfood, encoding="utf-8", mode="w") as myfile: + myfile.write(cfood_str) + + +def _parse_args_csv(): + """Parse the arguments.""" + parser = argparse.ArgumentParser(description="Create datamodel and cfood from CSV files.") + parser.add_argument('-i', '--input', help="The input file.", required=True, dest="infile") + parser.add_argument('-o', '--outfile', help="Yaml filename to save the result", required=True) + parser.add_argument('--cfood', help="Yaml filename to create cfood output in", required=False) + + return parser.parse_args() + + +def csv_to_datamodel_main(): + """The main function for csv data handling.""" + args = _parse_args_csv() + csv_to_datamodel(**vars(args)) diff --git a/src/caoscrawler/structure_elements/__init__.py b/src/caoscrawler/structure_elements/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..351f1069708ec94c0dd27313b6329d89858d4330 --- /dev/null +++ b/src/caoscrawler/structure_elements/__init__.py @@ -0,0 +1,31 @@ +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Submdule containing all default and optional converters.""" + +from .. import utils +from .structure_elements import * + +try: + from .rocrate_structure_elements import ROCrateEntity +except ImportError as err: + ROCrateEntity: type = utils.MissingImport( + name="ROCrateEntity", hint="Try installing with the `rocrate` extra option.", + err=err) diff --git a/src/caoscrawler/structure_elements/rocrate_structure_elements.py b/src/caoscrawler/structure_elements/rocrate_structure_elements.py new file mode 100644 index 0000000000000000000000000000000000000000..66768ad800128297a27f47d672352f21310703e9 --- /dev/null +++ b/src/caoscrawler/structure_elements/rocrate_structure_elements.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2024 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +from rocrate.model.entity import Entity + +from .structure_elements import StructureElement + + +class ROCrateEntity(StructureElement): + """ + Store entities contained in ROCrates. + """ + + def __init__(self, folder: str, entity: Entity): + """ + Initializes this ROCrateEntity. + + Arguments: + ---------- + folder: str + The folder that contains the ROCrate data. In case of a zipped ROCrate, this + is a temporary folder that the ROCrate was unzipped to. + The folder is the folder containing the ro-crate-metadata.json. + + entity: Entity + The ROCrate entity that is stored in this structure element. + The entity automatically contains an attribute ".crate" + that stores the ROCrate that this entity belongs to. It can be used + e.g. to look up links to other entities (ROCrate.dereference). + """ + super().__init__(entity.properties()["@id"]) + self.folder = folder + self.entity = entity diff --git a/src/caoscrawler/structure_elements.py b/src/caoscrawler/structure_elements/structure_elements.py similarity index 67% rename from src/caoscrawler/structure_elements.py rename to src/caoscrawler/structure_elements/structure_elements.py index f6dd3394f8f4ce04f0f2cd730e9625ad6b3480f3..23e471f763688bf86480b42f471fee54d6538b59 100644 --- a/src/caoscrawler/structure_elements.py +++ b/src/caoscrawler/structure_elements/structure_elements.py @@ -23,16 +23,24 @@ # ** end header # -from typing import Dict as tDict import warnings +import lxml.etree + class StructureElement(object): - """ base class for elements in the hierarchical data structure """ + """Base class for elements in the hierarchical data structure. + +Parameters +---------- + +name: str + The name of the StructureElement. May be used for pattern matching by CFood rules. + """ - def __init__(self, name): + def __init__(self, name: str): # Used to store usage information for debugging: - self.metadata: tDict[str, set[str]] = { + self.metadata: dict[str, set[str]] = { "usage": set() } @@ -53,6 +61,18 @@ class StructureElement(object): class FileSystemStructureElement(StructureElement): + """StructureElement representing an element of a file system, like a directory or a simple file. + +Parameters +---------- + +name: str + The name of the StructureElement. May be used for pattern matching by CFood rules. + +path: str + The path to the file or directory. + """ + def __init__(self, name: str, path: str): super().__init__(name) self.path = path @@ -77,6 +97,7 @@ class Directory(FileSystemStructureElement): class File(FileSystemStructureElement): + """StrutureElement representing a file.""" pass @@ -170,3 +191,53 @@ class DictDictElement(DictElement): def __init__(self, *args, **kwargs): warnings.warn(DeprecationWarning("This class is depricated. Please use DictElement.")) super().__init__(*args, **kwargs) + + +class XMLTagElement(StructureElement): + """ + Stores elements of an XML tree. + """ + + def __init__(self, element: lxml.etree.Element): + super().__init__(element.getroottree().getelementpath(element)) + self.tag = element + + +class XMLTextNode(StructureElement): + """ + Stores text nodes of XML trees. + """ + + def __init__(self, element: lxml.etree.Element): + """ + Initializes this XML text node. + + Please note that, although syntactically similar, it is semantically + different from TextElement: + - TextElements have a meaningful name, e.g. a key in a key-value pair. This name can + be matched using the match_name entry. + - XMLTextNodes just have a text and the name is just for identifying the structure element. + They can only be matched using the match entry in the XMLTextNodeConverter. + """ + super().__init__(element.getroottree().getelementpath(element) + "/text()") + self.tag = element + self.value = element.text + + +class XMLAttributeNode(StructureElement): + """ + Stores text nodes of XML trees. + """ + + def __init__(self, element: lxml.etree.Element, + key: str): + """ + Initializes this XML attribute node. + + element: The xml tree element containing the attribute. + key: The key which identifies the attribute in the list of attributes. + """ + super().__init__(element.getroottree().getelementpath(element) + "@" + key) + self.value = element.attrib[key] + self.key = key + self.tag = element diff --git a/src/caoscrawler/sync_graph.py b/src/caoscrawler/sync_graph.py new file mode 100644 index 0000000000000000000000000000000000000000..a05e6320892239cbe8d7f1d9fbd7949a57f9bccb --- /dev/null +++ b/src/caoscrawler/sync_graph.py @@ -0,0 +1,718 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Henrik tom Wörden +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" +A data model class for the graph of entities that shall be created during synchronization of the +crawler. +""" + +from __future__ import annotations + +import logging +import re +from typing import Any, Callable, Optional, Union + +import linkahead as db +from linkahead.cached import cached_get_entity_by +from linkahead.exceptions import EmptyUniqueQueryError + +from .identifiable import Identifiable +from .identifiable_adapters import IdentifiableAdapter +from .sync_node import SyncNode, TempID + +logger = logging.getLogger(__name__) + + +def _set_each_scalar_value( + node: SyncNode, condition: Callable[[Any], bool], value: Any +): + """helper function that conditionally replaces each value element of each property of a node + + If the property value is a list, the replacement is done for each list entry. + The replacement is only performed if the condition that + is provided is fulfilled, i.e. the callable ``condition`` returns True. The callable + ``condition`` must take the property value (or list element) as the sole argument. + + Args: + node (SyncNode): The node which provides the properties (and their values) to operate on. + condition (Callable): A function with one argument which is interpreted as a condition: + Only if it returns True for the property value, the action is + executed. + value (Callable): A function returning a new value that is set as the property value. This + function receives the old value as the single argument. + + Last review by Alexander Schlemmer on 2024-05-24. + """ + for p in node.properties: + if isinstance(p.value, list): + for ii, el in enumerate(p.value): + if condition(el): + p.value[ii] = value(el) + elif condition(p.value): + p.value = value(p.value) + + +class SyncGraph: + """ + A data model class for the graph of entities that shall be created during synchronization of + the crawler. + + The SyncGraph combines nodes in the graph based on their identity in order to create a graph of + objects that can either be inserted or updated in(to) the remote server. This combination of + SyncNodes happens during initialization and later on when the ID of SyncNodes is set. + + When the SyncGraph is initialized, the properties of given entities are scanned and used to + create multiple reference maps that track how SyncNodes reference each other. + These maps are kept up to date when SyncNodes are merged because they are identified with each + other. During initialization, SyncNodes are first merged based on their ID, path or + identifiable. + + When additional information is added to the graph by setting the ID of a node + (via `set_id_of_node`) then the graph is updated accordingly: + - if this information implies that the node is equivalent to another node (e.g. has same ID), + then they are merged + - if knowing that one node does not exist in the remote server, then this might imply that some + other node also does not exist if its identity relies on the latter. + - The new ID might make it possible to create the identifiables of connected nodes and thus + might trigger further merging of nodes based on the new identifiables. + + A SyncGraph should only be manipulated via one function: + - set_id_of_node: a positive integer means the Entity exists, None means it is missing + TODO what about String IDs + + The SyncGraph can be converted back to lists of entities which allow to perform the desired + inserts and updates. + + Usage: + - Initialize the Graph with a list of entities. Those will be converted to the SyncNodes of the + graph. + - SyncNodes that can be merged are automatically merged and SyncNodes where the existence can + be determined are automatically removed from the list of unchecked SyncNodes: + graph.unchecked. + - You manipulate the graph by setting the ID of a SyncNode (either to a valid ID or to None). + For example, you can check whether a SyncNode has an identifiable and then query the remote + server and use the result to set the ID. + - After each manipulation, the graph updates accordingly (see above) + - Ideally, the unchecked list is empty after some manipulation. + - You can export a list of entities to be inserted and one of entities to be updated with + export_record_lists. + + Last review by Alexander Schlemmer on 2024-05-24. + """ + + # General implementation remark: + # There are three cases where an update of one SyncNode can affect other nodes: + # - mark existing (add identifiables) + # - mark missing (add identifiables and add (negative) IDs) + # - merge (add identifiables) + # + # We cannot get an infinite recursion where one update triggers another update and so on + # because updates are conditional: + # Setting an ID removes the node (immediately) from the unchecked list and it is only tried to + # set an ID in _mark_missing if a node is in the uncheck list. Thus, setting the ID once + # prevents future attempts to set the ID of the same node. + # Also, setting an identifiable is only done when needed, i.e. there is no identifiable. + # Note, that when ever one node is changed, we check all dependent nodes (see usage of + # `_get_nodes_whose_identity_relies_on`) whether something should be updated. Thus, we cannot + # miss a necessary update. + def __init__( + self, entities: list[db.Entity], identifiableAdapter: IdentifiableAdapter + ): + self.identifiableAdapter = identifiableAdapter + # A dictionary allowing for quick lookup of sync nodes using their (possibly negative) IDs. + # This dictionary is initially set using _mark_entities_with_path_or_id and later updated + # using set_id_of_node or during merges of nodes. + self._id_look_up: dict[Union[int, TempID, str], SyncNode] = {} + # Similar as above for looking up nodes using paths + self._path_look_up: dict[str, SyncNode] = {} + # Similar as above for looking up nodes using identifiables. This dictionary uses the text + # representation generated by get_representation method of Identifiable as keys. + self._identifiable_look_up: dict[str, SyncNode] = {} + # look up for the nodes that were marked as being missing (on the remote server) + self._missing: dict[int, SyncNode] = {} + # same for existing + self._existing: dict[int, SyncNode] = {} + # entities that are missing get negative IDs to allow identifiable creation + self._remote_missing_counter = -1 + + self.nodes: list[SyncNode] = [] + self._initialize_nodes(entities) # list of all SemanticEntities + # list all SemanticEntities that have not yet been checked + self.unchecked = list(self.nodes) + + # initialize reference mappings (see _create_reference_mapping) + ( + self.forward_references, # id(node) -> full set of nodes referenced by the given node + self.backward_references, # id(node) -> full set of nodes referencing the given node + # as above, subset where the reference properties are part of identifiables + self.forward_references_id_props, + self.backward_references_id_props, + # as above, subset where references are part of identifiables due to "referenced_by" + self.forward_references_backref, + self.backward_references_backref, + ) = self._create_reference_mapping(self.nodes) + + # remove entities with path or ID from unchecked list + self._mark_entities_with_path_or_id() + + # add identifiables where possible + for node in list(self.nodes): + if self._identifiable_is_needed(node): + self._set_identifiable_of_node(node) + + # everything in unchecked neither does have an ID nor a path. + # Thus, it must be possible to create an + # identifiable which is checked using the following function: + for node in self.unchecked: + self.identifiableAdapter.all_identifying_properties_exist(node) + + def set_id_of_node(self, node: SyncNode, node_id: Optional[str] = None): + """sets the ID attribute of the given SyncNode to node_id. + + If node_id is None, a negative ID will be + given indicating that the node does not exist on the remote server. + Furthermore it will be marked as missing using _mark_missing. + + Last review by Alexander Schlemmer on 2024-05-24. + """ + if node.id is not None: + raise RuntimeError( + "Cannot update ID.\n" + f"It already is {node.id} and shall be set to {node_id}." + ) + if node_id is None: + node_id = TempID(self._get_new_id()) + node.id = node_id + if node_id in self._id_look_up: + self._merge_into(node, self._id_look_up[node.id]) + else: + self._id_look_up[node.id] = node + if isinstance(node.id, TempID): + self._mark_missing(node) + else: + self._mark_existing(node) + + def export_record_lists(self): + """exports the SyncGraph in form of db.Entities + + All nodes are converted to db.Entity objects and reference values that are SyncNodes are + replaced by their corresponding (newly created) db.Entity objects. + + Since the result is returned in form of two lists, one with Entities that have a valid ID + one with those that haven't, an error is raised if there are any SyncNodes without an + (possibly negative) ID. + + Last review by Alexander Schlemmer on 2024-05-24. + """ + # TODO reactivate once the implementation is appropriate + # if len(self.unchecked) > 1: + # self.unchecked_contains_circular_dependency() + + for el in self.nodes: + if el.id is None: + raise RuntimeError("Exporting unchecked entities is not supported") + + entities = [] + node_map = {} + for el in self.nodes: + entities.append(el.export_entity()) + node_map[id(el)] = entities[-1] + + for ent in entities: + _set_each_scalar_value( + ent, + condition=lambda val: isinstance(val, SyncNode), + value=lambda val: node_map[id(val)], + ) + + missing = [el for el in entities if el.id < 0] + existing = [el for el in entities if el.id > 0] + # remove negative IDs + for el in missing: + el.id = None + + return (missing, existing) + + def _identity_relies_on_unchecked_entity(self, node: SyncNode): + """ + If a record for which it could not yet be verified whether it exists in LA or not is part + of the identifying properties, this returns True, otherwise False + + Last review by Alexander Schlemmer on 2024-05-27. + """ + + return any( + [ + id(ent) not in self._missing and id(ent) not in self._existing + for ent in self.forward_references_id_props[id(node)] + ] + + [ + id(ent) not in self._missing and id(ent) not in self._existing + for ent in self.backward_references_backref[id(node)] + ] + ) + + def unchecked_contains_circular_dependency(self): + """ + Detects whether there are circular references in the given entity list and returns a list + where the entities are ordered according to the chain of references (and only the entities + contained in the circle are included. Returns None if no circular dependency is found. + + TODO: for the sake of detecting problems for split_into_inserts_and_updates we should only + consider references that are identifying properties. + """ + raise NotImplementedError("This function is not yet properly implemented") + # TODO if the first element is not part of the circle, then + # this will not work + # We must created a better implementation (see also TODO in docstring) + circle = [self.unchecked[0]] + closed = False + while not closed: + added_to_circle = False + for referenced in self.forward_references[id(circle[-1])]: + if referenced in self.unchecked: + if referenced in circle: + closed = True + circle.append(referenced) + added_to_circle = True + if not added_to_circle: + return None + return circle + + def get_equivalent(self, entity: SyncNode) -> Optional[SyncNode]: + """ + Return an equivalent SyncNode. + + Equivalent means that ID, path or identifiable are the same. + If a new information was added to the given SyncNode (e.g. the ID), it might be possible + then to identify an equivalent node (i.e. one with the same ID in this example). + There might be more than one equivalent node in the graph. However, simply the first that + is found is being returned. (When an equivalent node is found, the given node is + typically merged, into the one that was found and after the merge the graph is again + checked for equivalent nodes.) + + Returns None if no equivalent node is found. + + Last review by Alexander Schlemmer on 2024-05-28. + """ + if entity.id is not None and entity.id in self._id_look_up: + candidate = self._id_look_up[entity.id] + if candidate is not entity: + return candidate + if entity.path is not None and entity.path in self._path_look_up: + candidate = self._path_look_up[entity.path] + if candidate is not entity: + return candidate + if ( + entity.identifiable is not None + and entity.identifiable.get_representation() in self._identifiable_look_up + ): + candidate = self._identifiable_look_up[ + entity.identifiable.get_representation() + ] + if candidate is not entity: + return candidate + return None + + def _get_new_id(self): + """returns the next unused temporary ID + + Last review by Alexander Schlemmer on 2024-05-24. + """ + self._remote_missing_counter -= 1 + return self._remote_missing_counter + + def _set_identifiable_of_node( + self, node: SyncNode, identifiable: Optional[Identifiable] = None + ): + """sets the identifiable and checks whether an equivalent node can be found with that new + information. If an equivalent node is found, 'node' is merged into that node. + + if no identifiable is given, the identifiable is retrieved from the identifiable adapter + + Raises a ValueError if the equivalent node found does not have an identifiable. + Raises a RuntimeError if there is no equivalent node found and + the (unique) string representation of the identifiable of node is already contained in + the identifiable_look_up. + + Last review by Alexander Schlemmer on 2024-05-29. + """ + if identifiable is None: + self.identifiableAdapter.all_identifying_properties_exist(node) + identifiable = self.identifiableAdapter.get_identifiable( + node, self.backward_references_backref[id(node)] + ) + node.identifiable = identifiable + equivalent_se = self.get_equivalent(node) + if equivalent_se is not None: + self._merge_into(node, equivalent_se) + else: + if node.identifiable.get_representation() in self._identifiable_look_up: + raise RuntimeError("Identifiable is already in the look up") + self._identifiable_look_up[node.identifiable.get_representation()] = node + + @staticmethod + def _sanity_check(entities: list[db.Entity]): + """ + Checks whether each record in entities has at least one parent. + + Last review by Alexander Schlemmer on 2024-05-24. + """ + for ent in entities: + if ent.role == "Record" and len(ent.parents) == 0: + raise ValueError(f"Records must have a parent.\n{ent}") + if isinstance(ent.id, int) and ent.id < 0: + raise ValueError( + f"Records must not have negative integers as IDs.\n{ent}" + ) + if isinstance(ent.id, str) and re.match(r"^-\d+$", ent.id): + raise ValueError( + f"Records must not have negative integers as IDs.\n{ent}" + ) + + def _get_nodes_whose_identity_relies_on(self, node: SyncNode): + """returns a set of nodes that reference the given node as identifying property or are + referenced by the given node and the parent of the given node is listed as + "is_referenced_by" + + Last review by Alexander Schlemmer on 2024-05-24. + """ + return self.backward_references_id_props[id(node)].union( + self.forward_references_backref[id(node)] + ) + + @staticmethod + def _create_flat_list( + ent_list: list[db.Entity], flat: Optional[list[db.Entity]] = None + ): + """ + Recursively adds entities and all their properties contained in ent_list to + the output list flat. + + TODO: This function will be moved to pylib as it is also needed by the + high level API. + + Last review by Alexander Schlemmer on 2024-05-29. + """ + # Note: A set would be useful here, but we do not want a random order. + if flat is None: + flat = list() + for el in ent_list: + if el not in flat: + flat.append(el) + for ent in ent_list: + for p in ent.properties: + # For lists append each element that is of type Entity to flat: + if isinstance(p.value, list): + for el in p.value: + if isinstance(el, db.Entity): + if el not in flat: + flat.append(el) + SyncGraph._create_flat_list([el], flat) + elif isinstance(p.value, db.Entity): + if p.value not in flat: + flat.append(p.value) + SyncGraph._create_flat_list([p.value], flat) + return flat + + @staticmethod + def _create_reference_mapping(flat: list[SyncNode]): + """ + Create six dictionaries that describe references among SyncNodes. All dictionaries use the + Python ID of SyncNodes as keys. + There is always one dictionary to describe the direction of the reference, i.e. + map[id(node)] -> other where other is a set of SyncNodes that are being referenced by node. + And then there is always one dictionary for the inverse direction. The two dictionaries are + named "forward_" and "backward_", respectively. + + Then there are three kinds of maps being generated: One includes all references + ("_references"), one includes references that are values of identifying properties + ("_references_id_props") and one includes references that are relevant for identifying + backreferences/"is_referenced_by" ("_references_backref"). I.e. the two latter are subesets + of the former reference map. + + Arguments: + ---------- + flat: list[SyncNode] + all SyncNodes that span the graph for which the reference map shall be created + + Last review by Alexander Schlemmer on 2024-05-29. + """ + # TODO we need to treat children of RecordTypes somehow. + forward_references: dict[int, set[SyncNode]] = {} + backward_references: dict[int, set[SyncNode]] = {} + forward_references_id_props: dict[int, set[SyncNode]] = {} + backward_references_id_props: dict[int, set[SyncNode]] = {} + forward_references_backref: dict[int, set[SyncNode]] = {} + backward_references_backref: dict[int, set[SyncNode]] = {} + + # initialize with empty lists/dict + for node in flat: + forward_references[id(node)] = set() + backward_references[id(node)] = set() + forward_references_id_props[id(node)] = set() + backward_references_id_props[id(node)] = set() + forward_references_backref[id(node)] = set() + backward_references_backref[id(node)] = set() + for node in flat: + for p in node.properties: + val = p.value + if not isinstance(val, list): + val = [val] + for v in val: + if isinstance(v, SyncNode): + forward_references[id(node)].add(v) + backward_references[id(v)].add(node) + if ( + node.registered_identifiable is not None + and len( + [ + el.name + for el in node.registered_identifiable.properties + if el.name == p.name + ] + ) + > 0 + ): + forward_references_id_props[id(node)].add(v) + backward_references_id_props[id(v)].add(node) + if ( + v.registered_identifiable is not None + and IdentifiableAdapter.referencing_entity_has_appropriate_type( + node.parents, v.registered_identifiable + ) + ): + forward_references_backref[id(node)].add(v) + backward_references_backref[id(v)].add(node) + + return ( + forward_references, + backward_references, + forward_references_id_props, + backward_references_id_props, + forward_references_backref, + backward_references_backref, + ) + + def _mark_entities_with_path_or_id(self): + """A path or an ID is sufficiently identifying. Thus, those entities can be marked as + checked + + When this function returns, there is only one node for each ID (i.e. no two nodes with the + same ID). The same is true for paths. + + This function also updates _id_look_up and _path_look_up + + Last review by Alexander Schlemmer on 2024-05-29. + """ + for node in list(self.nodes): + if node.id is not None: + eq_node = self.get_equivalent(node) + if eq_node is not None: + self._basic_merge_into(node, eq_node) + else: + self._id_look_up[node.id] = node + self._mark_existing(node) + + for node in list(self.nodes): + if node.path is not None: + eq_node = self.get_equivalent(node) + if eq_node is not None: + self._basic_merge_into(node, eq_node) + else: + self._path_look_up[node.path] = node + try: + existing = cached_get_entity_by(path=node.path) + except EmptyUniqueQueryError: + existing = None + remote_id = None + if existing is not None: + remote_id = existing.id + self.set_id_of_node(node, remote_id) + + def _basic_merge_into(self, source: SyncNode, target: SyncNode): + """tries to merge source into target and updates member variables + + - reference maps are updated + - self.nodes is updated + - self.unchecked is updated + - lookups are being updated + """ + # sanity checks + if source is target: + raise ValueError("source must not be target") + + target.update(source) + + # replace actual reference property values + for node in self.backward_references[id(source)]: + _set_each_scalar_value( + node, condition=lambda val: val is source, value=lambda val: target + ) + + # update reference mappings + for setA, setB in ( + (self.forward_references, self.backward_references), # ref: source -> other + (self.backward_references, self.forward_references), # ref: other -> source + (self.forward_references_id_props, self.backward_references_id_props), + (self.backward_references_id_props, self.forward_references_id_props), + (self.forward_references_backref, self.backward_references_backref), + (self.backward_references_backref, self.forward_references_backref), + ): + for node in setA.pop(id(source)): + setA[id(target)].add(node) + setB[id(node)].remove(source) + setB[id(node)].add(target) + + # remove unneeded SyncNode + self.nodes.remove(source) + if source in self.unchecked: + self.unchecked.remove(source) + # update look ups + if target.id is not None: + self._id_look_up[target.id] = target + if target.path is not None: + self._path_look_up[target.path] = target + if target.identifiable is not None: + self._identifiable_look_up[target.identifiable.get_representation()] = target + + def _merge_into(self, source: SyncNode, target: SyncNode): + """tries to merge source into target and performs the necessary updates: + - update the member variables of target using source (``target.update(source)``). + - replaces reference values to source by target + - updates the reference map + - updates lookup tables + - removes source from node lists + - marks target as missing/existing if source was marked that way + - adds an identifiable if now possible (e.g. merging based on ID might allow create an + identifiable when none of the two nodes had the sufficient properties on its own before) + - check whether dependent nodes can now get an identifiable (the merge might have set the + ID such that dependent nodes can now create an identifiable) + + Last review by Alexander Schlemmer on 2024-05-29. + """ + self._basic_merge_into(source, target) + + if (id(source) in self._existing and id(target) in self._missing) or ( + id(target) in self._existing and id(source) in self._missing + ): + raise RuntimeError("Trying to merge missing and existing") + + if id(source) in self._missing and id(target) not in self._missing: + self._mark_missing(target) + elif id(source) in self._existing and id(target) not in self._existing: + self._mark_existing(target) + + # due to the merge it might now be possible to create an identifiable + if self._identifiable_is_needed(target): + self._set_identifiable_of_node(target) + # This is one of three cases that affect other nodes: + # - mark existing + # - mark missing + # - merge + self._add_identifiables_to_dependent_nodes(target) + + eq_node = self.get_equivalent(target) + if eq_node is not None: + self._merge_into(target, eq_node) + + def _identifiable_is_needed(self, node: SyncNode): + """ + This function checks: + - the identifiable of node is None + - the node has all properties that are needed for the identifiable + - there are no unchecked entities that are needed for the identifiable of the node, + neither as forward or as backward references + + Last review by Alexander Schlemmer on 2024-05-24. + """ + return ( + node.identifiable is None + and not self._identity_relies_on_unchecked_entity(node) + and self.identifiableAdapter.all_identifying_properties_exist( + node, raise_exception=False + ) + ) + + def _initialize_nodes(self, entities: list[db.Entity]): + """create initial set of SyncNodes from provided Entity list""" + self._sanity_check(entities) + entities = self._create_flat_list(entities) + se_lookup: dict[int, SyncNode] = {} # lookup: python id -> SyncNode + + # Create new sync nodes from the list of entities, their registered identifiables + # are set from the identifiable adapter. + for el in entities: + self.nodes.append( + SyncNode(el, self.identifiableAdapter.get_registered_identifiable(el)) + ) + se_lookup[id(el)] = self.nodes[-1] + + # replace db.Entity objects with SyncNodes in references: + for node in self.nodes: + _set_each_scalar_value( + node, + condition=lambda val: id(val) in se_lookup, + value=lambda val: se_lookup[id(val)], + ) + + def _add_identifiables_to_dependent_nodes(self, node): + """For each dependent node, we check whether this allows to create an identifiable + + Last review by Alexander Schlemmer on 2024-05-29. + """ + for other_node in self._get_nodes_whose_identity_relies_on(node): + if self._identifiable_is_needed(other_node): + self._set_identifiable_of_node(other_node) + + def _mark_missing(self, node: SyncNode): + """Mark a sync node as missing and remove it from the dictionary of unchecked nodes. + + Last review by Alexander Schlemmer on 2024-05-24. + """ + self._missing[id(node)] = node + self.unchecked.remove(node) + + # This is one of three cases that affect other nodes: + # - mark existing + # - mark missing + # - merge + self._add_identifiables_to_dependent_nodes(node) + # For each dependent node, we set the ID to None (missing) + # (None is the default second argument of set_id_of_node.) + for other_node in self._get_nodes_whose_identity_relies_on(node): + if other_node in self.unchecked: + self.set_id_of_node(other_node) + + def _mark_existing(self, node: SyncNode): + """Mark a sync node as existing and remove it from the dictionary of unchecked nodes. + + Last review by Alexander Schlemmer on 2024-05-24. + """ + if isinstance(node.id, TempID): + raise ValueError("ID must valid existing entities, not TempID") + self._existing[id(node)] = node + self.unchecked.remove(node) + # This is one of three cases that affect other nodes: + # - mark existing + # - mark missing + # - merge + self._add_identifiables_to_dependent_nodes(node) diff --git a/src/caoscrawler/sync_node.py b/src/caoscrawler/sync_node.py new file mode 100644 index 0000000000000000000000000000000000000000..d912d6465a68270411c121f65b4c5a828c9c667e --- /dev/null +++ b/src/caoscrawler/sync_node.py @@ -0,0 +1,264 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Henrik tom Wörden +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Optional +from warnings import warn + +import linkahead as db +import yaml +from linkahead.common.models import Parent, ParentList, PropertyList + +from .exceptions import ImpossibleMergeError + +if TYPE_CHECKING: + from .identifiable import Identifiable + + +class TempID(int): + """A special kind of int for negative temporary IDs. + + This allows to identify TempIDs in the presence of String IDs. + A string ID might look like a negative integer. + """ + pass + + +class SyncNode(db.Entity): + """represents the information of an Entity as it shall be created in LinkAhead + + The following information is taken from an db.Entity object during initialization or when the + object is updated using the `update` member function: + - id + - role + - path + - file + - name + - description + - parents + - properties + + Typically, this class is used in the following way: + 1. A SyncNode is initialized with a db.Entity object. + 2. The SyncNode object is possibly updated one or more times with other SyncNode objects. + 3. A db.Entity object is created (`export_entity`) that contains the combined information. + """ + + def __init__( + self, entity: db.Entity, registered_identifiable: Optional[db.RecordType] = None, + **kwargs + ): + super().__init__(name=entity.name, + id=entity.id, + description=entity.description, + **kwargs) + # db.Entity properties + self.role = entity.role + self.path = entity.path + self.file = entity.file + self.parents = ParentList().extend(entity.parents) + self.properties = PropertyList().extend(entity.properties) + self._check_for_multiproperties() + # other members + self.identifiable: Optional[Identifiable] = None + self.registered_identifiable = registered_identifiable + + def update(self, other: SyncNode) -> None: + """Update this node with information of given ``other`` SyncNode. + + parents are added if they are not yet in the list properties + are added in any case. This may lead to duplication of + properties. We allow this duplication here and remove it when + we create a db.Entity (export_entity function) because if + property values are SyncNode objects, they might not be + comparable (no ID, no identifiable) yet. + + Raises + ------ + ValueError: + The `other` SyncNode doesn't share identifiables with + `this` SyncNode, so they can't be merged. + ImpossibleMergeError: + The two SyncNodes are incompatible in their attributes + like "id", "role", "path", "file", "name", or + "description". + + """ + + if other.identifiable is not None and self.identifiable is not None: + if ( + other.identifiable.get_representation() + != self.identifiable.get_representation() + ): + raise ValueError( + "The SyncNode that is used with update must have an equivalent" + f" identifiable. I.e. you cannot merge entities with differing identifiables" + "The identifiables where:\n" + f"{self.identifiable._create_hashable_string(self.identifiable)}\n" + f"and\n{other.identifiable._create_hashable_string(other.identifiable)}." + ) + + if other.identifiable: + self.identifiable = other.identifiable + for attr in ["id", "role", "path", "file", "name", "description"]: + if other.__getattribute__(attr) is not None: + if self.__getattribute__(attr) is None: + self.__setattr__(attr, other.__getattribute__(attr)) + else: + if self.__getattribute__(attr) != other.__getattribute__(attr): + raise ImpossibleMergeError( + f"Trying to update {attr} but this would lead to an " + f"override of the value '{self.__getattribute__(attr)}' " + f"by the value '{other.__getattribute__(attr)}'", + pname=attr, + value_a=self.__getattribute__(attr), + value_b=other.__getattribute__(attr) + ) + for p in other.parents: + if not parent_in_list(p, self.parents): + self.parents.append(p) + for p in other.properties: + self.properties.append(p) + + def export_entity(self) -> db.Entity: + """create a db.Entity object from this SyncNode + + Properties are only added once (based on id or name). If values do not match, an Error is + raised. If values are SyncNode objects with IDs, they are considered equal if their IDs are + equal. + + Raises + ------ + RuntimeError: + In case of a unsupported role, so no Entity can't be created. + ImpossibleMergeError: + In case of conflicting property values in this SyncNode. + """ + ent = None + if self.role == "Record": + ent = db.Record() + elif self.role == "File": + ent = db.File() + else: + raise RuntimeError("Invalid role") + for attr in ["id", "role", "path", "file", "name", "description"]: + ent.__setattr__(attr, self.__getattribute__(attr)) + for p in self.parents: + ent.add_parent(p) + for p in self.properties: + entval: Any = ent.get_property(p) + if entval is None: + ent.add_property(id=p.id, name=p.name, value=p.value, description=p.description, + datatype=p.datatype, unit=p.unit) + else: + entval = entval.value + unequal = False + pval = p.value + if isinstance(entval, list) != isinstance(pval, list): + unequal = True + if not isinstance(entval, list): + entval = [entval] + if not isinstance(pval, list): + pval = [pval] + if len(entval) != len(pval): + unequal = True + else: + for e_el, p_el in zip(entval, pval): + if isinstance(e_el, SyncNode) and e_el.id is not None: + e_el = e_el.id + if isinstance(p_el, SyncNode) and p_el.id is not None: + p_el = p_el.id + if e_el != p_el: + unequal = True + + if unequal: + ime = ImpossibleMergeError( + f"The crawler is trying to create an entity \n\n{self}\n\nbut there are " + "conflicting property values.", + pname=p.name, value_a=entval, value_b=pval + ) + raise ime + return ent + + def __repr__(self) -> str: + """ somewhat concise text representation of the SyncNode """ + res = f"\n=====================================================\n{self.role}\n" + res += yaml.dump( + { + "id": self.id, + "name": self.name, + "path": self.path, + "parents": [el.name for el in self.parents], + }, + allow_unicode=True, + ) + res += "---------------------------------------------------\n" + res += "properties:\n" + d: dict[str, Any] = {} + for p in self.properties: + v = p.value + d[p.name] = [] + if not isinstance(p.value, list): + v = [v] + for el in v: + if isinstance(el, SyncNode): + d[p.name].append( + { + "id": el.id, + "name": el.name, + "path": el.path, + "parents": [e.name for e in el.parents], + } + ) + else: + d[p.name].append(el) + + return ( + res + + yaml.dump(d, allow_unicode=True) + + "=====================================================\n" + ) + + def _check_for_multiproperties(self): + """ warns if multiproperties are present """ + ids = set() + names = set() + for p in self.properties: + if p.name is not None: + if p.name in names: + warn("Multiproperties are not supported by the crawler.") + names.add(p.name) + if p.id is not None: + if p.id in ids: + warn("Multiproperties are not supported by the crawler.") + ids.add(p.id) + + +def parent_in_list(parent: Parent, plist: ParentList) -> bool: + """helper function that checks whether a parent with the same name or ID is in the plist""" + return plist.filter(parent) + + +def property_in_list(prop: db.Property, plist: PropertyList) -> bool: + """helper function that checks whether a property with the same name or ID is in the plist""" + return plist.filter(prop) diff --git a/src/caoscrawler/transformer_functions.py b/src/caoscrawler/transformer_functions.py index eda9f3c2bc98c8d2561f152f9f6ddd422daee00a..117d0b021d4ec0b0efc79c5db0d7ed397207933f 100644 --- a/src/caoscrawler/transformer_functions.py +++ b/src/caoscrawler/transformer_functions.py @@ -20,9 +20,14 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. +"""Definition of default transformer functions. + +See https://docs.indiscale.com/caosdb-crawler/converters.html#transform-functions for more +information. + """ -Defnition of default transformer functions. -""" + +import datetime import re from typing import Any @@ -61,3 +66,89 @@ def replace(in_value: Any, in_parameters: dict): if not isinstance(in_value, str): raise RuntimeError("must be string") return in_value.replace(in_parameters['remove'], in_parameters['insert']) + + +def date_parse(in_value: str, params: dict) -> str: + """Transform text so that it is formatted in a way that LinkAhead can understand it. + +Parameters +========== + +- date_format: str, optional + A format string using the ``datetime`` specificaton: + https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes + """ + fmt_default = "%Y-%m-%d" + fmt = params.get("date_format", fmt_default) + dt_str = datetime.datetime.strptime(in_value, fmt).strftime(fmt_default) + return dt_str + + +def datetime_parse(in_value: str, params: dict) -> str: + """Transform text so that it is formatted in a way that LinkAhead can understand it. + + +Parameters +========== + +- datetime_format: str, optional + A format string using the ``datetime`` specificaton: + https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes + """ + fmt_default = "%Y-%m-%dT%H:%M:%S" + fmt = params.get("datetime_format", fmt_default) + dt_str = datetime.datetime.strptime(in_value, fmt).strftime(fmt_default) + return dt_str + + +def cast_to_int(in_value: Any, params: dict) -> int: + """ + Cast the `in_value` to int. + + Parameters + ========== + No parameters. + """ + return int(in_value) + + +def cast_to_float(in_value: Any, params: dict) -> float: + """ + Cast the `in_value` to float. + + Parameters + ========== + No parameters. + """ + return float(in_value) + + +def cast_to_bool(in_value: Any, params: dict) -> bool: + """ + Cast the `in_value` to bool. + + This is done by comparing `in_value` to "True". + Only "true", "True", "False" and "false" are accepted as possible values. + All other input values raise an error. + + Parameters + ========== + No parameters. + """ + val = str(in_value).lower() + if val == "true": + return True + if val == "false": + return False + raise ValueError("Invalid value for type cast to bool: {}".format(in_value)) + + +def cast_to_str(in_value: Any, params: dict) -> str: + """ + Cast the `in_value` to str. + + Parameters + ========== + No parameters. + """ + return str(in_value) diff --git a/src/caoscrawler/utils.py b/src/caoscrawler/utils.py index 61b363099d0892b74e91f257bccb6cc832c3d59f..5f736d5ad7550e0b29cb629b2fa140a2f38d6f5f 100644 --- a/src/caoscrawler/utils.py +++ b/src/caoscrawler/utils.py @@ -25,7 +25,12 @@ # Some utility functions, e.g. for extending pylib. -import caosdb as db +import sys +from posixpath import join as posixjoin +from typing import Optional +from urllib.parse import urljoin + +import linkahead as db def has_parent(entity: db.Entity, name: str): @@ -39,3 +44,45 @@ def has_parent(entity: db.Entity, name: str): if parent.name == name: return True return False + + +def MissingImport(name: str, hint: str = "", err: Optional[Exception] = None) -> type: + """Factory with dummy classes, which may be assigned to variables but never used.""" + def _error(): + error_msg = f"This class ({name}) cannot be used, because some libraries are missing." + if hint: + error_msg += "\n\n" + hint + + if err: + print(error_msg, file=sys.stdout) + raise RuntimeError(error_msg) from err + raise RuntimeError(error_msg) + + class _Meta(type): + def __getattribute__(cls, *args, **kwargs): + _error() + + def __call__(cls, *args, **kwargs): + _error() + + class _DummyClass(metaclass=_Meta): + pass + + _DummyClass.__name__ = name + + return _DummyClass + + +def get_shared_resource_link(host_url, filename): + """Return a link adress which is basically {host_url}/Shared/{filename}. + + Use urllib.parse.join and os.path.join to prevent missing or extra ``/`` and the like. + + """ + + if not host_url.endswith('/'): + # Fill with trailing '/' s. that urljoin doesn't remove the context root. + host_url += '/' + # Use posixjoin to always have '/' in links, even when running on + # Windows systems. + return urljoin(host_url, posixjoin("Shared/", filename)) diff --git a/src/caoscrawler/validator.py b/src/caoscrawler/validator.py new file mode 100644 index 0000000000000000000000000000000000000000..33e29b02db429e3382248bbd80d2d00cd7b07c6b --- /dev/null +++ b/src/caoscrawler/validator.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer +# +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +""" +This module contains functions to validate the output of a scanner run with a +json schema. +""" + +import jsonschema +import linkahead as db +# from caosadvancedtools.models.parser import parse_model_from_string +from caosadvancedtools.json_schema_exporter import recordtype_to_json_schema +from caosadvancedtools.models.parser import parse_model_from_yaml +from jsonschema import ValidationError +from linkahead.high_level_api import convert_to_python_object + + +def load_json_schema_from_datamodel_yaml(filename: str) -> dict[str, dict]: + """ + Load a data model yaml file (using caosadvancedtools) and convert + all record types into a json schema using the json_schema_exporter module. + + Arguments + --------- + filename: str + The filename of the yaml file to load. + + Returns + ------- + A dict of json schema objects. The keys are the record types for which the schemas + are generated. + """ + + model = parse_model_from_yaml(filename) + + rt_schemas = {} + for el_key, el in model.items(): + if isinstance(el, db.RecordType): + rt_schemas[el_key] = recordtype_to_json_schema(el) + + return rt_schemas + + +def representer_ordereddict(dumper, data): + """ + Helper function to be able to represent the converted json schema objects correctly as yaml. + This representer essentially replaced OrderedDict objects with simple dict objects. + + Since Python 3.7 dicts are ordered by default, see e.g.: + https://softwaremaniacs.org/blog/2020/02/05/dicts-ordered/en/ + + Example how to use the representer: + ```python + yaml.add_representer(OrderedDict, caoscrawler.validator.representer_ordereddict) + ``` + """ + return dumper.represent_data(dict(data)) + + +def _apply_schema_patches(pobj: dict): + """ + Changes applied: + - properties are moved vom subitem "proeprties" to top-level. + - The following keys are deleted: parents, role, name, description, metadata, properties + """ + if "properties" not in pobj: + # this is probably a file + return pobj + for prop in pobj["properties"]: + if isinstance(pobj["properties"][prop], dict): + pobj[prop] = _apply_schema_patches(pobj["properties"][prop]) + else: + pobj[prop] = pobj["properties"][prop] + + for keyd in ("parents", "role", "name", + "description", "metadata", "properties"): + if keyd in pobj: + del pobj[keyd] + + return pobj + + +def convert_record(record: db.Record): + """ + Convert a record into a form suitable for validation with jsonschema. + + Uses `high_level_api.convert_to_python_object` + Afterwards `_apply_schema_patches` is called recursively to refactor the dictionary + to match the current form of the jsonschema. + + Arguments: + ---------- + record: db.Record + The record that is supposed to be converted. + """ + pobj = convert_to_python_object(record).serialize() + return _apply_schema_patches(pobj) + + +def validate(records: list[db.Record], schemas: dict[str, dict]) -> list[tuple]: + """ + Validate a list of records against a dictionary of schemas. + The keys of the dictionary are record types and the corresponding values are json schemata + associated with that record type. The current implementation assumes that each record that is + checked has exactly one parent and raises an error if that is not the case. + The schema belonging to a record is identified using the name of the first (and only) parent + of the record. + + Arguments: + ---------- + + records: list[db.Record] + List of records that will be validated. + + schemas: dict[str, dict] + A dictionary of JSON schemas generated using `load_json_schema_from_datamodel_yaml`. + + Returns: + -------- + A list of tuples, one element for each record: + + - Index 0: A boolean that determines whether the schema belonging to the record type of the + record matched. + - Index 1: A validation error if the schema did not match or None otherwise. + """ + + retval = [] + for r in records: + if len(r.parents) != 1: + raise NotImplementedError( + "Schema validation is only supported if records have exactly one parent.") + parname = r.parents[0].name + if parname not in schemas: + raise RuntimeError( + "No schema for record type {} in schema dictionary.".format(parname)) + try: + jsonschema.validate(convert_record(r), schemas[parname]) + retval.append((True, None)) + except ValidationError as ex: + retval.append((False, ex)) + return retval diff --git a/src/caoscrawler/version.py b/src/caoscrawler/version.py index fdc8323452cd190cc3628efa57c15992f30fabeb..4cd435486aca26e20e785bbbeb65c013d8e727cb 100644 --- a/src/caoscrawler/version.py +++ b/src/caoscrawler/version.py @@ -17,13 +17,10 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. # -try: - from importlib import metadata as importlib_metadata -except ImportError: # Python<3.8 dowesn"t support this so use - import importlib_metadata +from importlib import metadata as importlib_metadata +from warnings import warn from packaging.version import parse as parse_version -from warnings import warn def get_caoscrawler_version(): @@ -43,7 +40,7 @@ def check_cfood_version(metadata: dict): if not metadata or "crawler-version" not in metadata: msg = """ -No crawler version specified in cfood definition, so there is now guarantee that +No crawler version specified in cfood definition, so there is no guarantee that the cfood definition matches the installed crawler version. Specifying a version is highly recommended to ensure that the definition works diff --git a/src/doc/README_SETUP.md b/src/doc/README_SETUP.md deleted file mode 100644 index 5f5161d0d672ff3ad14db5c5b49f5c65550b06d7..0000000000000000000000000000000000000000 --- a/src/doc/README_SETUP.md +++ /dev/null @@ -1,28 +0,0 @@ -# Getting started with the CaosDB Crawler # - -## Installation -see INSTALL.md - -## Run Unit Tests -Run `pytest unittests`. - -## Documentation ## -We use sphinx to create the documentation. Docstrings in the code should comply -with the Googly style (see link below). - -Build documentation in `src/doc` with `make html`. - -### Requirements ### - -- `sphinx` -- `sphinx-autoapi` -- `recommonmark` -- `sphinx-rtd-theme` - -### How to contribute ### - -- [Google Style Python Docstrings](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) -- [Google Style Python Docstrings 2nd reference](https://github.com/google/styleguide/blob/gh-pages/pyguide.md#38-comments-and-docstrings) -- [References to other documentation](https://www.sphinx-doc.org/en/master/usage/extensions/intersphinx.html#role-external) - - diff --git a/src/doc/cfood.rst b/src/doc/cfood.rst index c12e251d49e164a737b20e92e56e7b3e10149d4f..0c7726d2017b955ecd7472d57dc259ff9a7bab53 100644 --- a/src/doc/cfood.rst +++ b/src/doc/cfood.rst @@ -27,17 +27,17 @@ A single document with a converter tree specification: .. _example_1: .. code-block:: yaml - + extroot: type: Directory match: ^extroot$ subtree: DataAnalysis: - type: Directory - match: DataAnalysis - # (...) + type: Directory + match: DataAnalysis + # (...) + - A single document with a converter tree specification, but also including a custom converters section: .. _example_2: @@ -50,15 +50,15 @@ A single document with a converter tree specification, but also including a cust CustomConverter_2: package: mypackage.converters converter: CustomConverter2 - + extroot: type: Directory match: ^extroot$ subtree: DataAnalysis: - type: Directory - match: DataAnalysis - # (...) + type: Directory + match: DataAnalysis + # (...) @@ -78,11 +78,11 @@ two custom converters in the second document (**not recommended**, see the recom - !defmacro name: SimulationDatasetFile params: - match: null - recordtype: null - nodename: null + match: null + recordtype: null + nodename: null definition: - # (...) + # (...) --- Converters: CustomConverter_1: @@ -91,15 +91,15 @@ two custom converters in the second document (**not recommended**, see the recom CustomConverter_2: package: mypackage.converters converter: CustomConverter2 - + extroot: type: Directory match: ^extroot$ subtree: DataAnalysis: - type: Directory - match: DataAnalysis - # (...) + type: Directory + match: DataAnalysis + # (...) @@ -118,27 +118,27 @@ The **recommended way** of defining metadata, custom converters, macros and the - !defmacro name: SimulationDatasetFile params: - match: null - recordtype: null - nodename: null + match: null + recordtype: null + nodename: null definition: - # (...) + # (...) Converters: CustomConverter_1: - package: mypackage.converters - converter: CustomConverter1 + package: mypackage.converters + converter: CustomConverter1 CustomConverter_2: - package: mypackage.converters - converter: CustomConverter2 + package: mypackage.converters + converter: CustomConverter2 --- extroot: type: Directory match: ^extroot$ subtree: DataAnalysis: - type: Directory - match: DataAnalysis - # (...) + type: Directory + match: DataAnalysis + # (...) List Mode @@ -148,11 +148,73 @@ Specifying values of properties can make use of two special characters, in order create lists or multi properties instead of single values: .. code-block:: yaml - - Experiment1: - Measurement: +Measurement # Element in List (list is cleared before run) - *Measurement # Multi Property (properties are removed before run) - Measurement # Overwrite + + Experiment1: + Measurement: +Measurement # Element in List (list is cleared before run) + *Measurement # Multi Property (properties are removed before run) + Measurement # Overwrite + +Values and units +---------------- + +Property values can be specified as a simple strings (as above) or as +a dictionaries that may also specify the :ref:`collection mode <List +Mode>`. Strings starting with a "$" will be replaced by a +corresponding variable if there is any. See the :doc:`tutorials +chapter<tutorials/index>` of this documentation for more elaborate +examples on how the variable replacment works exactly. A simple +example could look the following. + +.. code-block:: yaml + + ValueElt: + type: TextElement + match_name: ^my_prop$ + match_value: "(?P<value>.*)" # Anything in here is stored in the variable "value" + records: + MyRecord: + MyProp: $value # will be replace by whatever is stored in the "value" variable set above. + +If not given explicitly, the collection mode will be determined from +the first character of the property value as explained above, and the +following three definitions are all equivalent: + +.. code-block:: yaml + + MyProp: +$value + +.. code-block:: yaml + + MyProp: + value: +$value + +and + +.. code-block:: yaml + + MyProp: + value: $value + collection_mode: list + + +Units of numeric values can be set by providing a property value not +as a single string, but as a dictionary with a ``value`` and a +``unit`` key. Within a converter definition this could look the +following. + +.. code-block:: yaml + + ValueWithUnitElt: + type: TextElement + match_name: ^my_prop$ + match_value: "^(?P<number>\\d+\\.?\\d*)\\s+(?P<unit>.+)" # Extract value and unit from a string which + # has a number followed by at least one whitespace + # character followed by a unit. + records: + MyRecord: + MyProp: + value: $number + unit: $unit File Entities @@ -160,7 +222,7 @@ File Entities In order to use File Entities, you must set the appropriate ``role: File``. Additionally, the path and file keys have to be given, with values that set the -paths remotely and locally, respectively. You can use the variable +paths remotely and locally, respectively. You can use the variable ``<converter name>_path`` that is automatically created by converters that deal with file system related StructureElements. The file object itsself is stored in a vairable with the same name (as it is the case for other Records). @@ -169,21 +231,21 @@ in a vairable with the same name (as it is the case for other Records). .. code-block:: yaml somefile: - type: SimpleFile - match: ^params.*$ # macht any file that starts with "params" - records: - fileEntity: - role: File # necessary to create a File Entity - path: somefile.path # defines the path in CaosDB - file: somefile.path # path where the file is found locally - SomeRecord: - ParameterFile: $fileEntity # creates a reference to the file + type: SimpleFile + match: ^params.*$ # match any file that starts with "params" + records: + fileEntity: + role: File # necessary to create a File Entity + path: somefile.path # defines the path in CaosDB + file: somefile.path # path where the file is found locally + SomeRecord: + ParameterFile: $fileEntity # creates a reference to the file Transform Functions ------------------- You can use transform functions to alter variable values that the crawler consumes (e.g. a string -that was matched with a reg exp). See :doc:`Converter Documentation<converters.rst>`. +that was matched with a reg exp). See :doc:`Converter Documentation<converters/index>`. You can define your own transform functions by adding the the same way you add custom converters: diff --git a/src/doc/concepts.rst b/src/doc/concepts.rst index 0b15f9b5d9aebefc2137b234ac4a9440b84906f5..b3aa02a151a4d03c1531094ea01a5246cb02ba73 100644 --- a/src/doc/concepts.rst +++ b/src/doc/concepts.rst @@ -1,49 +1,54 @@ +======== Concepts -)))))))) +======== -The CaosDB Crawler can handle any kind of hierarchical data structure. The typical use case is +The CaosDB Crawler can handle any kind of hierarchical data structure. The typical use case is a directory tree that is traversed. We use the following terms/concepts to describe how the CaosDB Crawler works. +Basics +====== + + Structure Elements ++++++++++++++++++ -This hierarchical structure is assumed to be consituted of a tree of -StructureElements. The tree is created on the fly by so called Converters which -are defined in a yaml file. The tree of StructureElements is a model -of the existing data (For example could a tree of Python file objects -(StructureElements) represent a file tree that exists on some file server). +The crawled hierarchical structure is represented by a tree of *StructureElements*. This tree is +generated on the fly by so called Converters which are defined in a yaml file (usually called +``cfood.yml``). This generated tree of StructureElements is a model of the existing data. For +example a tree of Python *file objects* (StructureElements) could correspond to a file system tree. Relevant sources in: -- ``src/structure_elements.py`` +- :py:mod:`caoscrawler.structure_elements` + +.. _ConceptConverters: Converters ++++++++++ -Converters treat StructureElements and thereby create the StructureElement that -are the children of the treated StructureElement. Converters therefore create -the above named tree. The definition of a Converter also contains what -Converters shall be used to treat the generated child-StructureElements. The -definition is therefore a tree itself. - -See :std:doc:`converters<converters>` for details. +Converters treat a StructureElement and during this process create a number of new +StructureElements: the children of the initially treated StructureElement. Thus by treatment of +existing StructureElements, Converters create a tree of StructureElements. +.. image:: img/converter.png + :height: 170 +See the chapter :std:doc:`Converters<converters/index>` for details. Relevant sources in: -- ``src/converters.py`` +- :py:mod:`caoscrawler.converters` Identifiables +++++++++++++ -An Identifiable of a Record is like the fingerprint of a Record. +An *Identifiable* of a Record is like the fingerprint of a Record. -The identifiable contains the information that is used by the CaosDB Crawler to identify Records. -For example, in order to check whether a Record exits in the CaosDB Server, the CaosDB Crawler creates a query -using the information contained in the Identifiable. +The Identifiable contains the information that is used by the CaosDB Crawler to identify Records. +For example, the CaosDB Crawler may create a query using the information contained in the +Identifiable in order to check whether a Record exists in the CaosDB Server. Suppose a certain experiment is at most done once per day, then the identifiable could consist of the RecordType "SomeExperiment" (as a parent) and the Property "date" with the respective value. @@ -65,8 +70,8 @@ In the current implementation an identifiable can only use one RecordType even t Relevant sources in -- ``src/identifiable_adapters.py`` -- ``src/identifiable.py`` +- :py:mod:`caoscrawler.identifiable_adapters` +- :py:mod:`caoscrawler.identifiable` Registered Identifiables ++++++++++++++++++++++++ @@ -85,7 +90,9 @@ we can check whether a Record with the parent "Project" is referencing the "Expe Record. If that is the case, this reference is part of the identifiable for the "Experiment" Record. Note, that if there are multiple Records with the appropriate parent (e.g. multiple "Project" Records in the above example) it will be required that all of them -reference the object to be identified. +reference the object to be identified. You can also use the wildcard "*" as +RecordType name in the configuration which will only require, that ANY Record +references the Record at hand. Identified Records @@ -96,12 +103,14 @@ The Crawler +++++++++++ The crawler can be considered the main program doing the synchronization in basically two steps: + #. Based on a yaml-specification scan the file system (or other sources) and create a set of CaosDB Entities that are supposed to be inserted or updated in a CaosDB instance. + #. Compare the current state of the CaosDB instance with the set of CaosDB Entities created in step 1, taking into account the :ref:`registered identifiables<Identifiables>`. Insert or update entites accordingly. Relevant sources in: -- ``src/crawl.py`` +- :py:mod:`caoscrawler.crawl` @@ -179,8 +188,7 @@ TODO Caching +++++++ -The Crawler uses the cached library function ``cached_get_entity_by``. The -cache is cleared automatically, when the Crawler does updates, but if you would -run the same Python process indefinetely the Crawler would not see changes due -to the Cache. Thus, please make sure to clear the cache if you create long -running Python processes. +The Crawler uses the cached library function ``cached_get_entity_by``. The cache is cleared +automatically when the Crawler does updates, but if you ran the same Python process indefinitely, +the Crawler would not see changes in LinkAhead due to the cache. Thus, please make sure to clear the +cache if you create long running Python processes. diff --git a/src/doc/conf.py b/src/doc/conf.py index e13d7724351df30c59974d7b8136055ec684ed73..01ca66bf03c1fb0e105e97dccaadc4d1ef5d14f0 100644 --- a/src/doc/conf.py +++ b/src/doc/conf.py @@ -21,11 +21,11 @@ # import os import sys + sys.path.insert(0, os.path.abspath('..')) import sphinx_rtd_theme # noqa: E402 - # -- Project information ----------------------------------------------------- project = 'caosdb-caoscrawler' @@ -33,10 +33,10 @@ copyright = '2024, IndiScale' author = 'Alexander Schlemmer' # The short X.Y version -version = '0.6.1' +version = '0.10.2' # The full version, including alpha/beta/rc tags # release = '0.5.2-rc2' -release = '0.6.1-dev' +release = '0.10.2-dev' # -- General configuration --------------------------------------------------- @@ -53,6 +53,7 @@ extensions = [ 'sphinx.ext.autosectionlabel', 'sphinx.ext.intersphinx', 'sphinx.ext.napoleon', # For Google style docstrings + "sphinx.ext.todo", "recommonmark", # For markdown files. "sphinx_rtd_theme", ] @@ -213,6 +214,10 @@ intersphinx_mapping = { # TODO Which options do we want? autodoc_default_options = { - 'members': None, - 'undoc-members': None, + 'members': True, + 'undoc-members': True, + 'member-order': 'bysource', + 'special-member': ["__init__"], } + +todo_include_todos = True diff --git a/src/doc/converters.rst b/src/doc/converters.rst deleted file mode 100644 index 60da52d3ed110f050a3d7aae866cc7d8b6b8dc31..0000000000000000000000000000000000000000 --- a/src/doc/converters.rst +++ /dev/null @@ -1,553 +0,0 @@ -Converters -)))))))))) - -Converters treat StructureElements and thereby create the StructureElement that -are the children of the treated StructureElement. Converters therefore create -the tree of structure elements. The definition of a Converter also contains what -Converters shall be used to treat the generated child-StructureElements. The -definition is therefore a tree itself. - -Each StructureElement in the tree has a set of data values, i.e a dictionary of -key value pairs. -Some of those values are set due to the kind of StructureElement. For example, -a file could have the file name as such a key value pair: 'filename': <sth>. -Converters may define additional functions that create further values. For -example, a regular expresion could be used to get a date from a file name. - - -A converter is defined via a yml file or part of it. The definition states -what kind of StructureElement it treats (typically one). -Also, it defines how children of the current StructureElement are -created and what Converters shall be used to treat those. - -The yaml definition looks like the following: - -TODO: outdated, see cfood-schema.yml - -.. code-block:: yaml - - <NodeName>: - type: <ConverterName> - match: ".*" - records: - Experiment1: - parents: - - Experiment - - Blablabla - date: $DATUM - (...) - Experiment2: - parents: - - Experiment - subtree: - (...) - -The **<NodeName>** is a description of what it represents (e.g. -'experiment-folder') and is used as identifier. - -**<type>** selects the converter that is going to be matched against the current structure -element. If the structure element matches (this is a combination of a typecheck and a detailed -match, see :py:class:`~caoscrawler.converters.Converter` for details) the converter is used -to generate records (see :py:meth:`~caoscrawler.converters.Converter.create_records`) and to possibly process a subtree, as defined by the function :func:`caoscrawler.converters.create_children`. - -**records** is a dict of definitions that define the semantic structure -(see details below). - -Subtree contains a list of Converter defnitions that look like the one -described here. - -Transform Functions -+++++++++++++++++++ -Often the situation arises, that you cannot use a value as it is found. Maybe a value should be -increased by an offset or a string should be split into a list of pieces. In order to allow such -simple conversions, transform functions can be named in the converter definition that are then -applied to the respective variables when the converter is executed. - -.. code-block:: yaml - - <NodeName>: - type: <ConverterName> - match: ".*" - transform: - <TransformNodeName>: - in: $<in_var_name> - out: $<out_var_name> - functions: - - <func_name>: # name of the function to be applied - <func_arg1>: <func_arg1_value> # key value pairs that are passed as parameters - <func_arg2>: <func_arg2_value> - # ... - -An example that splits the variable ``a`` and puts the generated list in ``b`` is the following: - -.. code-block:: yaml - - Experiment: - type: Dict - match: ".*" - transform: - param_split: - in: $a - out: $b - functions: - - split: # split is a function that is defined by default - marker: "|" # its only parameter is the marker that is used to split the string - records: - Report: - tags: $b - -This splits the string in '$a' and stores the resulting list in '$b'. This is here used to add a -list valued property to the Report Record. - - -There are a number of transform functions that are defined by default (see -``src/caoscrawler/default_transformers.yml``). You can define custom transform functions by adding -them to the cfood definition (see :doc:`CFood Documentation<cfood.rst>`). - - -Standard Converters -+++++++++++++++++++ - -Directory Converter -=================== -The Directory Converter creates StructureElements for each File and Directory -inside the current Directory. You can match a regular expression against the -directory name using the 'match' key. - -Simple File Converter -===================== -The Simple File Converter does not create any children and is usually used if -a file shall be used as it is and be inserted and referenced by other entities. - -Markdown File Converter -======================= -Reads a YAML header from Markdown files (if such a header exists) and creates -children elements according to the structure of the header. - -DictElement Converter -===================== -Creates a child StructureElement for each key in the dictionary. - -Typical Subtree converters --------------------------- -The following StructureElement are typically created: - -- BooleanElement -- FloatElement -- TextElement -- IntegerElement -- ListElement -- DictElement - -Scalar Value Converters -======================= -`BooleanElementConverter`, `FloatElementConverter`, `TextElementConverter`, and -`IntegerElementConverter` behave very similarly. - -These converters expect `match_name` and `match_value` in their definition -which allow to match the key and the value, respectively. - -Note that there are defaults for accepting other types. For example, -FloatElementConverter also accepts IntegerElements. The default -behavior can be adjusted with the fields `accept_text`, `accept_int`, -`accept_float`, and `accept_bool`. - -The following denotes what kind of StructureElements are accepted by default -(they are defined in `src/caoscrawler/converters.py`): - -- DictBooleanElementConverter: bool, int -- DictFloatElementConverter: int, float -- DictTextElementConverter: text, bool, int, float -- DictIntegerElementConverter: int -- DictListElementConverter: list -- DictDictElementConverter: dict - -YAMLFileConverter -================= - -A specialized Dict Converter for yaml files: Yaml files are opened and the contents are -converted into dictionaries that can be further converted using the typical subtree converters -of dict converter. - -**WARNING**: Currently unfinished implementation. - -JSONFileConverter -================= - - - - -TableConverter -============== - -A generic converter (abstract) for files containing tables. -Currently, there are two specialized implementations for xlsx-files and csv-files. - -All table converters generate a subtree that can be converted with DictDictElementConverters: -For each row in the table a DictDictElement (structure element) is generated. The key of the -element is the row number. The value of the element is a dict containing the mapping of -column names to values of the respective cell. - -Example: - -.. code-block:: yaml - - subtree: - TABLE: - type: CSVTableConverter - match: ^test_table.csv$ - records: - (...) # Records edited for the whole table file - subtree: - ROW: - type: DictDictElement - match_name: .* - match_value: .* - records: - (...) # Records edited for each row - subtree: - COLUMN: - type: DictFloatElement - match_name: measurement # Name of the column in the table file - match_value: (?P<column_value).*) - records: - (...) # Records edited for each cell - - -XLSXTableConverter -================== - -CSVTableConverter -================= - -Custom Converters -+++++++++++++++++ - -It was previously mentioned that it is possible to create custom converters. -These custom converters can be used to integrate arbitrary data extraction and ETL capabilities -into the caosdb-crawler and make these extensions available to any yaml specification. - -The basic syntax for adding a custom converter to a yaml cfood definition file is: - -.. code-block:: yaml - - Converters: - <NameOfTheConverterInYamlFile>: - package: <python>.<module>.<name> - converter: <PythonClassName> - -The Converters-section can be either put into the first or second document of the cfood yaml file. -It can be also part of a single-document yaml cfood file. Please refer to :doc:`the cfood documentation<cfood>` for more details. - -Details: - -- **<NameOfTheConverterInYamlFile>**: This is the name of the converter as it is going to be used in the present yaml file. -- **<python>.<module>.<name>**: The name of the module where the converter class resides. -- **<PythonClassName>**: Within this specified module there must be a class inheriting from base class :py:class:`caoscrawler.converters.Converter`. - -The following methods are abstract and need to be overwritten by your custom converter to make it work: - -- :py:meth:`~caoscrawler.converters.Converter.create_children` -- :py:meth:`~caoscrawler.converters.Converter.match` -- :py:meth:`~caoscrawler.converters.Converter.typecheck` - - -Example -======= - -In the following, we will explain the process of adding a custom converter to a yaml file using -a SourceResolver that is able to attach a source element to another entity. - -**Note**: This example might become a standard crawler soon, as part of the scifolder specification. See https://doi.org/10.3390/data5020043 for details. In this documentation example we will, therefore, add it to a package called "scifolder". - -First we will create our package and module structure, which might be: - -.. code-block:: - - scifolder_package/ - README.md - setup.cfg - setup.py - Makefile - tox.ini - src/ - scifolder/ - __init__.py - converters/ - __init__.py - sources.py # <- the actual file containing - # the converter class - doc/ - unittests/ - -Now we need to create a class called "SourceResolver" in the file "sources.py". In this - more advanced - example, we will not inherit our converter directly from :py:class:`~caoscrawler.converters.Converter`, but use :py:class:`~caoscrawler.converters.TextElementConverter`. The latter already implements :py:meth:`~caoscrawler.converters.Converter.match` and :py:meth:`~caoscrawler.converters.Converter.typecheck`, so only an implementation for :py:meth:`~caoscrawler.converters.Converter.create_children` has to be provided by us. -Furthermore we will customize the method :py:meth:`~caoscrawler.converters.Converter.create_records` that allows us to specify a more complex record generation procedure than provided in the standard implementation. One specific limitation of the standard implementation is, that only a fixed -number of records can be generated by the yaml definition. So for any applications - like here - that require an arbitrary number of records to be created, a customized implementation of :py:meth:`~caoscrawler.converters.Converter.create_records` is recommended. -In this context it is recommended to make use of the function :func:`caoscrawler.converters.create_records` that implements creation of record objects from python dictionaries of the same structure -that would be given using a yaml definition (see next section below). - -.. code-block:: python - - import re - from caoscrawler.stores import GeneralStore, RecordStore - from caoscrawler.converters import TextElementConverter, create_records - from caoscrawler.structure_elements import StructureElement, TextElement - - - class SourceResolver(TextElementConverter): - """ - This resolver uses a source list element (e.g. from the markdown readme file) - to link sources correctly. - """ - - def __init__(self, definition: dict, name: str, - converter_registry: dict): - """ - Initialize a new directory converter. - """ - super().__init__(definition, name, converter_registry) - - def create_children(self, generalStore: GeneralStore, - element: StructureElement): - - # The source resolver does not create children: - - return [] - - def create_records(self, values: GeneralStore, - records: RecordStore, - element: StructureElement, - file_path_prefix): - if not isinstance(element, TextElement): - raise RuntimeError() - - # This function must return a list containing tuples, each one for a modified - # property: (name_of_entity, name_of_property) - keys_modified = [] - - # This is the name of the entity where the source is going to be attached: - attach_to_scientific_activity = self.definition["scientific_activity"] - rec = records[attach_to_scientific_activity] - - # The "source" is a path to a source project, so it should have the form: - # /<Category>/<project>/<scientific_activity>/ - # obtain these information from the structure element: - val = element.value - regexp = (r'/(?P<category>(SimulationData)|(ExperimentalData)|(DataAnalysis))' - '/(?P<project_date>.*?)_(?P<project_identifier>.*)' - '/(?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2})(_(?P<identifier>.*))?/') - - res = re.match(regexp, val) - if res is None: - raise RuntimeError("Source cannot be parsed correctly.") - - # Mapping of categories on the file system to corresponding record types in CaosDB: - cat_map = { - "SimulationData": "Simulation", - "ExperimentalData": "Experiment", - "DataAnalysis": "DataAnalysis"} - linkrt = cat_map[res.group("category")] - - keys_modified.extend(create_records(values, records, { - "Project": { - "date": res.group("project_date"), - "identifier": res.group("project_identifier"), - }, - linkrt: { - "date": res.group("date"), - "identifier": res.group("identifier"), - "project": "$Project" - }, - attach_to_scientific_activity: { - "sources": "+$" + linkrt - }}, file_path_prefix)) - - # Process the records section of the yaml definition: - keys_modified.extend( - super().create_records(values, records, element, file_path_prefix)) - - # The create_records function must return the modified keys to make it compatible - # to the crawler functions: - return keys_modified - - -If the recommended (python) package structure is used, the package containing the converter -definition can just be installed using `pip install .` or `pip install -e .` from the -`scifolder_package` directory. - -The following yaml block will register the converter in a yaml file: - -.. code-block:: yaml - - Converters: - SourceResolver: - package: scifolder.converters.sources - converter: SourceResolver - - -Using the `create_records` API function -======================================= - -The function :func:`caoscrawler.converters.create_records` was already mentioned above and it is -the recommended way to create new records from custom converters. Let's have a look at the -function signature: - -.. code-block:: python - - def create_records(values: GeneralStore, # <- pass the current variables store here - records: RecordStore, # <- pass the current store of CaosDB records here - def_records: dict): # <- This is the actual definition of new records! - - -`def_records` is the actual definition of new records according to the yaml cfood specification -(work in progress, in the docs). Essentially you can do everything here, that you could do -in the yaml document as well, but using python source code. - -Let's have a look at a few examples: - -.. code-block:: yaml - - DirConverter: - type: Directory - match: (?P<dir_name>.*) - records: - Experiment: - identifier: $dir_name - -This block will just create a new record with parent `Experiment` and one property -`identifier` with a value derived from the matching regular expression. - -Let's formulate that using `create_records`: - -.. code-block:: python - - dir_name = "directory name" - - record_def = { - "Experiment": { - "identifier": dir_name - } - } - - keys_modified = create_records(values, records, - record_def) - -The `dir_name` is set explicitely here, everything else is identical to the yaml statements. - - -The role of `keys_modified` -=========================== - -You probably have noticed already, that :func:`caoscrawler.converters.create_records` returns -`keys_modified` which is a list of tuples. Each element of `keys_modified` has two elements: - -- Element 0 is the name of the record that is modified (as used in the record store `records`). -- Element 1 is the name of the property that is modified. - -It is important, that the correct list of modified keys is returned by -:py:meth:`~caoscrawler.converters.Converter.create_records` to make the crawler process work. - -So, a sketch of a typical implementation within a custom converter could look like this: - - -.. code-block:: python - - def create_records(self, values: GeneralStore, - records: RecordStore, - element: StructureElement, - file_path_prefix: str): - - # Modify some records: - record_def = { - # ... - } - - keys_modified = create_records(values, records, - record_def) - - # You can of course do it multiple times: - keys_modified.extend(create_records(values, records, - record_def)) - - # You can also process the records section of the yaml definition: - keys_modified.extend( - super().create_records(values, records, element, file_path_prefix)) - # This essentially allows users of your converter to customize the creation of records - # by providing a custom "records" section additionally to the modifications provided - # in this implementation of the Converter. - - # Important: Return the list of modified keys! - return keys_modified - - -More complex example -==================== - -Let's have a look at a more complex examples, defining multiple records: - -.. code-block:: yaml - - DirConverter: - type: Directory - match: (?P<dir_name>.*) - records: - Project: - identifier: project_name - Experiment: - identifier: $dir_name - Project: $Project - ProjectGroup: - projects: +$Project - - -This block will create two new Records: - -- A project with a constant identifier -- An experiment with an identifier, derived from a regular expression and a reference to the new project. - -Furthermore a Record `ProjectGroup` will be edited (its initial definition is not given in the -yaml block): The project that was just created will be added as a list element to the property -`projects`. - -Let's formulate that using `create_records` (again, `dir_name` is constant here): - -.. code-block:: python - - dir_name = "directory name" - - record_def = { - "Project": { - "identifier": "project_name", - } - "Experiment": { - "identifier": dir_name, - "Project": "$Project", - } - "ProjectGroup": { - "projects": "+$Project", - } - - } - - keys_modified = create_records(values, records, - record_def) - -Debugging -========= - -You can add the key `debug_match` to the definition of a Converter in order to create debugging -output for the match step. The following snippet illustrates this: - -.. code-block:: yaml - - DirConverter: - type: Directory - match: (?P<dir_name>.*) - debug_match: True - records: - Project: - identifier: project_name - - -Whenever this Converter tries to match a StructureElement, it logs what was tried to macht against -what and what the result was. diff --git a/src/doc/converters/cfood_definition.rst b/src/doc/converters/cfood_definition.rst new file mode 100644 index 0000000000000000000000000000000000000000..ea2f14b23bec04e659aa3166f089c7d274f74811 --- /dev/null +++ b/src/doc/converters/cfood_definition.rst @@ -0,0 +1,53 @@ +CFood definition +++++++++++++++++ + +Converter application to data is specified via a tree-like yml file (called ``cfood.yml``, by +convention). The yml file specifies which Converters shall be used on which StructureElements, and +how to treat the generated *child* StructureElements. + +The yaml definition may look like this: + +.. todo:: + + This is outdated, see ``cfood-schema.yml`` for the current specification of a ``cfood.yml``. + +.. code-block:: yaml + + <NodeName>: + type: <ConverterName> + match: ".*" + records: + Experiment1: + parents: + - Experiment + - Blablabla + date: $DATUM + (...) + Experiment2: + parents: + - Experiment + subtree: + (...) + +The **<NodeName>** is a description of what the current block represents (e.g. +``experiment-folder``) and is used as an identifier. + +**<type>** selects the converter that is going to be matched against +the current structure element. If the structure element matches (this +is a combination of a typecheck and a detailed match, see the +:py:class:`~caoscrawler.converters.converters.Converter` source +documentation for details), the converter will: + +- generate records (with + :py:meth:`~caoscrawler.converters.converters.Converter.create_records`) +- possibly process a subtree (with + :py:meth:`~caoscrawler.converters.converters.Converter.create_children`) + +**match** *TODO* + +**records** is a dict of definitions that define the semantic structure +(see details below). + +**subtree** makes the yaml recursive: It contains a list of new Converter +definitions, which work on the StructureElements that are returned by the +current Converter. diff --git a/src/doc/converters/custom_converters.rst b/src/doc/converters/custom_converters.rst new file mode 100644 index 0000000000000000000000000000000000000000..2738d66c483148fdecb9b189edac45e5b9a55a8b --- /dev/null +++ b/src/doc/converters/custom_converters.rst @@ -0,0 +1,344 @@ +Custom Converters ++++++++++++++++++ + +As mentioned before it is possible to create custom converters. +These custom converters can be used to integrate arbitrary data extraction and ETL capabilities +into the LinkAhead crawler and make these extensions available to any yaml specification. + +Tell the crawler about a custom converter +========================================= + +To use a custom crawler, it must be defined in the ``Converters`` section of the CFood yaml file. +The basic syntax for adding a custom converter to a definition file is: + +.. code-block:: yaml + + Converters: + <NameOfTheConverterInYamlFile>: + package: <python>.<module>.<name> + converter: <PythonClassName> + +The Converters section can be either put into the first or the second +document of the cfood yaml file. It can be also part of a +single-document yaml cfood file. Please refer to :doc:`the cfood +documentation<../cfood>` for more details. + +Details: + +- **<NameOfTheConverterInYamlFile>**: This is the name of the converter as it is going to be used in the present yaml file. +- **<python>.<module>.<name>**: The name of the module where the converter class resides. +- **<PythonClassName>**: Within this specified module there must be a class inheriting from base class :py:class:`caoscrawler.converters.converters.Converter`. + +Implementing a custom converter +=============================== + +Converters inherit from the :py:class:`~caoscrawler.converters.converters.Converter` class. + +The following methods are abstract and need to be overwritten by your custom converter to make it work: + +:py:meth:`~caoscrawler.converters.converters.Converter.create_children`: + Return a list of child StructureElement objects. + +- :py:meth:`~caoscrawler.converters.converters.Converter.match` +- :py:meth:`~caoscrawler.converters.converters.Converter.typecheck` + + +Example +======= + +In the following, we will explain the process of adding a custom converter to a yaml file using +a SourceResolver that is able to attach a source element to another entity. + +**Note**: This example might become a standard crawler soon, as part of the scifolder specification. See https://doi.org/10.3390/data5020043 for details. In this documentation example we will, therefore, add it to a package called "scifolder". + +First we will create our package and module structure, which might be: + +.. code-block:: + + scifolder_package/ + README.md + setup.cfg + setup.py + Makefile + tox.ini + src/ + scifolder/ + __init__.py + converters/ + __init__.py + sources.py # <- the actual file containing + # the converter class + doc/ + unittests/ + +Now we need to create a class called "SourceResolver" in the file "sources.py". In this - more advanced - example, we will not inherit our converter directly from :py:class:`~caoscrawler.converters.converters.Converter`, but use :py:class:`~caoscrawler.converters.converters.TextElementConverter`. The latter already implements :py:meth:`~caoscrawler.converters.converters.Converter.match` and :py:meth:`~caoscrawler.converters.converters.Converter.typecheck`, so only an implementation for :py:meth:`~caoscrawler.converters.converters.Converter.create_children` has to be provided by us. +Furthermore we will customize the method :py:meth:`~caoscrawler.converters.converters.Converter.create_records` that allows us to specify a more complex record generation procedure than provided in the standard implementation. One specific limitation of the standard implementation is, that only a fixed +number of records can be generated by the yaml definition. So for any applications - like here - that require an arbitrary number of records to be created, a customized implementation of :py:meth:`~caoscrawler.converters.converters.Converter.create_records` is recommended. +In this context it is recommended to make use of the function :func:`caoscrawler.converters.converters.create_records` that implements creation of record objects from python dictionaries of the same structure +that would be given using a yaml definition (see next section below). + +.. code-block:: python + + import re + from caoscrawler.stores import GeneralStore, RecordStore + from caoscrawler.converters import TextElementConverter, create_records + from caoscrawler.structure_elements import StructureElement, TextElement + + + class SourceResolver(TextElementConverter): + """ + This resolver uses a source list element (e.g. from the markdown readme file) + to link sources correctly. + """ + + def __init__(self, definition: dict, name: str, + converter_registry: dict): + """ + Initialize a new directory converter. + """ + super().__init__(definition, name, converter_registry) + + def create_children(self, generalStore: GeneralStore, + element: StructureElement): + + # The source resolver does not create children: + + return [] + + def create_records(self, values: GeneralStore, + records: RecordStore, + element: StructureElement, + file_path_prefix): + if not isinstance(element, TextElement): + raise RuntimeError() + + # This function must return a list containing tuples, each one for a modified + # property: (name_of_entity, name_of_property) + keys_modified = [] + + # This is the name of the entity where the source is going to be attached: + attach_to_scientific_activity = self.definition["scientific_activity"] + rec = records[attach_to_scientific_activity] + + # The "source" is a path to a source project, so it should have the form: + # /<Category>/<project>/<scientific_activity>/ + # obtain these information from the structure element: + val = element.value + regexp = (r'/(?P<category>(SimulationData)|(ExperimentalData)|(DataAnalysis))' + '/(?P<project_date>.*?)_(?P<project_identifier>.*)' + '/(?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2})(_(?P<identifier>.*))?/') + + res = re.match(regexp, val) + if res is None: + raise RuntimeError("Source cannot be parsed correctly.") + + # Mapping of categories on the file system to corresponding record types in CaosDB: + cat_map = { + "SimulationData": "Simulation", + "ExperimentalData": "Experiment", + "DataAnalysis": "DataAnalysis"} + linkrt = cat_map[res.group("category")] + + keys_modified.extend(create_records(values, records, { + "Project": { + "date": res.group("project_date"), + "identifier": res.group("project_identifier"), + }, + linkrt: { + "date": res.group("date"), + "identifier": res.group("identifier"), + "project": "$Project" + }, + attach_to_scientific_activity: { + "sources": "+$" + linkrt + }}, file_path_prefix)) + + # Process the records section of the yaml definition: + keys_modified.extend( + super().create_records(values, records, element, file_path_prefix)) + + # The create_records function must return the modified keys to make it compatible + # to the crawler functions: + return keys_modified + + +If the recommended (python) package structure is used, the package containing the converter +definition can just be installed using `pip install .` or `pip install -e .` from the +`scifolder_package` directory. + +The following yaml block will register the converter in a yaml file: + +.. code-block:: yaml + + Converters: + SourceResolver: + package: scifolder.converters.sources + converter: SourceResolver + + +Using the `create_records` API function +======================================= + +The function :func:`caoscrawler.converters.converters.create_records` was already mentioned above and it is +the recommended way to create new records from custom converters. Let's have a look at the +function signature: + +.. code-block:: python + + def create_records(values: GeneralStore, # <- pass the current variables store here + records: RecordStore, # <- pass the current store of CaosDB records here + def_records: dict): # <- This is the actual definition of new records! + + +`def_records` is the actual definition of new records according to the yaml cfood specification +(work in progress, in the docs). Essentially you can do everything here, that you could do +in the yaml document as well, but using python source code. + +Let's have a look at a few examples: + +.. code-block:: yaml + + DirConverter: + type: Directory + match: (?P<dir_name>.*) + records: + Experiment: + identifier: $dir_name + +This block will just create a new record with parent `Experiment` and one property +`identifier` with a value derived from the matching regular expression. + +Let's formulate that using `create_records`: + +.. code-block:: python + + dir_name = "directory name" + + record_def = { + "Experiment": { + "identifier": dir_name + } + } + + keys_modified = create_records(values, records, + record_def) + +The `dir_name` is set explicitely here, everything else is identical to the yaml statements. + + +The role of `keys_modified` +=========================== + +You probably have noticed already, that :func:`caoscrawler.converters.converters.create_records` returns +`keys_modified` which is a list of tuples. Each element of `keys_modified` has two elements: + +- Element 0 is the name of the record that is modified (as used in the record store `records`). +- Element 1 is the name of the property that is modified. + +It is important, that the correct list of modified keys is returned by +:py:meth:`~caoscrawler.converters.converters.Converter.create_records` to make the crawler process work. + +So, a sketch of a typical implementation within a custom converter could look like this: + + +.. code-block:: python + + def create_records(self, values: GeneralStore, + records: RecordStore, + element: StructureElement, + file_path_prefix: str): + + # Modify some records: + record_def = { + # ... + } + + keys_modified = create_records(values, records, + record_def) + + # You can of course do it multiple times: + keys_modified.extend(create_records(values, records, + record_def)) + + # You can also process the records section of the yaml definition: + keys_modified.extend( + super().create_records(values, records, element, file_path_prefix)) + # This essentially allows users of your converter to customize the creation of records + # by providing a custom "records" section additionally to the modifications provided + # in this implementation of the Converter. + + # Important: Return the list of modified keys! + return keys_modified + + +More complex example +==================== + +Let's have a look at a more complex examples, defining multiple records: + +.. code-block:: yaml + + DirConverter: + type: Directory + match: (?P<dir_name>.*) + records: + Project: + identifier: project_name + Experiment: + identifier: $dir_name + Project: $Project + ProjectGroup: + projects: +$Project + + +This block will create two new Records: + +- A project with a constant identifier +- An experiment with an identifier, derived from a regular expression and a reference to the new project. + +Furthermore a Record `ProjectGroup` will be edited (its initial definition is not given in the +yaml block): The project that was just created will be added as a list element to the property +`projects`. + +Let's formulate that using `create_records` (again, `dir_name` is constant here): + +.. code-block:: python + + dir_name = "directory name" + + record_def = { + "Project": { + "identifier": "project_name", + } + "Experiment": { + "identifier": dir_name, + "Project": "$Project", + } + "ProjectGroup": { + "projects": "+$Project", + } + + } + + keys_modified = create_records(values, records, + record_def) + +Debugging +========= + +You can add the key `debug_match` to the definition of a Converter in order to create debugging +output for the match step. The following snippet illustrates this: + +.. code-block:: yaml + + DirConverter: + type: Directory + match: (?P<dir_name>.*) + debug_match: True + records: + Project: + identifier: project_name + + +Whenever this Converter tries to match a StructureElement, it logs what was tried to macht against +what and what the result was. diff --git a/src/doc/converters/further_converters.rst b/src/doc/converters/further_converters.rst new file mode 100644 index 0000000000000000000000000000000000000000..a334c8778f440e108fd141b0fc53ec06765deb8c --- /dev/null +++ b/src/doc/converters/further_converters.rst @@ -0,0 +1,100 @@ +Further converters +++++++++++++++++++ + +More converters, together with cfood definitions and examples can be found in +the `LinkAhead Crawler Extensions Subgroup +<https://gitlab.com/linkahead/crawler-extensions>`_ on gitlab. In the following, +we list converters that are shipped with the crawler library itself but are not +part of the set of standard converters and may require this library to be +installed with additional optional dependencies. + +HDF5 Converters +=============== + +For treating `HDF5 Files +<https://docs.hdfgroup.org/hdf5/develop/_s_p_e_c.html>`_, there are in total +four individual converters corresponding to the internal structure of HDF5 +files: the :ref:`H5FileConverter` which opens the file itself and creates +further structure elements from HDF5 groups, datasets, and included +multi-dimensional arrays that are in turn treated by the +:ref:`H5GroupConverter`, the :ref:`H5DatasetConverter`, and the +:ref:`H5NdarrayConverter`, respectively. You need to install the LinkAhead +crawler with its optional ``h5-crawler`` dependency for using these converters. + +The basic idea when crawling HDF5 files is to treat them very similar to +:ref:`dictionaries <DictElement Converter>` in which the attributes on root, +group, or dataset level are essentially treated like ``BooleanElement``, +``TextElement``, ``FloatElement``, and ``IntegerElement`` in a dictionary: They +are appended as children and can be accessed via the ``subtree``. The file +itself and the groups within may contain further groups and datasets, which can +have their own attributes, subgroups, and datasets, very much like +``DictElements`` within a dictionary. The main difference to any other +dictionary type is the presence of multi-dimensional arrays within HDF5 +datasets. Since LinkAhead doesn't have any datatype corresponding to these, and +since it isn't desirable to store these arrays directly within LinkAhead for +reasons of performance and of searchability, we wrap them within a specific +Record as explained :ref:`below <H5NdarrayConverter>`, together with more +metadata and their internal path within the HDF5 file. Users can thus query for +datasets and their arrays according to their metadata within LinkAhead and then +use the internal path information to access the dataset within the file +directly. The type of this record and the property for storing the internal path +need to be reflected in the datamodel. Using the default names, you would need a +datamodel like + +.. code-block:: yaml + + H5Ndarray: + obligatory_properties: + internal_hdf5-path: + datatype: TEXT + +although the names of both property and record type can be configured within the +cfood definition. + +A simple example of a cfood definition for HDF5 files can be found in the `unit +tests +<https://gitlab.com/linkahead/linkahead-crawler/-/blob/main/unittests/h5_cfood.yml?ref_type=heads>`_ +and shows how the individual converters are used in order to crawl a `simple +example file +<https://gitlab.com/linkahead/linkahead-crawler/-/blob/main/unittests/hdf5_dummy_file.hdf5?ref_type=heads>`_ +containing groups, subgroups, and datasets, together with their respective +attributes. + +H5FileConverter +--------------- + +This is an extension of the +:py:class:`~caoscrawler.converters.converters.SimpleFileConverter` +class. It opens the HDF5 file and creates children for any contained +group or dataset. Additionally, the root-level attributes of the HDF5 +file are accessible as children. + +H5GroupConverter +---------------- + +This is an extension of the +:py:class:`~caoscrawler.converters.converters.DictElementConverter` +class. Children are created for all subgroups and datasets in this +HDF5 group. Additionally, the group-level attributes are accessible as +children. + +H5DatasetConverter +------------------ + +This is an extension of the +:py:class:`~caoscrawler.converters.converters.DictElementConverter` +class. Most importantly, it stores the array data in HDF5 dataset into +:py:class:`~caoscrawler.converters.hdf5_converter.H5NdarrayElement` +which is added to its children, as well as the dataset attributes. + +H5NdarrayConverter +------------------ + +This converter creates a wrapper record for the contained dataset. The name of +this record needs to be specified in the cfood definition of this converter via +the ``recordname`` option. The RecordType of this record can be configured with +the ``array_recordtype_name`` option and defaults to ``H5Ndarray``. Via the +given ``recordname``, this record can be used within the cfood. Most +importantly, this record stores the internal path of this array within the HDF5 +file in a text property, the name of which can be configured with the +``internal_path_property_name`` option which defaults to ``internal_hdf5_path``. diff --git a/src/doc/converters/index.rst b/src/doc/converters/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..38fc11335a2640f645e9b4e093690d1ffa7cd07f --- /dev/null +++ b/src/doc/converters/index.rst @@ -0,0 +1,29 @@ +Converters +)))))))))) + +Converters treat a StructureElement and during this process create a number of new +StructureElements: the children of the initially treated StructureElement. Thus by treatment of +existing StructureElements, Converters create a tree of StructureElements. + +.. image:: ../img/converter.png + :height: 170 + :alt: Converters are Python classes that tell the crawler how to + interprete StructureElements. + +Each StructureElement in the tree has a set of properties, organized as +key-value pairs. +Some of those properties are specified by the type of StructureElement. For example, +a file could have the file name as property: ``'filename': myfile.dat``. +Converters may define additional functions that create further values. For +example, a regular expression could be used to get a date from a file name. + +.. toctree:: + :maxdepth: 1 + :caption: Contents: + + CFood definition<cfood_definition> + Standard converters<standard_converters> + Further converters<further_converters> + Custom converters<custom_converters> + Transform functions<transform_functions> + diff --git a/src/doc/converters/standard_converters.rst b/src/doc/converters/standard_converters.rst new file mode 100644 index 0000000000000000000000000000000000000000..5f86abb5b324e0cc1584e42e6abb2612acc8067f --- /dev/null +++ b/src/doc/converters/standard_converters.rst @@ -0,0 +1,404 @@ +Standard Converters ++++++++++++++++++++ + +These are the standard converters that exist in a default installation. For writing and applying +*custom converters*, see :ref:`below <Custom Converters>`. + +Directory Converter +=================== + +The Directory Converter creates StructureElements for each File and +Directory inside the current Directory. You can match a regular +expression against the directory name using the 'match' key. + +With the optional ``match_newer_than_file`` key, a path to file +containing only an ISO-formatted datetime string can be specified. If +this is done, a directory will only match if it contains at least one +file or directory that has been modified since that datetime. If the +file doesn't exist or contains an invalid string, the directory will +be matched regardless of the modification times. + +Simple File Converter +===================== +The Simple File Converter does not create any children and is usually used if +a file shall be used as it is and be inserted and referenced by other entities. + +Markdown File Converter +======================= +Reads a YAML header from Markdown files (if such a header exists) and creates +children elements according to the structure of the header. + +DictElement Converter +===================== + +DictElement → StructureElement + +Creates a child StructureElement for each key in the dictionary. + +Typical Subtree converters +-------------------------- +The following StructureElement types are typically created by the DictElement converter: + +- BooleanElement +- FloatElement +- TextElement +- IntegerElement +- ListElement +- DictElement + +Note that you may use ``TextElement`` for anything that exists in a text format that can be +interpreted by the server, such as date and datetime strings in ISO-8601 format. + +match_properties +---------------- + +`match_properties` is a dictionary of key-regexps and value-regexp pairs and can be used to +match direct properties of a `DictElement`. Each key matches +a property name and the corresponding value matches its property value. + +Example: +........ + +.. code-block:: json + + { + "@type": "PropertyValue", + "additionalType": "str", + "propertyID": "testextra", + "value": "hi" + } + +When applied to a dict loaded from the above json, a `DictElementConverter` with the following definition: + +.. code-block:: yaml + + Example: + type: DictElement + match_properties: + additionalType: (?P<addt>.*)$ + property(.*): (?P<propid>.*)$ + +will match and create two variables: + +- `addt = "str"` +- `propid = "testextra"` + + +Scalar Value Converters +======================= +`BooleanElementConverter`, `FloatElementConverter`, `TextElementConverter`, and +`IntegerElementConverter` behave very similarly. + +These converters expect `match_name` and `match_value` in their definition +which allow to match the key and the value, respectively. + +Note that there are defaults for accepting other types. For example, +FloatElementConverter also accepts IntegerElements. The default +behavior can be adjusted with the fields `accept_text`, `accept_int`, +`accept_float`, and `accept_bool`. + +The following denotes what kind of StructureElements are accepted by default +(they are defined in `src/caoscrawler/converters.py`): + +- BooleanElementConverter: bool, int +- FloatElementConverter: int, float +- TextElementConverter: text, bool, int, float +- IntegerElementConverter: int +- ListElementConverter: list +- DictElementConverter: dict + +YAMLFileConverter +================= + +A specialized Dict Converter for yaml files: Yaml files are opened and the contents are +converted into dictionaries that can be further converted using the typical subtree converters +of dict converter. + +**WARNING**: Currently unfinished implementation. + +JSONFileConverter +================= + + + + +TableConverter +============== + +Table → DictElement + +A generic converter (abstract) for files containing tables. +Currently, there are two specialized implementations for XLSX files and CSV files. + +All table converters generate a subtree of dicts, which in turn can be converted with DictElementConverters: +For each row in the table the TableConverter generates a DictElement (structure element). The key of the +element is the row number. The value of the element is a dict containing the mapping of +column names to values of the respective cell. + +Example: + +.. code-block:: yaml + + subtree: + TABLE: # Any name for the table as a whole + type: CSVTableConverter + match: ^test_table.csv$ + records: + (...) # Records edited for the whole table file + subtree: + ROW: # Any name for a data row in the table + type: DictElement + match_name: .* + match_value: .* + records: + (...) # Records edited for each row + subtree: + COLUMN: # Any name for a specific type of column in the table + type: FloatElement + match_name: measurement # Name of the column in the table file + match_value: (?P<column_value).*) + records: + (...) # Records edited for each cell + + +XLSXTableConverter +================== + +XLSX File → DictElement + +CSVTableConverter +================= + +CSV File → DictElement + +PropertiesFromDictConverter +=========================== + +The :py:class:`~caoscrawler.converters.converters.PropertiesFromDictConverter` is +a specialization of the +:py:class:`~caoscrawler.converters.converters.DictElementConverter` and offers +all its functionality. It is meant to operate on dictionaries (e.g., +from reading in a json or a table file), the keys of which correspond +closely to properties in a LinkAhead datamodel. This is especially +handy in cases where properties may be added to the data model and +data sources that are not yet known when writing the cfood definition. + +The converter definition of the +:py:class:`~caoscrawler.converters.converters.PropertiesFromDictConverter` has an +additional required entry ``record_from_dict`` which specifies the +Record to which the properties extracted from the dict are attached +to. This Record is identified by its ``variable_name`` by which it can +be referred to further down the subtree. You can also use the name of +a Record that was specified earlier in the CFood definition in order +to extend it by the properties extracted from a dict. Let's have a +look at a simple example. A CFood definition + +.. code-block:: yaml + + PropertiesFromDictElement: + type: PropertiesFromDictElement + match: ".*" + record_from_dict: + variable_name: MyRec + parents: + - MyType1 + - MyType2 + +applied to a dictionary + +.. code-block:: json + + { + "name": "New name", + "a": 5, + "b": ["a", "b", "c"], + "author": { + "full_name": "Silvia Scientist" + } + } + +will create a Record ``New name`` with parents ``MyType1`` and +``MyType2``. It has a scalar property ``a`` with value 5, a list +property ``b`` with values "a", "b" and "c", and an ``author`` +property which references an ``author`` with a ``full_name`` property +with value "Silvia Scientist": + +.. image:: ../img/properties-from-dict-records-author.png + :height: 210 + :alt: A Record "New Name" and an author Record with full_name + "Silvia Scientist" are generated and filled automatically. + +Note how the different dictionary keys are handled differently +depending on their types: scalar and list values are understood +automatically, and a dictionary-valued entry like ``author`` is +translated into a reference to an ``author`` Record automatically. + +You can further specify how references are treated with an optional +``references key`` in ``record_from_dict``. Let's assume that in the +above example, we have an ``author`` **Property** with datatype +``Person`` in our data model. We could add this information by +extending the above example definition by + + +.. code-block:: yaml + + PropertiesFromDictElement: + type: PropertiesFromDictElement + match: ".*" + record_from_dict: + variable_name: MyRec + parents: + - MyType1 + - MyType2 + references: + author: + parents: + - Person + +so that now, a ``Person`` record with a ``full_name`` property with +value "Silvia Scientist" is created as the value of the ``author`` +property: + +.. image:: ../img/properties-from-dict-records-person.png + :height: 200 + :alt: A new Person Record is created which is referenced as an + author. + +For the time being, only the parents of the referenced record can be +set via this option. More complicated treatments can be implemented +via the ``referenced_record_callback`` (see below). + +Properties can be blacklisted with the ``properties_blacklist`` +keyword, i.e., all keys listed under ``properties_blacklist`` will be +excluded from automated treatment. Since the +:py:class:`~caoscrawler.converters.converters.PropertiesFromDictConverter` has +all the functionality of the +:py:class:`~caoscrawler.converters.converters.DictElementConverter`, individual +properties can still be used in a subtree. Together with +``properties_blacklist`` this can be used to add custom treatment to +specific properties by blacklisting them in ``record_from_dict`` and +then treating them in the subtree the same as you would do it in the +standard +:py:class:`~caoscrawler.converters.converters.DictElementConverter`. Note that +the blacklisted keys are excluded on **all** levels of the dictionary, +i.e., also when they occur in a referenced entity. + +For further customization, the +:py:class:`~caoscrawler.converters.converters.PropertiesFromDictConverter` +can be used as a basis for :ref:`custom converters<Custom Converters>` +which can make use of its ``referenced_record_callback`` argument. The +``referenced_record_callback`` can be a callable object which takes +exactly a Record as an argument and needs to return that Record after +doing whatever custom treatment is needed. Additionally, it is given +the ``RecordStore`` and the ``ValueStore`` in order to be able to +access the records and values that have already been defined from +within ``referenced_record_callback``. Such a function might look the +following: + +.. code-block:: python + + def my_callback(rec: db.Record, records: RecordStore, values: GeneralStore): + # do something with rec, possibly using other records or values from the stores... + rec.description = "This was updated in a callback" + return rec + +It is applied to all Records that are created from the dictionary and +it can be used to, e.g., transform values of some properties, or add +special treatment to all Records of a specific +type. ``referenced_record_callback`` is applied **after** the +properties from the dictionary have been applied as explained above. + +XML Converters +============== + +There are the following converters for XML content: + + +XMLFileConverter +---------------- + +This is a converter that loads an XML file and creates an XMLElement containing the +root element of the XML tree. It can be matched in the subtree using the XMLTagConverter. + +XMLTagConverter +--------------- + +The XMLTagConverter is a generic converter for XMLElements with the following main features: + +- It allows to match a combination of tag name, attribute names and text contents using the keys: + + - ``match_tag``: regexp, default empty string + - ``match_attrib``: dictionary of key-regexps and value-regexp + pairs. Each key matches an attribute name and the corresponding + value matches its attribute value. + - ``match_text``: regexp, default empty string +- It allows to traverse the tree using XPath (using Python lxml's xpath functions): + + - The key ``xpath`` is used to set the xpath expression and has a + default of ``child::*``. Its default would generate just the list of + sub nodes of the current node. The result of the xpath expression + is used to generate structure elements as children. It furthermore + uses the keys ``tags_as_children``, ``attribs_as_children`` and + ``text_as_children`` to decide which information from the found + nodes will be used as children: + - ``tags_as_children``: (default ``true``) For each xml tag element + found by the xpath expression, generate one XMLTag structure + element. Its name is the full path to the tag using the function + ``getelementpath`` from ``lxml``. + - ``attribs_as_children``: (default ``false``) For each xml tag element + found by the xpath expression, generate one XMLAttributeNode + structure element for each of its attributes. The name of the + respective attribute node has the form: ``<full path of the tag> @ + <name of the attribute>`` **Please note:** Currently, there is no + converter implemented that can match XMLAttributeNodes. + - ``text_as_children``: (default ``false``) For each xml tag element + found by the xpath expression, generate one XMLTextNode structure + element containing the text content of the tag element. Note that + in case of multiple text elements, only the first one is + added. The name of the respective attribute node has the form: + ``<full path of the tag> /text()`` to the tag using the function + ``getelementpath`` from ``lxml``. **Please note:** Currently, there is + no converter implemented that can match XMLAttributeNodes. + +Namespaces +********** + +The default is to take the namespace map from the current node and use +it in xpath queries. Because default namespaces cannot be handled by +xpath, it is possible to remap the default namespace using the key +``default_namespace``. The key ``nsmap`` can be used to define +additional nsmap entries. + +XMLTextNodeConverter +-------------------- + +In the future, this converter can be used to match XMLTextNodes that +are generated by the XMLTagConverter. + + +ZipFileConverter +================ + +This converter opens zip files, unzips them into a temporary directory and +exposes its contents as File structure elements. + +Usage Example: +-------------- + +.. code-block:: yaml + + ExampleZipFile: + type: ZipFile + match: example\.zip$ + subtree: + DirInsideZip: + type: Directory + match: experiments$ + FileInsideZip: + type: File + match: description.odt$ + +This converter will match and open files called ``example.zip``. If +the file contains a directory called ``experiments`` it will be +processed further by the respective converter in the subtree. The same +is true for a file called ``description.odt``. diff --git a/src/doc/converters/transform_functions.rst b/src/doc/converters/transform_functions.rst new file mode 100644 index 0000000000000000000000000000000000000000..ecd47d2dc004c6f1382279901dfec2d96e0e4a2d --- /dev/null +++ b/src/doc/converters/transform_functions.rst @@ -0,0 +1,72 @@ +Transform Functions ++++++++++++++++++++ +Often the situation arises, that you cannot use a value as it is found. Maybe a value should be +increased by an offset or a string should be split into a list of pieces. In order to allow such +simple conversions, transform functions can be named in the converter definition that are then +applied to the respective variables when the converter is executed. + +.. code-block:: yaml + + <NodeName>: + type: <ConverterName> + match: ".*" + transform: + <TransformNodeName>: + in: $<in_var_name> + out: $<out_var_name> + functions: + - <func_name>: # name of the function to be applied + <func_arg1>: <func_arg1_value> # key value pairs that are passed as parameters + <func_arg2>: <func_arg2_value> + # ... + +An example that splits the variable ``a`` and puts the generated list in ``b`` is the following: + +.. code-block:: yaml + + Experiment: + type: Dict + match: ".*" + transform: + param_split: + in: $a + out: $b + functions: + - split: # split is a function that is defined by default + marker: "|" # its only parameter is the marker that is used to split the string + records: + Report: + tags: $b + +This splits the string in '$a' and stores the resulting list in +'$b'. This is here used to add a list valued property to the Report +Record. Note that from LinkAhead Crawler 0.11.0 onwards, the value of +``marker`` in the above example can also be read in from a variable in +the usual ``$`` notation: + +.. code-block:: yaml + + # ... variable ``separator`` is defined somewhere above this part, e.g., + # by reading a config file. + Experiment: + type: Dict + match: ".*" + transform: + param_split: + in: $a + out: $b + functions: + - split: + marker: $separator # Now the separator is read in from a + # variable, so we can, e.g., change from + # '|' to ';' without changing the cfood + # definition. + records: + Report: + tags: $b + + + +There are a number of transform functions that are defined by default (see +``src/caoscrawler/default_transformers.yml``). You can define custom transform functions by adding +them to the cfood definition (see :doc:`CFood Documentation<../cfood>`). diff --git a/src/doc/getting_started/furtherreading.rst b/src/doc/getting_started/furtherreading.rst new file mode 100644 index 0000000000000000000000000000000000000000..8d8d3ecc4b5575f71e90e9e5a17b060a63403a07 --- /dev/null +++ b/src/doc/getting_started/furtherreading.rst @@ -0,0 +1,9 @@ +Further reading +=============== + +- A simple `documented example <https://gitlab.com/caosdb/documented-crawler-example>`_ which + demonstrates the crawler usage. +- Some useful examples can be found in the `integration tests + <https://gitlab.com/caosdb/caosdb-crawler/-/tree/main/integrationtests>`_ (and to a certain extent + in the unit tests). +- TODO: Information on caching diff --git a/src/doc/getting_started/helloworld.md b/src/doc/getting_started/helloworld.md index 723fb88d08047350d9f4bc3d3d2bd84ec9b27efb..67fdf88974391ac6209f1010bfb4f2d883e51021 100644 --- a/src/doc/getting_started/helloworld.md +++ b/src/doc/getting_started/helloworld.md @@ -33,7 +33,7 @@ Then you can do the following interactively in (I)Python. But we recommend that copy the code into a script and execute it to spare yourself typing. ```python -import caosdb as db +import linkahead as db from datetime import datetime from caoscrawler import Crawler, SecurityMode from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter diff --git a/src/doc/getting_started/index.rst b/src/doc/getting_started/index.rst index 490c705f2feb9eeedc399e8c1d91e28abcd7fd12..86b34d069391b146d15599228067df2e9e41d642 100644 --- a/src/doc/getting_started/index.rst +++ b/src/doc/getting_started/index.rst @@ -10,6 +10,7 @@ Getting Started prerequisites helloworld optionalfeatures + furtherreading This section will help you get going! From the first installation steps to the first simple crawl. diff --git a/src/doc/getting_started/optionalfeatures.rst b/src/doc/getting_started/optionalfeatures.rst index d326d7fce6f77a0278c9f2d05a641888203a2089..7b77646501d677b7a99799b97fae752107b11d6f 100644 --- a/src/doc/getting_started/optionalfeatures.rst +++ b/src/doc/getting_started/optionalfeatures.rst @@ -30,6 +30,13 @@ to decide what tool is used for sending mails (use the upper one if you want to actually send mails. See ``sendmail`` configuration in the LinkAhead docs. +You can even supply the name of a custom CSS file that shall be used: + +.. code:: ini + + [advancedtools] + crawler.customcssfile = theme-research.css + Crawler Status Records ---------------------- diff --git a/src/doc/how-to-upgrade.md b/src/doc/how-to-upgrade.md index 30d23f8f3a4ad88f6b3f4fca18013e26fbcb1dc1..8af805ea30cc85cdde88d789ee3538b2bbaef7e3 100644 --- a/src/doc/how-to-upgrade.md +++ b/src/doc/how-to-upgrade.md @@ -1,5 +1,45 @@ # How to upgrade + +## 0.8.x to 0.9.0 + +If you were using the optional HDF5 converter classes, you need to +adapt the package path in your cfood definition from the **old** + +```yaml +Converters: + H5Dataset: + converter: H5DatasetConverter + package: caoscrawler.hdf5_converter + H5File: + converter: H5FileConverter + package: caoscrawler.hdf5_converter + H5Group: + converter: H5GroupConverter + package: caoscrawler.hdf5_converter + H5Ndarray: + converter: H5NdarrayConverter + package: caoscrawler.hdf5_converter +``` + +to the **new** paths: + +```yaml +Converters: + H5Dataset: + converter: H5DatasetConverter + package: caoscrawler.converters.hdf5_converter + H5File: + converter: H5FileConverter + package: caoscrawler.converters.hdf5_converter + H5Group: + converter: H5GroupConverter + package: caoscrawler.converters.hdf5_converter + H5Ndarray: + converter: H5NdarrayConverter + package: caoscrawler.converters.hdf5_converter +``` + ## 0.6.x to 0.7.0 If you added Parents to Records at multiple places in the CFood, you must now do this at a single location because this key now overwrites previously set diff --git a/src/doc/img/converter.png b/src/doc/img/converter.png new file mode 100644 index 0000000000000000000000000000000000000000..c11517a32ceb164510a7731ff0516d19db71801a Binary files /dev/null and b/src/doc/img/converter.png differ diff --git a/src/doc/img/converter.svg b/src/doc/img/converter.svg new file mode 100644 index 0000000000000000000000000000000000000000..af32ff69cdd6c25805f929458556310b3ee34f41 --- /dev/null +++ b/src/doc/img/converter.svg @@ -0,0 +1,442 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<svg + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns="http://www.w3.org/2000/svg" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + width="72.854424mm" + height="33.470383mm" + viewBox="0 0 72.854423 33.470383" + version="1.1" + id="svg13434" + inkscape:version="1.0.2 (e86c870879, 2021-01-15)" + sodipodi:docname="converter.svg" + inkscape:export-filename="/home/daniel/indiscale/software/linkahead/caosdb-crawler/src/doc/img/converter.png" + inkscape:export-xdpi="299.83078" + inkscape:export-ydpi="299.83078"> + <defs + id="defs13428"> + <marker + style="overflow:visible;" + id="marker1559" + refX="0.0" + refY="0.0" + orient="auto" + inkscape:stockid="Arrow2Mend" + inkscape:isstock="true"> + <path + transform="scale(0.6) rotate(180) translate(0,0)" + d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z " + style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round;stroke:#000000;stroke-opacity:1;fill:#000000;fill-opacity:1" + id="path1557" /> + </marker> + <marker + style="overflow:visible" + id="marker1266" + refX="0" + refY="0" + orient="auto" + inkscape:stockid="Arrow2Mend" + inkscape:isstock="true"> + <path + transform="scale(-0.6)" + d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.625;stroke-linejoin:round;stroke-opacity:1" + id="path1264" /> + </marker> + <marker + style="overflow:visible" + id="marker1218" + refX="0" + refY="0" + orient="auto" + inkscape:stockid="Arrow2Mend" + inkscape:isstock="true" + inkscape:collect="always"> + <path + transform="scale(-0.6)" + d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.625;stroke-linejoin:round;stroke-opacity:1" + id="path1216" /> + </marker> + <marker + style="overflow:visible" + id="Arrow2Mend" + refX="0" + refY="0" + orient="auto" + inkscape:stockid="Arrow2Mend" + inkscape:isstock="true" + inkscape:collect="always"> + <path + transform="scale(-0.6)" + d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.625;stroke-linejoin:round;stroke-opacity:1" + id="path909" /> + </marker> + <marker + style="overflow:visible" + id="Arrow1Lend" + refX="0" + refY="0" + orient="auto" + inkscape:stockid="Arrow1Lend" + inkscape:isstock="true"> + <path + transform="matrix(-0.8,0,0,-0.8,-10,0)" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + d="M 0,0 5,-5 -12.5,0 5,5 Z" + id="path885" /> + </marker> + <marker + style="overflow:visible" + id="marker1559-2" + refX="0" + refY="0" + orient="auto" + inkscape:stockid="Arrow2Mend" + inkscape:isstock="true"> + <path + transform="scale(-0.6)" + d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.625;stroke-linejoin:round;stroke-opacity:1" + id="path1557-9" /> + </marker> + </defs> + <sodipodi:namedview + id="base" + pagecolor="#ffffff" + bordercolor="#666666" + borderopacity="1.0" + inkscape:pageopacity="0.0" + inkscape:pageshadow="2" + inkscape:zoom="2.8" + inkscape:cx="120.68286" + inkscape:cy="23.831081" + inkscape:document-units="mm" + inkscape:current-layer="g1411" + inkscape:document-rotation="0" + showgrid="false" + inkscape:snap-global="false" + inkscape:window-width="1920" + inkscape:window-height="1135" + inkscape:window-x="0" + inkscape:window-y="0" + inkscape:window-maximized="1" + lock-margins="true" + fit-margin-top="2" + fit-margin-left="2" + fit-margin-right="2" + fit-margin-bottom="2" /> + <metadata + id="metadata13431"> + <rdf:RDF> + <cc:Work + rdf:about=""> + <dc:format>image/svg+xml</dc:format> + <dc:type + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> + <dc:title></dc:title> + </cc:Work> + </rdf:RDF> + </metadata> + <g + inkscape:label="Ebene 1" + inkscape:groupmode="layer" + id="layer1" + transform="translate(-8.1569115,-36.221295)"> + <g + id="g1411" + transform="translate(32.258972,-4.0381556)"> + <path + style="opacity:1;fill:none;stroke:#000000;stroke-width:0.264583;paint-order:markers fill stroke;stop-color:#000000" + d="m 26.22787,46.991961 -0.04324,7.85981" + id="path870" + sodipodi:nodetypes="cc" /> + <path + style="opacity:1;fill:none;stroke:#000000;stroke-width:0.264583;paint-order:markers fill stroke;stop-color:#000000" + d="m 27.268191,47.234524 6.5917,7.093847" + id="path872" + sodipodi:nodetypes="cc" /> + <path + style="opacity:1;fill:none;stroke:#000000;stroke-width:0.264583;paint-order:markers fill stroke;stop-color:#000000" + d="M 17.211264,56.167197 12.543075,64.49543" + id="path874" + sodipodi:nodetypes="cc" /> + <path + style="opacity:1;fill:none;stroke:#000000;stroke-width:0.264583;paint-order:markers fill stroke;stop-color:#000000" + d="m 19.403188,56.222309 1.865426,8.356695" + id="path876" + sodipodi:nodetypes="cc" /> + <path + style="opacity:1;fill:none;stroke:#000000;stroke-width:0.265;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Mend);paint-order:markers fill stroke;stop-color:#000000" + d="m 34.590338,55.360048 c 1.051358,-1.820435 1.974353,-2.426981 3.317324,-2.31217 0.956924,0.08181 1.647835,1.289889 2.049783,2.024833" + id="path880" + sodipodi:nodetypes="cac" /> + <path + style="opacity:1;fill:none;stroke:#000000;stroke-width:0.264583;paint-order:markers fill stroke;stop-color:#000000" + d="m 25.076267,47.179412 -6.5917,7.093847" + id="path14001" + sodipodi:nodetypes="cc" /> + <rect + style="opacity:1;fill:#25e325;stroke:#000000;stroke-width:0.264583;paint-order:markers fill stroke;stop-color:#000000" + id="rect13997" + width="4.4276514" + height="3.9112766" + x="23.986937" + y="44.075451" /> + <rect + style="opacity:1;fill:#ff8cff;fill-opacity:1;stroke:#000000;stroke-width:0.264583;paint-order:markers fill stroke;stop-color:#000000" + id="rect13999" + width="4.4276514" + height="3.9112766" + x="15.955473" + y="53.282654" /> + <path + sodipodi:type="star" + style="opacity:1;fill:#ff8cff;fill-opacity:1;stroke:#000000;stroke-width:0.264583;paint-order:markers fill stroke;stop-color:#000000" + id="path14003" + sodipodi:sides="3" + sodipodi:cx="26.161613" + sodipodi:cy="55.658291" + sodipodi:r1="2.7924292" + sodipodi:r2="1.3962145" + sodipodi:arg1="0.52359878" + sodipodi:arg2="1.5707963" + inkscape:flatsided="true" + inkscape:rounded="0" + inkscape:randomized="0" + d="m 28.579928,57.054505 -4.836629,0 2.418314,-4.188643 z" + inkscape:transform-center-y="-0.69810795" /> + <path + style="fill:none;stroke:#000000;stroke-width:0.264583;stroke-miterlimit:4;stroke-dasharray:0.529166, 0.529166;stroke-dashoffset:0;paint-order:markers fill stroke;stop-color:#000000" + d="M 11.791704,65.225482 9.0065326,70.566411" + id="path1467" + sodipodi:nodetypes="cc" /> + <path + style="fill:none;stroke:#000000;stroke-width:0.264583;stroke-miterlimit:4;stroke-dasharray:0.529166, 0.529166;stroke-dashoffset:0;paint-order:markers fill stroke;stop-color:#000000" + d="m 13.983628,65.280594 1.865426,5.369391" + id="path1469" + sodipodi:nodetypes="cc" /> + <circle + style="opacity:1;fill:#6abfff;fill-opacity:1;stroke:#000000;stroke-width:0.264583;paint-order:markers fill stroke;stop-color:#000000" + id="path14861" + cx="12.714239" + cy="65.343147" + r="2.3446827" /> + <path + sodipodi:type="star" + style="opacity:1;fill:#ff8cff;fill-opacity:1;stroke:#000000;stroke-width:0.264583;paint-order:markers fill stroke;stop-color:#000000" + id="path14863" + sodipodi:sides="3" + sodipodi:cx="33.771244" + sodipodi:cy="55.658291" + sodipodi:r1="2.7924292" + sodipodi:r2="1.3962145" + sodipodi:arg1="0.52359878" + sodipodi:arg2="1.5707963" + inkscape:flatsided="true" + inkscape:rounded="0" + inkscape:randomized="0" + d="m 36.189559,57.054505 -4.83663,0 2.418315,-4.188643 z" + inkscape:transform-center-y="-0.69810795" /> + <path + sodipodi:type="star" + style="font-variation-settings:normal;opacity:1;vector-effect:none;fill:#6abfff;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000;stop-opacity:1" + id="path14865" + sodipodi:sides="3" + sodipodi:cx="31.079979" + sodipodi:cy="69.469734" + sodipodi:r1="2.7924292" + sodipodi:r2="1.3962145" + sodipodi:arg1="0.52359878" + sodipodi:arg2="1.5707963" + inkscape:flatsided="true" + inkscape:rounded="0" + inkscape:randomized="0" + d="m 33.498294,70.865949 -4.83663,0 2.418315,-4.188644 z" + inkscape:transform-center-y="-0.69810795" /> + <circle + style="font-variation-settings:normal;opacity:1;vector-effect:none;fill:#6abfff;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000;stop-opacity:1" + id="circle14867" + cx="21.223957" + cy="65.343147" + r="2.3446827" /> + <rect + style="font-variation-settings:normal;opacity:1;vector-effect:none;fill:#6abfff;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000;stop-opacity:1" + id="rect14869" + width="4.4276514" + height="3.9112766" + x="36.216988" + y="66.749847" /> + <path + id="path82" + inkscape:connector-curvature="0" + d="m 41.515562,54.871846 -0.417806,1.20398 -1.168202,0.279888 -0.960641,-0.870246 -1.138707,0.832408 0.537693,1.179427 -0.621192,1.02818 -1.273991,0.03274 -0.218785,1.407938 1.203979,0.417805 0.279889,1.168202 -0.870245,0.960643 0.832407,1.138704 1.179427,-0.537691 1.028181,0.621191 0.03274,1.273992 1.407938,0.218785 0.417806,-1.20398 1.168202,-0.279888 0.96064,0.870244 1.138706,-0.832406 -0.537691,-1.179427 0.621192,-1.028182 1.273992,-0.03274 0.218784,-1.407938 -1.20398,-0.417805 -0.279888,-1.168203 0.870246,-0.96064 -0.83241,-1.138707 -1.179425,0.537693 -1.028181,-0.621192 -0.03274,-1.273992 z" + style="fill:#d0dbf5;fill-opacity:1;stroke:#0f2d59;stroke-width:0.284967;stroke-linecap:round;stroke-linejoin:round" + sodipodi:nodetypes="ccccccccccccccccccccccccccccccccc" /> + <path + style="opacity:1;fill:none;stroke:#000000;stroke-width:0.265;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#marker1218);paint-order:markers fill stroke;stop-color:#000000" + d="m 36.505382,61.212129 c -1.732593,0.460546 -2.239587,0.94846 -3.054171,1.805942 -0.855057,0.900086 -1.291029,1.914968 -1.728787,3.298907" + id="path1214" + sodipodi:nodetypes="cac" /> + <path + style="opacity:1;fill:none;stroke:#000000;stroke-width:0.265;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#marker1266);paint-order:markers fill stroke;stop-color:#000000" + d="m 39.119283,63.680684 c -0.561579,0.349977 -1.171361,1.831472 -1.388934,2.468193" + id="path1262" + sodipodi:nodetypes="cc" /> + <g + id="g1624-1" + transform="translate(-24.776227,-7.0250037)"> + <path + sodipodi:type="star" + style="font-variation-settings:normal;vector-effect:none;fill:none;fill-opacity:1;stroke:#000000;stroke-width:0.113934;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000" + id="path14865-5-2" + sodipodi:sides="3" + sodipodi:cx="66.174721" + sodipodi:cy="64.759911" + sodipodi:r1="1.2024668" + sodipodi:r2="0.60123342" + sodipodi:arg1="0.52359878" + sodipodi:arg2="1.5707963" + inkscape:flatsided="true" + inkscape:rounded="0" + inkscape:randomized="0" + inkscape:transform-center-y="-0.30061479" + inkscape:transform-center-x="2.0589357e-06" + d="m 67.216088,65.361144 -2.082734,0 1.041367,-1.8037 z" /> + <path + style="font-variation-settings:normal;opacity:1;vector-effect:none;fill:none;fill-opacity:1;stroke:#000000;stroke-width:0.15;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#marker1559-2);paint-order:markers fill stroke;stop-color:#000000;stop-opacity:1" + d="M 66.173282,65.809672 V 67.26045" + id="path1555-0" /> + <g + id="g1807" + transform="translate(0.32991862)"> + <rect + style="font-variation-settings:normal;vector-effect:none;fill:none;fill-opacity:1;stroke:#000000;stroke-width:0.113934;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000" + id="rect14869-6-7" + width="1.906621" + height="1.6842613" + x="66.594307" + y="67.911743" /> + <path + sodipodi:type="star" + style="font-variation-settings:normal;vector-effect:none;fill:none;fill-opacity:1;stroke:#000000;stroke-width:0.113934;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000" + id="path1803" + sodipodi:sides="3" + sodipodi:cx="64.271751" + sodipodi:cy="69.082977" + sodipodi:r1="1.2024668" + sodipodi:r2="0.60123342" + sodipodi:arg1="0.52359878" + sodipodi:arg2="1.5707963" + inkscape:flatsided="true" + inkscape:rounded="0" + inkscape:randomized="0" + inkscape:transform-center-y="-0.30061479" + inkscape:transform-center-x="2.0589357e-06" + d="m 65.313118,69.684211 -2.082733,0 1.041366,-1.803701 z" /> + </g> + </g> + </g> + <g + id="g1374" + transform="translate(-49.214304,-4.5219647)"> + <circle + style="font-variation-settings:normal;vector-effect:none;fill:#6abfff;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000" + id="circle14867-5" + cx="61.878521" + cy="49.767113" + r="2.3446827" /> + <text + xml:space="preserve" + style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.125046" + x="59.173122" + y="60.298515" + id="text14855"><tspan + sodipodi:role="line" + id="tspan14853" + x="59.173122" + y="60.298515" + style="font-size:3.52778px;stroke-width:0.125046">Converter</tspan></text> + <text + xml:space="preserve" + style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.125046" + x="59.173122" + y="45.423546" + id="text14859"><tspan + sodipodi:role="line" + id="tspan14857" + x="59.173122" + y="45.423546" + style="font-size:3.52778px;stroke-width:0.125046">StructureElement</tspan></text> + <rect + style="fill:#25e325;stroke:#000000;stroke-width:0.264583;paint-order:markers fill stroke;stop-color:#000000" + id="rect13997-6" + width="4.4276514" + height="3.9112766" + x="65.556091" + y="47.811474" /> + <path + sodipodi:type="star" + style="fill:#ff8cff;fill-opacity:1;stroke:#000000;stroke-width:0.264583;paint-order:markers fill stroke;stop-color:#000000" + id="path14003-7" + sodipodi:sides="3" + sodipodi:cx="73.831802" + sodipodi:cy="50.531364" + sodipodi:r1="2.7924292" + sodipodi:r2="1.3962145" + sodipodi:arg1="0.52359878" + sodipodi:arg2="1.5707963" + inkscape:flatsided="true" + inkscape:rounded="0" + inkscape:randomized="0" + inkscape:transform-center-y="-0.69810795" + d="m 76.250117,51.927579 -4.836629,0 2.418314,-4.188644 z" /> + <g + id="g1649"> + <path + id="path82-3" + inkscape:connector-curvature="0" + d="m 66.342602,61.715213 -0.417806,1.20398 -1.168202,0.279888 -0.960641,-0.870246 -1.138707,0.832408 0.537693,1.179427 -0.621192,1.02818 -1.273991,0.03274 -0.218785,1.407938 1.203979,0.417805 0.279889,1.168202 -0.870245,0.960643 0.832407,1.138704 1.179427,-0.537691 1.028181,0.621191 0.03274,1.273992 1.407938,0.218785 0.417806,-1.20398 1.168202,-0.279888 0.96064,0.870244 1.138706,-0.832406 -0.537691,-1.179427 0.621192,-1.028182 1.273992,-0.03274 0.218784,-1.407938 -1.20398,-0.417805 -0.279888,-1.168203 0.870246,-0.96064 -0.83241,-1.138707 -1.179425,0.537693 -1.028181,-0.621192 -0.03274,-1.273992 z" + style="fill:#d0dbf5;fill-opacity:1;stroke:#0f2d59;stroke-width:0.284967;stroke-linecap:round;stroke-linejoin:round" + sodipodi:nodetypes="ccccccccccccccccccccccccccccccccc" /> + <g + id="g1624" + transform="translate(-0.23034383)"> + <path + sodipodi:type="star" + style="font-variation-settings:normal;vector-effect:none;fill:none;fill-opacity:1;stroke:#000000;stroke-width:0.113934;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000" + id="path14865-5" + sodipodi:sides="3" + sodipodi:cx="66.489288" + sodipodi:cy="64.759911" + sodipodi:r1="1.2024668" + sodipodi:r2="0.60123342" + sodipodi:arg1="0.52359878" + sodipodi:arg2="1.5707963" + inkscape:flatsided="true" + inkscape:rounded="0" + inkscape:randomized="0" + inkscape:transform-center-y="-0.30061479" + d="m 67.530655,65.361144 -2.082734,0 1.041367,-1.8037 z" + inkscape:transform-center-x="2.0589357e-06" /> + <rect + style="font-variation-settings:normal;vector-effect:none;fill:none;fill-opacity:1;stroke:#000000;stroke-width:0.113934;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000" + id="rect14869-6" + width="1.906621" + height="1.6842613" + x="65.535973" + y="67.911743" /> + <path + style="font-variation-settings:normal;opacity:1;vector-effect:none;fill:none;fill-opacity:1;stroke:#000000;stroke-width:0.15;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#marker1559);paint-order:markers fill stroke;stop-color:#000000;stop-opacity:1" + d="M 66.487845,65.809672 V 67.26045" + id="path1555" /> + </g> + </g> + </g> + </g> +</svg> diff --git a/src/doc/img/properties-from-dict-records-author.png b/src/doc/img/properties-from-dict-records-author.png new file mode 100644 index 0000000000000000000000000000000000000000..20ee9497ab5ae577c3d515f11da6294c88601fed Binary files /dev/null and b/src/doc/img/properties-from-dict-records-author.png differ diff --git a/src/doc/img/properties-from-dict-records-person.png b/src/doc/img/properties-from-dict-records-person.png new file mode 100644 index 0000000000000000000000000000000000000000..8b026056a42ff3ba203c6077a426640c864b24c1 Binary files /dev/null and b/src/doc/img/properties-from-dict-records-person.png differ diff --git a/src/doc/index.rst b/src/doc/index.rst index 8a02ec62e50308a28899e71b4664f626dfa0c27b..a72389b1f4b94430b2c5ff2bfee9757193327ed7 100644 --- a/src/doc/index.rst +++ b/src/doc/index.rst @@ -10,7 +10,7 @@ CaosDB-Crawler Documentation Getting started<getting_started/index> Tutorials<tutorials/index> Concepts<concepts> - Converters<converters> + Converters<converters/index> CFoods (Crawler Definitions)<cfood> Macros<macros> How to upgrade<how-to-upgrade> diff --git a/src/doc/macros.rst b/src/doc/macros.rst index 5329ca6ddde49dbef439659d4904b07ed3f2bef9..3a234973ee17791aaa2a0bd9e4b81836207a07e0 100644 --- a/src/doc/macros.rst +++ b/src/doc/macros.rst @@ -1,7 +1,11 @@ Macros ------ -Macros highly facilitate the writing of complex :doc:`CFoods<cfood>`. Consider the following prevalent example: +Introduction +============ + +Macros highly facilitate the writing of complex :doc:`CFoods<cfood>`. Consider the following common +example: .. _example_files: .. code-block:: yaml @@ -82,16 +86,46 @@ The expanded version of `ExperimentalData` will look like: This :ref:`example<example_files_2>` can also be found in the macro unit tests (see :func:`unittests.test_macros.test_documentation_example_2`). -Complex Example -=============== -The following, more complex example, demonstrates the use -of macro variable substitutions that generate crawler variable substitutions: +Mixing macros and plain definitions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can also mix macros and plain definitions. Whenever a name cannot be resolved to a macro, a +plain yaml node definition is used as a fallback: + +.. code:: yaml + + --- + metadata: + macros: + - !defmacro + name: MarkdownFile + # ... Definition here ... + --- + ExperimentalData: + type: Directory + match: ExperimentalData + subtree: !macro + MarkdownFile: + - name: README + filename: ^README.md$ + OtherContent: # There is no macro named "OtherContent", so this is parsed as normal content. + type: SimpleFile + match: .*txt + records: + # ... Normal content ... + -- `$$$nodename` will lead to a macro variable substitution of variable `$nodename` during macro expansion. -- `$$` will be turned into `$` -- So in the crawler cfood, the string will appear as `$value` if variable `nodename` would be set to `value` when using the macro. +Complex example +=============== + +Let's try something more complex: what happens to multiple ``$``? This example demonstrates the use +of `macro` variable substitutions to generate `crawler` variable substitutions: +- ``$$`` will be converted into ``$``. +- ``$$$nodename`` will retain a single ``$`` and substitute ``$nodename`` during macro expansion. +- So in the cfood, if ``nodename: value``, the string ``$$$nodename`` will be converted to + ``$value``. .. _example_1: .. code-block:: yaml @@ -117,7 +151,8 @@ of macro variable substitutions that generate crawler variable substitutions: Simulation: $recordtype: +$File -The expanded version of :ref:`example<example_1>` can be seen in :ref:`example<example_1_expanded>`. +The expanded version of the :ref:`example above<example_1>` (with ``nodename: Dataset``) can be seen +:ref:`here<example_1_expanded>`: .. _example_1_expanded: @@ -140,11 +175,11 @@ The expanded version of :ref:`example<example_1>` can be seen in :ref:`example<e type: SimpleFile type: Directory -This :ref:`example<example_1>` can also be found in the macro unit tests (see :func:`unittests.test_macros.test_documentation_example_1`). - +This example can also be found in the macro unit tests (see +:func:`unittests.test_macros.test_documentation_example_1`). -Using Macros Multiple Times +Using macros multiple times =========================== To use the same macro multiple times in the same yaml node, lists can be used: @@ -197,11 +232,11 @@ use the same top level key. Because later versions would overwrite previous ones. Here we used ``$macro_name`` to prevent that. -Limitation -========== +Limitations +=========== -Currently it is not possible to use the same macro twice in the same yaml node, but in different -positions. Consider: +Currently it is not possible to use the same macro twice in the same yaml node, if it occurs in +different positions. Consider: .. _example_multiple_limitation: .. code-block:: yaml @@ -226,14 +261,13 @@ positions. Consider: Other_node: type: test - test_twice: # This is NOT possible as each - # dictionary element can only appear once in a yaml node. + test_twice: # This is NOT possible as each key + # can only appear once in a yaml node. - macro_name: twice # <- This is the second one, with different arguments a: 5 - {} # <- This is the third one, just using default arguments -However, this should not be a real limitation, as the crawler is designed in a way, -that the order of the nodes in the same level should not matter. +This should not be a real limitation however, as the order of nodes does not matter for the crawler. Using macros within macro definitions diff --git a/src/doc/tutorials/index.rst b/src/doc/tutorials/index.rst index b6f0fab511f3646f3ec6a7a320299e72a2c20038..412d29f01018f05b84e0fe8e43fa631b61b91d04 100644 --- a/src/doc/tutorials/index.rst +++ b/src/doc/tutorials/index.rst @@ -9,4 +9,4 @@ This chapter contains a collection of tutorials. Parameter File<parameterfile> Scientific Data Folder<scifolder> - + Single Structured File <single_file> diff --git a/src/doc/tutorials/parameterfile.rst b/src/doc/tutorials/parameterfile.rst index 9369ba8b83df8c484a4af8f240e1a1de2f4c10fb..2442969541eebf9a4e058b797b48995b39372a3e 100644 --- a/src/doc/tutorials/parameterfile.rst +++ b/src/doc/tutorials/parameterfile.rst @@ -88,6 +88,10 @@ regular expressions do: We can use the groups from the regular expressions that are used for matching. In our example, we use the "value" group to assign the "frequency" value to the "Experiment". +.. note:: + + For more information on the ``cfood.yml`` specification, read on in the chapter :ref:`Converters`. + A fully grown CFood ------------------- @@ -148,4 +152,6 @@ the CFood file is in the current working directory): caosdb-crawler -s update -i identifiables.yml cfood.yml . +.. note:: + ``caosdb-crawler`` currently only works with cfoods which have a directory as top level element. diff --git a/src/doc/tutorials/single_file.rst b/src/doc/tutorials/single_file.rst new file mode 100644 index 0000000000000000000000000000000000000000..824a658985b9375e140df7fb63a1fc9e7f6a7563 --- /dev/null +++ b/src/doc/tutorials/single_file.rst @@ -0,0 +1,222 @@ +Tutorial: Single structured file +==================================== + +In this tutorial, we will create a crawler that reads a single structured file, +such as a CSV file. + +Declarations +------------ +This tutorial is based on the following simple data model: + +``model.yml`` + +.. code-block:: yaml + + Fish: + recommended_properties: + date: + datatype: DATETIME + number: + datatype: INTEGER + weight: + datatype: DOUBLE + species: + datatype: TEXT + +You can insert this model with the following command: + +.. code-block:: shell + + python -m caosadvancedtools.models.parser model.yml --sync + + +We will identify `Fish` Records in LinkAhead using the following two +attributes. + +``identifiables.yml`` + +.. code-block:: yaml + + Fish: + - date + - number + +And we will use the following crawler configuration. + +``cfood.yml`` + +.. code-block:: yaml + + --- + metadata: + crawler-version: 0.9.1 + --- + + fish_data_file: # Root file + type: CSVTableConverter + match: "^fish_data_.*.csv$" # Match CSV file with a name that starts with "fish_data_" + subtree: + table_row: # One row in the CSV file + type: DictElement + match_name: .* # we want to treat every row, so match anything + match_value: .* + records: + Fish: # Record for the current row; information from statements below + # are added to this Record + subtree: + date: # Element for the date column + type: TextElement + match_name: date # Name of the column in the table file + match_value: (?P<column_value>.*) # We match any value of the row in this + # column and assign it to the ``column_value`` + # variable + records: # Records edited for each cell + Fish: + date: $column_value + species: + type: TextElement + match_name: species + match_value: (?P<column_value>.*) + records: + Fish: + species: $column_value + number: + type: TextElement + match_name: identifier + match_value: (?P<column_value>.*) + records: + Fish: + number: $column_value + weight: + type: TextElement + match_name: weight + match_value: (?P<column_value>.*) + records: + Fish: + weight: $column_value + + + +Python code +----------- + +The following code allows us to read the csv file, create corresponding `Fish` +Records and synchronize those with LinkAhead. + +.. code-block:: python + + #!/usr/bin/env python3 + + # Copyright (C) 2023-2024 IndiScale GmbH <info@indiscale.com> + # + # This program is free software: you can redistribute it and/or modify + # it under the terms of the GNU Affero General Public License as + # published by the Free Software Foundation, either version 3 of the + # License, or (at your option) any later version. + # + # This program is distributed in the hope that it will be useful, + # but WITHOUT ANY WARRANTY; without even the implied warranty of + # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + # GNU Affero General Public License for more details. + # + # You should have received a copy of the GNU Affero General Public License + # along with this program. If not, see <https://www.gnu.org/licenses/>. + + """Crawler for fish data""" + + import os + import argparse + import sys + import logging + + from caoscrawler.scanner import load_definition, create_converter_registry, scan_structure_elements + from caoscrawler.structure_elements import File + from caoscrawler import Crawler, SecurityMode + from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter + + + def crawl_file(filename: str, dry_run: bool = False): + """Read a CSV file into a LinkAhead container. + + Parameters + ---------- + filename : str + The name of the CSV file. + + dry_run : bool + If True, do not modify the database. + """ + # setup logging + logger = logging.getLogger("caoscrawler") + logger.setLevel(level=(logging.DEBUG)) + logger.addHandler(logging.StreamHandler(stream=sys.stdout)) + + # load crawler configuration + definition = load_definition("cfood.yml") + converter_registry = create_converter_registry(definition) + + # crawl the CSV file + records = scan_structure_elements(items=File(name= os.path.basename(filename), path=filename), + crawler_definition=definition, + converter_registry=converter_registry) + logger.debug(records) + + crawler = Crawler(securityMode=SecurityMode.UPDATE) + # This defines how Records on the server are identified with the ones we have locally + ident = CaosDBIdentifiableAdapter() + ident.load_from_yaml_definition("identifiables.yml") + crawler.identifiableAdapter = ident + + # Here we synchronize the data + inserts, updates = crawler.synchronize(commit_changes=True, unique_names=True, + crawled_data=records) + + #from IPython import embed + #embed() + + def _parse_arguments(): + """Parse the arguments.""" + parser = argparse.ArgumentParser(description='Crawler for fish data') + parser.add_argument('-n', '--dry-run', help="Do not modify the database.", action="store_true") + parser.add_argument('csv_file', metavar="csv file", help="The csv file to be crawled.") + return parser.parse_args() + + + def main(): + """Main function.""" + args = _parse_arguments() + crawl_file(args.csv_file, dry_run=args.dry_run) + + + if __name__ == '__main__': + main() + +Running it +---------- +This is an example for the data files that we can crawl: + +``fish_data_1.csv`` + +.. code-block:: + + identifier,date,species,weight + 1,2022-01-02,pike,3.4 + 2,2022-01-02,guppy,2.3 + 3,2022-01-02,pike,2.2 + 3,2022-01-06,pike,2.1 + + +If you have created all the files, you can run: + +.. code-block:: shell + + python3 crawl.py fish_data_2.csv + +Note, that you can run the same script again and you will not see any changes +being done to the data in LinkAhead. + + +You may play around with changing data in the data table. Changes will +propagate into LinkAhead when you run the Crawler again. If you change one of +the identifying properties, the Crawler will consider the data that it reads as +new and create new `Fish` Records. diff --git a/tox.ini b/tox.ini index a7d4465ed36f0fe5e49c06721d3e3a0cdf453fa0..e003e26ecd16861c3b8a8d991fc789c78d203e5b 100644 --- a/tox.ini +++ b/tox.ini @@ -1,20 +1,23 @@ [tox] -envlist = py37, py38, py39, py310, py311 +envlist = py38, py39, py310, py311, py312, py313 skip_missing_interpreters = true [testenv] -deps = . +deps = .[h5-crawler,spss,rocrate] pytest pytest-cov # TODO: Make this f-branch sensitive git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@dev commands = caosdb-crawler --help - py.test --cov=caosdb -vv {posargs} + py.test --cov=caoscrawler -vv {posargs} [flake8] max-line-length = 100 +[pycodestyle] +max-line-length = 100 + [pytest] testpaths = unittests -xfail_strict = True \ No newline at end of file +xfail_strict = True diff --git a/unittests/broken_cfoods/broken_record_from_dict.yml b/unittests/broken_cfoods/broken_record_from_dict.yml new file mode 100644 index 0000000000000000000000000000000000000000..fd8ffdbd29f6ad7b8b38fc17eb43686f4170dbcb --- /dev/null +++ b/unittests/broken_cfoods/broken_record_from_dict.yml @@ -0,0 +1,7 @@ +RecordFromDictElement: + type: PropertiesFromDictElement + match: "(.*)" + subtree: + AnotherElement: + type: Text + match_name: "(.*)" diff --git a/unittests/broken_cfoods/broken_record_from_dict_2.yml b/unittests/broken_cfoods/broken_record_from_dict_2.yml new file mode 100644 index 0000000000000000000000000000000000000000..ca321373c6c4d6bcc8c104c8c4b3c7147bf71375 --- /dev/null +++ b/unittests/broken_cfoods/broken_record_from_dict_2.yml @@ -0,0 +1,11 @@ +RecordFromDictElement: + type: PropertiesFromDictElement + record_from_dict: + parents: + - MyType1 + - MyType2 + match: "(.*)" + subtree: + AnotherElement: + type: Text + match_name: "(.*)" diff --git a/unittests/datamodels/datamodel.yaml b/unittests/datamodels/datamodel.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2759ecba7f2967062937d9b2f4805a9b501ab6c4 --- /dev/null +++ b/unittests/datamodels/datamodel.yaml @@ -0,0 +1,6 @@ +Dataset: + obligatory_properties: + keywords: + datatype: TEXT + dateModified: + datatype: DATETIME diff --git a/unittests/eln_cfood.yaml b/unittests/eln_cfood.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ab8e7108f511b0450d37c3e60162e412d4a1bf3b --- /dev/null +++ b/unittests/eln_cfood.yaml @@ -0,0 +1,36 @@ +--- +metadata: + crawler-version: 0.9.2 + macros: +--- +Converters: + ELNFile: + converter: ELNFileConverter + package: caoscrawler.converters + ROCrateEntity: + converter: ROCrateEntityConverter + package: caoscrawler.converters + +DataDir: + type: Directory + match: .* + subtree: + ELNFile: + type: ELNFile + match: ^.*\.eln$ + subtree: + RecordsExample: + type: ROCrateEntity + match_type: Dataset + match_properties: + "@id": records-example/$ + name: (?P<name>.*) + keywords: (?P<keywords>.*) + description: (?P<description>.*) + dateModified: (?P<dateModified>.*) + records: + Dataset: + name: $name + keywords: $keywords + description: $description + dateModified: $dateModified diff --git a/unittests/eln_files/PASTA.eln b/unittests/eln_files/PASTA.eln new file mode 100644 index 0000000000000000000000000000000000000000..61866e7d5f57cb32191af6663be230153092e712 Binary files /dev/null and b/unittests/eln_files/PASTA.eln differ diff --git a/unittests/eln_files/records-example.eln b/unittests/eln_files/records-example.eln new file mode 100644 index 0000000000000000000000000000000000000000..09ed53fc179e80a240ab773247d6f9adee71b429 Binary files /dev/null and b/unittests/eln_files/records-example.eln differ diff --git a/unittests/example_cfood.yml b/unittests/example_cfood.yml index 713bd4be0f3c816e1e8c8b7a057b30a4b400f13c..59cb601395f73bd26ed81bd6ea9c51f670798d36 100644 --- a/unittests/example_cfood.yml +++ b/unittests/example_cfood.yml @@ -1,6 +1,6 @@ --- metadata: - crawler-version: 0.3.1 + crawler-version: 0.9.0 --- Definitions: type: Definitions diff --git a/unittests/h5_cfood.yml b/unittests/h5_cfood.yml new file mode 100644 index 0000000000000000000000000000000000000000..dc789a85aabcbdc32388fd91460d42d477630f37 --- /dev/null +++ b/unittests/h5_cfood.yml @@ -0,0 +1,69 @@ +--- +metadata: + crawler-version: 0.9.0 +--- +Converters: + H5Dataset: + converter: H5DatasetConverter + package: caoscrawler.converters.hdf5_converter + H5File: + converter: H5FileConverter + package: caoscrawler.converters.hdf5_converter + H5Group: + converter: H5GroupConverter + package: caoscrawler.converters.hdf5_converter + H5Ndarray: + converter: H5NdarrayConverter + package: caoscrawler.converters.hdf5_converter +# Top-level, we have just the HDF5 file. +ParentDirectory: + type: Directory + match: (.*) + subtree: + H5FileElement: + type: H5File + match: (.*)\.(hdf5|h5)$ + records: + H5File: + parents: + - H5File + role: File + path: $H5FileElement + file: $H5FileElement + subtree: + # Here, we have the groups, the top-level dataset, and possible + # attributes (empty for now). + RootIntegerElement: + type: H5Dataset + match_name: ^root_integers$ + records: + H5Dataset: + parents: + - H5Dataset + H5File: + H5Dataset: +$H5Dataset + subtree: + # included NDArray in this dataset + TopLevelIntNDElement: + type: H5Ndarray + match_name: (.*) + recordname: this + records: + # this: + # ContainingFile: $H5File + H5Dataset: + Ndarray: $this + # There is one more list-valued attribute to this dataset. + TopLevelDataAttribute: + type: ListElement + match_name: ^attr_data_root$ + subtree: + AttributeListEntry: + type: FloatElement + match_name: (.*) + match_value: (?P<value>.*) + records: + H5Dataset: + attr_data_root: +$value + + diff --git a/unittests/hdf5_dummy_file.hdf5 b/unittests/hdf5_dummy_file.hdf5 new file mode 100644 index 0000000000000000000000000000000000000000..41bfb7ab3bcac19d90fd4f018cdd8118ae806eaf Binary files /dev/null and b/unittests/hdf5_dummy_file.hdf5 differ diff --git a/unittests/record_from_dict_cfood.yml b/unittests/record_from_dict_cfood.yml new file mode 100644 index 0000000000000000000000000000000000000000..1ea2159df9d63256d9a0b2e293d82a9ad694608f --- /dev/null +++ b/unittests/record_from_dict_cfood.yml @@ -0,0 +1,12 @@ +PropertiesFromDictElement: + type: PropertiesFromDictElement + match: ".*" + record_from_dict: + variable_name: MyRec + parents: + - MyType1 + - MyType2 + references: + author: + parents: + - Person diff --git a/unittests/scifolder_cfood.yml b/unittests/scifolder_cfood.yml index 9d6e8cf3ea325ad14641530f2e6cafd43f0dc1bb..f32c24e772b86ab8adf530d20ec208722b74deac 100644 --- a/unittests/scifolder_cfood.yml +++ b/unittests/scifolder_cfood.yml @@ -4,7 +4,7 @@ --- metadata: - crawler-version: 0.3.1 + crawler-version: 0.9.0 --- Definitions: type: Definitions diff --git a/unittests/test_cfood_metadata.py b/unittests/test_cfood_metadata.py index 494bd383d95b4a845b5ea6f86ccff0f9a1db257f..b123f98584ba99ed4fec412732cb2bf536034a91 100644 --- a/unittests/test_cfood_metadata.py +++ b/unittests/test_cfood_metadata.py @@ -17,15 +17,13 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. # -import pytest -import yaml - from tempfile import NamedTemporaryFile from unittest.mock import patch -from unittest.mock import MagicMock, Mock -import caoscrawler +import pytest +import yaml +import caoscrawler from caoscrawler.scanner import load_definition @@ -35,7 +33,7 @@ def _temp_file_load(txt: str): definition using load_definition from Crawler. """ definition = None - with NamedTemporaryFile() as f: + with NamedTemporaryFile(delete=False) as f: f.write(txt.encode()) f.flush() definition = load_definition(f.name) diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 52ece13dc2269a3e3b16e6378166e91b084f4a7c..e4b442d91060c7ba98cb1a910156b1800f050be3 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -3,8 +3,9 @@ # # This file is a part of the CaosDB Project. # -# Copyright (C) 2021,2022 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2021-2024 Indiscale GmbH <info@indiscale.com> # Copyright (C) 2021,2022 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as @@ -28,26 +29,33 @@ import importlib import json import logging import os -import sys +import pytest +import yaml + from itertools import product from pathlib import Path +from tempfile import NamedTemporaryFile + +import linkahead as db -import pytest -import yaml from caoscrawler.converters import (Converter, ConverterValidationError, DateElementConverter, DictElementConverter, DictIntegerElementConverter, DirectoryConverter, FloatElementConverter, IntegerElementConverter, JSONFileConverter, ListElementConverter, - MarkdownFileConverter, YAMLFileConverter, - _AbstractScalarValueElementConverter, - handle_value, replace_variables) + MarkdownFileConverter, + PropertiesFromDictConverter, + YAMLFileConverter, handle_value, + replace_variables) +from caoscrawler.converters.converters import \ + _AbstractScalarValueElementConverter from caoscrawler.crawl import Crawler from caoscrawler.scanner import (_load_definition_from_yaml_dict, create_converter_registry, - create_transformer_registry, load_definition) -from caoscrawler.stores import GeneralStore + create_transformer_registry, load_definition, + scan_structure_elements) +from caoscrawler.stores import GeneralStore, RecordStore from caoscrawler.structure_elements import (BooleanElement, DictElement, Directory, File, FloatElement, IntegerElement, ListElement, @@ -72,6 +80,10 @@ def converter_registry(): "DictElement": { "converter": "DictElementConverter", "package": "caoscrawler.converters"}, + "PropertiesFromDictElement": { + "converter": "PropertiesFromDictConverter", + "package": "caoscrawler.converters" + }, "TextElement": { "converter": "TextElementConverter", "package": "caoscrawler.converters"}, @@ -140,7 +152,7 @@ def test_markdown_converter(converter_registry): converter = MarkdownFileConverter({"match": "(.*)"}, "TestMarkdownFileConverter", converter_registry) - with pytest.raises(ConverterValidationError) as err: + with pytest.raises(ConverterValidationError): converter.create_children(None, File("test_tool.py", UNITTESTDIR / "test_crawler.py")) m = converter.match(test_readme) @@ -341,6 +353,8 @@ def test_variable_replacement(): values = GeneralStore() values["a"] = 4 values["b"] = "68" + values["my_unit"] = "m" + values["cm"] = "cm" # basic values stay unchanged assert replace_variables(5, values) is 5 @@ -348,28 +362,38 @@ def test_variable_replacement(): assert replace_variables("$a", values) is 4 assert replace_variables("${b}", values) == "68" - assert handle_value("b", values) == ("b", "single") - assert handle_value("+b", values) == ("b", "list") - assert handle_value("*b", values) == ("b", "multiproperty") - assert handle_value("$b", values) == ("68", "single") - assert handle_value("+$b", values) == ("68", "list") - assert handle_value("*$b", values) == ("68", "multiproperty") + # values given as simple strings never have units + assert handle_value("b", values) == ("b", None, "single") + assert handle_value("+b", values) == ("b", None, "list") + assert handle_value("*b", values) == ("b", None, "multiproperty") + assert handle_value("$b", values) == ("68", None, "single") + assert handle_value("+$b", values) == ("68", None, "list") + assert handle_value("*$b", values) == ("68", None, "multiproperty") + # No units in dicts assert handle_value({"value": "b", - "collection_mode": "single"}, values) == ("b", "single") + "collection_mode": "single"}, values) == ("b", None, "single") assert handle_value({"value": "b", - "collection_mode": "list"}, values) == ("b", "list") + "collection_mode": "list"}, values) == ("b", None, "list") assert handle_value({"value": "b", - "collection_mode": "multiproperty"}, values) == ("b", "multiproperty") + "collection_mode": "multiproperty"}, values) == ("b", None, "multiproperty") assert handle_value({"value": "$b", - "collection_mode": "single"}, values) == ("68", "single") + "collection_mode": "single"}, values) == ("68", None, "single") assert handle_value({"value": "$b", - "collection_mode": "list"}, values) == ("68", "list") + "collection_mode": "list"}, values) == ("68", None, "list") assert handle_value({"value": "$b", - "collection_mode": "multiproperty"}, values) == ("68", "multiproperty") + "collection_mode": "multiproperty"}, values) == ("68", None, "multiproperty") + + # Unit specified in the same way as value: + assert handle_value({"value": 5, "unit": "m"}, values) == (5, "m", "single") + assert handle_value({"value": 5, "unit": "${my_unit}"}, values) == (5, "m", "single") + assert handle_value({"value": "+5", "unit": "${my_unit}"}, values) == ("5", "m", "list") + assert handle_value({"value": "*5", "unit": "${my_unit}"}, + values) == ("5", "m", "multiproperty") - assert handle_value(["a", "b"], values) == (["a", "b"], "single") - assert handle_value(["$a", "$b"], values) == ([4, "68"], "single") + assert handle_value(["a", "b"], values) == (["a", "b"], None, "single") + assert handle_value(["$a", "$b"], values) == ([4, "68"], None, "single") + assert handle_value({"value": ["$a", "$a"], "unit": "$cm"}, values) == ([4, 4], "cm", "single") def test_apply_transformers(converter_registry): @@ -496,7 +520,7 @@ MyElement: two_doc_yaml = """ --- metadata: - crawler-version: 0.3.1 + crawler-version: 0.9.0 Converters: MyNewType: converter: MyNewTypeConverter @@ -632,9 +656,10 @@ def test_load_converters(): # converter classes can be loaded from their respective packages. # Please adapt, if defaults change! - assert len(converter_registry) == 22 + assert len(converter_registry) == 29 # All of them are contained in caoscrawler.converters + # except for the xml converters: for conv_key, conv in converter_registry.items(): assert conv["package"] == "caoscrawler.converters" # ... and their names all end in "Converter" @@ -659,3 +684,448 @@ def test_create_path_value(converter_registry): dc.create_values(values, Directory("a", "/a")) assert "Test.path" in values assert values["Test.path"] == "/a" + + +def test_properties_from_dict_basic(converter_registry): + """Test that a record with the correct name and properties is created, and + that the children are still created correctly. + + """ + # definitions with blacklist and named references + pfdc = PropertiesFromDictConverter( + definition={ + "type": "PropertiesFromDictElement", + "match": ".*", + "record_from_dict": { + "variable_name": "MyRec", + "parents": ["DictRT1", "DictRT2"], + "properties_blacklist": ["blacklisted_int", "blacklisted_ref"], + "references": { + "authors": { + "parents": ["Person"] + } + } + } + }, + name="Test", converter_registry=converter_registry) + # Tests for Dict with scalars, dict with lists, dict with reference, + # dict with list of references, dict with reference with reference, named + # reference + values = GeneralStore() + records = RecordStore() + test_dict_element = DictElement("TestDictElement", { + "a": 5, + "b": ["a", "b", "c"], + "scalar_ref": { + "name": "Scalar Ref", + "a": 23, + "blacklisted_int": 42 + }, + "list_ref": [ + { + "c": True + }, + { + "c": False + } + ], + "ref_with_ref": { + "a": 789, + "ref_in_ref": { + "b": "something" + } + }, + "blacklisted_int": -123, + "blacklisted_ref": { + "a": 25 + }, + "authors": { + "full_name": "Some Author" + } + }) + pfdc.create_records(values=values, records=records, element=test_dict_element) + assert "MyRec" in records + my_rec = records["MyRec"] + assert isinstance(my_rec, db.Record) + assert len(my_rec.parents) == 2 + assert "DictRT1" in [par.name for par in my_rec.parents] + assert "DictRT2" in [par.name for par in my_rec.parents] + + # scalar prop + assert my_rec.get_property("a") is not None + assert my_rec.get_property("a").value == 5 + + # list prop + assert my_rec.get_property("b") is not None + assert len(my_rec.get_property("b").value) == 3 + for elt in ["a", "b", "c"]: + assert elt in my_rec.get_property("b").value + + # scalar ref + assert my_rec.get_property("scalar_ref") is not None + referenced = my_rec.get_property("scalar_ref").value + assert isinstance(referenced, db.Record) + assert referenced.name == "Scalar Ref" + assert len(referenced.parents) == 1 + assert "scalar_ref" in [par.name for par in referenced.parents] + assert referenced.get_property("a") is not None + assert referenced.get_property("a").value == 23 + # blacklisted + assert referenced.get_property("blacklisted_int") is None + + # list of ref + assert my_rec.get_property("list_ref") is not None + assert isinstance(my_rec.get_property("list_ref").value, list) + assert len(my_rec.get_property("list_ref").value) == 2 + for rec in my_rec.get_property("list_ref").value: + assert isinstance(rec, db.Record) + assert len(rec.parents) == 1 + assert "list_ref" in [par.name for par in rec.parents] + assert rec.get_property("c") is not None + assert type(rec.get_property("c").value) is bool + assert True in [rec.get_property("c").value for rec in my_rec.get_property("list_ref").value] + assert False in [rec.get_property("c").value for rec in my_rec.get_property("list_ref").value] + + # ref with ref + assert my_rec.get_property("ref_with_ref") is not None + outer_rec = my_rec.get_property("ref_with_ref").value + assert isinstance(outer_rec, db.Record) + assert len(outer_rec.parents) == 1 + assert "ref_with_ref" in [par.name for par in outer_rec.parents] + assert outer_rec.get_property("a") is not None + assert outer_rec.get_property("a").value == 789 + assert outer_rec.get_property("ref_in_ref") is not None + inner_rec = outer_rec.get_property("ref_in_ref").value + assert isinstance(inner_rec, db.Record) + assert len(inner_rec.parents) == 1 + assert "ref_in_ref" in [par.name for par in inner_rec.parents] + assert inner_rec.get_property("b") is not None + assert inner_rec.get_property("b").value == "something" + + # blacklisted + assert my_rec.get_property("blacklisted_int") is None + assert my_rec.get_property("blacklisted_ref") is None + + # named reference property + assert my_rec.get_property("authors") is not None + author_rec = my_rec.get_property("authors").value + assert isinstance(author_rec, db.Record) + assert len(author_rec.parents) == 1 + assert "Person" in [par.name for par in author_rec.parents] + assert author_rec.get_property("full_name") is not None + assert author_rec.get_property("full_name").value == "Some Author" + + +def test_properties_from_dict_callable(converter_registry): + + def convert_some_values(rec: db.Record, records: RecordStore, values: GeneralStore): + """Add an URL prefix to a property value if appliccable.""" + + if rec.get_property("url") is not None: + + old_val = rec.get_property("url").value + if not (old_val is None or old_val.startswith("http")): + + # only add if there is a value that doesn't look like an URL + rec.get_property("url").value = f"https://test.com/{old_val}" + + return rec + + pdfc = PropertiesFromDictConverter( + definition={ + "record_from_dict": { + "variable_name": "MyRec", + "name": "My New Record" + } + }, + name="TestConverter", + converter_registry=converter_registry, + referenced_record_callback=convert_some_values + ) + + values = GeneralStore() + records = RecordStore() + test_dict_element = DictElement("TestDictElement", { + "url": "something", + "referenced1": { + "url": "referenced" + }, + "referenced2": { + "nourl": "something else", + "url": "https://indiscale.com" + } + }) + pdfc.create_records(values=values, records=records, element=test_dict_element) + assert "MyRec" in records + my_rec = records["MyRec"] + assert isinstance(my_rec, db.Record) + assert len(my_rec.parents) == 1 + assert "MyRec" in [par.name for par in my_rec.parents] + assert my_rec.name == "My New Record" + + # simple conversion + assert my_rec.get_property("url") is not None + assert my_rec.get_property("url").value == "https://test.com/something" + + # also works in referenced + assert my_rec.get_property("referenced1") is not None + referenced1 = my_rec.get_property("referenced1").value + assert isinstance(referenced1, db.Record) + assert referenced1.get_property("url") is not None + assert referenced1.get_property("url").value == "https://test.com/referenced" + + # ... and works as expected + assert my_rec.get_property("referenced2") is not None + referenced2 = my_rec.get_property("referenced2").value + assert isinstance(referenced2, db.Record) + assert referenced2.get_property("nourl") is not None + assert referenced2.get_property("nourl").value == "something else" + assert referenced2.get_property("url") is not None + assert referenced2.get_property("url").value == "https://indiscale.com" + + +def test_properties_from_dict_nested(converter_registry): + """Test the PropertiesFromDictConverter with a nested dict, + together with the regular DictElementConverter and Records created + and used on different subtree levels. + + """ + root_dict_element = DictElement("RootDict", { + "TopLevelRec": "MyRec", + "propertiesDict": { + "a": 5, + "blacklisted": { + "bl_name": "BlackList", + "date": "2023-12-31" + } + }, + "otherDict": { + "additional_from_other": "other" + } + }) + def_dict = { + "RootElt": { + # Root dictionary + "type": "DictElement", + "match": ".*", + "records": { + # Define top-level, use below in subtrees + "MyRec": { + "parents": ["MyType"] + } + }, + "subtree": { + # Top-level text element for the Record name + "NameElt": { + "type": "TextElement", + "match_name": "^TopLevelRec$", + "match_value": "(?P<name>.*)", + "records": { + "MyRec": { + "name": "$name" + } + } + }, + "PFDElement": { + "type": "PropertiesFromDictElement", + "match_name": "^propertiesDict$", + "record_from_dict": { + "variable_name": "MyRec", + "properties_blacklist": ["blacklisted"] + }, + "subtree": { + "BLElement": { + "type": "DictElement", + "match_name": "^blacklisted$", + "records": { + "BLRec": { + "parents": ["BlackListedType"], + "MyRec": "$MyRec" + } + }, + "subtree": { + "BLNameElt": { + "type": "TextElement", + "match_name": "^bl_name$", + "match_value": "(?P<name>.*)", + "records": { + "BLRec": { + "name": "$name" + } + } + }, + "BLDateElt": { + "type": "TextElement", + "match_name": "^date$", + "match_value": "(?P<date>.*)", + "records": { + "BLRec": { + "creation_date": "$date" + } + } + } + } + } + } + }, + # Other dict which uses the DictElementConverter + "OtherDictElement": { + "type": "DictElement", + "match_name": "^otherDict$", + "subtree": { + "additionalElt": { + "type": "TextElement", + "match_name": "^additional_from_other$", + "match_value": "(?P<val>.*)", + "records": { + "MyRec": { + "additional_from_other": "$val" + } + } + } + } + } + } + } + } + + records = scan_structure_elements(root_dict_element, def_dict, converter_registry) + + # All records need to be there + assert len(records) == 2 + myrec = None + blrec = None + for rec in records: + if rec.name == "MyRec": + myrec = rec + elif rec.name == "BlackList": + blrec = rec + assert myrec is not None + assert blrec is not None + + # Parent is set from top level + assert len(myrec.parents) == 1 + assert "MyType" in [par.name for par in myrec.parents] + + # Set automatically, with blacklist + assert myrec.get_property("a") is not None + assert myrec.get_property("a").value == 5 + assert myrec.get_property("blacklisted") is None + + # Now check blacklisted record from subtree + assert len(blrec.parents) == 1 + assert "BlackListedType" in [par.name for par in blrec.parents] + assert blrec.get_property("MyRec") is not None + assert blrec.get_property("MyRec").value == myrec + assert blrec.get_property("creation_date") is not None + assert blrec.get_property("creation_date").value == "2023-12-31" + + # The "old" DictConverter should have added the additional property: + assert myrec.get_property("additional_from_other") is not None + assert myrec.get_property("additional_from_other").value == "other" + + +def test_dict_match_properties(converter_registry): + + root_dict_element = DictElement("RootDict", { + "prop_a": "value", + "prop_b": "25", + "prop_c": 24 + }) + + def_dict = { + "RootElt": { + # Root dictionary + "type": "DictElement", + "match_properties": { + "prop_a": "(?P<a>.*)$", + "prop_[^ac]": "(?P<b>.*)$", + "prop_c": "(?P<c>.*)$", + }, + "records": { + # Define top-level, use below in subtrees + "MyRec": { + "prop_a": "$a", + "prop_b": "$b", + "$a": "$c" + } + }}} + records = scan_structure_elements(root_dict_element, def_dict, converter_registry) + assert len(records) == 1 + record = records[0] + assert record.get_property("prop_a").value == "value" + assert record.get_property("prop_b").value == "25" + assert record.get_property("value").value == "24" # Note the type change here + + root_dict_element = DictElement("RootDict", { + "prop_a": "value", + "prop_b": "25", + # Property missing + }) + + records = scan_structure_elements(root_dict_element, def_dict, converter_registry) + assert len(records) == 0 + + with pytest.raises(RuntimeError, match="Multiple properties match the same match_properties entry."): + root_dict_element = DictElement("RootDict", { + "prop_a": "value", + "prop_b": "25", + "prop_d": 24 # duplicate matches + }) + records = scan_structure_elements(root_dict_element, def_dict, converter_registry) + + +def test_directory_converter_change_date(caplog, converter_registry): + """Test that only directories that were modified after a certain + date are crawled. + + """ + test_dir_element = Directory("test_directories", UNITTESTDIR / "test_directories") + date_of_dir_change = DirectoryConverter._get_most_recent_change_in_dir(test_dir_element) + past_date = date_of_dir_change - datetime.timedelta(days=1) + future_date = date_of_dir_change + datetime.timedelta(days=1) + + tmpfi = NamedTemporaryFile(delete=False) + + # Write down past + with open(tmpfi.name, "w") as fi: + fi.write(f"{past_date.isoformat()}\n") + + converter_def = { + "type": "Directory", + "match": "^test_directories$", + "match_newer_than_file": tmpfi.name + } + dc = DirectoryConverter(name="DC1", definition=converter_def, + converter_registry=converter_registry) + assert dc.match(test_dir_element) is not None + + # Write down future, so nothing should match + with open(tmpfi.name, "w") as fi: + fi.write(f"{future_date.isoformat()}\n") + assert dc.match(test_dir_element) is None + + # Also match in the corner case of equality: + with open(tmpfi.name, "w") as fi: + fi.write(f"{date_of_dir_change.isoformat()}\n") + assert dc.match(test_dir_element) is not None + + # Match but warn + with open(tmpfi.name, "w") as fi: + fi.write(f"This is garbage.\n") + with pytest.raises(ValueError): + dc.match(test_dir_element) + assert len(caplog.record_tuples) == 1 + assert caplog.record_tuples[0][1] == logging.ERROR + assert tmpfi.name in caplog.record_tuples[0][2] + assert "doesn't contain a ISO formatted datetime in its first line" in caplog.record_tuples[0][2] + + # Match anything since file doesn't exist, inform in debug log. + os.remove(tmpfi.name) + # Clear log and enforce debug level. + caplog.clear() + caplog.set_level(logging.DEBUG) + assert dc.match(test_dir_element) is not None + assert len(caplog.record_tuples) == 1 + assert caplog.record_tuples[0][1] == logging.DEBUG + assert "Reference file doesn't exist." == caplog.record_tuples[0][2] diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index fbf98346e59b0cbec88f17398eff41f26c423dee..ad69c6f57cbc8d48d194507d7c1aa79c9da7521b 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -23,7 +23,6 @@ """ test the Crawler class """ -import json import logging import os import warnings @@ -33,14 +32,23 @@ from os.path import basename, dirname, join from pathlib import Path from unittest.mock import MagicMock, Mock, patch -import caoscrawler -import caosdb as db -import caosdb.common.models as dbmodels +import linkahead as db +import linkahead.common.models as dbmodels import pytest import yaml +from caosadvancedtools.models.parser import parse_model_from_string +from linkahead.apiutils import compare_entities +from linkahead.cached import cache_clear +from linkahead.exceptions import EmptyUniqueQueryError +from pytest import raises + +import caoscrawler from caoscrawler.crawl import (Crawler, SecurityMode, _treat_deprecated_prefix, crawler_main, split_restricted_path) from caoscrawler.debug_tree import DebugTree +from caoscrawler.exceptions import (ImpossibleMergeError, + MissingIdentifyingProperty, + MissingReferencingEntityError) from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import (CaosDBIdentifiableAdapter, IdentifiableAdapter, @@ -50,10 +58,7 @@ from caoscrawler.scanner import (create_converter_registry, scan_directory, from caoscrawler.stores import GeneralStore, RecordStore from caoscrawler.structure_elements import (DictElement, DictListElement, DictTextElement, File) -from caosdb.apiutils import compare_entities -from caosdb.cached import cache_clear -from caosdb.exceptions import EmptyUniqueQueryError -from pytest import raises +from caoscrawler.sync_graph import SyncGraph UNITTESTDIR = Path(__file__).parent @@ -85,6 +90,20 @@ NEW_ELEMENT = (db.Record() .add_property(name="result", value="homogeneous")) +def reset_mocks(mocks): + for mock in mocks: + mock.reset_mock() + + +def mock_create_values(values, element): + pass + + +def mock_get_entity_by_query(query=None): + if query is not None: + return db.Record(id=1111, name='rec_name').add_parent('RT') + + def mock_get_entity_by(eid=None, name=None, path=None): if eid is not None: candidates = [el for el in EXAMPLE_SERVER_STATE if el.id == eid] @@ -108,13 +127,113 @@ def mock_get_entity_by(eid=None, name=None, path=None): raise EmptyUniqueQueryError("") +def basic_retrieve_by_name_mock_up(rec, referencing_entities=None, known=None): + """ returns a stored Record if rec.name is an existing key, None otherwise """ + if rec.name in known: + return known[rec.name] + else: + return None + + +def mock_retrieve_record(identifiable: Identifiable): + """ assumes that the identifiable is always only the date""" + + for record in EXAMPLE_SERVER_STATE: + if (record.role == "Record" and "date" in identifiable.properties + and record.get_property("date").value == identifiable.properties['date']): + return record + return None + + +def mock_cached_only_rt(query_string: str): + """Always return an empty Container""" + result = db.Container() + lo_query = query_string.lower() + if lo_query.startswith("find record ") or lo_query.startswith("find file "): + return result + model = parse_model_from_string(""" +B: + obligatory_properties: + C: + obligatory_properties: + prop_other: + datatype: INTEGER + prop_ident: + datatype: INTEGER +A: + obligatory_properties: + B: + datatype: LIST<B> + prop_ident: +""") + if query_string == "FIND RECORDTYPE 'A'": + model.get_deep("A").id = 1 + return result + [model.get_deep("A")] + if query_string == "FIND RECORDTYPE 'B'": + model.get_deep("A").id = 2 + return result + [model.get_deep("B")] + print(query_string) + raise NotImplementedError(f"Mock for this case is missing: {query_string}") + + +def mock_cached_only_rt_allow_empty(query_string: str): + try: + result = mock_cached_only_rt(query_string) + except NotImplementedError: + result = db.Container() + return result + + @pytest.fixture(autouse=True) def clear_cache(): cache_clear() +@pytest.fixture +def crawler_mocked_identifiable_retrieve(): + crawler = Crawler() + # TODO use minimal setup + # mock retrieval of registered identifiabls: return Record with just a parent + crawler.identifiableAdapter.get_registered_identifiable = Mock( + side_effect=lambda x: db.Record().add_parent(x.parents[0].name).add_property(name='name')) + + # Simulate remote server content by using the names to identify records + # There is only a single known Record with name A + crawler.identifiableAdapter.retrieve_identified_record_for_identifiable = Mock( + side_effect=partial( + basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A")})) + return crawler + + +@pytest.fixture +def crawler_mocked_for_backref_test(): + crawler = Crawler() + # mock retrieval of registered identifiabls: return Record with just a parent + + def get_reg_ident(x): + if x.parents[0].name == "C": + return db.Record().add_parent(x.parents[0].name).add_property( + "is_referenced_by", value=["BR"]).add_property("name") + elif x.parents[0].name == "D": + return db.Record().add_parent(x.parents[0].name).add_property( + "is_referenced_by", value=["BR", "BR2"]).add_property("name") + else: + return db.Record().add_parent(x.parents[0].name).add_property("name") + crawler.identifiableAdapter.get_registered_identifiable = Mock(side_effect=get_reg_ident) + + # Simulate remote server content by using the names to identify records + # There is only a single known Record with name A + crawler.identifiableAdapter.retrieve_identified_record_for_identifiable = Mock( + side_effect=partial( + basic_retrieve_by_name_mock_up, known={"A": + db.Record(id=1111, name="A").add_parent("BR")})) + return crawler + + @pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_constructor(): + # tests that appropriate DeprecationWarnings are triggered by the constructor when deprecated + # arguments are being passed. with warnings.catch_warnings(record=True) as w: # Cause all warnings to always be triggered. warnings.filterwarnings("ignore") @@ -131,6 +250,7 @@ def test_constructor(): @pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_deprecated_functions(): + # tests that appropriate DeprecationWarnings are triggered by deprecated methods with warnings.catch_warnings(record=True) as w: # Cause all warnings to always be triggered. warnings.filterwarnings("ignore") @@ -175,106 +295,58 @@ def test_check_whether_parent_exists(): def test_remove_unnecessary_updates(): # test trvial case - upl = [db.Record().add_parent("A")] - irs = [db.Record().add_parent("A")] - updates = Crawler.remove_unnecessary_updates(upl, irs) + crawled_data = [db.Record().add_parent("A")] + identified_records = [db.Record().add_parent("A")] + updates = Crawler.remove_unnecessary_updates(crawled_data, identified_records) assert len(updates) == 0 # test property difference case - # TODO this should work right? - # upl = [db.Record().add_parent("A").add_property("a", 3)] - # irs = [db.Record().add_parent("A")] # ID should be s - # Crawler.remove_unnecessary_updates(upl, irs) - # assert len(upl) == 1 + crawled_data = [db.Record().add_parent("A").add_property("a", 3)] + identified_records = [db.Record().add_parent("A")] # ID should be s + Crawler.remove_unnecessary_updates(crawled_data, identified_records) + assert len(crawled_data) == 1 # test value difference case - upl = [db.Record().add_parent("A").add_property("a", 5)] - irs = [db.Record().add_parent("A").add_property("a")] - updates = Crawler.remove_unnecessary_updates(upl, irs) + crawled_data = [db.Record().add_parent("A").add_property("a", 5)] + identified_records = [db.Record().add_parent("A").add_property("a")] + updates = Crawler.remove_unnecessary_updates(crawled_data, identified_records) assert len(updates) == 1 - upl = [db.Record().add_parent("A").add_property("a", 5)] - irs = [db.Record().add_parent("A").add_property("a", 5)] - updates = Crawler.remove_unnecessary_updates(upl, irs) + crawled_data = [db.Record().add_parent("A").add_property("a", 5)] + identified_records = [db.Record().add_parent("A").add_property("a", 5)] + updates = Crawler.remove_unnecessary_updates(crawled_data, identified_records) assert len(updates) == 0 # test unit difference case - upl = [db.Record().add_parent("A").add_property("a", unit='cm')] - irs = [db.Record().add_parent("A").add_property("a")] - updates = Crawler.remove_unnecessary_updates(upl, irs) + crawled_data = [db.Record().add_parent("A").add_property("a", unit='cm')] + identified_records = [db.Record().add_parent("A").add_property("a")] + updates = Crawler.remove_unnecessary_updates(crawled_data, identified_records) assert len(updates) == 1 # test None difference case - upl = [db.Record().add_parent("A").add_property("a")] - irs = [db.Record().add_parent("A").add_property("a", 5)] - updates = Crawler.remove_unnecessary_updates(upl, irs) + crawled_data = [db.Record().add_parent("A").add_property("a")] + identified_records = [db.Record().add_parent("A").add_property("a", 5)] + updates = Crawler.remove_unnecessary_updates(crawled_data, identified_records) assert len(updates) == 1 def test_split_into_inserts_and_updates_trivial(): crawler = Crawler() - crawler.split_into_inserts_and_updates([]) + st = SyncGraph([], crawler.identifiableAdapter) + crawler._split_into_inserts_and_updates(st) -def basic_retrieve_by_name_mock_up(rec, referencing_entities=None, known=None): - """ returns a stored Record if rec.name is an existing key, None otherwise """ - if rec.name in known: - return known[rec.name] - else: - return None - - -@pytest.fixture -def crawler_mocked_identifiable_retrieve(): - crawler = Crawler() - # TODO use minimal setup - # mock retrieval of registered identifiabls: return Record with just a parent - crawler.identifiableAdapter.get_registered_identifiable = Mock( - side_effect=lambda x: db.Record().add_parent(x.parents[0].name).add_property(name='name')) - - # Simulate remote server content by using the names to identify records - # There is only a single known Record with name A - crawler.identifiableAdapter.retrieve_identified_record_for_record = Mock(side_effect=partial( - basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A")})) - crawler.identifiableAdapter.retrieve_identified_record_for_identifiable = Mock( - side_effect=partial( - basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A")})) - return crawler - - -def test_split_into_inserts_and_updates_single(crawler_mocked_identifiable_retrieve): +def test_split_into_inserts_and_updates_simple(crawler_mocked_identifiable_retrieve): + # basic test that checks whether two records are correctly sorted to update and insert based on + # whether an entity can be found using the identifiable crawler = crawler_mocked_identifiable_retrieve identlist = [Identifiable(name="A", record_type="C"), Identifiable(name="B", record_type="C")] - entlist = [db.Record(name="A").add_parent( - "C"), db.Record(name="B").add_parent("C")] - - assert crawler.get_from_any_cache(identlist[0]) is None - assert crawler.get_from_any_cache(identlist[1]) is None - assert not crawler._has_reference_value_without_id(identlist[0]) - assert not crawler._has_reference_value_without_id(identlist[1]) - assert crawler.identifiableAdapter.retrieve_identified_record_for_record( - identlist[0]).id == 1111 - assert crawler.identifiableAdapter.retrieve_identified_record_for_record( - identlist[1]) is None - - insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) - assert len(insert) == 1 - assert insert[0].name == "B" - assert len(update) == 1 - assert update[0].name == "A" - # if this ever fails, the mock up may be removed - crawler.identifiableAdapter.get_registered_identifiable.assert_called() - crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() + entlist = [db.Record(name="A").add_parent("C"), + db.Record(name="B").add_parent("C")] + st = SyncGraph(entlist, crawler.identifiableAdapter) + # check setup -def test_split_into_inserts_and_updates_with_duplicate(crawler_mocked_identifiable_retrieve): - crawler = crawler_mocked_identifiable_retrieve - a = db.Record(name="A").add_parent("C") - b = db.Record(name="B").add_parent("C") - b.add_property("A", a) - # This is identical to a and should be removed - c = db.Record(name="A").add_parent("C") - entlist = [a, b, c] - insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) + insert, update = crawler._split_into_inserts_and_updates(st) assert len(insert) == 1 assert insert[0].name == "B" assert len(update) == 1 @@ -284,31 +356,20 @@ def test_split_into_inserts_and_updates_with_duplicate(crawler_mocked_identifiab crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() -def test_split_into_inserts_and_updates_with_ref(crawler_mocked_identifiable_retrieve): +def test_split_into_inserts_and_updates_with_circ(crawler_mocked_identifiable_retrieve): + # test trying to split circular dependency crawler = crawler_mocked_identifiable_retrieve - # try it with a reference - a = db.Record(name="A").add_parent("C") - b = db.Record(name="B").add_parent("C") - b.add_property("A", a) - entlist = [a, b] - insert, update = crawler.split_into_inserts_and_updates(entlist) - assert len(insert) == 1 - assert insert[0].name == "B" - assert len(update) == 1 - assert update[0].name == "A" - # if this ever fails, the mock up may be removed - crawler.identifiableAdapter.get_registered_identifiable.assert_called() - crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() - + crawler.identifiableAdapter.get_registered_identifiable = Mock( + side_effect=lambda x: db.Record().add_parent('C').add_property(name='a') + ) + # two records that reference each other via identifying properties + a = db.Record().add_parent("C") + b = db.Record().add_parent("C").add_property(name='a', value=a) + a.add_property(name='a', value=b) -def test_split_into_inserts_and_updates_with_circ(): - # try circular - a = db.Record(name="A").add_parent("C") - b = db.Record(name="B").add_parent("C") - b.add_property("A", a) - a.add_property("B", b) - entlist = [a, b] - # TODO this does not seem to be complete! + st = SyncGraph([a, b], crawler.identifiableAdapter) + with pytest.raises(RuntimeError): + crawler._split_into_inserts_and_updates(st) def test_split_into_inserts_and_updates_with_complex(crawler_mocked_identifiable_retrieve): @@ -322,11 +383,12 @@ def test_split_into_inserts_and_updates_with_complex(crawler_mocked_identifiable b = db.Record(name="B").add_parent("C") g = db.Record(name="G").add_parent("C") f = db.Record(name="F").add_parent("C") - g.add_property("A", a) - b.add_property("A", f) + g.add_property("C", b) b.add_property("A", a) + b.add_property("C", f) entlist = [a, b, g] - insert, update = crawler.split_into_inserts_and_updates(entlist) + st = SyncGraph(entlist, crawler.identifiableAdapter) + insert, update = crawler._split_into_inserts_and_updates(st) assert len(insert) == 3 assert "B" in [el.name for el in insert] assert len(update) == 1 @@ -338,86 +400,187 @@ def test_split_into_inserts_and_updates_with_complex(crawler_mocked_identifiable # TODO write test where the unresoled entity is not part of the identifiable -def test_split_into_inserts_and_updates_with_copy_attr(crawler_mocked_identifiable_retrieve): - crawler = crawler_mocked_identifiable_retrieve - # assume identifiable is only the name - a = db.Record(name="A").add_parent("C") - a.add_property("foo", 1) - b = db.Record(name="A").add_parent("C") - b.add_property("bar", 2) - entlist = [a, b] - insert, update = crawler.split_into_inserts_and_updates(entlist) - - assert update[0].get_property("bar").value == 2 - assert update[0].get_property("foo").value == 1 - # if this ever fails, the mock up may be removed - crawler.identifiableAdapter.get_registered_identifiable.assert_called() - crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() +@patch("caoscrawler.crawl.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@patch("caoscrawler.identifiable_adapters.cached_query", + new=Mock(side_effect=mock_cached_only_rt)) +def test_split_iiau_with_unmergeable_list_items(): + """Test for meaningful exception when referencing a list of unmergeable entities. +Datamodel +--------- +A: + B: LIST<B> + prop_ident: INTEGER -def test_has_missing_object_in_references(): - crawler = Crawler() - # Simulate remote server content by using the names to identify records - # There are only two known Records with name A and B - crawler.identifiableAdapter.get_registered_identifiable = Mock(side_effect=partial( - basic_retrieve_by_name_mock_up, known={"C": db.Record(name="C").add_parent("RTC") - .add_property("d").add_property("name"), - "D": db.Record(name="D").add_parent("RTD") - .add_property("d").add_property("e").add_property("name"), - })) - - # one reference with id -> check - assert not crawler._has_missing_object_in_references( - Identifiable(name="C", record_type="RTC", properties={'d': 123}), []) - # one ref with Entity with id -> check - assert not crawler._has_missing_object_in_references( - Identifiable(name="C", record_type="RTC", properties={'d': db.Record(id=123) - .add_parent("C")}), []) - # one ref with id one with Entity with id (mixed) -> check - assert not crawler._has_missing_object_in_references( - Identifiable(name="C", record_type="RTD", - properties={'d': 123, 'b': db.Record(id=123).add_parent("RTC")}), []) - # entity to be referenced in the following - a = db.Record(name="C").add_parent("C").add_property("d", 12311) - # one ref with id one with Entity without id (but not identifying) -> fail - assert not crawler._has_missing_object_in_references( - Identifiable(name="C", record_type="RTC", properties={'d': 123, 'e': a}), []) - - # one ref with id one with Entity without id (mixed) -> fail - assert not crawler._has_missing_object_in_references( - Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), []) - - crawler.add_to_remote_missing_cache(a, Identifiable(name="C", record_type="RTC", - properties={'d': 12311})) - # one ref with id one with Entity without id but in cache -> check - assert crawler._has_missing_object_in_references( - Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), []) +B: + prop_ident: + C: - # if this ever fails, the mock up may be removed - crawler.identifiableAdapter.get_registered_identifiable.assert_called() +C: + prop_other: INTEGER +Identifiables +------------- -@pytest.mark.xfail() -def test_references_entities_without_ids(): - crawler = Crawler() - assert not crawler._has_reference_value_without_id(db.Record().add_parent("Person") - .add_property('last_name', 123) - .add_property('first_name', 123)) - # id and rec with id - assert not crawler._has_reference_value_without_id(db.Record().add_parent("Person") - .add_property('first_name', 123) - .add_property('last_name', - db.Record(id=123))) - # id and rec with id and one unneeded prop - assert crawler._has_reference_value_without_id(db.Record().add_parent("Person") - .add_property('first_name', 123) - .add_property('stuff', db.Record()) - .add_property('last_name', db.Record(id=123))) - - # one identifying prop is missing - assert crawler._has_reference_value_without_id(db.Record().add_parent("Person") - .add_property('first_name', 123) - .add_property('last_name', db.Record())) +id_A: [prop_ident] +id_B: [prop_ident, "is_referenced_by: A"] +id_C: [prop_other, "is_referenced_by: B"] + +Data +---- + +c1: (23) +c2: (42) + +b1: ("same", c1) +b2: ("same", c2) + +a: ([b1, b2]) + + + +- a can be identified. +- bs can be identified with each other once a is identified +- cs depend on b(s), but cannot be put in one Entity because they have conflicting properties + """ + prop_ident = db.Property("prop_ident", datatype=db.INTEGER) + prop_other = db.Property("prop_ident", datatype=db.INTEGER) + rt_c = db.RecordType("C").add_property(prop_other) + # Somehow it is necessary that `B` has a reference property. Dunno if C must have an + # identifiable as well. + rt_b = db.RecordType("B").add_property(prop_ident).add_property("C") + rt_a = db.RecordType("A").add_property(prop_ident).add_property("LIST<B>") + + ident_a = db.RecordType().add_parent("A").add_property("prop_ident") + ident_b = db.RecordType().add_parent("B").add_property("prop_ident").add_property( + "is_referenced_by", value="A") + ident_c = db.RecordType().add_parent("C").add_property("prop_other").add_property( + "is_referenced_by", value="B") + + rec_a = db.Record("a").add_parent(rt_a).add_property("prop_ident", value=1234) + rec_b = [] + rec_c = [] + for value in [23, 42]: + new_c = db.Record().add_parent(rt_c).add_property("prop_other", value=value) + rec_c.append(new_c) + rec_b.append(db.Record().add_parent(rt_b).add_property( + "prop_ident", value=2020).add_property("C", value=new_c)) + rec_a.add_property("B", rec_b) + + ident_adapter = CaosDBIdentifiableAdapter() + ident_adapter.register_identifiable("A", ident_a) + ident_adapter.register_identifiable("B", ident_b) + ident_adapter.register_identifiable("C", ident_c) + + crawler = Crawler(identifiableAdapter=ident_adapter) + + st = SyncGraph(deepcopy([rec_a, *rec_b, *rec_c]), crawler.identifiableAdapter) + assert st._identity_relies_on_unchecked_entity(st.nodes[0]) is False + assert st._identity_relies_on_unchecked_entity(st.nodes[1]) + assert st._identity_relies_on_unchecked_entity(st.nodes[2]) + assert st._identity_relies_on_unchecked_entity(st.nodes[3]) + assert st._identity_relies_on_unchecked_entity(st.nodes[4]) + assert len(st.unchecked) == 5 + + # The Cs cannot be merged due to different identifying properties + # The Bs cannot be merged due to different references to Cs + with raises(ImpossibleMergeError) as rte: + crawler._split_into_inserts_and_updates(st) + + # The order of the Cs is random so we only know that they are the + # last two elements but not in which order they have been tried to + # be merged. + assert "The problematic property is 'C' with values " in str(rte.value) + assert f"'[{st.nodes[-2]}]'" in str(rte.value) + assert f"'[{st.nodes[-1]}]'" in str(rte.value) + + # TODO + # assert not isinstance(rte.value, NotImplementedError), \ + # "Exception must not be NotImplementedError, but plain RuntimeError." + # assert "Could not find referencing entities" in rte.value.args[0] + # assert "merge conflicts in the referencing" in rte.value.args[0] + + +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test): + # test that backrefs are appropriately considered in the identifiable + crawler = crawler_mocked_for_backref_test + identlist = [Identifiable(name="A", record_type="BR"), + Identifiable(name="B", record_type="C", backrefs=[db.Entity()])] + referenced = db.Record(name="B").add_parent("C") + entlist = [referenced, db.Record(name="A").add_parent("BR").add_property("ref", referenced), ] + + # Test without referencing object + # currently a RuntimeError is raised if necessary properties are missing. + with raises(MissingReferencingEntityError): + st = SyncGraph([db.Record(name="B").add_parent("C")], crawler.identifiableAdapter) + + # identifiables were not yet checked + st = SyncGraph(entlist, crawler.identifiableAdapter) + assert st.get_equivalent(st.nodes[1]) is None + assert st.get_equivalent(st.nodes[0]) is None + # one can be found remotely, one not + + # check the split... + insert, update = crawler._split_into_inserts_and_updates(st) + # A was found remotely and is therefore in the update list + assert len(update) == 1 + assert update[0].name == "A" + # B does not exist on the (simulated) remote server + assert len(insert) == 1 + assert insert[0].name == "B" + + +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_test): + # test whether multiple references of the same record type are correctly used + crawler = crawler_mocked_for_backref_test + referenced = db.Record(name="B").add_parent("C") + entlist = [referenced, + db.Record(id=1, name="A").add_parent("BR").add_property("ref", referenced), + db.Record(id=2, name="C").add_parent("BR").add_property("ref", referenced), + ] + + # test whether both entities are listed in the backref attribute of the identifiable + st = SyncGraph(entlist, crawler.identifiableAdapter) + + identifiable = crawler.identifiableAdapter.get_identifiable( + st.nodes[0], + st.backward_references_backref[id(st.nodes[0])]) + assert len(identifiable.backrefs) == 2 + + # check the split... + insert, update = crawler._split_into_inserts_and_updates(st) + assert len(update) == 2 + assert len(insert) == 1 + + +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_test): + # test whether multiple references of the different record types are correctly used + crawler = crawler_mocked_for_backref_test + referenced = db.Record(name="B").add_parent("D") + entlist = [referenced, + db.Record(id=1, name="A").add_parent("BR").add_property("ref", referenced), + db.Record(id=2, name="A").add_parent("BR2").add_property("ref", referenced), + ] + + # test whether both entities are listed in the backref attribute of the identifiable + st = SyncGraph(entlist, crawler.identifiableAdapter) + identifiable = crawler.identifiableAdapter.get_identifiable( + st.nodes[0], + st.backward_references_backref[id(st.nodes[0])]) + + assert len(identifiable.backrefs) == 2 + + # check the split... + insert, update = crawler._split_into_inserts_and_updates(st) + assert len(update) == 2 + assert len(insert) == 1 def test_replace_entities_with_ids(): @@ -432,21 +595,6 @@ def test_replace_entities_with_ids(): assert a.get_property("C").value == [12345, 233324] -def reset_mocks(mocks): - for mock in mocks: - mock.reset_mock() - - -def mock_retrieve_record(identifiable: Identifiable): - """ assumes that the identifiable is always only the date""" - - for record in EXAMPLE_SERVER_STATE: - if (record.role == "Record" - and record.get_property("date").value == identifiable.properties['date']): - return record - return None - - @patch("caoscrawler.crawl.cached_get_entity_by", new=Mock(side_effect=mock_get_entity_by)) @patch("caoscrawler.identifiable_adapters.cached_get_entity_by", @@ -485,7 +633,6 @@ def test_synchronization_no_commit(upmock, insmock): def test_security_mode(updateCacheMock, upmock, insmock): # trivial case: nothing to do crawled_data = [r.copy() for r in EXAMPLE_SERVER_STATE if r.role == "Record"] - print(crawled_data) crawler = Crawler(securityMode=SecurityMode.RETRIEVE) crawler.synchronize(commit_changes=True, crawled_data=crawled_data) assert crawler.run_id is not None @@ -520,9 +667,6 @@ def test_security_mode(updateCacheMock, upmock, insmock): assert crawler.run_id is not None insmock.assert_not_called() upmock.assert_not_called() - # import IPython - # IPython.embed() - # print(updateCacheMock.call_args_list) assert updateCacheMock.call_count == 1 # reset counts reset_mocks([updateCacheMock, insmock, upmock]) @@ -578,64 +722,6 @@ def test_security_mode(updateCacheMock, upmock, insmock): crawled_data[-1] = EXAMPLE_SERVER_STATE[-1].copy() -def test_create_reference_mapping(): - a = db.Record().add_parent("A") - b = db.Record().add_parent("B").add_property('a', a) - ref = Crawler.create_reference_mapping([a, b]) - assert id(a) in ref - assert id(b) not in ref - assert "B" in ref[id(a)] - assert ref[id(a)]["B"] == [b] - - -def test_create_flat_list(): - a = db.Record() - b = db.Record() - a.add_property(name="a", value=a) - a.add_property(name="b", value=b) - flat = Crawler.create_flat_list([a]) - assert len(flat) == 2 - assert a in flat - assert b in flat - c = db.Record() - c.add_property(name="a", value=a) - # This would caus recursion if it is not dealt with properly. - a.add_property(name="c", value=c) - flat = Crawler.create_flat_list([c]) - assert len(flat) == 3 - assert a in flat - assert b in flat - assert c in flat - - -@pytest.fixture -def crawler_mocked_for_backref_test(): - crawler = Crawler() - # mock retrieval of registered identifiabls: return Record with just a parent - - def get_reg_ident(x): - if x.parents[0].name == "C": - return db.Record().add_parent(x.parents[0].name).add_property( - "is_referenced_by", value=["BR"]).add_property("name") - elif x.parents[0].name == "D": - return db.Record().add_parent(x.parents[0].name).add_property( - "is_referenced_by", value=["BR", "BR2"]).add_property("name") - else: - return db.Record().add_parent(x.parents[0].name).add_property("name") - crawler.identifiableAdapter.get_registered_identifiable = Mock(side_effect=get_reg_ident) - - # Simulate remote server content by using the names to identify records - # There is only a single known Record with name A - crawler.identifiableAdapter.retrieve_identified_record_for_record = Mock(side_effect=partial( - basic_retrieve_by_name_mock_up, known={"A": - db.Record(id=1111, name="A").add_parent("BR")})) - crawler.identifiableAdapter.retrieve_identified_record_for_identifiable = Mock( - side_effect=partial( - basic_retrieve_by_name_mock_up, known={"A": - db.Record(id=1111, name="A").add_parent("BR")})) - return crawler - - def test_validation_error_print(caplog): caplog.set_level(logging.DEBUG, logger="caoscrawler.converters") # there should be no server interaction since we only test the behavior if a validation error @@ -652,90 +738,6 @@ def test_validation_error_print(caplog): caplog.clear() -@patch("caoscrawler.identifiable_adapters.get_children_of_rt", - new=Mock(side_effect=lambda x: [x])) -def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test): - crawler = crawler_mocked_for_backref_test - identlist = [Identifiable(name="A", record_type="BR"), - Identifiable(name="B", record_type="C", backrefs=[db.Entity()])] - referenced = db.Record(name="B").add_parent("C") - entlist = [referenced, db.Record(name="A").add_parent("BR").add_property("ref", referenced), ] - - # Test without referencing object - # currently a NotImplementedError is raised if necessary properties are missing. - with raises(NotImplementedError): - crawler.split_into_inserts_and_updates([db.Record(name="B").add_parent("C")]) - - # identifiables were not yet checked - assert crawler.get_from_any_cache(identlist[0]) is None - assert crawler.get_from_any_cache(identlist[1]) is None - # one with reference, one without - assert not crawler._has_reference_value_without_id(identlist[0]) - assert crawler._has_reference_value_without_id(identlist[1]) - # one can be found remotely, one not - assert crawler.identifiableAdapter.retrieve_identified_record_for_record( - identlist[0]).id == 1111 - assert crawler.identifiableAdapter.retrieve_identified_record_for_record( - identlist[1]) is None - - # check the split... - insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) - # A was found remotely and is therefore in the update list - assert len(update) == 1 - assert update[0].name == "A" - # B does not exist on the (simulated) remote server - assert len(insert) == 1 - assert insert[0].name == "B" - - -@patch("caoscrawler.identifiable_adapters.get_children_of_rt", - new=Mock(side_effect=lambda x: [x])) -def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_test): - # test whether multiple references of the same record type are correctly used - crawler = crawler_mocked_for_backref_test - referenced = db.Record(name="B").add_parent("C") - entlist = [referenced, - db.Record(name="A").add_parent("BR").add_property("ref", referenced), - db.Record(name="C").add_parent("BR").add_property("ref", referenced), - ] - - # test whether both entities are listed in the backref attribute of the identifiable - referencing_entities = crawler.create_reference_mapping(entlist) - identifiable = crawler.identifiableAdapter.get_identifiable(referenced, referencing_entities) - assert len(identifiable.backrefs) == 2 - - # check the split... - insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) - assert len(update) == 1 - assert len(insert) == 2 - - -@patch("caoscrawler.identifiable_adapters.get_children_of_rt", - new=Mock(side_effect=lambda x: [x])) -def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_test): - # test whether multiple references of the different record types are correctly used - crawler = crawler_mocked_for_backref_test - referenced = db.Record(name="B").add_parent("D") - entlist = [referenced, - db.Record(name="A").add_parent("BR").add_property("ref", referenced), - db.Record(name="A").add_parent("BR2").add_property("ref", referenced), - ] - - # test whether both entities are listed in the backref attribute of the identifiable - referencing_entities = crawler.create_reference_mapping(entlist) - identifiable = crawler.identifiableAdapter.get_identifiable(referenced, referencing_entities) - assert len(identifiable.backrefs) == 2 - - # check the split... - insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) - assert len(update) == 2 - assert len(insert) == 1 - - -def mock_create_values(values, element): - pass - - @patch("caoscrawler.converters.IntegerElementConverter.create_values") def test_restricted_path(create_mock): """ @@ -822,9 +824,9 @@ def test_restricted_path(create_mock): def test_split_restricted_path(): - assert ["el"] == split_restricted_path("/el") - assert ["el"] == split_restricted_path("/el/") - assert ["el", "el"] == split_restricted_path("/el/el") + assert ["el"] == split_restricted_path(os.path.sep + "el") + assert ["el"] == split_restricted_path(os.path.sep + "el" + os.path.sep) + assert ["el", "el"] == split_restricted_path(os.path.sep + "el" + os.path.sep + "el") # Filter the warning because we want to have it here and this way it does not hinder running @@ -867,34 +869,6 @@ def test_create_entity_summary(): assert "<a href='/Entity/4'>a</a>, <a href='/Entity/6'>b</a>" in text -def test_detect_circular_dependency(crawler_mocked_identifiable_retrieve, caplog): - crawler = crawler_mocked_identifiable_retrieve - crawler.identifiableAdapter.get_registered_identifiable = Mock( - side_effect=lambda x: db.Record().add_parent('C').add_property(name='C')) - a = db.Record(name='a').add_parent("C") - b = db.Record(name='b').add_parent("C").add_property(name="C", value=a) - c = db.Record(name='c').add_parent("C").add_property(name='D', value='e' - ).add_property(name="C", value=b) - d = db.Record(name='c').add_parent("C") - a.add_property(name="C", value=c) - flat = [a, b, c] - circle = Crawler.detect_circular_dependency(flat) - assert [id(el) for el in circle] == [id(el) for el in [a, c, b, a]] - - assert Crawler.detect_circular_dependency([d]) is None - with raises(RuntimeError): - _, _ = crawler.split_into_inserts_and_updates(flat) - caplog.set_level(logging.ERROR, logger="caoscrawler.converters") - assert "Found circular dependency" in caplog.text - assert "-------\na\n['C" in caplog.text - caplog.clear() - - -def mock_get_entity_by_query(query=None): - if query is not None: - return db.Record(id=1111, name='rec_name').add_parent('RT') - - @patch("caoscrawler.crawl.cached_get_entity_by", new=Mock(side_effect=mock_get_entity_by_query)) def test_replace_name_with_referenced_entity(): diff --git a/unittests/test_data/invalid_identifiable/identifiable_content_no_list.yaml b/unittests/test_data/invalid_identifiable/identifiable_content_no_list.yaml new file mode 100644 index 0000000000000000000000000000000000000000..aee572a190bd7f439f638ef7c9a5d94a831aca81 --- /dev/null +++ b/unittests/test_data/invalid_identifiable/identifiable_content_no_list.yaml @@ -0,0 +1,4 @@ +Experiment: + date: + - 1 + - 2 diff --git a/unittests/test_data/invalid_identifiable/identifiable_no_str_or_dict.yaml b/unittests/test_data/invalid_identifiable/identifiable_no_str_or_dict.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a33c4ace9f8709a9b4a77c5fd8f38514acbe1e9c --- /dev/null +++ b/unittests/test_data/invalid_identifiable/identifiable_no_str_or_dict.yaml @@ -0,0 +1,3 @@ +Experiment: +- date +- 23 diff --git a/unittests/test_data/invalid_identifiable/identifiable_referenced_no_list.yaml b/unittests/test_data/invalid_identifiable/identifiable_referenced_no_list.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a504eab748d4891c3e1088ee785afcf6347fbbab --- /dev/null +++ b/unittests/test_data/invalid_identifiable/identifiable_referenced_no_list.yaml @@ -0,0 +1,5 @@ +Experiment: +- date +Event: +- is_referenced_by: Experiment +- event_id diff --git a/unittests/test_directories/examples_json/invalidjson.json b/unittests/test_directories/examples_json/invalidjson.json index 9c012bf062264014278fc2df7be6cf33b65c7469..49a00fc6df33fe8d82ec2735e39c400a2342f0bf 100644 --- a/unittests/test_directories/examples_json/invalidjson.json +++ b/unittests/test_directories/examples_json/invalidjson.json @@ -1,13 +1,13 @@ { - "projectId": 10002, - "archived": false, - "coordinator": { - "firstname": "Miri", - "lastname": "Mueller", - "email": "miri.mueller@science.de" - }, - "start_date": "2022-03-01", - "candidates": ["Mouse", "Penguine"], - "rvalue": 0.4444, - "url": "https://site.de/index.php/" + "projectId": 10002, + "archived": false, + "coordinator": { + "firstname": "Miri", + "lastname": "Mueller", + "email": "miri.mueller@science.de" + }, + "start_date": "2022-03-01", + "candidates": ["Mouse", "Penguine"], + "rvalue": 0.4444, + "url": "https://site.de/index.php/" } diff --git a/unittests/test_directories/examples_json/testjson.json b/unittests/test_directories/examples_json/testjson.json index d37ea2defc21d767e4e13ad3b39d6682b3c452ef..29d59780f4824d9c2edbc8fe1da3a6b380def57b 100644 --- a/unittests/test_directories/examples_json/testjson.json +++ b/unittests/test_directories/examples_json/testjson.json @@ -1,22 +1,21 @@ { - "name": "DEMO", - "projectId": 10002, - "archived": false, - "Person": [ - { - "firstname": "Miri", - "lastname": "Mueller", - "other": null, - "email": "miri.mueller@science.de" - }, + "name": "DEMO", + "projectId": 10002, + "archived": false, + "Person": [{ + "firstname": "Miri", + "lastname": "Mueller", + "other": null, + "email": "miri.mueller@science.de" + }, { "firstname": "Mara", "lastname": "Mueller", - "email": "mara.mueller@science.de" + "email": "mara.mueller@science.de" } ], - "start_date": "2022-03-01", - "candidates": ["Mouse", "Penguine"], - "rvalue": 0.4444, - "url": "https://site.de/index.php/" + "start_date": "2022-03-01", + "candidates": ["Mouse", "Penguine"], + "rvalue": 0.4444, + "url": "https://site.de/index.php/" } diff --git a/unittests/test_entity_comparison.py b/unittests/test_entity_comparison.py index 549bc4f42a59765d25446d44fbb845e49ca4d9b9..8543732fde4d584e2022dcf6432e9572ae625eb5 100644 --- a/unittests/test_entity_comparison.py +++ b/unittests/test_entity_comparison.py @@ -2,8 +2,7 @@ # Tests for entity comparison # A. Schlemmer, 06/2021 -import caosdb as db - +import linkahead as db import pytest from pytest import raises diff --git a/unittests/test_file_identifiables.py b/unittests/test_file_identifiables.py deleted file mode 100644 index 4ec02aa3fc497f8dc35adc709533ef5b35066f3a..0000000000000000000000000000000000000000 --- a/unittests/test_file_identifiables.py +++ /dev/null @@ -1,69 +0,0 @@ -#!/bin/python -# Tests for file identifiables -# A. Schlemmer, 06/2021 - -from unittest.mock import Mock, patch - -import caosdb as db -import pytest -from caoscrawler.identifiable import Identifiable -from caoscrawler.identifiable_adapters import LocalStorageIdentifiableAdapter -from caosdb.cached import cache_clear -from caosdb.exceptions import EmptyUniqueQueryError -from pytest import raises - -from test_crawler import mock_get_entity_by - - -@pytest.fixture(autouse=True) -def clear_cache(): - cache_clear() - - -@patch("caoscrawler.identifiable_adapters.get_children_of_rt", - new=Mock(side_effect=id)) -@patch("caoscrawler.identifiable_adapters.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by)) -def test_file_identifiable(): - ident = LocalStorageIdentifiableAdapter() - - # Without a path there is no identifying information - with raises(ValueError): - ident.get_identifiable(db.File(), []) - - fp = "/test/bla/bla.txt" - file_obj = db.File(path=fp) - identifiable = ident.get_identifiable(file_obj) - - # the path is copied to the identifiable - assert fp == identifiable.path - assert isinstance(identifiable, Identifiable) - - # __eq__ function is only defined for Identifiable objects - with raises(ValueError): - file_obj != identifiable - - # since the path does not exist in the data in ident, the follwoing functions return None - with raises(EmptyUniqueQueryError): - ident.retrieve_identified_record_for_record(file_obj) - assert ident.get_file(identifiable) is None - - # Try again with actual files in the store: - records = ident.get_records() - test_record_wrong_path = db.File(path="/bla/bla/test.txt") - test_record_correct_path = db.File(path="/test/bla/bla.txt") - test_record_alsocorrect_path = db.File(path="/test/bla/bla.txt") - records.append(test_record_wrong_path) - # Now, there is a file, but still wrong path -> result is still None - identified_file = ident.get_file(file_obj) - assert identified_file is None - - records.append(test_record_correct_path) - # now there is a match - identified_file = ident.get_file(file_obj) - assert identified_file is not None - assert identified_file.path == file_obj.path - - with raises(RuntimeError, match=".*unambigiously.*"): - records.append(test_record_alsocorrect_path) - identified_file = ident.get_file(file_obj) diff --git a/unittests/test_h5_converter.py b/unittests/test_h5_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..9c1058812c75c6d1e5ee7028c8f6fccd7081a54c --- /dev/null +++ b/unittests/test_h5_converter.py @@ -0,0 +1,134 @@ +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2023 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2023 Florian Spreckelsen <f.spreckelsen@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +from functools import partial +from pathlib import Path + +import linkahead as db +import numpy as np +from pytest import fixture, importorskip +from utils import dircheckstr as dircheck_base + +from caoscrawler.converters.hdf5_converter import ( + H5DatasetElement, H5GroupElement, H5NdarrayElement, + convert_basic_element_with_nd_array, convert_h5_element) +from caoscrawler.debug_tree import DebugTree +from caoscrawler.scanner import scan_directory +from caoscrawler.structure_elements import (FloatElement, ListElement, + TextElement) + +# Skip the whole module if h5py hasn't been installed +h5py = importorskip("h5py") + + +UNITTESTDIR = Path(__file__).parent + +# always add the path here +dircheckstr = partial(dircheck_base, UNITTESTDIR) + + +@fixture +def h5_dummy_file(): + + path = UNITTESTDIR / "hdf5_dummy_file.hdf5" + + return h5py.File(path, 'r') + + +def test_h5_elements(h5_dummy_file): + + elt = convert_h5_element(h5_dummy_file["group_level1_a"], "test") + assert isinstance(elt, H5GroupElement) + + elt = convert_h5_element(h5_dummy_file["root_integers"], "test") + assert isinstance(elt, H5DatasetElement) + + +def test_nd_array_conversion(): + + # Only test array handling here, `convert_basic_element` is tested + # elsewhere. + arr = np.array([[["something"]]]) + elt = convert_basic_element_with_nd_array(arr) + assert isinstance(elt, TextElement) + assert elt.value == "something" + + arr = np.zeros((1, 1)) + elt = convert_basic_element_with_nd_array(arr) + assert isinstance(elt, FloatElement) + assert elt.value == 0 + + arr = np.zeros((1, 3, 1)) + elt = convert_basic_element_with_nd_array(arr) + assert isinstance(elt, ListElement) + assert elt.value == [0, 0, 0] + + arr = np.array([[1, 2, 3], [4, 5, 6]]) + elt = convert_basic_element_with_nd_array(arr, internal_path="some/path") + assert isinstance(elt, H5NdarrayElement) + assert elt.internal_path == "some/path" + + # Non-arrays should be forwarded correctly + elt = convert_basic_element_with_nd_array("something") + assert isinstance(elt, TextElement) + assert elt.value == "something" + + elt = convert_basic_element_with_nd_array([0, 0, 0]) + assert isinstance(elt, ListElement) + assert elt.value == [0, 0, 0] + + +def test_record_creation(): + + dbt = DebugTree() + records = scan_directory(UNITTESTDIR, UNITTESTDIR / "h5_cfood.yml", debug_tree=dbt) + + # In total 3 records: The file, the Dataset, and its ndarray + assert len(records) == 3 + file_rec = [rec for rec in records if isinstance(rec, db.File)] + # exactly on file + assert len(file_rec) == 1 + + subd = dbt.debug_tree[dircheckstr("hdf5_dummy_file.hdf5")] + # At this level, we have 5 variables (directories and paths, plus H5File + # record), and one record. + assert len(subd[0]) == 5 + assert len(subd[1]) == 1 + file_rec = subd[1]["H5File"] + assert file_rec.get_property("H5Dataset") is not None + assert file_rec.get_property("H5Dataset").value is not None + # Reference properties currently need to be integration tested (especially + # with the circular dependency between) H5File and NDArray. + + # top level integers + subd = dbt.debug_tree["root_integers"] + # Two additional variables (RootIntegerElement + Dataset record), one + # additional record + assert len(subd[0]) == 7 + assert len(subd[1]) == 2 + ds_rec = subd[1]["H5Dataset"] + assert isinstance(ds_rec, db.Record) + assert len(ds_rec.parents) == 1 + assert ds_rec.parents[0].name == "H5Dataset" + assert ds_rec.get_property("Ndarray") is not None + assert ds_rec.get_property("Ndarray").value is not None + assert ds_rec.get_property("attr_data_root") is not None + assert isinstance(ds_rec.get_property("attr_data_root").value, list) + for number in [-2., -4., -8., -10.12345]: + assert number in [float(val) for val in ds_rec.get_property("attr_data_root").value] diff --git a/unittests/test_identifiable.py b/unittests/test_identifiable.py index 3f3c606b163df4dc238be9a669fd31eb630a582d..44aac6a3edd40e0df8558f68083e22245ff58127 100644 --- a/unittests/test_identifiable.py +++ b/unittests/test_identifiable.py @@ -24,10 +24,11 @@ test identifiable module """ +import linkahead as db import pytest -import caosdb as db + from caoscrawler.identifiable import Identifiable -from caoscrawler.identified_cache import IdentifiedCache +from caoscrawler.sync_node import SyncNode def test_create_hashable_string(): @@ -43,25 +44,20 @@ def test_create_hashable_string(): assert ( Identifiable._create_hashable_string( Identifiable(name="A", record_type="B", - properties={'a': db.Record(id=12)}) + properties={'a': SyncNode(db.Record(id=12))}) ) == "P<B>N<A>R<[]>a:12") a = Identifiable._create_hashable_string( - Identifiable(name="A", record_type="B", properties={'a': [db.Record(id=12)]})) + Identifiable(name="A", record_type="B", properties={'a': [SyncNode(db.Record(id=12))]})) assert (a == "P<B>N<A>R<[]>a:[12]") assert (Identifiable._create_hashable_string( Identifiable(name="A", record_type="B", properties={'a': [12]})) == "P<B>N<A>R<[]>a:[12]") assert ( Identifiable._create_hashable_string( Identifiable(name="A", record_type="B", properties={ - 'a': [db.Record(id=12), 11]}) + 'a': [SyncNode(db.Record(id=12)), 11]}) ) == "P<B>N<A>R<[]>a:[12, 11]") - assert ( - Identifiable._create_hashable_string( - Identifiable(record_type="B", properties={'a': [db.Record()]}) - ) != Identifiable._create_hashable_string( - Identifiable(record_type="B", properties={'a': [db.Record()]}))) assert Identifiable._create_hashable_string( - Identifiable(name="A", record_type="B", backrefs=[123, db.Entity(id=124)], + Identifiable(name="A", record_type="B", backrefs=[123, SyncNode(db.Record(id=124))], properties={'a': 5})) == "P<B>N<A>R<['123', '124']>a:5" @@ -74,9 +70,9 @@ def test_repr(): # only test that something meaningful is returned assert 'properties' in str(Identifiable(name="A", record_type="B")) assert str(Identifiable(name="A", record_type="B", properties={'a': 0})).split( - "properties:\n")[1].split('\n')[0] == '{"a": 0}' + "properties:\n")[1].split('\n')[0] == '{"a": "0"}' assert str(Identifiable(name="A", record_type="B", properties={'a': 0, 'b': "test"})).split( - "properties:\n")[1].split('\n')[0] == '{"a": 0, "b": "test"}' + "properties:\n")[1].split('\n')[0] == '{"a": "0", "b": "test"}' # TODO(henrik): Add a test using backrefs once that's implemented. @@ -88,13 +84,5 @@ def test_equality(): record_id=12, properties={"a": 0}) != Identifiable(record_id=13, properties={"a": 0}) assert Identifiable( record_id=12, properties={"a": 0}) == Identifiable(properties={"a": 0}) - assert Identifiable( - path="a", properties={"a": 0}) != Identifiable(path="b", properties={"a": 0}) - assert Identifiable( - path="a", properties={"a": 0}) == Identifiable(path="a", properties={"a": 1}) - assert Identifiable( - path="a", properties={"a": 0}) == Identifiable(properties={"a": 0}) - assert Identifiable(properties={"a": 0}) == Identifiable( - properties={"a": 0}) - assert Identifiable(properties={"a": 0}) != Identifiable( - properties={"a": 1}) + assert Identifiable(properties={"a": 0}) == Identifiable(properties={"a": 0}) + assert Identifiable(properties={"a": 0}) != Identifiable(properties={"a": 1}) diff --git a/unittests/test_identifiable_adapters.py b/unittests/test_identifiable_adapters.py index 268b9800ddf1ef1688386f394ec1c6c7eb3e3912..bdc0ab850d1a8253e876e8b1a6bc621327802f79 100644 --- a/unittests/test_identifiable_adapters.py +++ b/unittests/test_identifiable_adapters.py @@ -27,16 +27,19 @@ test identifiable_adapters module """ -import os from datetime import datetime from pathlib import Path +from unittest.mock import MagicMock, Mock, patch -import caosdb as db +import linkahead as db import pytest + +from caoscrawler.exceptions import InvalidIdentifiableYAML from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import (CaosDBIdentifiableAdapter, IdentifiableAdapter, convert_value) +from caoscrawler.sync_graph import SyncNode UNITTESTDIR = Path(__file__).parent @@ -120,19 +123,49 @@ def test_load_from_yaml_file(): assert project_i.get_property("title") is not None +def test_invalid_yaml(): + ident = CaosDBIdentifiableAdapter() + invalid_dir = UNITTESTDIR / "test_data" / "invalid_identifiable" + with pytest.raises(InvalidIdentifiableYAML) as exc: + ident.load_from_yaml_definition(invalid_dir / "identifiable_content_no_list.yaml") + assert str(exc.value) == "Identifiable contents must be lists, but this was not: Experiment" + + with pytest.raises(InvalidIdentifiableYAML) as exc: + ident.load_from_yaml_definition(invalid_dir / "identifiable_referenced_no_list.yaml") + assert str(exc.value) == "'is_referenced_by' must be a list. Found in: Event" + + with pytest.raises(InvalidIdentifiableYAML) as exc: + ident.load_from_yaml_definition(invalid_dir / "identifiable_no_str_or_dict.yaml") + assert str(exc.value) == ("Identifiable properties must be str or dict, but this one was not:\n" + " Experiment/23") + + def test_non_default_name(): ident = CaosDBIdentifiableAdapter() - ident.register_identifiable( - "Person", db.RecordType() - .add_parent(name="Person") - .add_property(name="last_name")) - identifiable = ident.get_identifiable(db.Record(name="don't touch it") - .add_parent("Person") - .add_property(name="last_name", value='Tom') - ) + identifiable = ident.get_identifiable(SyncNode(db.Record(name="don't touch it") + .add_parent("Person") + .add_property(name="last_name", value='Tom'), + db.RecordType() + .add_parent(name="Person") + .add_property(name="last_name")), []) assert identifiable.name is None +def test_wildcard_ref(): + ident = CaosDBIdentifiableAdapter() + rec = (db.Record(name="don't touch it").add_parent("Person") + .add_property(name="last_name", value='Tom')) + dummy = SyncNode(db.Record(), None) + dummy.id = 1 + identifiable = ident.get_identifiable(SyncNode(rec, db.RecordType() + .add_parent(name="Person") + .add_property(name="is_referenced_by", + value=["*"])), + [dummy] + ) + assert identifiable.backrefs[0] == 1 + + def test_convert_value(): # test that string representation of objects stay unchanged. No stripping or so. class A(): @@ -143,23 +176,61 @@ def test_convert_value(): def test_get_identifiable(): - # TODO modify this such that it becomes a test that acutally tests (sufficiently) the - # get_identifable function - ident = CaosDBIdentifiableAdapter() ident.load_from_yaml_definition(UNITTESTDIR / "example_identifiables.yml") - r_cur = (db.Record(id=5) - .add_parent(name="Experiment", id=3) - .add_property(name="date", value="2022-02-01") - .add_property(name="result", value="FAIL")) - id_r0 = ident.get_identifiable(r_cur) - assert r_cur.parents[0].name == id_r0.record_type - assert r_cur.get_property( - "date").value == id_r0.properties["date"] - assert len(r_cur.parents) == 1 - assert len(r_cur.properties) == 2 + rec = (db.Record(id=5) + .add_parent(name="Experiment", id=3) + .add_property(name="date", value="2022-02-01") + .add_property(name="result", value="FAIL")) + se = SyncNode(rec, + ident.get_registered_identifiable(rec)) + id_r0 = ident.get_identifiable(se, []) + assert rec.parents[0].name == id_r0.record_type + assert rec.get_property("date").value == id_r0.properties["date"] + assert len(rec.parents) == 1 + assert len(rec.properties) == 2 assert len(id_r0.properties) == 1 + ident = CaosDBIdentifiableAdapter() + ident_a = db.RecordType(name="A").add_parent("A").add_property("name").add_property("a") + ident.register_identifiable("A", ident_a) + rec = (db.Record(id=5) + .add_parent(name="A", id=3) + .add_property(name="a", value="2022-02-01") + .add_property(name="result", value="FAIL")) + se = SyncNode(rec, ident.get_registered_identifiable(rec)) + for el in [ + db.Record() + .add_parent(name="A", id=3) + .add_property(name="a", value="2022-02-01") + .add_property(name="result", value="FAIL"), + db.Record(name='a') + .add_parent(name="A", id=3) + .add_property(name="a", value="2022-02-01") + .add_property(name="result", value="FAIL"), + ]: + se.update(SyncNode(el)) + + id_r0 = ident.get_identifiable(se, []) + assert "A" == id_r0.record_type + assert "2022-02-01" == id_r0.properties["a"] + assert 'a' == id_r0.name + assert len(id_r0.properties) == 1 + + rec = (db.Record(name='a') + .add_parent(name="A") + .add_property(name="a", value="2") + ) + se = SyncNode(rec, ident.get_registered_identifiable(rec)) + se.update(SyncNode( + db.Record(name='a') + .add_parent(name="A") + .add_property(name="a", value="3") + )) + + with pytest.raises(RuntimeError): + id_r0 = ident.get_identifiable(se, []) + @pytest.mark.xfail def test_retrieve_identified_record_for_identifiable(): @@ -175,7 +246,7 @@ def test_retrieve_identified_record_for_identifiable(): r_cur = r break - id_r1 = ident.get_identifiable(r_cur) + id_r1 = ident.get_identifiable(r_cur, []) assert r_cur.parents[0].name == id_r1.record_type assert r_cur.get_property( "identifier").value == id_r1.properties["identifier"] @@ -196,3 +267,19 @@ def test_retrieve_identified_record_for_identifiable(): assert r_cur.get_property( "responsible").value == idr_r1.get_property("responsible").value assert r_cur.description == idr_r1.description + + +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_referencing_entity_has_appropriate_type(): + dummy = db.Record().add_parent("A") + registered_identifiable = db.RecordType() + rft = IdentifiableAdapter.referencing_entity_has_appropriate_type + assert not rft([], registered_identifiable) + assert not rft(dummy.parents, registered_identifiable) + registered_identifiable.add_property("is_referenced_by", "B") + assert not rft(dummy.parents, registered_identifiable) + registered_identifiable.properties[0].value = ["B", "A"] + assert rft(dummy.parents, registered_identifiable) + registered_identifiable.properties[0].value = ["B", "*"] + assert rft(dummy.parents, registered_identifiable) diff --git a/unittests/test_identified_cache.py b/unittests/test_identified_cache.py deleted file mode 100644 index 4ed7c55c7326415308917e20e9f391b17b07ad87..0000000000000000000000000000000000000000 --- a/unittests/test_identified_cache.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -# -# ** header v3.0 -# This file is a part of the CaosDB Project. -# -# Copyright (C) 2021 Indiscale GmbH <info@indiscale.com> -# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see <https://www.gnu.org/licenses/>. -# -# ** end header -# - -""" -test identified_cache module -""" - -import caosdb as db -from caoscrawler.identifiable import Identifiable -from caoscrawler.identified_cache import IdentifiedCache - - -def test_IdentifiedCache(): - ident = Identifiable(name="A", record_type="B") - record = db.Record("A").add_parent("B").add_property('b', 5) - cache = IdentifiedCache() - assert ident not in cache - cache.add(record=record, identifiable=ident) - assert ident in cache - assert cache[ident] is record - assert Identifiable(name="A", record_type="C") != Identifiable(name="A", record_type="B") - assert Identifiable(name="A", record_type="C") not in cache diff --git a/unittests/test_issues.py b/unittests/test_issues.py index cbbe9cabcfd17daaf07165757351f00dc051eeab..a6de65400f42018c3fdcde7b2f29d4fd200bf62b 100644 --- a/unittests/test_issues.py +++ b/unittests/test_issues.py @@ -22,15 +22,13 @@ from pytest import mark -import caosdb as db - +from caoscrawler.converters import CrawlerTemplate, replace_variables from caoscrawler.crawl import Crawler -from caoscrawler.identifiable import Identifiable -from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter +from caoscrawler.scanner import (create_converter_registry, + scan_structure_elements) +from caoscrawler.stores import GeneralStore from caoscrawler.structure_elements import DictElement -from caoscrawler.scanner import create_converter_registry, scan_structure_elements - def test_issue_10(): """Test integer-to-float conversion in dictionaries""" @@ -110,3 +108,43 @@ def test_list_datatypes(): assert isinstance(records[0].get_property("Subject").value, list) assert records[0].get_property("Subject").datatype is not None assert records[0].get_property("Subject").datatype.startswith("LIST") + + +def test_issue_93(): + """https://gitlab.com/linkahead/linkahead-crawler/-/issues/93 + + cfood.yaml does not allow umlaut in $expression""" + values = GeneralStore() + expressions = [ + "foo", + "foo.bär", + "_1", + "Ä", + "ųøîµ", + ] + for exp in expressions: + values[exp] = f"This is {exp}" + # ## Test preliminary check + # With braces + for exp in expressions: + assert replace_variables(f"${{{exp}}}", values) == f"This is {exp}" + # Without braces + for exp in expressions: + assert replace_variables(f"${exp}", values) == f"This is {exp}" + + # ## Test actual replacement + for exp in expressions: + # as-is + propvalue = f"${{{exp}}}" + propvalue_template = CrawlerTemplate(propvalue) + # from IPython import embed + # embed() + + assert propvalue_template.safe_substitute(**values.get_storage()) == f"This is {exp}" + + # String embedded into context + propvalue = f"some text before >> ${{{exp}}} << some text after" + print(propvalue) + propvalue_template = CrawlerTemplate(propvalue) + assert (propvalue_template.safe_substitute(**values.get_storage()) + == f"some text before >> This is {exp} << some text after") diff --git a/unittests/test_json.py b/unittests/test_json.py index fdb332df60d73dce3356a563e09ae0d02cf845b7..5d145b38fd36fa2de4e4ab754cbadda0fff6eff7 100644 --- a/unittests/test_json.py +++ b/unittests/test_json.py @@ -26,18 +26,17 @@ """ test the JSON converter """ -import json import os +from pathlib import Path +import linkahead as db from pytest import raises -import caosdb as db - from caoscrawler.converters import JSONFileConverter -from pathlib import Path from caoscrawler.crawl import Crawler +from caoscrawler.scanner import (create_converter_registry, load_definition, + scan_structure_elements) from caoscrawler.structure_elements import File, JSONFile -from caoscrawler.scanner import load_definition, create_converter_registry, scan_structure_elements UNITTESTDIR = Path(__file__).parent diff --git a/unittests/test_macros.py b/unittests/test_macros.py index 53837e920e93f2cc318d62549145a0e8ac757372..03fe0e665652bb12e204d76857771c1d064ec28a 100644 --- a/unittests/test_macros.py +++ b/unittests/test_macros.py @@ -22,15 +22,15 @@ # ** end header # -from caoscrawler.macros import defmacro_constructor, macro_constructor -from caoscrawler.macros.macro_yaml_object import macro_store -from caoscrawler.crawl import Crawler -from caoscrawler.scanner import load_definition - from tempfile import NamedTemporaryFile -import yaml import pytest +import yaml + +from caoscrawler.crawl import Crawler +from caoscrawler.macros import defmacro_constructor, macro_constructor +from caoscrawler.macros.macro_yaml_object import macro_store +from caoscrawler.scanner import load_definition @pytest.fixture @@ -50,16 +50,16 @@ def _temp_file_load(txt: str): definition using load_definition from Crawler. """ definition = None - with NamedTemporaryFile() as f: + with NamedTemporaryFile(delete=False) as f: f.write(txt.encode()) f.flush() - definition = load_definition(f.name) + definition = load_definition(f.name) return definition def test_macros(register_macros, macro_store_reset): dat = yaml.load(""" -defs: +macros: - !defmacro name: test params: @@ -85,7 +85,7 @@ testnode: def test_macro_list_replacment(register_macros, macro_store_reset): dat = yaml.load(""" -defs: +macros: - !defmacro name: test params: @@ -112,7 +112,7 @@ testnode: def test_multi_macros(register_macros, macro_store_reset): dat = yaml.load(""" -defs: +macros: - !defmacro name: test_one params: {} @@ -142,7 +142,7 @@ def test_multi_macros_toplevel(register_macros, macro_store_reset): dat_loader = list(yaml.safe_load_all(""" --- metadata: - crawler-version: 0.5.1 + crawler-version: 0.9.0 macros: - !defmacro name: test_one @@ -171,7 +171,7 @@ def test_load_definition(register_macros, macro_store_reset): txt = """ --- metadata: - crawler-version: 0.5.1 + crawler-version: 0.9.0 --- extroot: type: Directory @@ -188,12 +188,13 @@ extroot: cfood = _temp_file_load(""" --- metadata: - crawler-version: 0.5.1 + crawler-version: 0.9.0 macros: - !defmacro name: test_one params: {} definition: + type: TextElement replaced1: ok - !defmacro name: test_two @@ -213,6 +214,7 @@ extroot: extroot2: !macro # test top level macro test_one: extroot3: + type: Directory subtree: SimulationData: !macro test_two: @@ -223,38 +225,44 @@ extroot3: assert cfood["extroot3"]["subtree"]["SimulationData"]["match"] == "SimulationData" -@pytest.mark.xfail def test_replace_arbitrary_objects(register_macros, macro_store_reset): """ See: https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/24 """ dat = yaml.load(""" -defs: +macros: - !defmacro name: test params: b: 25 + testvar_list_empty: [] testvar_list: - a - $b + testvar_dict_empty: {} testvar_dict: t1: a t2: $b definition: replaced1: $b: ok - c: $testvar_dict - d: $testvar_list + dict_empty: $testvar_dict_empty + dict: $testvar_dict + list_empty: $testvar_list_empty + list: ${testvar_list} testnode: obl: !macro test: """, Loader=yaml.SafeLoader) print(yaml.dump(dat)) - assert dat["testnode"]["obl"]["replaced1"]["c"]["t1"] == "a" - assert dat["testnode"]["obl"]["replaced1"]["c"]["t2"] == "25" - assert dat["testnode"]["obl"]["replaced1"]["d"][0] == "a" - assert dat["testnode"]["obl"]["replaced1"]["d"][1] == "25" + replaced = dat["testnode"]["obl"]["replaced1"] + assert replaced["dict_empty"] == {} + assert replaced["dict"]["t1"] == "a" + assert replaced["dict"]["t2"] == 25 + assert replaced["list_empty"] == [] + assert replaced["list"][0] == "a" + assert replaced["list"][1] == 25 def test_macros_in_macros(register_macros, macro_store_reset): @@ -264,13 +272,14 @@ def test_macros_in_macros(register_macros, macro_store_reset): cfood = _temp_file_load(""" --- metadata: - crawler-version: 0.5.1 + crawler-version: 0.9.0 macros: - !defmacro name: one_macro params: a: 25 definition: + type: DictElement macro_sub_$a: b: $a another_param: 3 @@ -278,6 +287,7 @@ metadata: name: test_macrodef params: {} definition: + type: DictElement macro_top: !macro one_macro: - a: 17 @@ -293,11 +303,11 @@ extroot: !macro assert "test_macro" not in cfood["extroot"] assert cfood["extroot"]["macro_top"]["not_macro"]["a"] == 26 d = cfood["extroot"]["macro_top"] - assert d["macro_sub_17"]["b"] == "17" + assert d["macro_sub_17"]["b"] == 17 assert d["macro_sub_17"]["another_param"] == 3 - assert d["macro_sub_25"]["b"] == "25" + assert d["macro_sub_25"]["b"] == 25 assert d["macro_sub_25"]["another_param"] == 3 - assert d["macro_sub_98"]["b"] == "98" + assert d["macro_sub_98"]["b"] == 98 assert d["macro_sub_98"]["another_param"] == 3 @@ -309,7 +319,7 @@ def test_silent_overwrite(register_macros, macro_store_reset): cfood = _temp_file_load(""" --- metadata: - crawler-version: 0.5.1 + crawler-version: 0.9.0 macros: - !defmacro name: one_macro @@ -340,12 +350,13 @@ def test_circular_macro_definition(register_macros, macro_store_reset): cfood = _temp_file_load(""" --- metadata: - crawler-version: 0.5.1 + crawler-version: 0.9.0 macros: - !defmacro name: test_one params: {} definition: !macro + type: TextElement test_two: - !defmacro name: test_two @@ -361,6 +372,7 @@ metadata: name: test_four params: {} definition: !macro + type: TextElement test_four: --- extroot: !macro @@ -389,7 +401,7 @@ def test_use_macro_twice(): cfood = _temp_file_load(""" --- metadata: - crawler-version: 0.5.1 + crawler-version: 0.9.0 macros: - !defmacro name: test_twice @@ -397,6 +409,7 @@ metadata: macro_name: default_name a: 4 definition: + type: DictElement $macro_name: something: a: $a @@ -410,9 +423,9 @@ extroot: !macro """) for name in ["once", "twice", "default_name"]: assert name in cfood["extroot"] - assert cfood["extroot"]["once"]["something"]["a"] == "4" - assert cfood["extroot"]["twice"]["something"]["a"] == "5" - assert cfood["extroot"]["default_name"]["something"]["a"] == "4" + assert cfood["extroot"]["once"]["something"]["a"] == 4 + assert cfood["extroot"]["twice"]["something"]["a"] == 5 + assert cfood["extroot"]["default_name"]["something"]["a"] == 4 # Code sample to generate the expanded macro: # with open("expanded_test_macro.yaml", "w") as f: # f.write(yaml.dump(cfood)) @@ -423,7 +436,7 @@ def test_documentation_example_2(): cfood = _temp_file_load(""" --- metadata: - crawler-version: 0.5.1 + crawler-version: 0.9.0 macros: - !defmacro name: MarkdownFile @@ -461,7 +474,7 @@ def test_documentation_example_1(): cfood = _temp_file_load(""" --- metadata: - crawler-version: 0.5.1 + crawler-version: 0.9.0 macros: - !defmacro name: SimulationDatasetFile @@ -510,7 +523,7 @@ def test_def_replacements(): cfood = _temp_file_load(""" --- metadata: - crawler-version: 0.5.1 + crawler-version: 0.9.0 macros: - !defmacro name: test_def_replacements @@ -549,7 +562,7 @@ extroot: !macro def test_list_macro_application(register_macros, macro_store_reset): dat = yaml.load(""" -defs: +macros: - !defmacro name: test params: @@ -573,14 +586,14 @@ testnode: test2: a: 4 """, Loader=yaml.SafeLoader) - assert dat["testnode"]["obl"]["expanded_4"]["param"] == "4" - assert dat["testnode"]["obl"]["expanded_2"]["param"] == "2" - assert dat["testnode"]["obl"]["expanded_4_test2"]["param"] == "4" + assert dat["testnode"]["obl"]["expanded_4"]["param"] == 4 + assert dat["testnode"]["obl"]["expanded_2"]["param"] == 2 + assert dat["testnode"]["obl"]["expanded_4_test2"]["param"] == 4 def test_variable_in_macro_definition(register_macros, macro_store_reset): dat = yaml.load(""" -defs: +macros: - !defmacro name: test params: @@ -598,7 +611,7 @@ testnode: - a: 2 b: 4 """, Loader=yaml.SafeLoader) - assert dat["testnode"]["obl"]["expanded_4"]["param"] == "4" - assert dat["testnode"]["obl"]["expanded_4"]["param_b"] == "4" - assert dat["testnode"]["obl"]["expanded_2"]["param"] == "2" - assert dat["testnode"]["obl"]["expanded_2"]["param_b"] == "4" + assert dat["testnode"]["obl"]["expanded_4"]["param"] == 4 + assert dat["testnode"]["obl"]["expanded_4"]["param_b"] == 4 + assert dat["testnode"]["obl"]["expanded_2"]["param"] == 2 + assert dat["testnode"]["obl"]["expanded_2"]["param_b"] == 4 diff --git a/unittests/test_parent_cfood.yml b/unittests/test_parent_cfood.yml index b8d0eaf597641d311cb70017dc2bc75c7c3434f3..21b49a2db8ac44f806c77718b2fa49fbc7488828 100644 --- a/unittests/test_parent_cfood.yml +++ b/unittests/test_parent_cfood.yml @@ -1,6 +1,6 @@ --- metadata: - crawler-version: 0.6.1 + crawler-version: 0.9.0 --- Definitions: type: Definitions diff --git a/unittests/test_rocrate_converter.py b/unittests/test_rocrate_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..dc7cef9f6d396c73a2a285d3f60fd587863237ac --- /dev/null +++ b/unittests/test_rocrate_converter.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer <a.schlemmer@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" +test the XML converters +""" +import importlib +import os +from pathlib import Path + +import linkahead as db +import pytest +import rocrate +import yaml +from caoscrawler import scanner +from caoscrawler.converters import ELNFileConverter, ROCrateEntityConverter +from caoscrawler.stores import GeneralStore +from caoscrawler.structure_elements import (DictElement, File, ROCrateEntity, + TextElement) +from rocrate.model.entity import Entity + +UNITTESTDIR = Path(__file__).parent + + +@pytest.fixture +def converter_registry(): + converter_registry: dict[str, dict[str, str]] = { + "ELNFile": { + "converter": "ELNFileConverter", + "package": "caoscrawler.converters"}, + "ROCrateEntity": { + "converter": "ROCrateEntityConverter", + "package": "caoscrawler.converters", + } + } + + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry + + +@pytest.fixture +def basic_eln_converter(converter_registry): + return ELNFileConverter(yaml.safe_load(""" +type: ELNFile +match: .*\\.eln +"""), "TestELNConverter", converter_registry) + + +@pytest.fixture +def eln_entities(basic_eln_converter): + f_k4mat = File("records-example.eln", + os.path.join(UNITTESTDIR, "eln_files", "records-example.eln")) + store = GeneralStore() + entities = basic_eln_converter.create_children(store, f_k4mat) + return entities + + +def test_load_pasta(basic_eln_converter): + """ + Test for loading the .eln example export from PASTA. + """ + f_pasta = File("PASTA.eln", os.path.join(UNITTESTDIR, "eln_files", "PASTA.eln")) + match = basic_eln_converter.match(f_pasta) + assert match is not None + entities = basic_eln_converter.create_children(GeneralStore(), f_pasta) + assert len(entities) == 20 + assert isinstance(entities[0], ROCrateEntity) + assert isinstance(entities[0].folder, str) + assert isinstance(entities[0].entity, Entity) + + +def test_load_kadi4mat(basic_eln_converter): + """ + Test for loading the .eln example export from PASTA. + """ + f_k4mat = File("records-example.eln", + os.path.join(UNITTESTDIR, "eln_files", "records-example.eln")) + match = basic_eln_converter.match(f_k4mat) + assert match is not None + entities = basic_eln_converter.create_children(GeneralStore(), f_k4mat) + assert len(entities) == 10 + assert isinstance(entities[0], ROCrateEntity) + assert isinstance(entities[0].folder, str) + assert isinstance(entities[0].entity, Entity) + + +def test_match_rocrate_entities(eln_entities): + ds1 = ROCrateEntityConverter(yaml.safe_load(""" +type: ROCrateEntity +match_properties: + "@id": \\./ + datePublished: (?P<datePublished>.*) +"""), "TestELNConverter", converter_registry) + + match = ds1.match(eln_entities[0]) + assert match is not None + + ds2 = ROCrateEntityConverter(yaml.safe_load(""" +type: ROCrateEntity +match_type: CreativeWork +match_properties: + "@id": ro-crate-metadata.json + dateCreated: (?P<dateCreated>.*) +"""), "TestELNConverter", converter_registry) + + match = ds2.match(eln_entities[0]) + assert match is None + match = ds1.match(eln_entities[1]) + assert match is None + + match = ds2.match(eln_entities[1]) + assert match is not None + assert match["dateCreated"] == "2024-08-21T12:07:45.115990+00:00" + + children = ds2.create_children(GeneralStore(), eln_entities[1]) + assert len(children) == 8 + assert isinstance(children[0], TextElement) + assert children[0].name == "@id" + assert children[0].value == "ro-crate-metadata.json" + assert isinstance(children[5], DictElement) + assert children[5].value == {'@id': 'https://kadi.iam.kit.edu'} + + +def test_file(eln_entities): + ds_csv = ROCrateEntityConverter(yaml.safe_load(""" +type: ROCrateEntity +match_type: File +match_properties: + "@id": .*\.csv$ +"""), "TestELNConverter", converter_registry) + + ent_csv = eln_entities[5] + match = ds_csv.match(ent_csv) + assert match is not None + + children = ds_csv.create_children(GeneralStore(), ent_csv) + + # Number of children = number of properties + number of files: + assert len(children) == len(ent_csv.entity.properties()) + 1 + # Get the file: + f_csv = [f for f in children if isinstance(f, File)][0] + with open(f_csv.path) as f: + text = f.read() + assert "Ultrasound Transducer" in text + + +def test_has_part(eln_entities): + ds_parts = ROCrateEntityConverter(yaml.safe_load(""" +type: ROCrateEntity +match_type: Dataset +match_properties: + "@id": records-example/ +"""), "TestELNConverter", converter_registry) + + ent_parts = eln_entities[2] + match = ds_parts.match(ent_parts) + assert match is not None + + children = ds_parts.create_children(GeneralStore(), ent_parts) + + # Number of children = number of properties + number of parts: + assert len(children) == len(ent_parts.entity.properties()) + 4 + entity_children = [f for f in children if isinstance(f, ROCrateEntity)] + assert len(entity_children) == 4 + for f in entity_children: + assert isinstance(f.entity, rocrate.model.file.File) + + +def test_scanner(): + rlist = scanner.scan_directory(os.path.join(UNITTESTDIR, "eln_files/"), + os.path.join(UNITTESTDIR, "eln_cfood.yaml")) + assert len(rlist) == 1 + assert isinstance(rlist[0], db.Record) + assert rlist[0].name == "records-example" + assert rlist[0].description == "This is a sample record." + assert rlist[0].parents[0].name == "Dataset" + assert rlist[0].get_property("keywords").value == "sample" + assert rlist[0].get_property("dateModified").value == "2024-08-21T11:43:17.626965+00:00" diff --git a/unittests/test_scalars_cfood.py b/unittests/test_scalars_cfood.py index ba604fe4f5b695506bf8df9dab79fc23232c546a..577fcd5f6c93bee2bc05451983d358aa2e07f798 100644 --- a/unittests/test_scalars_cfood.py +++ b/unittests/test_scalars_cfood.py @@ -2,10 +2,11 @@ # Tests for: # https://gitlab.com/caosdb/caosdb-crawler/-/issues/9 # A. Schlemmer, 06/2021 -import os from pathlib import Path import pytest +from utils import dircheckstr + # The main function that is affected by this issue: from caoscrawler.converters import handle_value from caoscrawler.crawl import Crawler @@ -14,8 +15,6 @@ from caoscrawler.scanner import scan_directory # We need the store for the above function from caoscrawler.stores import GeneralStore -from utils import dircheckstr - UNITTESTDIR = Path(__file__).parent @@ -24,15 +23,15 @@ def test_handle_value(): store = GeneralStore() # This one should work: - assert handle_value("bla", store) == ("bla", "single") + assert handle_value("bla", store) == ("bla", None, "single") # These failed: - assert handle_value(4, store) == (4, "single") - assert handle_value(4.2, store) == (4.2, "single") - assert handle_value(True, store) == (True, "single") + assert handle_value(4, store) == (4, None, "single") + assert handle_value(4.2, store) == (4.2, None, "single") + assert handle_value(True, store) == (True, None, "single") # List test: - assert handle_value([4, 3, 2], store) == ([4, 3, 2], "single") + assert handle_value([4, 3, 2], store) == ([4, 3, 2], None, "single") def test_record_structure_generation(): diff --git a/unittests/test_scanner.py b/unittests/test_scanner.py index 863e79766fdc384ffa0a85821aa13719ade882fb..7adb44dfee21a59694b12757779d9a2b9fd7cf5b 100644 --- a/unittests/test_scanner.py +++ b/unittests/test_scanner.py @@ -26,17 +26,19 @@ Unit test functions for the scanner. """ +import os from functools import partial from pathlib import Path from tempfile import NamedTemporaryFile from unittest.mock import MagicMock, Mock, patch -import caosdb as db +import linkahead as db import pytest import yaml from caoscrawler.crawl import Crawler from caoscrawler.debug_tree import DebugTree -from caoscrawler.scanner import (create_converter_registry, load_definition, +from caoscrawler.scanner import (_load_definition_from_yaml_dict, + create_converter_registry, load_definition, scan_directory, scan_structure_elements) from caoscrawler.structure_elements import (DictElement, DictListElement, DictTextElement, File) @@ -109,7 +111,7 @@ def test_record_structure_generation(): assert len(subc[1]) == 0 # The data analysis node creates one variable for the node itself: - assert subd[0]["DataAnalysis"] == "examples_article/DataAnalysis" + assert subd[0]["DataAnalysis"] == os.path.join("examples_article", "DataAnalysis") assert subc[0]["DataAnalysis"] is False subd = dbt.debug_tree[dircheckstr("DataAnalysis", "2020_climate-model-predict")] @@ -127,9 +129,10 @@ def test_record_structure_generation(): assert subd[0]["identifier"] == "climate-model-predict" assert subd[0]["Project"].__class__ == db.Record - assert subd[0]["DataAnalysis"] == "examples_article/DataAnalysis" + assert subd[0]["DataAnalysis"] == os.path.join("examples_article", "DataAnalysis") assert subc[0]["DataAnalysis"] is True - assert subd[0]["project_dir"] == "examples_article/DataAnalysis/2020_climate-model-predict" + assert subd[0]["project_dir"] == os.path.join( + "examples_article", "DataAnalysis", "2020_climate-model-predict") assert subc[0]["project_dir"] is False # Check the copy flags for the first level in the hierarchy: @@ -317,6 +320,7 @@ def test_record_parents(): # lower levels assert len(rec.parents) == 1 + def test_new_debug_tree(): new_debug_tree = [] scan_directory(UNITTESTDIR / "test_directories" / "examples_article", @@ -326,8 +330,185 @@ def test_new_debug_tree(): assert len(new_debug_tree) == 1 # 1 structure element (SE) # Check structure of first node: assert new_debug_tree[0]["name"] == "examples_article" - assert new_debug_tree[0]["type"] == "caoscrawler.structure_elements.Directory" + assert new_debug_tree[0]["type"] == "caoscrawler.structure_elements.structure_elements.Directory" assert new_debug_tree[0]["path"].endswith( "unittests/test_directories/examples_article") assert len(new_debug_tree[0]["matching_converters"]) == 1 # exactly one matches assert len(new_debug_tree[0]["matching_converters"][0]["subtree"]) == 4 + + +def test_error_messages(): + data = { + 'Experiments': {} + } + + broken_yaml = """ +EmptyConverter: + """ + broken_definition = _load_definition_from_yaml_dict( + [yaml.load(broken_yaml, Loader=yaml.SafeLoader)]) + + converter_registry = create_converter_registry(broken_definition) + + with pytest.raises(RuntimeError, match="Definition of converter \"EmptyConverter\" is empty"): + scan_structure_elements(DictElement(name="", value=data), + broken_definition, converter_registry) + + broken_yaml = """ +Converter: + type: DictElement + records: + TestRecord: "42" + """ + + broken_definition = _load_definition_from_yaml_dict( + [yaml.load(broken_yaml, Loader=yaml.SafeLoader)]) + + converter_registry = create_converter_registry(broken_definition) + + with pytest.raises(RuntimeError, match="dict expected, but found str: 42"): + scan_structure_elements(DictElement(name="", value=data), + broken_definition, converter_registry) + + +def test_units(): + """Test the correct setting of units.""" + crawler_definition = load_definition(UNITTESTDIR / "test_unit_cfood.yml") + converter_registry = create_converter_registry(crawler_definition) + + data = { + "value_with_unit": "1.1 m", + "array_with_units": [ + "1.1 cm", + "2.2 cm" + ] + } + records = scan_structure_elements(DictElement(name="", value=data), crawler_definition, + converter_registry) + assert len(records) == 1 + rec = records[0] + # This is hard-coded in cfood: + assert rec.get_property("may_be_overwritten") is not None + assert rec.get_property("may_be_overwritten").value == "12" + assert rec.get_property("may_be_overwritten").unit == "K" + # Those are set from data + assert rec.get_property("value_with_unit") is not None + assert rec.get_property("value_with_unit").value == "1.1" + assert rec.get_property("value_with_unit").unit == "m" + assert rec.get_property("list_with_unit") is not None + assert rec.get_property("list_with_unit").value == ["1.1", "2.2"] + assert rec.get_property("list_with_unit").unit == "cm" + + # Contradictory units + data = { + "array_with_units": [ + "1.1 K", + "45 W" + ] + } + with raises(RuntimeError) as rte: + records = scan_structure_elements(DictElement(name="", value=data), crawler_definition, + converter_registry) + assert "Property 'list_with_unit' has contradictory units" in str(rte.value) + + # Overwrite value and unit + data = { + "may_be_overwritten": "400 °C" + } + records = scan_structure_elements(DictElement(name="", value=data), crawler_definition, + converter_registry) + assert len(records) == 1 + rec = records[0] + # Now set from data + assert rec.get_property("may_be_overwritten") is not None + assert rec.get_property("may_be_overwritten").value == "400" + assert rec.get_property("may_be_overwritten").unit == "°C" + + +def test_recursive_definition(): + """ + This is basically a test for: + https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/16 + """ + + recursive_yaml = """ +Converter: + type: DictElement + records: + Block: + Experiment: $Experiment + Experiment: + Block: $Block + """ + + crawler_definition = _load_definition_from_yaml_dict( + [yaml.load(recursive_yaml, Loader=yaml.SafeLoader)]) + converter_registry = create_converter_registry(crawler_definition) + + data = { + "value_with_unit": "1.1 m", + "array_with_units": [ + "1.1 cm", + "2.2 cm" + ] + } + records = scan_structure_elements(DictElement(name="", value=data), crawler_definition, + converter_registry) + + assert len(records) == 2 + assert len(records[0].parents) == 1 + assert records[0].parents[0].name == "Block" + assert len(records[1].parents) == 1 + assert records[1].parents[0].name == "Experiment" + + assert records[0].get_property("Experiment").value == records[1] + assert records[1].get_property("Block").value == records[0] + + +def test_recursive_definition_2(): + """ + This is another a test for: + https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/16 + + It defines Experiment on a different level, therefore allowing the recursive definition. + This is, however, no workaround for test_recursive_definition as a bidirectional link on the + same level is still not achieved. + """ + + recursive_yaml = """ +FirstConverter: + type: DictElement + records: + Experiment: + subtree: + Converter: + type: DictElement + records: + Block: + Experiment: $Experiment + Experiment: + Block: $Block + """ + + crawler_definition = _load_definition_from_yaml_dict( + [yaml.load(recursive_yaml, Loader=yaml.SafeLoader)]) + converter_registry = create_converter_registry(crawler_definition) + + data = {"data": { + "value_with_unit": "1.1 m", + "array_with_units": [ + "1.1 cm", + "2.2 cm" + ] + }} + records = scan_structure_elements(DictElement(name="", value=data), crawler_definition, + converter_registry) + + assert len(records) == 2 + assert len(records[0].parents) == 1 + assert records[0].parents[0].name == "Block" + assert len(records[1].parents) == 1 + assert records[1].parents[0].name == "Experiment" + + assert records[0].get_property("Experiment").value == records[1] + assert records[1].get_property("Block").value == records[0] diff --git a/unittests/test_schema.py b/unittests/test_schema.py index 0d5bebce98fbc8c789c1080bcf3919f128bdbf54..96c388ac362583eda13ca368519467c34446868e 100644 --- a/unittests/test_schema.py +++ b/unittests/test_schema.py @@ -2,17 +2,15 @@ # Tests for schema validation # A. Schlemmer, 06/2021 -from importlib_resources import files -import caosdb as db - -from os.path import join, dirname -from caoscrawler import Crawler +from os.path import dirname, join +import linkahead as db import pytest -from pytest import raises - +from importlib_resources import files from jsonschema.exceptions import ValidationError +from pytest import raises +from caoscrawler import Crawler from caoscrawler.scanner import load_definition @@ -27,6 +25,13 @@ def rfp(*pathcomponents): def test_schema_validation(): load_definition(rfp("scifolder_cfood.yml")) load_definition(rfp("scifolder_extended.yml")) + load_definition(rfp("record_from_dict_cfood.yml")) with raises(ValidationError, match=".*enum.*"): load_definition(rfp("broken_cfoods", "broken1.yml")) + + with raises(ValidationError, match=".*required.*"): + load_definition(rfp("broken_cfoods", "broken_record_from_dict.yml")) + + with raises(ValidationError, match=".*required.*"): + load_definition(rfp("broken_cfoods", "broken_record_from_dict_2.yml")) diff --git a/unittests/test_scripts.py b/unittests/test_scripts.py new file mode 100644 index 0000000000000000000000000000000000000000..da03c1f24fbd3d7ca13cfa55d6f69c0cb5a6a6f1 --- /dev/null +++ b/unittests/test_scripts.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 + +# This file is a part of the LinkAhead project. +# +# Copyright (C) 2024 IndiScale GmbH <www.indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Test if the scripts work as expected. +""" + +from subprocess import run + +SCRIPTS = [ + "linkahead-crawler", + "caosdb-crawler", + "spss_to_datamodel", + "csv_to_datamodel", +] + + +def test_script_loading(): + """Run the scripts with "-h".""" + for script in SCRIPTS: + run([script, "-h"], check=True) diff --git a/unittests/test_spss_converter.py b/unittests/test_spss_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..59fe723849dadcda21a699416372f08f2756f4e1 --- /dev/null +++ b/unittests/test_spss_converter.py @@ -0,0 +1,79 @@ +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Testing converter for SPSS files.""" + +import datetime +import importlib +from pathlib import Path + +import numpy as np +import pytest + +from caoscrawler.converters import ConverterValidationError, SPSSConverter +from caoscrawler.structure_elements import (BooleanElement, DictElement, + Directory, File, FloatElement, + IntegerElement, ListElement, + TextElement) + +UNITTESTDIR = Path(__file__).parent + + +@pytest.fixture +def converter_registry(): + converter_registry: dict[str, dict[str, str]] = { + "Directory": { + "converter": "DirectoryConverter", + "package": "caoscrawler.converters"}, + } + + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry + + +def test_spss_converter(converter_registry): + converter = SPSSConverter({ + "match": ("sample.sav") + }, + "ThisConverterNameIsIrrelevant", converter_registry + ) + + spss_dir = UNITTESTDIR / "test_tables" / "spss" + for sav_file, length, thistype in [ + (File("sample.sav", spss_dir / "sample.sav"), 5, str), + (File("sample.sav", spss_dir / "sample_large.sav"), 485, int), + ]: + m = converter.match(sav_file) + assert m is not None + assert len(m) == 0 + + children = converter.create_children(None, sav_file) + assert len(children) == length + + for ii, child in enumerate(children): + assert child.__class__ == DictElement + assert child.name == str(ii) + my_dict = child.value + assert isinstance(my_dict["mychar"], str) + assert isinstance(my_dict["mydate"], datetime.date) or np.isnan(my_dict["mydate"]) + assert isinstance(my_dict["dtime"], datetime.datetime) or np.isnan(my_dict["dtime"]) + assert isinstance(my_dict["mytime"], datetime.time) or np.isnan(my_dict["mytime"]) + assert isinstance(my_dict["mylabl"], thistype), f"{type(my_dict['mylabl'])}" + assert isinstance(my_dict["myord"], thistype), f"{type(my_dict['myord'])}" diff --git a/unittests/test_sync_graph.py b/unittests/test_sync_graph.py new file mode 100644 index 0000000000000000000000000000000000000000..06f0dfb9eb3d3536d26dcfd354ca27f08ef99a02 --- /dev/null +++ b/unittests/test_sync_graph.py @@ -0,0 +1,680 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +import logging +from functools import partial +from itertools import product +from unittest.mock import MagicMock, Mock, patch + +import linkahead as db +import pytest +from test_crawler import (basic_retrieve_by_name_mock_up, + mock_cached_only_rt_allow_empty, mock_get_entity_by) + +from caoscrawler.exceptions import (MissingIdentifyingProperty, + MissingRecordType) +from caoscrawler.identifiable import Identifiable +from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter +from caoscrawler.sync_graph import SyncGraph, _set_each_scalar_value +from caoscrawler.sync_node import SyncNode, parent_in_list, property_in_list + + +@pytest.fixture +def simple_adapter(): + # different RTs with different registered identifiables to allow to test various behavior + ident_adapter = CaosDBIdentifiableAdapter() + ident_adapter.register_identifiable( + "RT1", + db.RecordType().add_parent("RT1").add_property("RT2")) + ident_adapter.register_identifiable( + "RT2", + db.RecordType().add_parent("RT2").add_property("is_referenced_by", ["RT1", "RT3"])) + ident_adapter.register_identifiable( + "RT3", + db.RecordType().add_parent("RT3").add_property("a")) + ident_adapter.register_identifiable( + "RT4", + db.RecordType().add_parent("RT4").add_property("RT3")) + ident_adapter.register_identifiable( + "RT5", + db.RecordType().add_parent("RT5").add_property("name")) + return ident_adapter + + +def test_create_flat_list(): + a = db.Record() + b = db.Record() + a.add_property(name="a", value=a) + a.add_property(name="b", value=b) + flat = SyncGraph._create_flat_list([a]) + assert len(flat) == 2 + assert a in flat + assert b in flat + c = db.Record() + c.add_property(name="a", value=a) + # This would cause a recursion error if it is not dealt with properly. + a.add_property(name="c", value=c) + flat = SyncGraph._create_flat_list([c]) + assert len(flat) == 3 + assert a in flat + assert b in flat + assert c in flat + + # Test for lists: + a = db.Record() + b = db.Record() + d = db.Record() + a.add_property(name="a", value=a) + a.add_property(name="list", value=[b, d]) + flat = SyncGraph._create_flat_list([a]) + assert len(flat) == 3 + assert a in flat + assert b in flat + assert d in flat + + c = db.Record() + c.add_property(name="a", value=a) + # This would cause a recursion error if it is not dealt with properly. + a.add_property(name="second_list", value=[b, d, c]) + flat = SyncGraph._create_flat_list([c]) + assert len(flat) == 4 + assert a in flat + assert b in flat + assert c in flat + assert d in flat + + +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_create_reference_mapping(): + a = SyncNode(db.Record().add_parent("RT1"), + db.RecordType().add_property("is_referenced_by", ["RT2"])) + b = SyncNode(db.Record(id=132).add_parent("RT2").add_property('a', a), + db.RecordType().add_property("a")) + ses = [a, b] + + mappings = SyncGraph._create_reference_mapping(ses) + # test initialization + for index, mapping in product((0, 1), mappings): + assert id(ses[index]) in mapping + + (forward_references, backward_references, forward_references_id_props, + backward_references_id_props, forward_references_backref, + backward_references_backref) = mappings + + # a has no ref + assert len(forward_references[id(a)]) == 0 + assert backward_references[id(a)] == set([b]) + # b does + assert forward_references[id(b)] == set([a]) + assert backward_references[id(b)] == set() + # a has no identifying reference + assert forward_references_id_props[id(a)] == set() + assert backward_references_id_props[id(a)] == set([b]) + # b has an identifying reference + assert forward_references_id_props[id(b)] == set([a]) + assert backward_references_id_props[id(b)] == set() + # a has an identifying back reference + assert forward_references_backref[id(a)] == set() + assert backward_references_backref[id(a)] == set([b]) + # b does not + assert forward_references_backref[id(b)] == set([a]) + assert backward_references_backref[id(b)] == set() + + +@patch("caoscrawler.sync_graph.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +def test_SyncGraph_init(): + # trivial case + a = db.Record(id=101).add_parent("A") + ident_a = db.RecordType().add_parent("A").add_property("prop_ident") + ident_adapter = CaosDBIdentifiableAdapter() + ident_adapter.register_identifiable("A", ident_a) + SyncGraph([a], ident_adapter) + SyncGraph([], ident_adapter) # should not fail either... + # test whether missing identifying properties cause an exception + with pytest.raises(MissingIdentifyingProperty): + SyncGraph([db.Record().add_parent("A")], ident_adapter) + + entlist = [ + db.Record(id=101).add_parent("A"), + db.Record(id=102).add_parent("A"), + db.File(path='a').add_parent("A"), + db.File(path='b').add_parent("A"), + db.Record(id=103).add_parent("A"), + db.Record(id=104).add_parent("A").add_property(name='prop_ident', value="MERGEME"), + db.Record().add_parent("A").add_property(name='prop_ident', value="MERGEME"), + db.File(path='a', file='b').add_parent("A"), + db.Record(id=101).add_parent("A"), + db.Record().add_parent("A").add_property(name='prop_ident', value="other"), + db.Record().add_parent("A").add_property(name='prop_ident', + value=db.Record().add_parent("A") + .add_property(name='prop_ident', value="other")), + db.File(path='a', file='b').add_parent("A"), + db.Record(id=101).add_parent("A"), + ] + st = SyncGraph(entlist, ident_adapter) + # all nodes with ID=101 have been merged + assert len([el for el in st.nodes if el.id == 101]) == 1 + # all nodes with path='a' have been merged + assert len([el for el in st.nodes if el.path == 'a']) == 1 + # all nodes with ID or path were removed from unchecked + for el in st.nodes: + if el.id is not None or el.path is not None: + assert el not in st.unchecked + # all nodes with ID are in the ID lookup + for el in st.nodes: + if el.id is not None: + assert st._id_look_up[el.id] is el + # all nodes with path are in the path lookup + for el in st.nodes: + if el.path is not None: + assert st._path_look_up[el.path] is el + # all nodes with identifiable are in the identifiable lookup + for el in st.nodes: + if el.identifiable is not None: + assert st._identifiable_look_up[el.identifiable.get_representation()] is el + # The node, which has no ID but has an identifiable, was merged with another node with ID (due + # to the shared identifiable) + new_one = [el for el in st.nodes if len(el.properties) > 0 + and el.properties[0].value == "MERGEME"] + assert len(new_one) == 1 + assert new_one[0].id == 104 + # every node that does not rely on something unchecked has an identifiable or an ID + for el in st.nodes: + if not st._identity_relies_on_unchecked_entity(el): + assert el.identifiable is not None or el.id is not None + + +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_merge_into_trivial(simple_adapter): + # simplest case: a -> c + # b + # (a reference c; b does not reference anything; a & b have the same target + # record) + c = db.Record(name='c').add_parent("RT2") + a = db.Record(name='a').add_parent("RT1").add_property('RT2', c) + b = db.Record(id=101).add_parent("RT1") + + st = SyncGraph([a, b], simple_adapter) + se_a, se_b, se_c = st.nodes + assert se_a.name == 'a' + assert se_b.id == 101 + assert se_c.name == 'c' + + # CHECK REFERENCE MAP (before merge): + # c is referenced by a + assert len(st.forward_references[id(se_a)]) == 1 + assert se_c in st.forward_references[id(se_a)] + assert len(st.forward_references[id(se_b)]) == 0 + assert len(st.forward_references[id(se_c)]) == 0 + assert len(st.backward_references[id(se_a)]) == 0 + assert len(st.backward_references[id(se_b)]) == 0 + assert len(st.backward_references[id(se_c)]) == 1 + assert se_a in st.backward_references[id(se_c)] + + assert len(st.forward_references_id_props[id(se_a)]) == 1 + assert se_c in st.forward_references_id_props[id(se_a)] + assert len(st.forward_references_id_props[id(se_b)]) == 0 + assert len(st.forward_references_id_props[id(se_c)]) == 0 + assert len(st.backward_references_id_props[id(se_a)]) == 0 + assert len(st.backward_references_id_props[id(se_b)]) == 0 + assert len(st.backward_references_id_props[id(se_c)]) == 1 + assert se_a in st.backward_references_id_props[id(se_c)] + + assert len(st.forward_references_backref[id(se_a)]) == 1 + assert se_c in st.forward_references_backref[id(se_a)] + assert len(st.forward_references_backref[id(se_b)]) == 0 + assert len(st.forward_references_backref[id(se_c)]) == 0 + assert len(st.backward_references_backref[id(se_a)]) == 0 + assert len(st.backward_references_backref[id(se_b)]) == 0 + assert len(st.backward_references_backref[id(se_c)]) == 1 + assert se_a in st.backward_references_backref[id(se_c)] + + st.set_id_of_node(se_a, 101) + + # CHECK REFERENCE MAP (after merge): + # c is now referenced by b + assert id(se_a) not in st.forward_references + assert len(st.forward_references[id(se_b)]) == 1 + assert se_c in st.forward_references[id(se_b)] + assert len(st.forward_references[id(se_c)]) == 0 + assert id(se_a) not in st.backward_references + assert len(st.backward_references[id(se_b)]) == 0 + assert len(st.backward_references[id(se_c)]) == 1 + assert se_b in st.backward_references[id(se_c)] + + assert id(se_a) not in st.forward_references_id_props + assert len(st.forward_references_id_props[id(se_b)]) == 1 + assert se_c in st.forward_references_id_props[id(se_b)] + assert len(st.forward_references_id_props[id(se_c)]) == 0 + assert id(se_a) not in st.backward_references_id_props + assert len(st.backward_references_id_props[id(se_b)]) == 0 + assert len(st.backward_references_id_props[id(se_c)]) == 1 + assert se_b in st.backward_references_id_props[id(se_c)] + + assert id(se_a) not in st.forward_references_backref + assert len(st.forward_references_backref[id(se_b)]) == 1 + assert se_c in st.forward_references_backref[id(se_b)] + assert len(st.forward_references_backref[id(se_c)]) == 0 + assert id(se_a) not in st.backward_references_backref + assert len(st.backward_references_backref[id(se_b)]) == 0 + assert len(st.backward_references_backref[id(se_c)]) == 1 + assert se_b in st.backward_references_backref[id(se_c)] + + +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_merge_into_simple(simple_adapter): + # simple case: a -> c <- b (a & b reference c; a & b have the same target record) + c = db.Record(name='c').add_parent("RT2") + a = db.Record().add_parent("RT1").add_property('RT2', c) + b = db.Record().add_parent("RT1").add_property('RT2', c) + + st = SyncGraph([a, b], simple_adapter) + se_a = st.nodes[0] + se_b = st.nodes[1] + se_c = st.nodes[2] + + # CHECK REFERENCE MAP: + # c is referenced by a & b + assert len(st.forward_references[id(se_a)]) == 1 + se_c in st.forward_references[id(se_a)] + assert len(st.forward_references[id(se_b)]) == 1 + se_c in st.forward_references[id(se_b)] + assert len(st.forward_references[id(se_c)]) == 0 + assert len(st.backward_references[id(se_a)]) == 0 + assert len(st.backward_references[id(se_b)]) == 0 + assert len(st.backward_references[id(se_c)]) == 2 + se_a in st.backward_references[id(se_c)] + se_b in st.backward_references[id(se_c)] + + assert len(st.forward_references_id_props[id(se_a)]) == 1 + se_c in st.forward_references_id_props[id(se_a)] + assert len(st.forward_references_id_props[id(se_b)]) == 1 + se_c in st.forward_references_id_props[id(se_b)] + assert len(st.forward_references_id_props[id(se_c)]) == 0 + assert len(st.backward_references_id_props[id(se_a)]) == 0 + assert len(st.backward_references_id_props[id(se_b)]) == 0 + assert len(st.backward_references_id_props[id(se_c)]) == 2 + se_a in st.backward_references_id_props[id(se_c)] + se_b in st.backward_references_id_props[id(se_c)] + + assert len(st.forward_references_backref[id(se_a)]) == 1 + se_c in st.forward_references_backref[id(se_a)] + assert len(st.forward_references_backref[id(se_b)]) == 1 + se_c in st.forward_references_backref[id(se_b)] + assert len(st.forward_references_backref[id(se_c)]) == 0 + assert len(st.backward_references_backref[id(se_a)]) == 0 + assert len(st.backward_references_backref[id(se_b)]) == 0 + assert len(st.backward_references_backref[id(se_c)]) == 2 + se_a in st.backward_references_backref[id(se_c)] + se_b in st.backward_references_backref[id(se_c)] + + st._merge_into(se_a, se_b) + + # CHECK REFERENCE MAP (after merge): + # c is now referenced by b + # (same situation as above) + assert id(se_a) not in st.forward_references + assert len(st.forward_references[id(se_b)]) == 1 + se_c in st.forward_references[id(se_b)] + assert len(st.forward_references[id(se_c)]) == 0 + assert id(se_a) not in st.backward_references + assert len(st.backward_references[id(se_b)]) == 0 + assert len(st.backward_references[id(se_c)]) == 1 + se_b in st.backward_references[id(se_c)] + + assert id(se_a) not in st.forward_references_id_props + assert len(st.forward_references_id_props[id(se_b)]) == 1 + se_c in st.forward_references_id_props[id(se_b)] + assert len(st.forward_references_id_props[id(se_c)]) == 0 + assert id(se_a) not in st.backward_references_id_props + assert len(st.backward_references_id_props[id(se_b)]) == 0 + assert len(st.backward_references_id_props[id(se_c)]) == 1 + se_b in st.backward_references_id_props[id(se_c)] + + assert id(se_a) not in st.forward_references_backref + assert len(st.forward_references_backref[id(se_b)]) == 1 + se_c in st.forward_references_backref[id(se_b)] + assert len(st.forward_references_backref[id(se_c)]) == 0 + assert id(se_a) not in st.backward_references_backref + assert len(st.backward_references_backref[id(se_b)]) == 0 + assert len(st.backward_references_backref[id(se_c)]) == 1 + se_b in st.backward_references_backref[id(se_c)] + + +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_backward_references_backref(): + # We use the reference as identifying reference in both directions. Thus the map is the same + # for all three categories: references, id_references and id_referenced_by + ident_a = db.RecordType().add_parent("BR").add_property("name") + ident_b = db.RecordType().add_parent("C").add_property("is_referenced_by", ["BR"]) + ident_adapter = CaosDBIdentifiableAdapter() + ident_adapter.register_identifiable("BR", ident_a) + ident_adapter.register_identifiable("C", ident_b) + + referenced = db.Record(name="B").add_parent("C") + ent_list = [referenced, db.Record(name="A").add_parent("BR").add_property("ref", referenced), ] + + st = SyncGraph(ent_list, ident_adapter) + assert st.nodes[1] in st.backward_references_backref[id(st.nodes[0])] + + +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_set_id_of_node(simple_adapter): + # setting the id should lead to the node being marked as existing + ent_list = [db.Record(name='a').add_parent("RT5")] + st = SyncGraph(ent_list, simple_adapter) + assert len(st.nodes) == 1 + assert len(st.unchecked) == 1 + st.set_id_of_node(st.unchecked[0], 101) + assert len(st.nodes) == 1 + assert len(st.unchecked) == 0 + assert id(st.nodes[0]) in st._existing + + # setting the id with None should lead to the node being marked as missing + ent_list = [db.Record().add_parent("RT1").add_property(name="RT2", value=1)] + st = SyncGraph(ent_list, simple_adapter) + assert len(st.nodes) == 1 + assert len(st.unchecked) == 1 + # is automatically set in during initialization of graph + assert st.nodes[0].identifiable is not None + st.set_id_of_node(st.unchecked[0]) + assert len(st.nodes) == 1 + assert len(st.unchecked) == 0 + assert id(st.nodes[0]) in st._missing + + # setting the id to one that already exists should lead to a merge + ent_list = [ + db.Record(id=101).add_parent("RT5"), + db.Record(name='a').add_parent("RT5").add_property(name="RT2", value=1)] + st = SyncGraph(ent_list, simple_adapter) + assert len(st.nodes) == 2 + assert len(st.unchecked) == 1 + st.set_id_of_node(st.unchecked[0], 101) + assert len(st.nodes) == 1 + assert len(st.unchecked) == 0 + assert st.nodes[0].properties[0].name == "RT2" + + # setting the id to None should lead to depending nodes marked as missing + ent_list = [ + db.Record().add_parent("RT3").add_property(name="a", value=1).add_property( + name="RT2", value=db.Record().add_parent("RT2")), + ] + st = SyncGraph(ent_list, simple_adapter) + assert len(st.nodes) == 2 + assert len(st.unchecked) == 2 + st.set_id_of_node(st.unchecked[0]) + assert len(st.nodes) == 2 + assert len(st.unchecked) == 0 + assert id(st.nodes[0]) in st._missing + assert id(st.nodes[1]) in st._missing + + # same as above but with backref + ent_list = [ + db.Record() + .add_parent("RT4") + .add_property(name="RT3", + value=db.Record().add_parent("RT3").add_property(name="a", value=1)), + ] + st = SyncGraph(ent_list, simple_adapter) + assert len(st.nodes) == 2 + assert len(st.unchecked) == 2 + assert st.unchecked[1].identifiable is not None + st.set_id_of_node(st.unchecked[1]) + assert len(st.nodes) == 2 + assert len(st.unchecked) == 0 + assert id(st.nodes[0]) in st._missing + assert id(st.nodes[1]) in st._missing + + # setting an id might allow to check another node that depends on the former + ent_list = [ + db.Record() + .add_parent("RT4") + .add_property(name="RT3", + value=db.Record().add_parent("RT3").add_property(name="a", value=1)), + ] + st = SyncGraph(ent_list, simple_adapter) + assert st.nodes[0].identifiable is None + assert st.nodes[1].identifiable is not None + st.set_id_of_node(st.unchecked[1], 111) + assert st.nodes[0].identifiable is not None + assert st.nodes[1].identifiable is not None + + # same as above but going one step further: the new identifiable allows to merge that node + ent_list = [ + (db.Record() + .add_parent("RT4") + .add_property(name="RT3", + value=db.Record().add_parent("RT3").add_property(name="a", value=1))), + + (db.Record() + .add_parent("RT4") + .add_property(name="RT3", value=111)) + ] + st = SyncGraph(ent_list, simple_adapter) + assert st.nodes[0].identifiable is None + assert st.nodes[1].identifiable is not None + assert st.nodes[2].identifiable is not None + assert len(st.nodes) == 3 + st.set_id_of_node(st.unchecked[2], 111) + assert st.nodes[0].identifiable is not None + assert len(st.nodes) == 2 + + +@patch("caoscrawler.sync_graph.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +def test_merging(simple_adapter): + # identifying information can be given at various locations in the hierachical tree + # test whether an object is correctly combined for all cases + ident_adapter = CaosDBIdentifiableAdapter() + ident_a = db.RecordType().add_parent("A").add_property("name").add_property("a") + ident_adapter.register_identifiable("A", ident_a) + ident_adapter.retrieve_identified_record_for_identifiable = Mock( + side_effect=partial( + basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A")})) + + # merging based on id + ent_list = [ + db.Record(id=101).add_parent("A"), + db.Record(id=101).add_parent("A")] + st = SyncGraph(ent_list, ident_adapter) + assert len(st.nodes) == 1 + assert len(st.unchecked) == 0 + assert 101 == st.nodes[0].id + assert "A" == st.nodes[0].parents[0].name + + # merging based on path + ent_list = [ + db.File(path='101').add_parent("A"), + db.File(path='101').add_parent("A")] + st = SyncGraph(ent_list, ident_adapter) + assert len(st.nodes) == 1 + assert len(st.unchecked) == 0 + assert '101' == st.nodes[0].path + assert "A" == st.nodes[0].parents[0].name + + # merging based on identifiable (non identifying properties are ignored) + ent_list = [ + db.File(name='101').add_parent("A").add_property('a', value=1).add_property('b', value=1), + db.File(name='101').add_parent("A").add_property('a', value=1).add_property('b', value=2)] + st = SyncGraph(ent_list, ident_adapter) + assert len(st.nodes) == 1 + assert st.nodes[0].id is None + assert '101' == st.nodes[0].name + assert "A" == st.nodes[0].parents[0].name + assert 1 == st.nodes[0].properties[0].value + assert "a" == st.nodes[0].properties[0].name + + # Merging a mix. One Record needs the identifiable to be merged. But the identifying + # information is scattered in the other case. + ent_list = [ + db.Record(id=101).add_parent("A"), + db.Record(id=101, name='a').add_parent("A"), + db.Record(id=101).add_parent("A").add_property('a', value=1), + db.Record(name='a').add_parent("A").add_property('a', value=1)] + + st = SyncGraph(ent_list, ident_adapter) + assert len(st.nodes) == 1 + assert len(st.unchecked) == 0 + assert 'a' == st.nodes[0].name + assert "A" == st.nodes[0].parents[0].name + assert 1 == st.nodes[0].properties[0].value + assert "a" == st.nodes[0].properties[0].name + assert 101 == st.nodes[0].id + + # test that adding an ID can lead to a cascade of merges + # This also tests whether setting something to missing allows to create an identifiable + # and thus allows a merge + subtree = db.Record(name='a').add_parent("A").add_property('a', value=db.Record( + name='b').add_parent("A").add_property('a', value=db.Record( + name='c').add_parent("A").add_property('a', value="missing"))) + ent_list = [ + db.Record(id=101).add_parent("A"), + db.Record(id=101, name='z').add_parent("A"), + db.Record(id=101).add_parent("A").add_property('a', value=subtree), + db.Record(name='z').add_parent("A").add_property('a', value=subtree), + ] + + st = SyncGraph(ent_list, ident_adapter) + assert len(st.nodes) == 5 + assert len(st.unchecked) == 4 + missing_one = [el for el in st.nodes if el.name == 'c'][0] + st.set_id_of_node(missing_one) + # setting c to missing means that b cannot exist which means that a cannot exist, this allows + # to merge the two z nodes + assert len(st.nodes) == 4 + assert len(st.unchecked) == 0 + + +def test_update_of_reference_values(simple_adapter): + # multiple nodes are merged including one that is referenced + # assure that this still leads to the value of the property of the referencing node to be + # updated, when the id is set. (Value object is replaced appropriately) + a = db.Record().add_parent("RT3").add_property('a', value=1) + ent_list = [ + a, + db.Record().add_parent("RT3").add_property('a', value=1), + db.Record().add_parent("RT3").add_property('a', value=1), + db.Record().add_parent("RT3").add_property('a', value=1), + db.Record().add_parent("RT3").add_property('a', value=1), + db.Record().add_parent("RT4").add_property('RT3', value=a), + db.Record().add_parent("RT3").add_property('a', value=1), + db.Record().add_parent("RT3").add_property('a', value=1)] + st = SyncGraph(ent_list, simple_adapter) + assert len(st.nodes) == 2 + assert len(st.unchecked) == 2 + assert 'RT4' == st.nodes[1].parents[0].name + st.set_id_of_node(st.nodes[0], 101) + b_prop = st.nodes[1].properties[0].value + assert b_prop.id == 101 + + +def test_ignoring_irrelevant_references(simple_adapter): + # make sure that a circle of references is no problem if one references is not identifying + b = db.Record(name='b').add_parent("RT5") + a = db.Record().add_parent("RT3").add_property('a', value=b) + b.add_property('a', value=a) + ent_list = [a, b] + st = SyncGraph(ent_list, simple_adapter) + assert len(st.nodes) == 2 + assert len(st.unchecked) == 2 + assert st.nodes[1].name == 'b' + + # a relies on b + assert st._identity_relies_on_unchecked_entity(st.nodes[0]) + # b relies on nothing + assert not st._identity_relies_on_unchecked_entity(st.nodes[1]) + # set ID of b + st.set_id_of_node(st.nodes[1], 101) + assert len(st.unchecked) == 1 + # now a nolonger relies on unchecked + assert not st._identity_relies_on_unchecked_entity(st.nodes[0]) + +# 'is implementation insufficient' + + +@pytest.mark.xfail() +def test_detect_circular_dependency(crawler_mocked_identifiable_retrieve, caplog): + crawler = crawler_mocked_identifiable_retrieve + crawler.identifiableAdapter.get_registered_identifiable = Mock( + side_effect=lambda x: db.Record().add_parent('C').add_property(name='C')) + a = db.Record(name='a').add_parent("C") + b = db.Record(name='b').add_parent("C").add_property(name="C", value=a) + c = db.Record(name='c').add_parent("C").add_property(name='D', value='e' + ).add_property(name="C", value=b) + d = db.Record(name='c').add_parent("C") + a.add_property(name="C", value=c) + flat = [a, b, c] + circle = Crawler.detect_circular_dependency(flat) + assert [id(el) for el in circle] == [id(el) for el in [a, c, b, a]] + + assert Crawler.detect_circular_dependency([d]) is None + st = SyncGraph(flat, crawler.identifiableAdapter) + with pytest.raises(RuntimeError): + _, _ = crawler._split_into_inserts_and_updates(st) + caplog.set_level(logging.ERROR, logger="caoscrawler.converters") + assert "Found circular dependency" in caplog.text + assert "\n--------\n\n> Parent: C\n\n>> Name: a\n[\'C\']" in caplog.text + caplog.clear() + + +def test_set_each_scalar_value(): + """Test whether properties with None as value are treated appropriately.""" + a = SyncNode(db.Record().add_parent("RT1").add_property(name="bla"), + db.RecordType().add_property("is_referenced_by", ["RT2"])) + _set_each_scalar_value(a, lambda x: False, None) + _set_each_scalar_value(a, lambda x: isinstance(x, SyncNode), None) + _set_each_scalar_value(a, lambda x: x is None, lambda x: 42) + assert a.properties[0].value == 42 + _set_each_scalar_value(a, lambda x: x == 42, lambda x: None) + assert a.properties[0].value is None + + +@patch("caoscrawler.identifiable_adapters.cached_query", + new=Mock(side_effect=mock_cached_only_rt_allow_empty)) +def test_merge_referenced_by(): + """Merging two entities that are referenced by a third entity with nonexistent RecordType. + + See also https://gitlab.com/linkahead/linkahead-crawler/-/issues/95 + """ + ident = CaosDBIdentifiableAdapter() + ident.load_from_yaml_object({ + "RT_A": ["name"], + "RT_B": [{"is_referenced_by": ["RT_A"]}, "my_id"] + }) + crawled_data: list = [] + references: list = [] + for ii in [0, 1]: + rec = db.Record().add_parent("RT_B").add_property("my_id", value=ii) + references.append(rec) + crawled_data.append(rec) + rec_a = db.Record(name="Rec_A").add_parent("RT_A") + rec_a.add_property("my_ref", value=references) + crawled_data.append(rec_a) + + with pytest.raises(MissingRecordType) as mrt: + SyncGraph(crawled_data, ident) + assert str(mrt.value).endswith("Record type could not be found on server: RT_A") diff --git a/unittests/test_sync_node.py b/unittests/test_sync_node.py new file mode 100644 index 0000000000000000000000000000000000000000..1f95551d34f9e06ab3e2fc196e1e7809eabfa019 --- /dev/null +++ b/unittests/test_sync_node.py @@ -0,0 +1,376 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +from unittest.mock import MagicMock, Mock, patch + +import linkahead as db +import pytest +from test_crawler import basic_retrieve_by_name_mock_up, mock_get_entity_by + +from caoscrawler.exceptions import ImpossibleMergeError +from caoscrawler.identifiable import Identifiable +from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter +from caoscrawler.sync_graph import SyncGraph +from caoscrawler.sync_node import SyncNode, parent_in_list, property_in_list + + +def assert_parents_equal(p1, p2): + """Special assertion for comparing parents.""" + for a, b in zip(p1, p2): + assert a.id == b.id + assert a.name == b.name + + +def assert_properties_equal(p1, p2): + """Special assertion for comparing properties.""" + for a, b in zip(p1, p2): + assert a.id == b.id + assert a.name == b.name + assert a.value == b.value + assert a.datatype == b.datatype + + +def test_sync_node(): + # initialization + rec = (db.Record(id=101, name='101') + .add_parent("A") + .add_parent("B") + .add_parent(id=102) + .add_property(name="a", value='a') + .add_property(id=103, value='b')) + rec.description = "hallo" + sna = SyncNode(rec) + # check information stored in initialized SyncNode + assert "Record" in str(sna) + assert sna.id == rec.id + assert sna.role == rec.role + assert sna.name == rec.name + assert sna.description == rec.description + assert_parents_equal(sna.parents, rec.parents) + assert_properties_equal(sna.properties, rec.properties) + # ... special case File (path and file attributes) + fi = db.File(id=101, name='101', path='/a/') + snb = SyncNode(fi) + assert snb.role == fi.role + assert snb.name == fi.name + assert snb.id == fi.id + assert snb.path == fi.path + assert snb.file == fi.file + + # check information in exported db.Entity + export = sna.export_entity() + assert export.id == rec.id + assert export.role == rec.role + assert export.name == rec.name + assert export.description == rec.description + assert_parents_equal(export.parents, rec.parents) + assert_properties_equal(export.properties, rec.properties) + export = snb.export_entity() + assert export.role == fi.role + assert export.name == fi.name + assert export.id == fi.id + assert export.path == fi.path + assert export.file == fi.file + + # merge no common information + # --------------------------- + rec_a = (db.Record(name='101') + .add_parent("A") + .add_parent(id=102) + .add_property(name="a", value='a') + .add_property(id=103, value='b')) + + rec_b = (db.Record(id=101) + .add_parent("B") + .add_parent(id=103) + .add_property(name="a", value='a') + .add_property(id=103, value='b')) + rec_b.description = "tja" + + sn_a = SyncNode(rec_a) + sn_b = SyncNode(rec_b) + sn_a.update(sn_b) + # test information in updated node + assert sn_a.id == rec_b.id + assert sn_a.role == rec_a.role + assert sn_a.name == rec_a.name + assert sn_a.description == rec_b.description + for p in rec_a.parents + rec_b.parents: + assert p in sn_a.parents + for p in rec_a.properties + rec_b.properties: + assert p in sn_a.properties + # Check for duplicated property: + ps = [p for p in sn_a.properties if p.name == "a"] + assert len(ps) == 2 + assert ps[0].value == "a" + assert ps[1].value == "a" + + # test information in exported entity + export = sn_a.export_entity() + assert export.id == rec_b.id + assert export.name == rec_a.name + for p in rec_a.parents + rec_b.parents: + assert parent_in_list(p, export.parents) + for p in rec_a.properties + rec_b.properties: + if p.name is not None: + assert p.name in [el.name for el in export.properties] + if p.id is not None: + assert p.id in [el.id for el in export.properties] + assert len(export.properties) == 2 + assert export.get_property('a').value == 'a' + assert export.get_property(103).value == 'b' + assert export.description == rec_b.description + assert export.role == rec_a.role + + # merge with common information + # ----------------------------- + rec_a = (db.Record(id=101, name='101') + .add_parent("A") + .add_parent(id=102) + .add_property(name="a", value='a')) + + rec_b = (db.Record(id=101, name='101') + .add_parent("A") + .add_parent(id=102) + .add_property(name="a", value='a')) + + sn_a = SyncNode(rec_a) + sn_b = SyncNode(rec_b) + sn_a.update(sn_b) + assert sn_a.id == rec_b.id + assert sn_a.name == rec_a.name + for p in rec_a.parents + rec_b.parents: + assert parent_in_list(p, sn_a.parents) + for p in rec_a.properties + rec_b.properties: + assert property_in_list(p, sn_a.properties) + assert sn_a.description == rec_b.description + assert sn_a.role == rec_a.role + + # merge with conflicting information + # ---------------------------------- + # ID mismatch + sn_a = SyncNode(db.Record(id=102)) + with pytest.raises(ImpossibleMergeError, match="Trying to update"): + sn_a.update(SyncNode(db.Record(id=101))) + + # name mismatch + sn_a = SyncNode(db.Record(name='102')) + with pytest.raises(ImpossibleMergeError, match="Trying to update"): + sn_a.update(SyncNode(db.Record(name='101'))) + + # type mismatch + sn_a = SyncNode(db.Record(name='102')) + with pytest.raises(ImpossibleMergeError, match="Trying to update"): + sn_a.update(SyncNode(db.File(name='102'))) + + # description mismatch + sn_a = SyncNode(db.Record(description='102')) + with pytest.raises(ImpossibleMergeError, match="Trying to update"): + sn_a.update(SyncNode(db.Record(description='101'))) + + # path mismatch + sn_a = SyncNode(db.File(path='102')) + with pytest.raises(ImpossibleMergeError, match="Trying to update"): + sn_a.update(SyncNode(db.File(path='101'))) + + # identifiable mismatch + sn_a = SyncNode(db.File(path='102')) + sn_a.identifiable = Identifiable(name='a') + sn_b = SyncNode(db.File(path='101')) + sn_b.identifiable = Identifiable(name='b') + with pytest.raises(ValueError, match="identifiable"): + sn_a.update(sn_b) + + +def test_export_node(): + rec_a = (db.Record(id=101) + .add_parent("B") + .add_parent(id=103) + .add_property(name="a", value=[SyncNode(db.Record())]) + .add_property(name='b', id=103, value='b')) + + sn_a = SyncNode(rec_a) + exp = sn_a.export_entity() + assert exp.id == rec_a.id + assert exp.name == rec_a.name + for p in rec_a.parents: + assert len([el for el in exp.parents if p.name == el.name]) == 1 + for p in rec_a.properties: + assert p.value == exp.get_property(p.name).value + if isinstance(p.value, list): + assert len(p.value) == len(exp.get_property(p.name).value) + assert len(exp.properties) == len(rec_a.properties) + assert len(exp.parents) == len(rec_a.parents) + + # --------------------------------------------------------------------------------------------- + # NOTE: in the following we create a SyncNode object with twice the same Property as a short + # hand for a SyncNode that was created from one Entity with such a Property and then updating + # it with another SyncNode that also has the Property + # --------------------------------------------------------------------------------------------- + + # same property name, different values + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value='b') + .add_property(name="a", value='a')) + + # there should be a warning when multiproperties are used + with pytest.warns(UserWarning) as caught: + SyncNode(rec_a) + messages = {str(w.message) for w in caught} + assert ("Multiproperties are not supported by the crawler.") in messages + + with pytest.raises(ImpossibleMergeError) as ime: + exp = SyncNode(rec_a).export_entity() + assert "The problematic property is 'a' with values '['b']' and '['a']'" in str(ime.value) + + # SyncNodes with same ID are considered equal + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=SyncNode(db.Record(id=1))) + .add_property(name="a", value=SyncNode(db.Record(id=1)))) + + exp = SyncNode(rec_a).export_entity() + assert exp.get_property('a').value.id == 1 + # SyncNodes convert multi properties into single properties + assert len([p for p in exp.properties if p.name == "a"]) == 1 + + # same SyncNode object is obviously equal + sn = SyncNode(db.Record(id=1)) + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=sn) + .add_property(name="a", value=sn)) + + exp = SyncNode(rec_a).export_entity() + assert exp.get_property('a').value.id == 1 + assert len([p for p in exp.properties if p.name == "a"]) == 1 + + # different SyncNode Objects (without an ID) are not equal + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=SyncNode(db.Record())) + .add_property(name="a", value=SyncNode(db.Record()))) + + with pytest.raises(ImpossibleMergeError) as ime: + exp = SyncNode(rec_a).export_entity() + + msg = (f"The problematic property is 'a' with values '[{SyncNode(db.Record())}]' " + f"and '[{SyncNode(db.Record())}]'") + assert msg in str(ime.value) + + # different SyncNode Objects with differing ID are not equal + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=SyncNode(db.Record(id=1))) + .add_property(name="a", value=SyncNode(db.Record(id=2)))) + + with pytest.raises(ImpossibleMergeError) as ime: + exp = SyncNode(rec_a).export_entity() + + msg = (f"The problematic property is 'a' with values '[{SyncNode(db.Record(id=1))}]' " + f"and '[{SyncNode(db.Record(id=2))}]'") + assert msg in str(ime.value) + + # SyncNodes with same ID are considered equal (list) + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=[SyncNode(db.Record(id=1)), SyncNode(db.Record(id=2))]) + .add_property(name="a", value=[SyncNode(db.Record(id=1)), SyncNode(db.Record(id=2))])) + + exp = SyncNode(rec_a).export_entity() + assert exp.get_property('a').value[0].id == 1 + assert len([p for p in exp.properties if p.name == "a"]) == 1 + + # SyncNodes with same ID are not equal when in different order (list) + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=[SyncNode(db.Record(id=1)), SyncNode(db.Record(id=2))]) + .add_property(name="a", value=[SyncNode(db.Record(id=2)), SyncNode(db.Record(id=1))])) + + with pytest.raises(ImpossibleMergeError) as ime: + exp = SyncNode(rec_a).export_entity() + + msg = ("The problematic property is 'a' with values " + f"'{[SyncNode(db.Record(id=1)), SyncNode(db.Record(id=2))]}' " + f"and '{[SyncNode(db.Record(id=2)), SyncNode(db.Record(id=1))]}'") + assert msg in str(ime.value) + + # same SyncNode object is obviously equal (list) + sn = SyncNode(db.Record(id=1)) + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=[sn]) + .add_property(name="a", value=[sn])) + + exp = SyncNode(rec_a).export_entity() + assert exp.get_property('a').value[0].id == 1 + + # different SyncNode Objects are not equal (list) + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=[SyncNode(db.Record())]) + .add_property(name="a", value=[SyncNode(db.Record())])) + + with pytest.raises(ImpossibleMergeError) as ime: + exp = SyncNode(rec_a).export_entity() + + msg = ("The problematic property is 'a' with values " + f"'{[SyncNode(db.Record())]}' and '{[SyncNode(db.Record())]}'") + assert msg in str(ime.value) + + # different SyncNode Objects with differing are not equal (list) + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=[SyncNode(db.Record(id=1))]) + .add_property(name="a", value=[SyncNode(db.Record(id=2))])) + + with pytest.raises(ImpossibleMergeError) as ime: + exp = SyncNode(rec_a).export_entity() + + msg = ("The problematic property is 'a' with values " + f"'{[SyncNode(db.Record(id=1))]}' and '{[SyncNode(db.Record(id=2))]}'") + assert msg in str(ime.value) + + # list vs no list + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=SyncNode(db.Record(id=1))) + .add_property(name="a", value=[SyncNode(db.Record(id=1))])) + + with pytest.raises(ImpossibleMergeError) as ime: + exp = SyncNode(rec_a).export_entity() + msg = ("The problematic property is 'a' with values " + f"'[{SyncNode(db.Record(id=1))}]' and '{[SyncNode(db.Record(id=1))]}'") + assert msg in str(ime.value) + + # different list sizes + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=[SyncNode(db.Record(id=1))]) + .add_property(name="a", value=[SyncNode(db.Record(id=1)), SyncNode(db.Record(id=1))])) + + with pytest.raises(ImpossibleMergeError) as ime: + exp = SyncNode(rec_a).export_entity() + + msg = ("The problematic property is 'a' with values " + f"'{[SyncNode(db.Record(id=1))]}' and " + f"'{[SyncNode(db.Record(id=1)), SyncNode(db.Record(id=1))]}'") + assert msg in str(ime.value) diff --git a/unittests/test_table_converter.py b/unittests/test_table_converter.py index 178393d9345bd8a6846b66e362ce4f7edac382ee..c606c1d3cdf9a95f00728eaae88153631b08af53 100644 --- a/unittests/test_table_converter.py +++ b/unittests/test_table_converter.py @@ -28,12 +28,13 @@ test the converters module import importlib import math -import os from os.path import basename, dirname, join from pathlib import Path -import caosdb as db +import linkahead as db import pytest +from utils import dircheckstr + from caoscrawler import Crawler from caoscrawler.converters import (Converter, ConverterValidationError, CSVTableConverter, DictConverter, @@ -48,8 +49,6 @@ from caoscrawler.structure_elements import (BooleanElement, DictElement, IntegerElement, ListElement, TextElement) -from utils import dircheckstr - UNITTESTDIR = Path(__file__).parent diff --git a/unittests/test_tables/spss/CITATION.cff b/unittests/test_tables/spss/CITATION.cff new file mode 100644 index 0000000000000000000000000000000000000000..140fcc071bf2d5f5709cf31bf11bd9676b81ca5f --- /dev/null +++ b/unittests/test_tables/spss/CITATION.cff @@ -0,0 +1,11 @@ +cff-version: 1.2.0 +message: "If you use this software, please cite it as below." +authors: +- family-names: "Fajardo" + given-names: "Otto" + orcid: "https://orcid.org/0000-0002-3363-9287" +title: "Pyreadstat" +version: 1.2.7 +doi: 10.5281/zenodo.6612282 +date-released: 2018-09-24 +url: "https://github.com/Roche/pyreadstat" diff --git a/unittests/test_tables/spss/LICENSE b/unittests/test_tables/spss/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..a2f94b1a2a5a4255fc8ef6d0beb94cce89f545e8 --- /dev/null +++ b/unittests/test_tables/spss/LICENSE @@ -0,0 +1,210 @@ +Test data files were copied from [pyreadstat](https://github.com/Roche/pyreadstat), they are +licensed under the Apache License, cited below. + +Copyright (C) 2018-2024 Otto Fajardo +Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> + +pyreadstat liscence: +--------------------------------------------------------------------------- + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/unittests/test_tables/spss/sample.sav b/unittests/test_tables/spss/sample.sav new file mode 100644 index 0000000000000000000000000000000000000000..20d0c5ce6689a60adfa329a17b4347274e9a863b Binary files /dev/null and b/unittests/test_tables/spss/sample.sav differ diff --git a/unittests/test_tables/spss/sample_large.sav b/unittests/test_tables/spss/sample_large.sav new file mode 100644 index 0000000000000000000000000000000000000000..b0c16c1390a15a4f62a859ade76aa17b89c6ae40 Binary files /dev/null and b/unittests/test_tables/spss/sample_large.sav differ diff --git a/unittests/test_transformers.py b/unittests/test_transformers.py index 02d932d13cc3fad52048b08e2b9fe56f11db2ae7..a2d227adc5b0c6a8f2f96cb054e1c7670e980e10 100644 --- a/unittests/test_transformers.py +++ b/unittests/test_transformers.py @@ -29,23 +29,53 @@ See: https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/107 """ import importlib -from functools import partial from pathlib import Path -from tempfile import NamedTemporaryFile -from unittest.mock import MagicMock, Mock, patch +from unittest.mock import Mock -import caosdb as db import pytest -import yaml from caoscrawler.converters import Converter, ListElementConverter from caoscrawler.scanner import create_transformer_registry, scan_directory from caoscrawler.stores import GeneralStore -from caoscrawler.transformer_functions import replace, split +from caoscrawler.transformer_functions import (cast_to_bool, cast_to_float, + cast_to_int, cast_to_str, + replace, split) from pytest import raises UNITTESTDIR = Path(__file__).parent +@pytest.fixture +def converter_registry(): + converter_registry: dict[str, dict[str, str]] = { + "Directory": { + "converter": "DirectoryConverter", + "package": "caoscrawler.converters"}, + "MarkdownFile": { + "converter": "MarkdownFileConverter", + "package": "caoscrawler.converters"}, + "Date": { + "converter": "DateElementConverter", + "package": "caoscrawler.converters"}, + "DictElement": { + "converter": "DictElementConverter", + "package": "caoscrawler.converters"}, + "TextElement": { + "converter": "TextElementConverter", + "package": "caoscrawler.converters"}, + "ListElement": { + "converter": "ListElementConverter", + "package": "caoscrawler.converters"}, + "JSONFile": { + "converter": "JSONFileConverter", + "package": "caoscrawler.converters"}, + } + + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry + + def test_simple_transformer(): """ Test the correct list of returned records by the scanner using the @@ -82,38 +112,6 @@ def test_simple_transformer(): assert False -@pytest.fixture -def converter_registry(): - converter_registry: dict[str, dict[str, str]] = { - "Directory": { - "converter": "DirectoryConverter", - "package": "caoscrawler.converters"}, - "MarkdownFile": { - "converter": "MarkdownFileConverter", - "package": "caoscrawler.converters"}, - "Date": { - "converter": "DateElementConverter", - "package": "caoscrawler.converters"}, - "DictElement": { - "converter": "DictElementConverter", - "package": "caoscrawler.converters"}, - "TextElement": { - "converter": "TextElementConverter", - "package": "caoscrawler.converters"}, - "ListElement": { - "converter": "ListElementConverter", - "package": "caoscrawler.converters"}, - "JSONFile": { - "converter": "JSONFileConverter", - "package": "caoscrawler.converters"}, - } - - for key, value in converter_registry.items(): - module = importlib.import_module(value["package"]) - value["class"] = getattr(module, value["converter"]) - return converter_registry - - def test_apply_replace(converter_registry): cfood_def = {"type": 'ListElement', "match_name": ".*", 'transform': {'test': {'in': '$a', 'out': '$b', 'functions': [{ @@ -146,3 +144,73 @@ def test_apply_replace_from_def(converter_registry): conv.apply_transformers(values, transformer_functions) assert values['b'] == "16:45" + + +def test_empty_functions_list(converter_registry): + cfood_def = {"type": 'ListElement', + "match_name": ".*", + 'transform': {'test': {'in': '$a', 'out': '$b', + 'functions': []}}} + values = GeneralStore() + values["a"] = "16_45" + + # transformer_functions = create_transformer_registry(crawler_definition) + transformer_functions = {"replace": replace} + + conv = ListElementConverter(definition=cfood_def, name='test', + converter_registry=converter_registry) + + conv.apply_transformers(values, transformer_functions) + assert values['b'] == "16_45" + + +def test_cast_transformer_functions(): + for val in ("True", "true", "False", "false"): + assert type(cast_to_bool(val, {})) == bool + if val[1] == "r": + assert cast_to_bool(val, {}) is True + else: + assert cast_to_bool(val, {}) is False + for val_err in ("jaksdlfj", "0", 1): + with pytest.raises(ValueError): + cast_to_bool(val_err, {}) + assert cast_to_bool(False, {}) is False + assert cast_to_bool(True, {}) is True + + assert cast_to_int("24", {}) == 24 + assert cast_to_int(24.0, {}) == 24 + assert cast_to_int(24, {}) == 24 + assert cast_to_int("-24", {}) == -24 + with pytest.raises(ValueError): + cast_to_int("24dsf", {}) + with pytest.raises(ValueError): + cast_to_int("24.0", {}) == 24 + + assert cast_to_float("24", {}) == 24.0 + assert cast_to_float("24.0", {}) == 24.0 + assert cast_to_float(24.0, {}) == 24.0 + assert cast_to_float(24, {}) == 24.0 + with pytest.raises(ValueError): + cast_to_float("24dsf", {}) + + assert cast_to_str(24, {}) == "24" + + +def test_replace_variables(): + vals = GeneralStore() + vals["test"] = "with" + vals["a"] = "str_without_replacement" + conv = Mock() + conv.definition = {} + conv.definition["transform"] = { + "test": { + "in": "$a", + "out": "$a", + "functions": [ + {"replace": { + "remove": "without", + "insert": "$test" + }} + ]}} + Converter.apply_transformers(conv, vals, {"replace": replace}) + assert vals["a"] == "str_with_replacement" diff --git a/unittests/test_unit_cfood.yml b/unittests/test_unit_cfood.yml new file mode 100644 index 0000000000000000000000000000000000000000..214aa49adceedce49a162f380ec453fb8597f215 --- /dev/null +++ b/unittests/test_unit_cfood.yml @@ -0,0 +1,43 @@ +--- +metadata: + crawler-version: 0.9.0 +--- +data: + type: Dict + match_name: '.*' + records: + MyRec: + may_be_overwritten: + value: "12" + unit: K + subtree: + ValueWithUnit: + type: TextElement + match_name: ^value_with_unit$ + match_value: "^(?P<number>\\d+\\.?\\d*)\\s+(?P<unit>.+)" + records: + MyRec: + value_with_unit: + value: $number + unit: $unit + MayBeOverwritten: + type: TextElement + match_name: ^may_be_overwritten$ + match_value: "^(?P<number>\\d+\\.?\\d*)\\s+(?P<unit>.+)" + records: + MyRec: + may_be_overwritten: + value: $number + unit: $unit + ListOfValues: + type: ListElement + match_name: ^array_with_units$ + subtree: + SingleValueWithUnit: + type: TextElement + match_value: "^(?P<number>\\d+\\.?\\d*)\\s+(?P<unit>.+)" + records: + MyRec: + list_with_unit: + value: +$number + unit: $unit diff --git a/unittests/test_utilities.py b/unittests/test_utilities.py index 5a80ab9b230db4540d741bf8fa4f9d11b5158aab..a9b052524957b6f8c1e0378e3153fc06f4f36806 100644 --- a/unittests/test_utilities.py +++ b/unittests/test_utilities.py @@ -19,17 +19,65 @@ # along with this program. If not, see <https://www.gnu.org/licenses/>. # +import pytest +from os.path import sep from caoscrawler.crawl import split_restricted_path +from caoscrawler.utils import MissingImport, get_shared_resource_link def test_split_restricted_path(): assert split_restricted_path("") == [] - assert split_restricted_path("/") == [] - assert split_restricted_path("test/") == ["test"] - assert split_restricted_path("/test/") == ["test"] - assert split_restricted_path("test/bla") == ["test", "bla"] - assert split_restricted_path("/test/bla") == ["test", "bla"] - assert split_restricted_path("/test1/test2/bla") == ["test1", "test2", "bla"] - assert split_restricted_path("/test//bla") == ["test", "bla"] - assert split_restricted_path("//test/bla") == ["test", "bla"] - assert split_restricted_path("///test//bla////") == ["test", "bla"] + assert split_restricted_path(f"{sep}") == [] + assert split_restricted_path(f"test{sep}") == ["test"] + assert split_restricted_path(f"{sep}test{sep}") == ["test"] + assert split_restricted_path(f"test{sep}bla") == ["test", "bla"] + assert split_restricted_path(f"{sep}test{sep}bla") == ["test", "bla"] + assert split_restricted_path(f"{sep}test1{sep}test2{sep}bla") == ["test1", "test2", "bla"] + assert split_restricted_path(f"{sep}test{sep}{sep}bla") == ["test", "bla"] + assert split_restricted_path(f"{sep}{sep}test{sep}bla") == ["test", "bla"] + assert split_restricted_path( + f"{sep}{sep}{sep}test{sep}{sep}bla{sep}{sep}{sep}{sep}") == ["test", "bla"] + + +def test_dummy_class(): + Missing = MissingImport(name="Not Important", hint="Do the thing instead.") + with pytest.raises(RuntimeError) as err_info_1: + print(Missing.__name__) + with pytest.raises(RuntimeError) as err_info_2: + Missing() + with pytest.raises(RuntimeError) as err_info_3: + print(Missing.foo) + + for err_info in (err_info_1, err_info_2, err_info_3): + msg = str(err_info.value) + assert "(Not Important)" in msg + assert msg.endswith("Do the thing instead.") + + MissingErr = MissingImport(name="Not Important", hint="Do the thing instead.", + err=ImportError("Old error")) + with pytest.raises(RuntimeError) as err_info_1: + print(MissingErr.__name__) + with pytest.raises(RuntimeError) as err_info_2: + MissingErr() + with pytest.raises(RuntimeError) as err_info_3: + print(MissingErr.foo) + + for err_info in (err_info_1, err_info_2, err_info_3): + msg = str(err_info.value) + assert "(Not Important)" in msg + orig_msg = str(err_info.value.__cause__) + assert orig_msg == "Old error" + + +def test_shared_resource_link(): + + assert get_shared_resource_link( + "https://example.com/", "file.txt") == "https://example.com/Shared/file.txt" + assert get_shared_resource_link( + "https://example.com", "file.txt") == "https://example.com/Shared/file.txt" + assert get_shared_resource_link( + "https://example.com", "path/to/file.txt") == "https://example.com/Shared/path/to/file.txt" + assert get_shared_resource_link( + "https://example.com/context-root", "path/to/file.txt") == "https://example.com/context-root/Shared/path/to/file.txt" + assert get_shared_resource_link( + "https://example.com/context-root/", "path/to/file.txt") == "https://example.com/context-root/Shared/path/to/file.txt" diff --git a/unittests/test_validation.py b/unittests/test_validation.py new file mode 100644 index 0000000000000000000000000000000000000000..a3215963f67b61241b321a0eb7345f9fe6fde1f2 --- /dev/null +++ b/unittests/test_validation.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer <a.schlemmer@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" +test validation +""" +from os.path import join +from pathlib import Path + +import jsonschema +import linkahead as db +import pytest +from caoscrawler.validator import (convert_record, + load_json_schema_from_datamodel_yaml, + validate) +from jsonschema import ValidationError + +UNITTESTDIR = Path(__file__).parent + + +def test_create_json_schema(): + json = load_json_schema_from_datamodel_yaml(join(UNITTESTDIR, "datamodels", "datamodel.yaml")) + r = db.Record() + r.add_parent(name="Dataset") + r.add_property(name="keywords", value="jakdlfjakdf") + r.add_property(name="dateModified", value="2024-11-16") + + pobj = convert_record(r) + # print(yaml.dump(pobj)) + # print(yaml.dump(json[0])) + assert "Dataset" in json + jsonschema.validate(pobj, json["Dataset"]) + + # Failing test: + r = db.Record() + r.add_parent(name="Dataset") + r.add_property(name="keywordss", value="jakdlfjakdf") + r.add_property(name="dateModified", value="2024-11-16") + + pobj = convert_record(r) + + with pytest.raises(ValidationError, match=".*'keywords' is a required property.*"): + jsonschema.validate(pobj, json["Dataset"]) + + +def test_validation(): + """ + Test for the main validation API function `validate` + """ + json = load_json_schema_from_datamodel_yaml( + join(UNITTESTDIR, "datamodels", "datamodel.yaml")) + r1 = db.Record() + r1.add_parent(name="Dataset") + r1.add_property(name="keywords", value="jakdlfjakdf") + r1.add_property(name="dateModified", value="2024-11-16") + + r2 = db.Record() + r2.add_parent(name="Dataset") + r2.add_property(name="keywordss", value="jakdlfjakdf") + r2.add_property(name="dateModified", value="2024-11-16") + + valres = validate([r1, r2], json) + assert valres[0][0] is True + assert valres[0][1] is None + assert not valres[1][0] + assert isinstance(valres[1][1], ValidationError) + assert valres[1][1].message == "'keywords' is a required property" diff --git a/unittests/test_variable_substitutions.py b/unittests/test_variable_substitutions.py index 09f78df661d82970e7264996102eff8881ee19ec..c75e37956c1ec24e47ff9cbd9b03572ed4a0f80e 100644 --- a/unittests/test_variable_substitutions.py +++ b/unittests/test_variable_substitutions.py @@ -19,15 +19,18 @@ # along with this program. If not, see <https://www.gnu.org/licenses/>. # -from copy import deepcopy from functools import partial from os.path import basename, dirname, join from pathlib import Path from unittest.mock import MagicMock, Mock -import caosdb as db +import linkahead as db import pytest import yaml +from linkahead.apiutils import compare_entities +from pytest import raises +from utils import dircheckstr as dircheckstr_base + from caoscrawler import Crawler from caoscrawler.debug_tree import DebugTree from caoscrawler.identifiable_adapters import (IdentifiableAdapter, @@ -35,10 +38,6 @@ from caoscrawler.identifiable_adapters import (IdentifiableAdapter, from caoscrawler.scanner import scan_directory from caoscrawler.structure_elements import (DictListElement, DictTextElement, File) -from caosdb.apiutils import compare_entities -from pytest import raises - -from utils import dircheckstr as dircheckstr_base UNITTESTDIR = Path(__file__).parent dircheckstr = partial(dircheckstr_base, UNITTESTDIR / "test_directories" / diff --git a/unittests/test_xml_converter.py b/unittests/test_xml_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..e8869ef6ffad511159a583a14fd49d2fad48766b --- /dev/null +++ b/unittests/test_xml_converter.py @@ -0,0 +1,379 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer <a.schlemmer@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" +test the XML converters +""" +import importlib +from pathlib import Path + +import pytest +import yaml +from lxml.etree import fromstring + +from caoscrawler.converters import (XMLAttributeNodeConverter, XMLTagConverter, + XMLTextNodeConverter) +from caoscrawler.scanner import load_definition +from caoscrawler.stores import GeneralStore +from caoscrawler.structure_elements import XMLTagElement + +UNITTESTDIR = Path(__file__).parent + + +@pytest.fixture +def converter_registry(): + converter_registry: dict[str, dict[str, str]] = { + "XMLTag": { + "converter": "XMLTagConverter", + "package": "caoscrawler.converters"}, + + "XMLTextNode": { + "converter": "XMLTextNodeConverter", + "package": "caoscrawler.converters"}, + "XMLAttributeNode": { + "converter": "XMLAttributeNodeConverter", + "package": "caoscrawler.converters"}, + } + + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry + + +@pytest.fixture +def basic_xmltag_converter(converter_registry): + return XMLTagConverter(yaml.safe_load(""" +type: XMLTag +match_tag: a +match_attrib: # default is the empty dictionary + "(?P<ref>(href|url))": "test(?P<number>[0-9])" # either the "href" or the "url" attribute must be set + alt: (.+) # this attribute must be present and contain at least one character +match_text: \\s*(?P<node_text>.+)\\s* + +subtree: + img: + type: XMLTag + match_name: img + match_attrib: + src: test2 +"""), "TestXMLTagConverter", converter_registry) + + +@pytest.fixture +def basic_xpath_xmltag_converter(converter_registry): + return XMLTagConverter(yaml.safe_load(""" +type: XMLTag +match_tag: a +match_attrib: # default is the empty dictionary + "(?P<ref>(href|url))": "test(?P<number>[0-9])" # either the "href" or the "url" attribute must be set + alt: (.+) # this attribute must be present and contain at least one character +match_text: \\s*(?P<node_text>.+)\\s* +xpath: child::*/* + +subtree: + img: + type: XMLTag + match_name: img + match_attrib: + src: test2 + testnode: + type: XMLTag + match_name: testnode +"""), "TestXMLTagConverter", converter_registry) + + +def test_simple_xml(basic_xmltag_converter): + """ + Test for basic xml conversion functionality. + """ + xml_text = """ + <a href="test1" alt="no link"> + test <img src="test2"/> + </a> + """ + + xml = fromstring(xml_text) + tag = XMLTagElement(xml) + assert tag.name == "." + + m = basic_xmltag_converter.match(tag) + + assert m is not None + assert m["ref"] == "href" + assert m["number"] == "1" + assert m["node_text"] == "test " + + +def test_not_matching(basic_xmltag_converter): + m = basic_xmltag_converter.match(XMLTagElement(fromstring(""" + <a href="test1"> + test <img src="test2"/> + </a> + """))) + + assert m is None # alt-attribute was missing + + m = basic_xmltag_converter.match(XMLTagElement(fromstring(""" + <a href="test" alt="no link"> + test <img src="test2"/> + </a> + """))) + + assert m is None # href attribute did not match + + m = basic_xmltag_converter.match(XMLTagElement(fromstring(""" + <a href="test1" url="http" alt="no link"> + test <img src="test2"/> + </a> + """))) + + assert m is None # href and url must not be present simultaneously + + m = basic_xmltag_converter.match(XMLTagElement(fromstring(""" + <a href="test1" alt="no link"><img src="test2"/></a> + """))) + + assert m is None # text node is empty + + m = basic_xmltag_converter.match(XMLTagElement(fromstring(""" + <a href="test1" alt="no link"/> + """))) + + assert m is None # text node is empty + + # TODO: adapt converter -> empty (==None) text node is equivalent to empty string text node + # TODO: adapt tests + # TODO: how to match " ajskdlfjaldsf ajsdklfjadkl " without the whitespaces in regexp correctly? + + +def test_nested_simple_xml(basic_xmltag_converter, basic_xpath_xmltag_converter): + """ + Test for xml conversion including children. + """ + xml_text = """ + <a href="test1" alt="no link"> + test <img src="test2"/> + </a> + """ + + tag = XMLTagElement(fromstring(xml_text)) + m = basic_xmltag_converter.match(tag) + assert m is not None + + general_store = GeneralStore() + children = basic_xmltag_converter.create_children(general_store, tag) + + assert len(children) == 1 + assert isinstance(children[0], XMLTagElement) + assert children[0].name == "img" + + xml_text = """ + <a href="test1" alt="no link"> + test <img src="test2"> + <testnode/> </img> + </a> + """ + + tag = XMLTagElement(fromstring(xml_text)) + m = basic_xpath_xmltag_converter.match(tag) + assert m is not None + + general_store = GeneralStore() + children = basic_xpath_xmltag_converter.create_children(general_store, tag) + + assert len(children) == 1 + assert isinstance(children[0], XMLTagElement) + assert children[0].name == "img/testnode" + + +def test_namespace_xml(converter_registry): + """ + Test for xml conversion including children. + Nodes have namespaces. + """ + + xml_text = """ + <root xmlns="default-namespace" xmlns:test="alternative-namespace"> + <node1 active="true"> + Bla + </node1> + <node1 active="true" size="45"> + text + <node2 xmlns="sub-namespace"> + <node3> + ok + </node3> + </node2> + <test:node2> + sep + </test:node2> + </node1> + </root> +""" + + # Test unsupported xpath (containing text()): + converter = XMLTagConverter(yaml.safe_load(""" +type: XMLTag +match_tag: "{default-namespace}root" +xpath: "default:node1/text()" +default_namespace: default +"""), "TestXMLTagConverter", converter_registry) + + tag = XMLTagElement(fromstring(xml_text)) + m = converter.match(tag) + assert m is not None + + with pytest.raises(RuntimeError, match="Only standard xml nodes.*"): + converter.create_children(GeneralStore(), tag) + + # Test complex xml using namespaces and text nodes: + converter = XMLTagConverter(yaml.safe_load(""" +type: XMLTag +match_tag: "{default-namespace}root" +xpath: "default:node1" +default_namespace: default +attribs_as_children: false +text_as_children: true +tags_as_children: false +"""), "TestXMLTagConverter", converter_registry) + children = converter.create_children(GeneralStore(), tag) + assert len(children) == 2 + assert children[0].name == "{default-namespace}node1[1]/text()" + assert children[0].value.strip() == "Bla" + assert children[1].name == "{default-namespace}node1[2]/text()" + assert children[1].value.strip() == "text" + + # Check child generation of attributes: + converter = XMLTagConverter(yaml.safe_load(""" +type: XMLTag +match_tag: "{default-namespace}root" +xpath: "default:node1" +default_namespace: default +attribs_as_children: true +text_as_children: false +tags_as_children: false +"""), "TestXMLTagConverter", converter_registry) + children = converter.create_children(GeneralStore(), tag) + + assert len(children) == 3 + assert children[0].name == "{default-namespace}node1[1]@active" + assert children[0].value.strip() == "true" + assert children[1].name == "{default-namespace}node1[2]@active" + assert children[1].value.strip() == "true" + assert children[2].name == "{default-namespace}node1[2]@size" + assert children[2].value.strip() == "45" + + # Test setting nsmap entries: + converter = XMLTagConverter(yaml.safe_load(""" +type: XMLTag +match_tag: "{default-namespace}root" +xpath: "//s:node2" +default_namespace: default +nsmap: + s: sub-namespace +"""), "TestXMLTagConverter", converter_registry) + children = converter.create_children(GeneralStore(), tag) + assert len(children) == 1 + assert children[0].name == "{default-namespace}node1[2]/{sub-namespace}node2" + + +def test_attrib_nodes(converter_registry): + """ + Test attribute node converters. + """ + + xml_text = """ + <node1 active="true" size="45"> + Bla + </node1> +""" + + converter = XMLTagConverter(yaml.safe_load(""" +type: XMLTag +match_tag: "node1" +xpath: . +tags_as_children: false +attribs_as_children: true +"""), "TestXMLTagConverter", converter_registry) + + tag = XMLTagElement(fromstring(xml_text)) + m = converter.match(tag) + assert m is not None + children = converter.create_children(GeneralStore(), tag) + assert len(children) == 2 + + attrib_converter = XMLAttributeNodeConverter(yaml.safe_load(""" +type: XMLAttributeNode +match_name: active +match_value: (?P<val>.*) +"""), "TestXMLAttributeNodeConverter", converter_registry) + m = attrib_converter.match(children[1]) + assert m is None + m = attrib_converter.match(children[0]) + assert m is not None + assert m["val"] == "true" + + attrib_converter = XMLAttributeNodeConverter(yaml.safe_load(""" +type: XMLAttributeNode +match_name: size +match_value: (?P<val>.*) +"""), "TestXMLAttributeNodeConverter", converter_registry) + m = attrib_converter.match(children[0]) + assert m is None + m = attrib_converter.match(children[1]) + assert m is not None + assert m["val"] == "45" + + +def test_text_nodes(converter_registry): + """ + Test text node converters. + """ + + xml_text = """ + <node1 active="true" size="45"> + Bla + </node1> +""" + + converter = XMLTagConverter(yaml.safe_load(""" +type: XMLTag +match_tag: "node1" +xpath: . +tags_as_children: false +text_as_children: true +"""), "TestXMLTagConverter", converter_registry) + + tag = XMLTagElement(fromstring(xml_text)) + m = converter.match(tag) + assert m is not None + children = converter.create_children(GeneralStore(), tag) + assert len(children) == 1 + + attrib_converter = XMLTextNodeConverter(yaml.safe_load(""" +type: XMLTextNode +match_text: \s*(?P<val>\w*)\s* +"""), "TestXMLTextNodeConverter", converter_registry) + m = attrib_converter.match(children[0]) + assert m is not None + assert m["val"] == "Bla" diff --git a/unittests/test_zipfile_converter.py b/unittests/test_zipfile_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..9bc8b8804e299387157869f0dc8b11a9c2a8c6f8 --- /dev/null +++ b/unittests/test_zipfile_converter.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer <a.schlemmer@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" +test the zip-file converter +""" +import importlib +import os +from pathlib import Path + +import pytest +import yaml +from caoscrawler.converters import DirectoryConverter, ZipFileConverter +from caoscrawler.stores import GeneralStore +from caoscrawler.structure_elements import Directory, File + +UNITTESTDIR = Path(__file__).parent + + +@pytest.fixture +def converter_registry(): + converter_registry: dict[str, dict[str, str]] = { + "ZipFile": { + "converter": "ZipFileConverter", + "package": "caoscrawler.converters"}, + } + + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry + + +def test_zipfile_converter(converter_registry): + zipfile = File("PASTA.eln", os.path.join(UNITTESTDIR, "eln_files", "PASTA.eln")) + zip_conv = ZipFileConverter(yaml.safe_load(""" +type: ZipFile +match: .*$ +"""), "TestZipFileConverter", converter_registry) + + match = zip_conv.match(zipfile) + assert match is not None + + children = zip_conv.create_children(GeneralStore(), zipfile) + assert len(children) == 1 + assert children[0].name == "PASTA" + + dir_conv = DirectoryConverter(yaml.safe_load(""" +type: Directory +match: ^PASTA$ +"""), "TestDirectory", converter_registry) + match = dir_conv.match(children[0]) + assert match is not None + children = dir_conv.create_children(GeneralStore(), children[0]) + assert len(children) == 5 + print(children) + for i in range(2): + assert isinstance(children[i], Directory) + for i in range(2, 5): + assert isinstance(children[i], File) diff --git a/unittests/utils.py b/unittests/utils.py index a9649dea686c33dc33d0d7636d08aa51beb35412..fee80e44028667b9b3c8c8f8201b1a774c46afdf 100644 --- a/unittests/utils.py +++ b/unittests/utils.py @@ -36,5 +36,5 @@ def dircheckstr(prefix, *pathcomponents): ftype = "Directory" else: ftype = "File" - return (f"caoscrawler.structure_elements.{ftype}: " + os.path.basename( + return (f"caoscrawler.structure_elements.structure_elements.{ftype}: " + os.path.basename( os.path.join(*pathcomponents)) + ", " + os.path.join(prefix, *pathcomponents))