diff --git a/.docker/Dockerfile b/.docker/Dockerfile index dd4f3d258443dc1f8b2bacb8d535780e8e37e5e8..539ac0d4e70bfbde2f630d4254cacc7419105611 100644 --- a/.docker/Dockerfile +++ b/.docker/Dockerfile @@ -10,7 +10,7 @@ RUN apt-get update && \ python3-sphinx \ tox \ -y -RUN pip3 install recommonmark sphinx-rtd-theme +RUN pip3 install pylint recommonmark sphinx-rtd-theme COPY .docker/wait-for-it.sh /wait-for-it.sh ARG PYLIB ADD https://gitlab.indiscale.com/api/v4/projects/97/repository/commits/${PYLIB} \ diff --git a/.gitignore b/.gitignore index 182ed05e1404483ecb553c8a4e469a86a77ba27c..7ad8606171fd21b94ffd8b15b972f956d4dfc1a1 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,5 @@ start_caosdb_docker.sh src/doc/_apidoc /dist/ *.egg-info +venv/ +.backups \ No newline at end of file diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8840e613f1e1eb86f30779b8b3535e2ff97ad0cc..879291320a7a715c10113f850a9f43f9465a7196 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -130,7 +130,7 @@ unittest_py3.7: # TODO: Use f-branch logic here - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@dev - - pip install . + - pip install .[h5-crawler] # actual test - caosdb-crawler --help - pytest --cov=caosdb -vv ./unittests @@ -279,7 +279,7 @@ cert: - cd .docker - CAOSHOSTNAME=caosdb-server ./cert.sh -style: +code-style: tags: [docker] stage: style image: $CI_REGISTRY_IMAGE @@ -290,6 +290,17 @@ style: - autopep8 -r --diff --exit-code . allow_failure: true +pylint: + tags: [docker] + stage: style + image: $CI_REGISTRY_IMAGE + needs: + - job: build-testenv + optional: true + allow_failure: true + script: + - pylint --unsafe-load-any-extension=y -d all -e E,F src/caoscrawler + # Build the sphinx documentation and make it ready for deployment by Gitlab Pages # Special job for serving a static website. See https://docs.gitlab.com/ee/ci/yaml/README.html#pages # Based on: https://gitlab.indiscale.com/caosdb/src/caosdb-pylib/-/ci/editor?branch_name=main diff --git a/CHANGELOG.md b/CHANGELOG.md index 25f8d2a144941c42ad4b7818dfb4a4c97c564701..240e1aff64dc555c0316606dce19589fea02a863 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,26 +10,82 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### ### Changed ### -- If the `parents` key is used in a cfood at a lower level for a Record that + +### Deprecated ### + +### Removed ### + +### Fixed ### + +### Security ### + +### Documentation ### + +## [0.7.0] - 2024-03-04 ## + +### Added ### + +* `transform` sections can be added to a CFood to apply functions to values stored in variables. +* default transform functions: submatch, split and replace. +* `*` can now be used as a wildcard in the identifiables parameter file to denote + that any Record may reference the identified one. +* `crawl.TreatedRecordLookUp` class replacing the old (and slow) + `identified_cache` module. The new class now handles all records identified by + id, path, or identifiable simultaneously. See API docs for more info on how to + add to and get from the new lookup class. +* `identifiable_adapters.IdentifiableAdapter.get_identifying_referencing_entities` + and + `identifiable_adapters.IdentifiableAdapter.get_identifying_referenced_entities` + static methods to return the referencing or referenced entities belonging to a + registered identifiable, respectively. +* [#70](https://gitlab.com/linkahead/linkahead-crawler/-/issues/70): Optional + converters for HDF5 files. They require this package to be installed with its + ``h5-crawler`` dependency. + +### Changed ### + +* If the `parents` key is used in a cfood at a lower level for a Record that already has a Parent (because it was explicitly given or the default Parent), the old Parent(s) are now overwritten with the value belonging to the `parents` key. -- If a registered identifiable states, that a reference by a Record with parent +* If a registered identifiable states, that a reference by a Record with parent RT1 is needed, then now also references from Records that have a child of RT1 as parent are accepted. +* More aggressive caching. +* The `identifiable_adapters.IdentifiableAdapter` now creates (possibly empty) + reference lists for all records in `create_reference_mapping`. This allows + functions like `get_identifiable` to be called only with the subset of the + referenceing entities belonging to a specific Record. +* The `identifiable_adapters.IdentifiableAdapter` uses entity ids (negative for + entities that don't exist remotely) instead of entity objects for keeping + track of references. ### Deprecated ### +* `IdentifiableAdapter.get_file` + ### Removed ### +* `identified_cache` module which was replaced by the `crawl.TreatedRecordLookUp` class. + ### Fixed ### -- Empty Records can now be created (https://gitlab.com/caosdb/caosdb-crawler/-/issues/27) +* Empty Records can now be created (https://gitlab.com/caosdb/caosdb-crawler/-/issues/27) * [#58](https://gitlab.com/caosdb/caosdb-crawler/-/issues/58) Documentation builds API docs in pipeline now. - -### Security ### - -### Documentation ### +* [#117](https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/117) + `replace_variable` does no longer unnecessarily change the type. Values stored + in variables in a CFood can have now other types. +* [indiscale#113](https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/113) + Resolving referenced entities fails in some corner cases. The crawler now + handles cases correctly in which entities retrieved from the server have to be + merged with local entities that both reference another, already existing + entity +* A corner case in `split_into_inserts_and_updates` whereby two records created + in different places in the cfood definition would not be merged if both were + identified by the same LinkAhead id +* [#87](https://gitlab.com/linkahead/linkahead-crawler/-/issues/87) Handle long strings more gracefully. The crawler sometimes runs into + [linkahead-server#101](https://gitlab.com/linkahead/linkahead-server/-/issues/101), this is now mitigated. +* [indiscale#128](https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/128) Yet another corner case of referencing resolution resolved. ## [0.6.0] - 2023-06-23 ## (Florian Spreckelsen) @@ -42,7 +98,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Creation of CrawlerRun Records that contain status information about data integration of the crawler if the `pycaosdb.ini` contains a `[caoscrawler]` with `create_crawler_status_records=True`. -- The Crawler `synchronize` function now takes list of RecordType names. +- The Crawler `synchronize` function now takes list of RecordType names. Records that have the given names as parents are excluded from inserts or updates - `Crawler.synchronize` now takes an optional `path_for_authorized_run` argument diff --git a/CITATION.cff b/CITATION.cff index c6d41cc49d74b72056d825ca731fa79897fc537b..de05ce11bc6667e508e3504629ba517a90642aef 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -1,25 +1,22 @@ cff-version: 1.2.0 message: "If you use this software, please cite it as below." authors: - - family-names: Fitschen - given-names: Timm - orcid: https://orcid.org/0000-0002-4022-432X - - family-names: Schlemmer - given-names: Alexander - orcid: https://orcid.org/0000-0003-4124-9649 - - family-names: Hornung - given-names: Daniel - orcid: https://orcid.org/0000-0002-7846-6375 - family-names: tom Wörden given-names: Henrik orcid: https://orcid.org/0000-0002-5549-578X - - family-names: Parlitz - given-names: Ulrich - orcid: https://orcid.org/0000-0003-3058-1435 + - family-names: Spreckelsen + given-names: Florian + orcid: https://orcid.org/0000-0002-6856-2910 - family-names: Luther given-names: Stefan orcid: https://orcid.org/0000-0001-7214-8125 + - family-names: Parlitz + given-names: Ulrich + orcid: https://orcid.org/0000-0003-3058-1435 +- family-names: Schlemmer + given-names: Alexander + orcid: https://orcid.org/0000-0003-4124-9649 title: CaosDB - Crawler -version: 0.6.0 -doi: 10.3390/data4020083 -date-released: 2023-06-23 \ No newline at end of file +version: 0.7.0 +doi: 10.3390/data9020024 +date-released: 2023-03-04 \ No newline at end of file diff --git a/integrationtests/basic_example/test_basic.py b/integrationtests/basic_example/test_basic.py index d3482a19f0c95912002b2ff68101623476d452ea..c906a81d86af56669f7c522169bceb3b5fcb3e01 100755 --- a/integrationtests/basic_example/test_basic.py +++ b/integrationtests/basic_example/test_basic.py @@ -112,11 +112,11 @@ def crawler_extended(ident): return cr, crawled_data, debug_tree -def test_ambigious_lookup(clear_database, usemodel, crawler, ident): +def test_ambiguous_lookup(clear_database, usemodel, crawler, ident): ins, ups = crawler[0].synchronize(crawled_data=crawler[1]) proj = db.execute_query("FIND Project WITH identifier='SpeedOfLight'", unique=True) - with pytest.raises(RuntimeError, match=".*unambigiously.*"): + with pytest.raises(RuntimeError, match=".*unambiguously.*"): print(crawler[0].identifiableAdapter.retrieve_identified_record_for_identifiable( Identifiable(properties={'project': proj.id}))) diff --git a/integrationtests/test_issues.py b/integrationtests/test_issues.py index 3bdac745f392cf747d4c4a46378047b76b04e2b4..814e82ad75512ec8fe217294e1a9e86c6aa01ab3 100644 --- a/integrationtests/test_issues.py +++ b/integrationtests/test_issues.py @@ -16,23 +16,26 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. # -from pytest import fixture, mark +from pytest import fixture, mark, raises -import caosdb as db -from caosdb.cached import cache_clear +import linkahead as db +from linkahead.cached import cache_clear +from caosadvancedtools.models.parser import parse_model_from_string from caoscrawler.crawl import Crawler +from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter from caoscrawler.structure_elements import DictElement from caoscrawler.scanner import create_converter_registry, scan_structure_elements -from caosdb.utils.register_tests import clear_database, set_test_key +from linkahead.utils.register_tests import clear_database, set_test_key set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") @fixture(autouse=True) def clear_cache(): + """Clear the LinkAhead cache.""" cache_clear() @@ -208,3 +211,121 @@ def test_issue_83(clear_database): assert len(retrieved_referencing3.get_property(referenced_type.name).value) == 2 assert retrieved_target1.id in retrieved_referencing3.get_property(referenced_type.name).value assert retrieved_target2.id in retrieved_referencing3.get_property(referenced_type.name).value + + +def test_indiscale_113(clear_database): + """Somewhat mysterious failures to resolve references in + split_into_inserts_and_updates, see + https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/113 + + """ + + # Create and insert minimal datamodel + datamodel_str = """ +Event: + recommended_properties: + Basis: + Campaign: +Basis: +Campaign: + recommended_properties: + Basis: +""" + model = parse_model_from_string(datamodel_str) + model.sync_data_model(noquestion=True) + + # Register identifiables, everything is identified by name + ident = CaosDBIdentifiableAdapter() + ident.register_identifiable("Event", db.RecordType().add_parent( + name="Event").add_property(name="name")) + ident.register_identifiable("Basis", db.RecordType().add_parent( + name="Basis").add_property(name="name")) + ident.register_identifiable("Campaign", db.RecordType().add_parent( + name="Campaign").add_property(name="name")) + + crawler = Crawler(identifiableAdapter=ident) + + # Add records: event references basis and campaign, campaign references + # basis. + basis = db.Record(name="Poseidon").add_parent(name="Basis") + campaign = db.Record(name="POS386").add_parent( + name="Campaign").add_property(name="Basis", value=basis) + event = db.Record(name="GeoB13952").add_parent(name="Event") + event.add_property(name="Basis", value=basis) + event.add_property(name="Campaign", value=campaign) + + # basis and campaign already exist in the db + db.Container().extend([basis, campaign]).insert() + # redefine to trigger resolving + basis = db.Record(name="Poseidon").add_parent(name="Basis") + campaign = db.Record(name="POS386").add_parent( + name="Campaign").add_property(name="Basis", value=basis) + recs = [event, basis, campaign] + + ins, ups = crawler.synchronize(crawled_data=recs, unique_names=False) + # There is only one event to be inserted + assert len(ins) == 1 + # Nothing to do for the existing ents + assert len(ups) == 0 + assert ins[0].name == event.name + + +def test_indiscale_87(clear_database): + """Handle long string queries gracefully. + + https://gitlab.com/linkahead/linkahead-crawler/-/issues/87 + """ + + prop = db.Property(name="str", datatype=db.TEXT).insert() + rt = db.RecordType(name="RT1").add_property(prop).insert() + strings = [ + "X123456789" * 26, + "X" * 260, + "X123456789" * 25 + "9876543210", + ] + recs = [ + db.Record().add_parent(rt).add_property(name="str", value=string).insert() + for string in strings + ] + idents = [ + Identifiable(record_type="RT1", properties={"str": string}) + for string in strings + ] + adapter = CaosDBIdentifiableAdapter() + for rec, ident in zip(recs, idents): + print(f"Testing: ...{rec.get_property('str').value[-10:]}") + retrieved = adapter.retrieve_identified_record_for_identifiable(ident) + # print(rec) + # print(retrieved) + print(db.apiutils.compare_entities(rec, retrieved)) + assert db.apiutils.empty_diff(rec, retrieved) + print("---") + + # add another, harmless, property + prop2 = db.Property(name="someint", datatype=db.INTEGER).insert() + rt.add_property(prop2).update() + string = "Y123456789" * 26 + numbers = [23, 42] + recs = [ + db.Record().add_parent(rt).add_property(name="str", value=string).add_property( + name="someint", value=number).insert() + for number in numbers + ] + idents = [Identifiable(record_type="RT1", properties={"str": string})] + # Ambiguous result + with raises(RuntimeError, match=".*unambiguously.*"): + retrieved = adapter.retrieve_identified_record_for_identifiable(idents[0]) + + # Upgrade new property to be identifying + idents = [ + Identifiable(record_type="RT1", properties={"str": string, "someint": number}) + for number in numbers + ] + for rec, ident in zip(recs, idents): + print(f"Testing: someint={rec.get_property('someint').value}") + retrieved = adapter.retrieve_identified_record_for_identifiable(ident) + # print(rec) + # print(retrieved) + print(db.apiutils.compare_entities(rec, retrieved)) + assert db.apiutils.empty_diff(rec, retrieved) + print("---") diff --git a/integrationtests/test_realworld_example.py b/integrationtests/test_realworld_example.py index 82644947a3cdc85a38be3403615b51fe1f4ded50..fbbf25643e1c1cf928aa9599c92d3d6e94a88974 100644 --- a/integrationtests/test_realworld_example.py +++ b/integrationtests/test_realworld_example.py @@ -24,24 +24,22 @@ """ an integration test module that runs a test against a (close to) real world example """ -from caosdb.utils.register_tests import clear_database, set_test_key -import logging import json +import logging import os +import pytest +import sys -import caosdb as db -from caosdb.cached import cache_clear +import linkahead as db +from linkahead.cached import cache_clear +from linkahead.utils.register_tests import clear_database, set_test_key +from caosadvancedtools.loadFiles import loadpath +from caosadvancedtools.models.parser import parse_model_from_json_schema, parse_model_from_yaml from caoscrawler.crawl import Crawler, crawler_main from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter -from caoscrawler.structure_elements import Directory -import pytest -from caosadvancedtools.models.parser import parse_model_from_json_schema, parse_model_from_yaml -from caosadvancedtools.loadFiles import loadpath - from caoscrawler.scanner import load_definition, scan_structure_elements, create_converter_registry - -import sys +from caoscrawler.structure_elements import Directory set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") @@ -91,15 +89,6 @@ def usemodel(): dataset_inherits.sync_data_model(noquestion=True) -@pytest.fixture -def clear_database(): - # TODO(fspreck): Remove once the corresponding advancedtools function can - # be used. - ents = db.execute_query("FIND ENTITY WITH ID>99") - if ents: - ents.delete() - - def create_identifiable_adapter(): ident = CaosDBIdentifiableAdapter() ident.load_from_yaml_definition(os.path.join(DATADIR, "identifiables.yml")) diff --git a/setup.cfg b/setup.cfg index 32edcde630172cb991ea28898ae0c5e9f5770f90..1e8abb8eeee2ba4b861a71880c96d943f813c812 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = caoscrawler -version = 0.6.1 +version = 0.7.1 author = Alexander Schlemmer author_email = alexander.schlemmer@ds.mpg.de description = A new crawler for caosdb @@ -19,9 +19,9 @@ package_dir = packages = find: python_requires = >=3.7 install_requires = - importlib-resources - caosadvancedtools >= 0.7.0 - linkahead >= 0.13.1 + importlib-resources + caosadvancedtools >= 0.7.0 + linkahead > 0.13.2 yaml-header-tools >= 0.2.1 pyyaml odfpy #make optional @@ -39,3 +39,8 @@ per-file-ignores = __init__.py:F401 [options.entry_points] console_scripts = caosdb-crawler = caoscrawler.crawl:main + +[options.extras_require] +h5-crawler = + h5py >= 3.8 + numpy diff --git a/src/caoscrawler/cfood-schema.yml b/src/caoscrawler/cfood-schema.yml index b0d77bbf5d7ba09df3c0c47d656fa3d22d07b6d2..5a6e1e50345382ca6e5a1e6ef3a8fbeafb806b84 100644 --- a/src/caoscrawler/cfood-schema.yml +++ b/src/caoscrawler/cfood-schema.yml @@ -31,6 +31,10 @@ cfood: - JSONFile - CSVTableConverter - XLSXTableConverter + - H5File + - H5Dataset + - H5Group + - H5Ndarray description: Type of this converter node. match: description: typically a regexp which is matched to a structure element name diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index b6fa8433fc3aab4668d86bb5c7920b21377ba87d..98c2a7362f38d8a96ab1b35cb0b02518173d9982 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -32,10 +32,11 @@ import os import re import warnings from abc import ABCMeta, abstractmethod +from inspect import signature from string import Template -from typing import List, Optional, Tuple, Union +from typing import Any, List, Optional, Tuple, Union -import caosdb as db +import linkahead as db import pandas as pd import yaml import yaml_header_tools @@ -52,7 +53,7 @@ from .utils import has_parent # by the converters: SPECIAL_PROPERTIES = ("description", "name", "id", "path", "file", "checksum", "size") - +SINGLE_VAR_RE = re.compile(r"^\$(\{)?(?P<varname>[0-9a-zA-Z_]+)(\})?$") logger = logging.getLogger(__name__) @@ -128,35 +129,34 @@ def create_path_value(func): return inner -def replace_variables(propvalue, values: GeneralStore): +def replace_variables(propvalue: Any, values: GeneralStore): """ This function replaces variables in property values (and possibly other locations, where the crawler can replace cfood-internal variables). - This function checks whether the value that is to be replaced is of type db.Entity. - In this case the entity is returned (note that this is of course only possible, if the - occurrence of the variable is directly at the beginning of the value and e.g. no string - concatenation is attempted. - - In any other case the variable substitution is carried out and a new string with the - replaced variables is returned. + If `propvalue` is a single variable name preceeded with a '$' (e.g. '$var' or '${var}'), then + the corresponding value stored in `values` is returned. + In any other case the variable substitution is carried out as defined by string templates + and a new string with the replaced variables is returned. """ + # We only replace string variable names. If it is not a string the value stays unchanged + if not isinstance(propvalue, str): + return propvalue + # Check if the replacement is a single variable containing a record: - match = re.match(r"^\$(\{)?(?P<varname>[0-9a-zA-Z_]+)(\})?$", propvalue) + match = SINGLE_VAR_RE.match(propvalue) if match is not None: varname = match.group("varname") if varname in values: - if values[varname] is None: - return None - if isinstance(values[varname], db.Entity): - return values[varname] + return values[varname] propvalue_template = CrawlerTemplate(propvalue) return propvalue_template.safe_substitute(**values.get_storage()) def handle_value(value: Union[dict, str, list], values: GeneralStore): - """Determine whether the given value needs to set a property, be added to an existing value (create a list) or + """Determine whether the given value needs to set a property, + be added to an existing value (create a list) or add as an additional property (multiproperty). Variable names (starting with a "$") are replaced by the corresponding value stored in the @@ -204,7 +204,6 @@ out: tuple # being able to directly set list values. Semantics is, however, a bit # different from the two cases above. collection_mode = "single" - propvalue = value # variables replacement: propvalue = list() @@ -319,6 +318,16 @@ class Converter(object, metaclass=ABCMeta): """Converters treat StructureElements contained in the hierarchical sturcture.""" def __init__(self, definition: dict, name: str, converter_registry: dict): + """ + + Parameters + ---------- + definition: dict, Please refer to ``src/doc/converters.rst`` to learn about the structure + that the definition dict must have. + converter_registry: dict, A dictionary that contains converter names as keys and dicts as + values. Those value dicts have the keys 'converter' and 'package'. + """ + self.definition = definition self.name = name @@ -328,6 +337,23 @@ class Converter(object, metaclass=ABCMeta): } self.converters = [] + if "transform" in self.definition: + if not isinstance(self.definition["transform"], dict): + raise RuntimeError("The value corresponding to the 'transform' key in the " + "converter definition must be a dict") + for transformer_key, transformer in self.definition["transform"].items(): + if "in" not in transformer: + raise RuntimeError("In-variable not defined!") + if "out" not in transformer: + raise RuntimeError("Out-variable not defined!") + if "functions" not in transformer: + raise RuntimeError("No functions given for transformer!") + if not isinstance(transformer["functions"], list): + raise RuntimeError("The value corresponding to the 'functions' key in the " + "transform section must be a list") + + if not isinstance(transformer["in"], str): + raise RuntimeError("You should provide the variable name as string") if "subtree" in definition: for converter_name in definition['subtree']: @@ -363,8 +389,14 @@ class Converter(object, metaclass=ABCMeta): Extract information from the structure element and store them as values in the general store. - values: The GeneralStore to store values in. - element: The StructureElement to extract values from. + Parameters: + ------------ + + values: GeneralStore + The GeneralStore to store values in. + + element: StructureElement + The StructureElement to extract values from. """ m = self.match(element) if m is None: @@ -372,6 +404,61 @@ class Converter(object, metaclass=ABCMeta): raise RuntimeError("Condition does not match.") values.update(m) + def apply_transformers(self, values: GeneralStore, transformer_functions: dict): + """ + Check if transformers are defined using the "transform" keyword. + Then apply the transformers to the variables defined in GeneralStore "values". + + Parameters: + ------------ + + values: GeneralStore + The GeneralStore to store values in. + + transformer_functions: dict + A dictionary of registered functions that can be used within this transformer block. + The keys of the dict are the function keys and the values the callable functions of the + form: + + def func(in_value: Any, in_parameters: dict) -> Any: + pass + """ + + if not "transform" in self.definition: + return + for transformer_key, transformer in self.definition["transform"].items(): + in_value = replace_variables(transformer["in"], values) + + for tr_func_el in transformer["functions"]: + if not isinstance(tr_func_el, dict): + raise RuntimeError("Elements of the list of the functions key " + "must be dictonaries!") + if len(tr_func_el) != 1: + raise RuntimeError("List element dictionaries must have exactly" + " one element with they key being the name" + " of the function!") + tr_func_key = list(tr_func_el.keys())[0] + tr_func_params = tr_func_el[tr_func_key] + if tr_func_key not in transformer_functions: + raise RuntimeError("Unknown transformer function: {}".format(tr_func_key)) + + # Retrieve the function from the dictionary: + tr_func = transformer_functions[tr_func_key] + # Call the function: + sig = signature(tr_func) + if len(sig.parameters) == 1 and len(tr_func_params) == 0: + out_value = tr_func(in_value) + else: + out_value = tr_func(in_value, tr_func_params) + # The next in_value is the current out_value: + in_value = out_value + # If everything succeeded, store the final value in the general store: + match = SINGLE_VAR_RE.match(transformer["out"]) + if match is None: + raise RuntimeError("'out' of the transformer definition must specify a single" + f" variable name. It was {transformer['out']}") + values[match.group('varname')] = out_value + @abstractmethod def create_children(self, values: GeneralStore, element: StructureElement): @@ -429,14 +516,14 @@ class Converter(object, metaclass=ABCMeta): """ Template for the debugging output for the match function """ msg = "\n--------" + name + "-----------\n" for re, ma in zip(regexp, matched): - msg += "matching against:\n" + re + "\n" - msg += "matching:\n" + ma + "\n" + msg += "matching reg:\t" + re + "\n" + msg += "matching val:\t" + ma + "\n" msg += "---------\n" if result is None: - msg += "No match" + msg += "No match\n" else: msg += "Matched groups:\n" - msg += str(result) + msg += str(result)+'\n' msg += "----------------------------------------\n" logger.debug(msg) @@ -597,6 +684,15 @@ class MarkdownFileConverter(SimpleFileConverter): "Error during the validation (yaml header cannot be read) of the markdown file " "located at the following node in the data structure:\n" f"{path}") + except yaml_header_tools.ParseErrorsInHeader as err: + if generalStore is not None and self.name in generalStore: + path = generalStore[self.name] + else: + path = "<path not set>" + raise ConverterValidationError( + "Error during the validation (yaml header cannot be read) of the markdown file " + "located at the following node in the data structure:\n" + "{}\nError:\n{}".format(path, err)) children: List[StructureElement] = [] for name, entry in header.items(): @@ -605,8 +701,12 @@ class MarkdownFileConverter(SimpleFileConverter): elif type(entry) == str: children.append(TextElement(name, entry)) else: + if generalStore is not None and self.name in generalStore: + path = generalStore[self.name] + else: + path = "<path not set>" raise RuntimeError( - "Header entry {} has incompatible type.".format(name)) + "Header entry {} has incompatible type.\nFilename: {}".format(name, path)) return children @@ -985,16 +1085,13 @@ class ListElementConverter(Converter): "This converter can only process DictListElements.") children: list[StructureElement] = [] for index, list_element in enumerate(element.value): - # TODO(fspreck): Refactor this and merge with DictXXXElements maybe? - if isinstance(list_element, str): - children.append(TextElement(str(index), list_element)) - elif isinstance(list_element, dict): - children.append(DictElement(str(index), list_element)) - elif isinstance(list_element, StructureElement): - children.append(list_element) - else: - raise NotImplementedError( - f"Unkown type {type(list_element)} in list element {list_element}.") + children.append( + convert_basic_element( + list_element, + name=f"{index}", + msg_prefix=f"The value at index {index} in the list as an unknown type." + ) + ) return children def typecheck(self, element: StructureElement): diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index cf5d134139edfc46ac2b97318503e21abd0ce7c3..d21e6e2521578dc407e445d8220506677be84e26 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -3,10 +3,11 @@ # # This file is a part of the CaosDB Project. # -# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> -# 2021-2023 Research Group Biomedical Physics, -# Max-Planck-Institute for Dynamics and Self-Organization Göttingen -# Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> +# Copyright (C) 2021-2024 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# Copyright (C) 2021-2023 Research Group Biomedical Physics, MPI-DS Göttingen +# Copyright (C) 2021-2023 Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> +# Copyright (C) 2021-2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as @@ -26,13 +27,12 @@ """ Crawl a file structure using a yaml cfood definition and synchronize -the acuired data with CaosDB. +the acuired data with LinkAhead. """ from __future__ import annotations import argparse -import importlib import logging import os import sys @@ -40,41 +40,37 @@ import traceback import uuid import warnings from argparse import RawTextHelpFormatter -from collections import defaultdict from copy import deepcopy from datetime import datetime from enum import Enum -from typing import Any, Optional, Type, Union +from typing import Any, List, Optional, Union -import caosdb as db +import linkahead as db import yaml -from caosadvancedtools.cache import Cache, UpdateCache +from caosadvancedtools.cache import UpdateCache from caosadvancedtools.crawler import Crawler as OldCrawler from caosadvancedtools.serverside.helper import send_mail from caosadvancedtools.utils import create_entity_link -from caosdb.apiutils import (EntityMergeConflictError, compare_entities, - merge_entities) -from caosdb.cached import cache_clear, cached_get_entity_by -from caosdb.common.datatype import is_reference -from caosdb.exceptions import EmptyUniqueQueryError -from importlib_resources import files -from jsonschema import validate +from linkahead.apiutils import (EntityMergeConflictError, compare_entities, + merge_entities) +from linkahead.cached import cache_clear, cached_get_entity_by +from linkahead.common.datatype import get_list_datatype, is_reference +from linkahead.exceptions import EmptyUniqueQueryError +from linkahead.utils.escape import escape_squoted_text from .config import get_config_setting -from .converters import Converter, ConverterValidationError, DirectoryConverter +from .converters import Converter, ConverterValidationError from .debug_tree import DebugTree from .identifiable import Identifiable from .identifiable_adapters import (CaosDBIdentifiableAdapter, IdentifiableAdapter, LocalStorageIdentifiableAdapter) -from .identified_cache import IdentifiedCache from .logging import configure_server_side_logging from .macros import defmacro_constructor, macro_constructor from .scanner import (create_converter_registry, initialize_converters, load_definition, scan_directory, scan_structure_elements) -from .stores import GeneralStore, RecordStore -from .structure_elements import Directory, NoneElement, StructureElement -from .version import check_cfood_version +from .stores import GeneralStore +from .structure_elements import StructureElement logger = logging.getLogger(__name__) @@ -176,12 +172,163 @@ def _resolve_datatype(prop: db.Property, remote_entity: db.Entity): return prop +def _treat_merge_error_of(newrecord, record): + """ + The parameters are two entities that cannot be merged with the merge_entities function. + + # This function checks for two obvious cases where no merge will ever be possible: + # 1. Two Entities with differing IDs + # 2. Two non-Entity values which differ + + It creates a more informative logger message and raises an Exception in those cases. + """ + for this_p in newrecord.properties: + that_p = record.get_property(this_p.name) + + if that_p is None: + logger.debug(f"Property {this_p.name} does not exist in the second entity. Note that " + "this should not be the reason for the merge conflict.") + continue + + if (isinstance(this_p.value, db.Entity) + and isinstance(that_p.value, db.Entity)): + if this_p.value.id is not None and that_p.value.id is not None: + if this_p.value.id != that_p.value.id: + logger.error("The Crawler is trying to merge two entities " + "because they should be the same object (same" + " identifiables), but they reference " + "different Entities with the same Property." + f"Problematic Property: {this_p.name}\n" + f"Referenced Entities: {this_p.value.id} and " + f"{that_p.value.id}\n" + f"{record}\n{newrecord}") + raise RuntimeError("Cannot merge Entities") + elif (not isinstance(this_p.value, db.Entity) + and not isinstance(that_p.value, db.Entity)): + if ((this_p.value != that_p.value) + # TODO can we also compare lists? + and not isinstance(this_p.value, list) + and not isinstance(that_p.value, list)): + logger.error( + "The Crawler is trying to merge two entities because they should be the same " + "object (same identifiables), but they have different values for the same " + "Property.\n" + f"Problematic Property: {this_p.name}\n" + f"Values: {this_p.value} and {that_p.value}\n" + f"{record}\n{newrecord}") + raise RuntimeError("Cannot merge Entities") + + class SecurityMode(Enum): RETRIEVE = 0 INSERT = 1 UPDATE = 2 +class TreatedRecordLookUp(): + """tracks Records and Identifiables for which it was checked whether they exist in the remote + server + + For a given Record it can be checked, whether it exists in the remote sever if + - it has a (valid) ID + - it has a (valid) path (FILEs only) + - an identifiable can be created for the Record. + + Records are added by calling the `add` function and they are then added to the internal + existing or missing list depending on whether the Record has a valid ID. + Additionally, the Record is added to three look up dicts. The keys of those are paths, IDs and + the representation of the identifiables. + + The extreme case, that one could imagine, would be that the same Record occurs three times as + different Python objects: one that only has an ID, one with only a path and one without ID and + path but with identifying properties. During `split_into_inserts_and_updates` all three + must be identified with each other (and must be merged). Since we require, that treated + entities have a valid ID if they exist in the remote server, all three objects would be + identified with each other simply using the IDs. + + In the case that the Record is not yet in the remote server, there cannot be a Python object + with an ID. Thus we might have one with a path and one with an identifiable. If that Record + does not yet exist, it is necessary that both Python objects have at least either the path or + the identifiable in common. + """ + + def __init__(self): + self._id_look_up: dict[int, db.Entity] = {} + self._path_look_up: dict[str, db.Entity] = {} + self._identifiable_look_up: dict[str, db.Entity] = {} + self.remote_missing_counter = -1 + self._missing: dict[int, db.Entity] = {} + self._existing: dict[int, db.Entity] = {} + + def add(self, record: db.Entity, identifiable: Optional[Identifiable] = None): + """ + Add a Record that was treated, such that it is contained in the internal look up dicts + + This Record MUST have an ID if it was found in the remote server. + """ + if record.id is None: + if record.path is None and identifiable is None: + raise RuntimeError("Record must have ID or path or an identifiable must be given." + f"Record is\n{record}") + record.id = self.remote_missing_counter + self.remote_missing_counter -= 1 + self._add_any(record, self._missing, identifiable) + else: + self._add_any(record, self._existing, identifiable) + + def get_any(self, record: db.Entity, identifiable: Optional[Identifiable] = None): + """ + Check whether this Record was already added. Identity is based on ID, path or Identifiable + represenation + """ + if record.id is not None and record.id in self._id_look_up: + return self._id_look_up[record.id] + if record.path is not None and record.path in self._path_look_up: + return self._path_look_up[record.path] + if (identifiable is not None and identifiable.get_representation() in + self._identifiable_look_up): + return self._identifiable_look_up[identifiable.get_representation()] + + def get_existing(self, record: db.Entity, identifiable: Optional[Identifiable] = None): + """ Check whether this Record exists on the remote server + + Returns: The stored Record + """ + rec = self.get_any(record, identifiable) + if id(rec) in self._existing: + return rec + else: + return None + + def get_missing(self, record: db.Entity, identifiable: Optional[Identifiable] = None): + """ Check whether this Record is missing on the remote server + + Returns: The stored Record + """ + rec = self.get_any(record, identifiable) + if id(rec) in self._missing: + return rec + else: + return None + + def get_missing_list(self): + """ Return all Records that are missing in the remote server """ + return list(self._missing.values()) + + def get_existing_list(self): + """ Return all Records that exist in the remote server """ + return list(self._existing.values()) + + def _add_any(self, record: db.Entity, lookup, identifiable: Optional[Identifiable] = None): + if record.id is not None: + self._id_look_up[record.id] = record + if record.path is not None: + self._path_look_up[record.path] = record + if identifiable is not None: + self._identifiable_look_up[identifiable.get_representation()] = record + lookup[id(record)] = record + + class Crawler(object): """ Crawler class that encapsulates crawling functions. @@ -218,8 +365,8 @@ class Crawler(object): # The following caches store records, where we checked whether they exist on the remote # server. Since, it is important to know whether they exist or not, we store them into two # different caches. - self.remote_existing_cache = IdentifiedCache() - self.remote_missing_cache = IdentifiedCache() + self.treated_records_lookup = TreatedRecordLookUp() + # TODO does it make sense to have this as member variable? self.securityMode = securityMode # TODO does it make sense to have this as member variable(run_id)? @@ -304,7 +451,8 @@ class Crawler(object): def _has_reference_value_without_id(self, ident: Identifiable) -> bool: """ - Returns True if there is at least one value in the properties attribute of ``ident`` which: + Returns True if there is at least one value in the properties and backrefs attributes of + ``ident`` which: a) is a reference property AND b) where the value is set to a @@ -365,7 +513,7 @@ class Crawler(object): Crawler.create_flat_list([p.value], flat) return flat - def _has_missing_object_in_references(self, ident: Identifiable, referencing_entities: list): + def _has_missing_object_in_references(self, ident: Identifiable, referencing_entities: dict): """ returns False if any value in the properties attribute is a db.Entity object that is contained in the `remote_missing_cache`. If ident has such an object in @@ -378,16 +526,21 @@ class Crawler(object): # Entity instead of ID and not cached locally if (isinstance(pvalue, list)): for el in pvalue: - if (isinstance(el, db.Entity) and self.get_from_remote_missing_cache( - self.identifiableAdapter.get_identifiable(el, referencing_entities)) is not None): + elident = self.identifiableAdapter.get_identifiable( + el, referencing_entities[id(el)]) + if (isinstance(el, db.Entity) + and self.treated_records_lookup.get_missing(el, elident) is not None): return True - if (isinstance(pvalue, db.Entity) and self.get_from_remote_missing_cache( - self.identifiableAdapter.get_identifiable(pvalue, referencing_entities)) is not None): + if (isinstance(pvalue, db.Entity) and self.treated_records_lookup.get_missing( + pvalue, + self.identifiableAdapter.get_identifiable(pvalue, + referencing_entities[id(pvalue)]) + ) is not None): # might be checked when reference is resolved return True return False - def replace_references_with_cached(self, record: db.Record, referencing_entities: list): + def replace_references_with_cached(self, record: db.Record, referencing_entities: dict): """ Replace all references with the versions stored in the cache. @@ -398,8 +551,10 @@ class Crawler(object): lst = [] for el in p.value: if (isinstance(el, db.Entity) and el.id is None): - cached = self.get_from_any_cache( - self.identifiableAdapter.get_identifiable(el, referencing_entities)) + cached = self.treated_records_lookup.get_any( + el, + self.identifiableAdapter.get_identifiable( + el, referencing_entities[id(el)])) if cached is None: lst.append(el) continue @@ -407,12 +562,12 @@ class Crawler(object): if isinstance(p.value, db.File): if p.value.path != cached.path: raise RuntimeError( - "The cached and the refernced entity are not identical.\n" + "The cached and the referenced entity are not identical.\n" f"Cached:\n{cached}\nReferenced:\n{el}" ) else: raise RuntimeError( - "The cached and the refernced entity are not identical.\n" + "The cached and the referenced entity are not identical.\n" f"Cached:\n{cached}\nReferenced:\n{el}" ) lst.append(cached) @@ -420,82 +575,25 @@ class Crawler(object): lst.append(el) p.value = lst if (isinstance(p.value, db.Entity) and p.value.id is None): - cached = self.get_from_any_cache( - self.identifiableAdapter.get_identifiable(p.value, referencing_entities)) + cached = self.treated_records_lookup.get_any( + p.value, self.identifiableAdapter.get_identifiable( + p.value, referencing_entities[id(p.value)])) if cached is None: continue if not check_identical(cached, p.value, True): if isinstance(p.value, db.File): if p.value.path != cached.path: raise RuntimeError( - "The cached and the refernced entity are not identical.\n" + "The cached and the referenced entity are not identical.\n" f"Cached:\n{cached}\nReferenced:\n{p.value}" ) else: raise RuntimeError( - "The cached and the refernced entity are not identical.\n" + "The cached and the referenced entity are not identical.\n" f"Cached:\n{cached}\nReferenced:\n{p.value}" ) p.value = cached - def get_from_remote_missing_cache(self, identifiable: Identifiable): - """ - returns the identified record if an identifiable with the same values already exists locally - (Each identifiable that is not found on the remote server, is 'cached' locally to prevent - that the same identifiable exists twice) - """ - if identifiable is None: - raise ValueError("Identifiable has to be given as argument") - - if identifiable in self.remote_missing_cache: - return self.remote_missing_cache[identifiable] - else: - return None - - def get_from_any_cache(self, identifiable: Identifiable): - """ - returns the identifiable if an identifiable with the same values already exists locally - (Each identifiable that is not found on the remote server, is 'cached' locally to prevent - that the same identifiable exists twice) - """ - if identifiable is None: - raise ValueError("Identifiable has to be given as argument") - - if identifiable in self.remote_existing_cache: - return self.remote_existing_cache[identifiable] - elif identifiable in self.remote_missing_cache: - return self.remote_missing_cache[identifiable] - else: - return None - - def add_to_remote_missing_cache(self, record: db.Record, identifiable: Identifiable): - """ - stores the given Record in the remote_missing_cache. - - If identifiable is None, the Record is NOT stored. - """ - self.add_to_cache(record=record, cache=self.remote_missing_cache, - identifiable=identifiable) - - def add_to_remote_existing_cache(self, record: db.Record, identifiable: Identifiable): - """ - stores the given Record in the remote_existing_cache. - - If identifiable is None, the Record is NOT stored. - """ - self.add_to_cache(record=record, cache=self.remote_existing_cache, - identifiable=identifiable) - - def add_to_cache(self, record: db.Record, cache: IdentifiedCache, - identifiable: Identifiable) -> None: - """ - stores the given Record in the given cache. - - If identifiable is None, the Record is NOT stored. - """ - if identifiable is not None: - cache.add(identifiable=identifiable, record=record) - @staticmethod def bend_references_to_new_object(old, new, entities): """ Bend references to the other object @@ -512,23 +610,73 @@ class Crawler(object): if p.value is old: p.value = new + def _merge_identified(self, newrecord, record, try_to_merge_later, all_records): + """ tries to merge record into newrecord + + If it fails, record is added to the try_to_merge_later list. + In any case, references are bent to the newrecord object. + + """ + try: + merge_entities( + newrecord, record, merge_references_with_empty_diffs=False, + merge_id_with_resolved_entity=True) + except EntityMergeConflictError: + _treat_merge_error_of(newrecord, record) + # We cannot merge but it is none of the clear case where merge is + # impossible. Thus we try later + try_to_merge_later.append(record) + if newrecord.id is not None: + record.id = newrecord.id + except NotImplementedError: + print(newrecord) + print(record) + raise + Crawler.bend_references_to_new_object( + old=record, new=newrecord, + entities=all_records + ) + + def _identity_relies_on_unchecked_entities(self, record: db.Record, referencing_entities): + """ + If a record for which it could not yet be verified whether it exists in LA or not is part + of the identifying properties, this returns True, otherwise False + """ + + registered_identifiable = self.identifiableAdapter.get_registered_identifiable(record) + if registered_identifiable is None: + return False + refs = self.identifiableAdapter.get_identifying_referencing_entities(referencing_entities, + registered_identifiable) + if any(el is None for el in refs): + return True + + refs = self.identifiableAdapter.get_identifying_referenced_entities( + record, registered_identifiable) + if any([self.treated_records_lookup.get_any(el) is None for el in refs]): + return True + + return False + @staticmethod def create_reference_mapping(flat: list[db.Entity]): """ Create a dictionary of dictionaries of the form: - dict[int, dict[str, list[db.Entity]]] + dict[int, dict[str, list[Union[int,None]]]] - The integer index is the Python id of the value object. - The string is the name of the first parent of the referencing object. Each value objects is taken from the values of all properties from the list flat. - So the returned mapping maps ids of entities to the objects which are referring + So the returned mapping maps ids of entities to the ids of objects which are referring to them. """ # TODO we need to treat children of RecordTypes somehow. - references: dict[int, dict[str, list[db.Entity]]] = {} + references: dict[int, dict[str, list[Union[int, None]]]] = {} for ent in flat: + if id(ent) not in references: + references[id(ent)] = {} for p in ent.properties: val = p.value if not isinstance(val, list): @@ -539,128 +687,158 @@ class Crawler(object): references[id(v)] = {} if ent.parents[0].name not in references[id(v)]: references[id(v)][ent.parents[0].name] = [] - references[id(v)][ent.parents[0].name].append(ent) + references[id(v)][ent.parents[0].name].append(ent.id) return references def split_into_inserts_and_updates(self, ent_list: list[db.Entity]): - to_be_inserted: list[db.Entity] = [] - to_be_updated: list[db.Entity] = [] flat = Crawler.create_flat_list(ent_list) + all_records = list(flat) # TODO: can the following be removed at some point for ent in flat: if ent.role == "Record" and len(ent.parents) == 0: raise RuntimeError(f"Records must have a parent.\n{ent}") - resolved_references = True + try_to_merge_later = [] + + # Check whether Records can be identified without identifiable + for i in reversed(range(len(flat))): + record = flat[i] + # 1. Can it be identified via an ID? + if record.id is not None: + treated_record = self.treated_records_lookup.get_existing(record) + if treated_record is not None: + self._merge_identified(treated_record, record, try_to_merge_later, all_records) + all_records.remove(record) + referencing_entities = self.create_reference_mapping(all_records) + else: + self.treated_records_lookup.add(record, None) + assert record.id + del flat[i] + # 2. Can it be identified via a path? + elif record.path is not None: + try: + existing = cached_get_entity_by(path=record.path) + except EmptyUniqueQueryError: + existing = None + if existing is not None: + record.id = existing.id + # TODO check the following copying of _size and _checksum + # Copy over checksum and size too if it is a file + record._size = existing._size + record._checksum = existing._checksum + treated_record = self.treated_records_lookup.get_any(record) + if treated_record is not None: + self._merge_identified(treated_record, record, try_to_merge_later, all_records) + all_records.remove(record) + referencing_entities = self.create_reference_mapping(all_records) + else: + # TODO add identifiable if possible + self.treated_records_lookup.add(record, None) + assert record.id + del flat[i] + + entity_was_treated = True # flat contains Entities which could not yet be checked against the remote server - while resolved_references and len(flat) > 0: - resolved_references = False - referencing_entities = self.create_reference_mapping( - flat + to_be_updated + to_be_inserted) + while entity_was_treated and len(flat) > 0: + entity_was_treated = False + referencing_entities = self.create_reference_mapping(all_records) # For each element we try to find out whether we can find it in the server or whether # it does not yet exist. Since a Record may reference other unkown Records it might not # be possible to answer this right away. # The following checks are done on each Record: - # 1. Can it be identified via an ID? - # 2. Can it be identified via a path? - # 3. Is it in the cache of already checked Records? - # 4. Can it be checked on the remote server? - # 5. Does it have to be new since a needed reference is missing? + # 1. Is it in the cache of already checked Records? + # 2. Can it be checked on the remote server? + # 3. Does it have to be new since a needed reference is missing? for i in reversed(range(len(flat))): record = flat[i] + + if self._identity_relies_on_unchecked_entities(record, + referencing_entities[id(record)]): + continue + identifiable = self.identifiableAdapter.get_identifiable( record, - referencing_entities=referencing_entities) - - # TODO remove if the exception is never raised - if record in to_be_inserted: - raise RuntimeError("This should not be reached since treated elements" - "are removed from the list") - # 1. Can it be identified via an ID? - elif record.id is not None: - to_be_updated.append(record) - self.add_to_remote_existing_cache(record, identifiable) - del flat[i] - # 2. Can it be identified via a path? - elif record.path is not None: - try: - existing = cached_get_entity_by(path=record.path) - except EmptyUniqueQueryError: - existing = None - if existing is None: - to_be_inserted.append(record) - self.add_to_remote_missing_cache(record, identifiable) - del flat[i] - else: - record.id = existing.id - # TODO check the following copying of _size and _checksum - # Copy over checksum and size too if it is a file - record._size = existing._size - record._checksum = existing._checksum - to_be_updated.append(record) - self.add_to_remote_existing_cache(record, identifiable) - del flat[i] - # 3. Is it in the cache of already checked Records? - elif self.get_from_any_cache(identifiable) is not None: - # We merge the two in order to prevent loss of information - newrecord = self.get_from_any_cache(identifiable) - try: - merge_entities(newrecord, record) - except EntityMergeConflictError: - continue - Crawler.bend_references_to_new_object( - old=record, new=newrecord, entities=flat + to_be_updated + to_be_inserted) + referencing_entities=referencing_entities[id(record)]) + + # 1. Is it in the cache of already checked Records? + if self.treated_records_lookup.get_any(record, identifiable) is not None: + treated_record = self.treated_records_lookup.get_any(record, identifiable) + # Since the identifiables are the same, treated_record and record actually + # describe the same object. + # We merge record into treated_record in order to prevent loss of information + self._merge_identified(treated_record, record, try_to_merge_later, all_records) + all_records.remove(record) + referencing_entities = self.create_reference_mapping(all_records) del flat[i] - resolved_references = True + entity_was_treated = True - # 4. Can it be checked on the remote server? + # 2. Can it be checked on the remote server? elif not self._has_reference_value_without_id(identifiable): identified_record = ( self.identifiableAdapter.retrieve_identified_record_for_identifiable( identifiable)) if identified_record is None: # identifiable does not exist remotely -> record needs to be inserted - to_be_inserted.append(record) - self.add_to_remote_missing_cache(record, identifiable) - del flat[i] + self.treated_records_lookup.add(record, identifiable) else: # side effect record.id = identified_record.id - to_be_updated.append(record) - self.add_to_remote_existing_cache(record, identifiable) - del flat[i] - resolved_references = True + record.path = identified_record.path + self.treated_records_lookup.add(record, identifiable) + assert record.id + del flat[i] + entity_was_treated = True - # 5. Does it have to be new since a needed reference is missing? + # 3. Does it have to be new since a needed reference is missing? # (Is it impossible to check this record because an identifiable references a # missing record?) elif self._has_missing_object_in_references(identifiable, referencing_entities): - to_be_inserted.append(record) - self.add_to_remote_missing_cache(record, identifiable) + self.treated_records_lookup.add(record, identifiable) + assert record.id del flat[i] - resolved_references = True + entity_was_treated = True for record in flat: self.replace_references_with_cached(record, referencing_entities) + # We postponed the merge for records where it failed previously and try it again now. + # This only might add properties of the postponed records to the already used ones. + for record in try_to_merge_later: + identifiable = self.identifiableAdapter.get_identifiable( + record, + referencing_entities=referencing_entities[id(record)]) + newrecord = self.treated_records_lookup.get_any(record, identifiable) + merge_entities(newrecord, record, merge_id_with_resolved_entity=True) if len(flat) > 0: circle = self.detect_circular_dependency(flat) if circle is None: logger.error("Failed, but found NO circular dependency. The data is as follows:" - + str(self.compact_entity_list_representation(flat))) + + str(self.compact_entity_list_representation(flat, + referencing_entities))) else: logger.error("Found circular dependency (Note that this might include references " "that are not identifying properties): " - + self.compact_entity_list_representation(circle)) + + self.compact_entity_list_representation(circle, + referencing_entities)) + raise RuntimeError( f"Could not finish split_into_inserts_and_updates. Circular dependency: " f"{circle is not None}") - return to_be_inserted, to_be_updated + # remove negative IDs + missing = self.treated_records_lookup.get_missing_list() + for el in missing: + if el.id is None: + raise RuntimeError("This should not happen") # TODO remove + if el.id >= 0: + raise RuntimeError("This should not happen") # TODO remove + el.id = None + + return (missing, self.treated_records_lookup.get_existing_list()) def replace_entities_with_ids(self, rec: db.Record): for el in rec.properties: @@ -673,23 +851,39 @@ class Crawler(object): if val.id is not None: el.value[index] = val.id - @staticmethod - def compact_entity_list_representation(circle): + @ staticmethod + def compact_entity_list_representation(entities, referencing_entities: List) -> str: """ a more readable representation than the standard xml representation TODO this can be removed once the yaml format representation is in pylib """ text = "\n--------\n" - for el in circle: - if el.name is not None: - text += f"{el.name}\n" - text += f"{[el.name for el in el.parents]}\n" - props = {p.name: p.value for p in el.properties} - text += f"{props}\n" + + grouped = {"": []} + for ent in entities: + if not ent.parents: + grouped[""].append(ent) + for parent in ent.parents: + if parent.name not in grouped: + grouped[parent.name] = [] + grouped[parent.name].append(ent) + if not grouped[""]: + del grouped[""] + for parent, group in grouped.items(): + text += f"\n> Parent: {parent}\n" + for ent in group: + if ent.name is not None: + text += f"\n>> Name: {ent.name}\n" + else: + text += "\n>> name: # No name" + text += f"{[ent.name for ent in ent.parents]}\n" + props = {p.name: p.value for p in ent.properties} + text += f"{props}\n" + text += f"is_referenced_by:\n{referencing_entities[id(ent)]}\n" return text + "--------\n" - @staticmethod + @ staticmethod def detect_circular_dependency(flat: list[db.Entity]): """ Detects whether there are circular references in the given entity list and returns a list @@ -722,7 +916,7 @@ class Crawler(object): return None return circle - @staticmethod + @ staticmethod def _merge_properties_from_remote( crawled_data: list[db.Record], identified_records: list[db.Record] @@ -764,7 +958,7 @@ class Crawler(object): return to_be_updated - @staticmethod + @ staticmethod def remove_unnecessary_updates( crawled_data: list[db.Record], identified_records: list[db.Record] @@ -790,7 +984,7 @@ class Crawler(object): return actual_updates - @staticmethod + @ staticmethod def execute_parent_updates_in_list(to_be_updated, securityMode, run_id, unique_names): """ Execute the updates of changed parents. @@ -833,14 +1027,68 @@ class Crawler(object): "mode. This might lead to a failure of inserts that follow.") logger.info(parent_updates) - @staticmethod + @ staticmethod + def _get_property_id_for_datatype(rtname: str, name: str): + return cached_get_entity_by( + query=f"FIND Entity '{escape_squoted_text(rtname)}' " + f"with name='{escape_squoted_text(name)}'").id + + @ staticmethod + def replace_name_with_referenced_entity_id(prop: db.Property): + """changes the given property in place if it is a reference property that has a name as + value + + If the Property has a List datatype, each element is treated separately. + If the datatype is generic, i.e. FILE or REFERENCE, values stay unchanged. + If the value is not a string, the value stays unchanged. + If the query using the datatype and the string value does not uniquely identify an Entity, + the value stays unchanged. + If an Entity is identified, then the string value is replaced by the ID. + """ + if get_list_datatype(prop.datatype) is None: # not a list + if (isinstance(prop.value, str) and is_reference(prop.datatype) and + prop.datatype != db.FILE and prop.datatype != db.REFERENCE): # datatype is a non-generic reference and value is a string + try: + # the get_entity function will raise an error if not unique + prop.value = Crawler._get_property_id_for_datatype( + rtname=prop.datatype, name=prop.value) + except (db.EmptyUniqueQueryError, db.QueryNotUniqueError): + logger.error("The Property {prop.name} with datatype={prop.datatype} has the " + "value {prop.value} and there is no appropriate Entity with such " + "a name.") + raise + else: + dt = get_list_datatype(prop.datatype) + if not (is_reference(dt) and dt != db.FILE and dt != db.REFERENCE): + return + propval = [] + for el in prop.value: + if isinstance(el, str): + try: + # the get_entity function will raise an error if not unique + propval.append(Crawler._get_property_id_for_datatype(rtname=dt, + name=el)) + except (db.EmptyUniqueQueryError, db.QueryNotUniqueError): + logger.error( + "The Property {prop.name} with datatype={prop.datatype} has the " + "value {prop.value} and there is no appropriate Entity with such " + "a name.") + raise + else: + propval.append(el) + prop.value = propval + + @ staticmethod def execute_inserts_in_list(to_be_inserted, securityMode, run_id: Optional[uuid.UUID] = None, unique_names=True): for record in to_be_inserted: for prop in record.properties: + if prop.name == "name": + raise Exception('Cannot search for the property with name "name"') entity = cached_get_entity_by(name=prop.name) _resolve_datatype(prop, entity) + Crawler.replace_name_with_referenced_entity_id(prop) logger.debug("INSERT") logger.debug(to_be_inserted) if len(to_be_inserted) > 0: @@ -850,7 +1098,7 @@ class Crawler(object): update_cache = UpdateCache() update_cache.insert(to_be_inserted, run_id, insert=True) - @staticmethod + @ staticmethod def set_ids_and_datatype_of_parents_and_properties(rec_list): for record in rec_list: for parent in record.parents: @@ -862,7 +1110,7 @@ class Crawler(object): prop.id = entity.id _resolve_datatype(prop, entity) - @staticmethod + @ staticmethod def execute_updates_in_list(to_be_updated, securityMode, run_id: Optional[uuid.UUID] = None, unique_names=True): @@ -876,7 +1124,7 @@ class Crawler(object): update_cache = UpdateCache() update_cache.insert(to_be_updated, run_id) - @staticmethod + @ staticmethod def check_whether_parent_exists(records: list[db.Entity], parents: list[str]): """ returns a list of all records in `records` that have a parent that is in `parents`""" problems = [] @@ -928,12 +1176,11 @@ class Crawler(object): """ if crawled_data is None: warnings.warn(DeprecationWarning( - "Calling synchronize without the data to be synchronized is depricated. Please " + "Calling synchronize without the data to be synchronized is deprecated. Please " "use for example the Scanner to create this data.")) crawled_data = self.crawled_data to_be_inserted, to_be_updated = self.split_into_inserts_and_updates(crawled_data) - referencing_entities = self.create_reference_mapping(to_be_updated + to_be_inserted) for el in to_be_updated: # all entity objects are replaced by their IDs except for the not yet inserted ones @@ -997,7 +1244,7 @@ class Crawler(object): return (to_be_inserted, to_be_updated) - @staticmethod + @ staticmethod def create_entity_summary(entities: list[db.Entity]): """ Creates a summary string reprensentation of a list of entities.""" parents = {} @@ -1016,7 +1263,7 @@ class Crawler(object): output = output[:-2] + "\n" return output - @staticmethod + @ staticmethod def inform_about_pending_changes(pending_changes, run_id, path, inserts=False): # Sending an Email with a link to a form to authorize updates is if get_config_setting("send_crawler_notifications"): @@ -1037,7 +1284,7 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) + " by invoking the crawler" " with the run id: {rid}\n".format(rid=run_id)) - @staticmethod + @ staticmethod def debug_build_usage_tree(converter: Converter): res: dict[str, dict[str, Any]] = { converter.name: { @@ -1352,6 +1599,7 @@ def crawler_main(crawled_directory_path: str, logger.debug(err) if "SHARED_DIR" in os.environ: + # pylint: disable=E0601 domain = get_config_setting("public_host_url") logger.error("Unexpected Error: Please tell your administrator about this and provide the" f" following path.\n{domain}/Shared/" + debuglog_public) diff --git a/src/caoscrawler/debug_tree.py b/src/caoscrawler/debug_tree.py index 9983981c69e3df7c58ddfda4b6977944eac54999..0d57040f5c20aca236a3c11531e8b7c45bad89ab 100644 --- a/src/caoscrawler/debug_tree.py +++ b/src/caoscrawler/debug_tree.py @@ -45,13 +45,13 @@ from importlib_resources import files from jsonschema import validate from typing import Any, Optional, Type, Union -import caosdb as db +import linkahead as db from caosadvancedtools.cache import UpdateCache, Cache from caosadvancedtools.crawler import Crawler as OldCrawler -from caosdb.apiutils import (compare_entities, EntityMergeConflictError, - merge_entities) -from caosdb.common.datatype import is_reference +from linkahead.apiutils import (compare_entities, EntityMergeConflictError, + merge_entities) +from linkahead.common.datatype import is_reference from .converters import Converter, DirectoryConverter, ConverterValidationError diff --git a/src/caoscrawler/default_transformers.yml b/src/caoscrawler/default_transformers.yml new file mode 100644 index 0000000000000000000000000000000000000000..d0ad23912176bdfbf2446aa6e04bd7fa6b858777 --- /dev/null +++ b/src/caoscrawler/default_transformers.yml @@ -0,0 +1,11 @@ + + +submatch: + package: caoscrawler.transformer_functions + function: submatch +split: + package: caoscrawler.transformer_functions + function: split +replace: + package: caoscrawler.transformer_functions + function: replace diff --git a/src/caoscrawler/hdf5_converter.py b/src/caoscrawler/hdf5_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..5b1ff5775fb74919c989507c449636fd822db7f0 --- /dev/null +++ b/src/caoscrawler/hdf5_converter.py @@ -0,0 +1,336 @@ +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2023 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2023 Florian Spreckelsen <f.spreckelsen@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +try: + import h5py +except ModuleNotFoundError: + raise ModuleNotFoundError( + "Couldn't find module h5py. Did you install the crawler package with " + "its optional `h5-crawler` dependency?" + ) + +import numpy as np + +from typing import Union + +import linkahead as db + +from .converters import (convert_basic_element, Converter, DictElementConverter, + match_name_and_value, SimpleFileConverter) +from .stores import GeneralStore, RecordStore +from .structure_elements import DictElement, File, FloatElement, IntegerElement, StructureElement + + +def convert_attributes(elt: Union[h5py.File, h5py.Group, h5py.Dataset]): + """Convert hdf5 attributes to a list of either basic scalar structure elements or ndarrays. + + Parameters + ---------- + elt : Union[h5py.File, h5py.Group, h5py.Dataset] + The hdf5 element the attributes of which will be converted to structure + elements. + + Returns + ------- + converted : list[StructureElement] + A list of the attributes converted to StructureElements (either basic + scalar elements or ndarray). + """ + + converted = [] + for name, value in elt.attrs.items(): + converted.append(convert_basic_element_with_nd_array( + value, name, f"The value of attribute {name} has an unknown type: {type(value)}.")) + + return converted + + +def convert_h5_element(elt: Union[h5py.Group, h5py.Dataset], name: str): + """Convert a given HDF5 element to the corresponding StructureElement. + + Parameters + ---------- + elt : Union[h5py.Group, h5py.Dataset] + The hdf5 element to be converted. + name : str + The name of the StructureElement that the hdf5 element is converted to. + + Raises + ------ + ValueError + In case of anything that is not convertible to a HDF5 structure element. + + Returns + ------- + StructureElement + The converted StructureElement. + """ + + if isinstance(elt, h5py.Group): + + return H5GroupElement(name, elt) + + if isinstance(elt, h5py.Dataset): + + return H5DatasetElement(name, elt) + + raise ValueError("The given element must be either a HDF5 Group or Dataset object.") + + +def convert_basic_element_with_nd_array(value, name: str = None, + internal_path: str = None, msg_prefix: str = ""): + """Convert a given object either to an ndarray structure element or to a + basic scalar structure element. + + This function extends :func:`~caoscrawler.converters.convert_basic_element` + by a special treatment for certain numpy objects, most importantly + ndarrays. They are converted to a scalar in case of a size-1 array, to a + list in case of a 1-d array, and to a ``H5NdarrayElement`` in all other + cases. In addition, numpy integers and floats are also converted to + IntegerElements and FloatElements, respectively. + + Parameters + ---------- + value + The object to be converted. + name : str, optional + The name of the structure element ``value`` is being converted + to. Default is None. + internal_path : str, optional + The internal path of ``value`` within the HDF5 file. Default is None. + msg_prefix : str, optional + The prefix of the error message that will be raised. Default is ``""``. + + Returns + ------- + StructureElement + The StructureElement ``value`` was converted to. + + """ + + if isinstance(value, np.ndarray): + + if value.size == 1: + # this is a scalar stacked in a numpy array. We don't know its + # actual shape, so we reshape first, then use the actual value + # inside. + value = value.reshape((1,))[0] + + elif np.squeeze(value).ndim == 1: + # If the array is one-dimensional we can save it as a list + value = list(np.squeeze(value)) + + else: + # real multi-dimensional array + return H5NdarrayElement(name, value, internal_path) + + elif isinstance(value, np.int32) or isinstance(value, np.int64): + + return IntegerElement(name, value) + + elif isinstance(value, np.float64): + + return FloatElement(name, value) + + return convert_basic_element(value, name, msg_prefix) + + +class H5GroupElement(DictElement): + """StructureElement specific for HDF5 groups""" + + def __init__(self, name: str, value: h5py.Group): + super().__init__(name, value) + + +class H5DatasetElement(DictElement): + """StructureElement specific for HDF5 datasets.""" + + def __init__(self, name: str, value: h5py.Dataset): + super().__init__(name, value) + + +class H5NdarrayElement(DictElement): + """StructureElement specific for NDArrays within HDF5 files. + + Also store the internal path of the array within the HDF5 file in its + ``internal_path`` attribute. + + """ + + def __init__(self, name: str, value, internal_path: str): + super().__init__(name, value) + self.internal_path = internal_path + + +class H5FileConverter(SimpleFileConverter): + """Converter for HDF5 files that creates children for the contained + attributes, groups, and datasets. + + """ + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + """Create children from root-level file attributes and contained hdf5 + elements. + + """ + + if not isinstance(element, File): + + raise ValueError("create_children should have been called with a File object.") + + ff = h5py.File(element.path, 'r') + + children = [] + + for name, value in ff.items(): + + children.append(convert_h5_element(value, name)) + + children.extend(convert_attributes(ff)) + + return children + + +class H5GroupConverter(DictElementConverter): + """Converter for HDF5 groups that creates children from the group-level + attributes and the contained subgroups and datasets. + + """ + + def typecheck(self, element: StructureElement): + + return isinstance(element, H5GroupElement) + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + """Create children from group attributes and hdf5 elements contained in + this group. + + """ + + if not isinstance(element.value, h5py.Group): + + raise ValueError("create_children should have been called with a HDF5 Group object.") + + children = [] + + for name, value in element.value.items(): + + children.append(convert_h5_element(value, name)) + + children.append(convert_attributes(element.value)) + + return children + + +class H5DatasetConverter(DictElementConverter): + """Converter for HDF5 datasets that creates children from the dataset + attributes and the contained array data. + + """ + + def typecheck(self, element: StructureElement): + + return isinstance(element, H5DatasetElement) + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + """Create children from the dataset attributes and append the array data + contained in this dataset. + + """ + + if not isinstance(element.value, h5py.Dataset): + + raise ValueError("create_children should have been called with a HDF5 Dataset object") + + children = convert_attributes(element.value) + + children.append( + H5NdarrayElement( + name=self.name+"_ndarray", + value=element.value, + internal_path=element.value.name + ) + ) + return children + + +class H5NdarrayConverter(Converter): + """Converter for ndarrays contained in HDF5 files. Creates the wrapper + record for this ndarray. + + """ + + def __init__(self, definition: dict, name: str, converter_registry: dict): + + # Check that a non-empty name for the record that will be created for + # the ndarray Record (within the cfood) is given + if not ("recordname" in definition and definition["recordname"]): + + raise RuntimeError(f"Converter {name} lacks the `recordname` definition.") + + super().__init__(definition, name, converter_registry) + + def create_children(self, values: GeneralStore, element: StructureElement): + """The ndarray doesn't have any further children.""" + + return [] + + def create_records(self, values: GeneralStore, records: RecordStore, element: StructureElement): + """Create a wrapper record with name ``recordname``, type + ``array_recordtype_name`` (default ``H5Ndarray``) and the internal path + stored in a property with name ``internal_path_property_name`` (default + ``internal_hdf5_path``). + + """ + + rname = self.definition["recordname"] + if "array_recordtype_name" in self.definition: + rtname = self.definition["array_recordtype_name"] + else: + rtname = "H5Ndarray" + + if "internal_path_property_name" in self.definition: + propname = self.definition["internal_path_property_name"] + else: + propname = "internal_hdf5_path" + + rec = db.Record().add_parent(rtname) + records[rname] = rec + values[rname] = rec + + rec.add_property(name=propname, value=element.internal_path) + keys_modified = [(rname, propname)] + + keys_modified.extend(super().create_records(values, records, element)) + + return keys_modified + + def typecheck(self, element: StructureElement): + + return isinstance(element, H5NdarrayElement) + + @Converter.debug_matching("name") + def match(self, element: StructureElement): + + if not isinstance(element, H5NdarrayElement): + + raise RuntimeError("This converter can only be called with H5NdarrayElements.") + + return match_name_and_value(self.definition, element.name, element.value) diff --git a/src/caoscrawler/identifiable.py b/src/caoscrawler/identifiable.py index 75af5be8f06a6ab95a4b7f2b92eda8cf3e321a1b..cefdf4a0f42b1f610e0712fdefebc2dc3b78d69f 100644 --- a/src/caoscrawler/identifiable.py +++ b/src/caoscrawler/identifiable.py @@ -20,7 +20,7 @@ # from __future__ import annotations -import caosdb as db +import linkahead as db from datetime import datetime import json from hashlib import sha256 diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index d9c9c00b22443121b4989bf988a40308b143dbf1..9be539fe0842e3ce24b68060fa2288cdc4c531b2 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -26,13 +26,16 @@ from __future__ import annotations import logging +import warnings from abc import ABCMeta, abstractmethod from datetime import datetime +from functools import lru_cache from typing import Any -import caosdb as db +import linkahead as db import yaml -from caosdb.cached import cached_get_entity_by +from linkahead.cached import cached_get_entity_by, cached_query +from linkahead.utils.escape import escape_squoted_text from .identifiable import Identifiable from .utils import has_parent @@ -43,21 +46,24 @@ logger = logging.getLogger(__name__) def get_children_of_rt(rtname): """Supply the name of a recordtype. This name and the name of all children RTs are returned in a list""" - return [p.name for p in db.execute_query(f"FIND RECORDTYPE {rtname}")] + escaped = escape_squoted_text(rtname) + return [p.name for p in cached_query(f"FIND RECORDTYPE '{escaped}'")] -def convert_value(value: Any): - """ Returns a string representation of the value that is suitable - to be used in the query - looking for the identified record. +def convert_value(value: Any) -> str: + """ Return a string representation of the value suitable for the search query. + + This is for search queries looking for the identified record. Parameters ---------- - value : Any type, the value that shall be returned and potentially converted. + value: Any + The value to be converted. Returns ------- - out : the string reprensentation of the value + out: str + the string reprensentation of the value. """ @@ -68,8 +74,7 @@ def convert_value(value: Any): elif isinstance(value, bool): return str(value).upper() elif isinstance(value, str): - # replace single quotes, otherwise they may break the queries - return value.replace("\'", "\\'") + return escape_squoted_text(value) else: return str(value) @@ -79,14 +84,15 @@ class IdentifiableAdapter(metaclass=ABCMeta): Some terms: -- Registered identifiable is the definition of an identifiable which is: - - A record type as the parent - - A list of properties - - A list of referenced by statements -- Identifiable is the concrete identifiable, e.g. the Record based on - the registered identifiable with all the values filled in. -- Identified record is the result of retrieving a record based on the - identifiable from the database. +- A *registered identifiable* defines an identifiable template, for example by specifying: + - Parent record types + - Properties + - ``is_referenced_by`` statements +- An *identifiable* belongs to a concrete record. It consists of identifying attributes which "fill + in" the *registered identifiable*. In code, it can be represented as a Record based on the + *registered identifiable* with all the values filled in. +- An *identified record* is the result of retrieving a record from the database, based on the + *identifiable* (and its values). General question to clarify: @@ -95,24 +101,28 @@ General question to clarify: The list of referenced by statements is currently not implemented. -The IdentifiableAdapter can be used to retrieve the three above mentioned objects (registred +The IdentifiableAdapter can be used to retrieve the three above mentioned objects (registered identifiabel, identifiable and identified record) for a Record. """ @staticmethod - def create_query_for_identifiable(ident: Identifiable): + def create_query_for_identifiable(ident: Identifiable, startswith: bool = False): """ This function is taken from the old crawler: caosdb-advanced-user-tools/src/caosadvancedtools/crawler.py uses the properties of ident to create a query that can determine whether the required record already exists. + + If ``startswith`` is True, use ``LIKE`` for long string values to test if the strings starts + with the first 200 characters of the value. """ query_string = "FIND RECORD " if ident.record_type is not None: - query_string += f"'{ident.record_type}'" + escaped_rt = escape_squoted_text(ident.record_type) + query_string += f"'{escaped_rt}'" for ref in ident.backrefs: eid = ref if isinstance(ref, db.Entity): @@ -122,11 +132,13 @@ identifiabel, identifiable and identified record) for a Record. query_string += " WITH " if ident.name is not None: - query_string += "name='{}'".format(convert_value(ident.name)) + query_string += "name='{}'".format(escape_squoted_text(ident.name)) if len(ident.properties) > 0: query_string += " AND " - query_string += IdentifiableAdapter.create_property_query(ident) + query_string += IdentifiableAdapter.create_property_query(ident, startswith=startswith) + + # TODO Can these cases happen at all with the current code? if query_string.endswith(" AND WITH "): query_string = query_string[:-len(" AND WITH ")] if query_string.endswith(" AND "): @@ -134,15 +146,40 @@ identifiabel, identifiable and identified record) for a Record. return query_string @staticmethod - def create_property_query(entity: Identifiable): + def __create_pov_snippet(pname: str, pvalue, startswith: bool = False): + """Return something like ``'name'='some value'`` or ``'name' LIKE 'some*'``. + +If ``startswith`` is True, the value of strings will be cut off at 200 characters and a ``LIKE`` +operator will be used to find entities matching at the beginning. +""" + if startswith and isinstance(pvalue, str) and len(pvalue) > 200: + operator_value_str = f" LIKE '{escape_squoted_text(pvalue[:200])}*'" + else: + operator_value_str = "='" + convert_value(pvalue) + "'" + result = "'" + escape_squoted_text(pname) + "'" + operator_value_str + return result + + @staticmethod + def create_property_query(entity: Identifiable, startswith: bool = False): + """Create a POV query part with the entity's properties. + +Parameters +---------- + +entity: Identifiable + The Identifiable whose properties shall be used. + +startswith: bool, optional + If True, check string typed properties against the first 200 characters only. Default is False. + """ query_string = "" + pov = IdentifiableAdapter.__create_pov_snippet # Shortcut for pname, pvalue in entity.properties.items(): if pvalue is None: - query_string += "'" + pname + "' IS NULL AND " + query_string += "'" + escape_squoted_text(pname) + "' IS NULL AND " elif isinstance(pvalue, list): for v in pvalue: - query_string += ("'" + pname + "'='" + - convert_value(v) + "' AND ") + query_string += pov(pname, v, startswith=startswith) + " AND " # TODO: (for review) # This code would allow for more complex identifiables with @@ -155,8 +192,7 @@ identifiabel, identifiable and identified record) for a Record. # IdentifiableAdapter.create_property_query(p.value) + # ") AND ") else: - query_string += ("'" + pname + "'='" + - convert_value(pvalue) + "' AND ") + query_string += pov(pname, pvalue, startswith=startswith) + " AND " # remove the last AND return query_string[:-4] @@ -174,19 +210,64 @@ identifiabel, identifiable and identified record) for a Record. @abstractmethod def get_file(self, identifiable: db.File): + warnings.warn(DeprecationWarning("This function is deprecated. Please do not use it.")) """ Retrieve the file object for a (File) identifiable. """ pass + @staticmethod + def get_identifying_referencing_entities(referencing_entities, registered_identifiable): + refs = [] + for prop in registered_identifiable.properties: + if prop.name.lower() != "is_referenced_by": + continue + for looking_for_rt in prop.value: + found = False + if looking_for_rt == "*": + for val in referencing_entities.values(): + if len(val) > 0: + found = True + refs.extend(val) + else: + rt_and_children = get_children_of_rt(looking_for_rt) + for rtname in rt_and_children: + if (rtname in referencing_entities): + refs.extend(referencing_entities[rtname]) + found = True + if not found: + raise NotImplementedError( + f"Could not find referencing entities of type(s): {prop.value}\n" + f"for registered identifiable:\n{registered_identifiable}\n" + f"There were {len(referencing_entities)} referencing entities to choose from." + ) + return refs + + @staticmethod + def get_identifying_referenced_entities(record, registered_identifiable): + refs = [] + for prop in registered_identifiable.properties: + pname = prop.name.lower() + if pname == "name" or pname == "is_referenced_by": + continue + if record.get_property(prop.name) is None: + raise RuntimeError("Missing identifying Property") + pval = record.get_property(prop.name).value + if not isinstance(prop.value, list): + pval = [prop.value] + for val in pval: + if isinstance(val, db.Entity): + refs.append(val) + return refs + def get_identifiable(self, record: db.Record, referencing_entities=None): """ - retrieve the registred identifiable and fill the property values to create an - identifiable + Retrieve the registered identifiable and fill the property values to create an + identifiable. Args: record: the record for which the Identifiable shall be created. - referencing_entities: a dictionary (Type: dict[int, dict[str, list[db.Entity]]]), that + referencing_entities: a dictionary (Type: dict[str, list[db.Entity]]), that allows to look up entities with a certain RecordType, that reference ``record`` Returns: @@ -205,6 +286,8 @@ identifiabel, identifiable and identified record) for a Record. name_is_identifying_property = False if registered_identifiable is not None: + identifiable_backrefs = self.get_identifying_referencing_entities( + referencing_entities, registered_identifiable) # fill the values: for prop in registered_identifiable.properties: if prop.name == "name": @@ -215,24 +298,8 @@ identifiabel, identifiable and identified record) for a Record. # case A: in the registered identifiable # case B: in the identifiable - # TODO: similar to the Identifiable class, Registred Identifiable should be a - # separate class too + # treated above if prop.name.lower() == "is_referenced_by": - for givenrt in prop.value: - rt_and_children = get_children_of_rt(givenrt) - found = False - for rtname in rt_and_children: - if (id(record) in referencing_entities - and rtname in referencing_entities[id(record)]): - identifiable_backrefs.extend( - referencing_entities[id(record)][rtname]) - found = True - if not found: - # TODO: is this the appropriate error? - raise NotImplementedError( - f"The following record is missing an identifying property:" - f"RECORD\n{record}\nIdentifying PROPERTY\n{prop.name}" - ) continue record_prop = record.get_property(prop.name) @@ -241,7 +308,7 @@ identifiabel, identifiable and identified record) for a Record. # raise an exception? # TODO: is this the appropriate error? raise NotImplementedError( - f"The following record is missing an identifying property:" + f"The following record is missing an identifying property:\n" f"RECORD\n{record}\nIdentifying PROPERTY\n{prop.name}" ) identifiable_props[record_prop.name] = record_prop.value @@ -256,17 +323,21 @@ identifiabel, identifiable and identified record) for a Record. "Multi properties used in identifiables could cause unpredictable results and " "are not allowed. You might want to consider a Property with a list as value.") - # use the RecordType of the registred Identifiable if it exists + # use the RecordType of the registered Identifiable if it exists # We do not use parents of Record because it might have multiple - return Identifiable( - record_id=record.id, - record_type=(registered_identifiable.parents[0].name - if registered_identifiable else None), - name=record.name if name_is_identifying_property else None, - properties=identifiable_props, - path=record.path, - backrefs=identifiable_backrefs - ) + try: + return Identifiable( + record_id=record.id, + record_type=(registered_identifiable.parents[0].name + if registered_identifiable else None), + name=record.name if name_is_identifying_property else None, + properties=identifiable_props, + path=record.path, + backrefs=identifiable_backrefs + ) + except Exception: + logger.error(f"Error while creating identifiable for this record:\n{record}") + raise @abstractmethod def retrieve_identified_record_for_identifiable(self, identifiable: Identifiable): @@ -305,6 +376,8 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): """ def __init__(self): + warnings.warn(DeprecationWarning( + "This class is deprecated. Please use the CaosDBIdentifiableAdapter.")) self._registered_identifiables = dict() self._records = [] @@ -319,6 +392,7 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): Just look in records for a file with the same path. """ candidates = [] + warnings.warn(DeprecationWarning("This function is deprecated. Please do not use it.")) for record in self._records: if record.role == "File" and record.path == identifiable.path: candidates.append(record) @@ -466,14 +540,14 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): self._registered_identifiables[name] = definition def get_file(self, identifiable: Identifiable): + warnings.warn(DeprecationWarning("This function is deprecated. Please do not use it.")) # TODO is this needed for Identifiable? # or can we get rid of this function? if isinstance(identifiable, db.Entity): return cached_get_entity_by(path=identifiable) if identifiable.path is None: raise RuntimeError("Path must not be None for File retrieval.") - candidates = db.execute_query("FIND File which is stored at '{}'".format( - identifiable.path)) + candidates = cached_get_entity_by(path=identifiable.path) if len(candidates) > 1: raise RuntimeError("Identifiable was not defined unambigiously.") if len(candidates) == 0: @@ -482,7 +556,7 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): def get_registered_identifiable(self, record: db.Record): """ - returns the registred identifiable for the given Record + returns the registered identifiable for the given Record It is assumed, that there is exactly one identifiable for each RecordType. Only the first parent of the given Record is considered; others are ignored @@ -506,10 +580,28 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): def retrieve_identified_record_for_identifiable(self, identifiable: Identifiable): query_string = self.create_query_for_identifiable(identifiable) - candidates = db.execute_query(query_string) + try: + candidates = cached_query(query_string) + except db.exceptions.HTTPServerError as err: + query_string = self.create_query_for_identifiable(identifiable, startswith=True) + candidates = cached_query(query_string).copy() # Copy against cache poisoning + + # Test if the candidates really match all properties + for pname, pvalue in identifiable.properties.items(): + popme = [] + for i in range(len(candidates)): + this_prop = candidates[i].get_property(pname) + if this_prop is None: + popme.append(i) + continue + if not this_prop.value == pvalue: + popme.append(i) + for i in reversed(popme): + candidates.pop(i) + if len(candidates) > 1: raise RuntimeError( - f"Identifiable was not defined unambigiously.\n{query_string}\nReturned the " + f"Identifiable was not defined unambiguously.\n{query_string}\nReturned the " f"following {candidates}." f"Identifiable:\n{identifiable.record_type}{identifiable.properties}") if len(candidates) == 0: diff --git a/src/caoscrawler/identified_cache.py b/src/caoscrawler/identified_cache.py deleted file mode 100644 index aa2d82f8e66c738e737c62f3cc68eaf60127e28b..0000000000000000000000000000000000000000 --- a/src/caoscrawler/identified_cache.py +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -# -# ** header v3.0 -# This file is a part of the CaosDB Project. -# -# Copyright (C) 2021 Indiscale GmbH <info@indiscale.com> -# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see <https://www.gnu.org/licenses/>. -# -# ** end header -# - - -""" -see class docstring -""" - -from .identifiable import Identifiable -import caosdb as db - - -class IdentifiedCache(object): - """ - This class is like a dictionary where the keys are Identifiables. When you check whether an - Identifiable exists as key this class returns True not only if that exact Python object is - used as a key, but if an Identifiable is used as key that is **equal** to the one being - considered (see __eq__ function of Identifiable). Similarly, if you do `cache[identifiable]` - you get the Record where the key is an Identifiable that is equal to the one in the rectangular - brackets. - - This class is used for Records where we checked the existence in a remote server using - identifiables. If the Record was found, this means that we identified the corresponding Record - in the remote server and the ID of the local object can be set. - To prevent querying the server again and again for the same objects, this cache allows storing - Records that were found on a remote server and those that were not (typically in separate - caches). - """ - - def __init__(self): - self._cache = {} - self._identifiables = [] - - def __contains__(self, identifiable: Identifiable): - return identifiable in self._identifiables - - def __getitem__(self, identifiable: db.Record): - index = self._identifiables.index(identifiable) - return self._cache[id(self._identifiables[index])] - - def add(self, record: db.Record, identifiable: Identifiable): - self._cache[id(identifiable)] = record - self._identifiables.append(identifiable) diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py index fdc9462e9bef6756d3a9e1a1af971b1628f3a623..e01ca767e79ad433c2f29ae37547349eea695c74 100644 --- a/src/caoscrawler/scanner.py +++ b/src/caoscrawler/scanner.py @@ -38,21 +38,20 @@ import importlib import logging import os import warnings -import yaml +from collections.abc import Callable +from typing import Any, Optional, Type, Union +import linkahead as db +import yaml from importlib_resources import files from jsonschema import validate -from typing import Any, Optional, Type, Union -import caosdb as db from .converters import Converter - +from .debug_tree import DebugTree from .stores import GeneralStore, RecordStore -from .structure_elements import StructureElement, Directory +from .structure_elements import Directory, StructureElement from .version import check_cfood_version -from .debug_tree import DebugTree - logger = logging.getLogger(__name__) @@ -112,6 +111,7 @@ def _load_definition_from_yaml_dict(crawler_definitions: list[dict]): for key in metadata["Converters"]: schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append( key) + # TODO: We need a similar thing for "Transformers". # Validate the cfood schema: validate(instance=crawler_definition, schema=schema["cfood"]) @@ -184,6 +184,42 @@ def create_converter_registry(definition: dict): return converter_registry +def create_transformer_registry(definition: dict[str, dict[str, str]]): + """ + Currently the transformer registry is a dictionary containing for each transformer: + - key is the short code, abbreviation for the converter class name + - module is the name of the module to be imported which must be installed + - class is the transformer function to load and associate with this converter entry + + all other info for the converter needs to be included in the converter plugin + directory: + schema.yml file + README.md documentation + + Please refer to the docstring of function "scanner" for more information about the + detailed structure of the transformer functions. + """ + + # Defaults for the transformer registry: + with open(str(files('caoscrawler').joinpath('default_transformers.yml')), "r") as f: + transformer_def: dict[str, dict[str, str]] = yaml.safe_load(f) + + registry: dict[str, Callable[[Any, dict], Any]] = {} + # More transformers from definition file: + if "Transformers" in definition: + for key, entry in definition["Transformers"].items(): + transformer_def[key] = { + "function": entry["function"], + "package": entry["package"] + } + + # Load modules and associate classes: + for key, value in transformer_def.items(): + module = importlib.import_module(value["package"]) + registry[key] = getattr(module, value["function"]) + return registry + + def initialize_converters(crawler_definition: dict, converter_registry: dict): """ takes the cfood as dict (`crawler_definition`) and creates the converter objects that @@ -202,6 +238,8 @@ def initialize_converters(crawler_definition: dict, converter_registry: dict): continue elif key == "Converters": continue + elif key == "Transformers": + continue converters.append(Converter.converter_factory( value, key, converter_registry)) @@ -220,32 +258,46 @@ def scanner(items: list[StructureElement], converters_path: Optional[list[str]] = None, restricted_path: Optional[list[str]] = None, crawled_data: Optional[list[db.Record]] = None, - debug_tree: Optional[DebugTree] = None) -> list[db.Record]: + debug_tree: Optional[DebugTree] = None, + registered_transformer_functions: Optional[dict] = None) -> list[db.Record]: """Crawl a list of StructureElements and apply any matching converters. -Formerly known as ``_crawl(...)``. + Formerly known as ``_crawl(...)``. -Parameters ----------- + Parameters + ---------- items: list[StructureElement] - structure_elements (e.g. files and folders on one level on the hierarchy) + structure_elements (e.g. files and folders on one level on the hierarchy) converters: list[Converter] - locally defined converters for - treating structure elements. A locally defined converter could be - one that is only valid for a specific subtree of the originally - cralwed StructureElement structure. + locally defined converters for treating structure elements. A locally + defined converter could be one that is only valid for a specific subtree + of the originally cralwed StructureElement structure. general_store, record_store: GeneralStore, RecordStore, optional - This recursion of the crawl function should only operate on - copies of the global stores of the Crawler object. - - restricted_path: list[str], optional - traverse the data tree only along the given path. For example, when a directory contains files - a, b and c, and b is given as ``restricted_path``, a and c will be ignored by the crawler. - When the end of the given path is reached, traverse the full tree as normal. The first element - of the list provided by ``restricted_path`` should be the name of the StructureElement at this - level, i.e. denoting the respective element in the items argument. + This recursion of the crawl function should only operate on copies of + the global stores of the Crawler object. + + restricted_path : list[str], optional + traverse the data tree only along the given path. For example, when a + directory contains files a, b and c, and b is given as ``restricted_path``, a + and c will be ignored by the crawler. When the end of the given path is + reached, traverse the full tree as normal. The first element of the list + provided by ``restricted_path`` should be the name of the StructureElement + at this level, i.e. denoting the respective element in the items + argument. + + registered_transformer_functions : dict, optional + A dictionary of transformer functions that can be used in the "transform" block + of a converter and that allows to apply simple transformations to variables extracted + either by the current converter or to other variables found in the current variable store. + + Each function is a dictionary: + + - The key is the name of the function to be looked up in the dictionary of registered transformer functions. + - The value is the function which needs to be of the form: + def func(in_value: Any, in_parameters: dict) -> Any: + pass """ # This path_found variable stores wether the path given by restricted_path was found in the @@ -270,7 +322,7 @@ Parameters converters_path = [] for element in items: - element_path = os.path.join(*(structure_elements_path + [element.get_name()])) + element_path = os.path.join(*(structure_elements_path + [str(element.get_name())])) logger.debug(f"Dealing with {element_path}") for converter in converters: @@ -287,8 +339,13 @@ Parameters general_store_copy[converter.name] = element_path # extracts values from structure element and stores them in the - # variable store + # variable store. converter.create_values(general_store_copy, element) + + # Apply transformers if there are any: + converter.apply_transformers(general_store_copy, + registered_transformer_functions) + keys_modified = converter.create_records( general_store_copy, record_store_copy, element) @@ -320,7 +377,8 @@ Parameters structure_elements_path + [element.get_name()], converters_path + [converter.name], restricted_path[1:] if restricted_path is not None else None, - crawled_data, debug_tree) + crawled_data, debug_tree, + registered_transformer_functions) if restricted_path and not path_found: raise RuntimeError("A 'restricted_path' argument was given that is not contained in " @@ -368,6 +426,9 @@ def scan_directory(dirname: str, crawler_definition_path: str, # Load and register converter packages: converter_registry = create_converter_registry(crawler_definition) + # Load and register transformer functions: + registered_transformer_functions = create_transformer_registry(crawler_definition) + if not dirname: raise ValueError( "You have to provide a non-empty path for crawling.") @@ -388,7 +449,8 @@ def scan_directory(dirname: str, crawler_definition_path: str, crawler_definition, converter_registry, restricted_path=restricted_path, - debug_tree=debug_tree + debug_tree=debug_tree, + registered_transformer_functions=registered_transformer_functions ) @@ -396,7 +458,9 @@ def scan_structure_elements(items: Union[list[StructureElement], StructureElemen crawler_definition: dict, converter_registry: dict, restricted_path: Optional[list[str]] = None, - debug_tree: Optional[DebugTree] = None) -> list[db.Record]: + debug_tree: Optional[DebugTree] = None, + registered_transformer_functions: Optional[dict] = None) -> ( + list[db.Record]): """ Start point of the crawler recursion. @@ -411,9 +475,9 @@ def scan_structure_elements(items: Union[list[StructureElement], StructureElemen A dictionary representing the crawler definition, possibly from a yaml file. restricted_path: list[str], optional - Traverse the data tree only along the given path. When the end of the given path - is reached, traverse the full tree as normal. See docstring of 'scanner' for - more details. + Traverse the data tree only along the given path. When the end of the + given path is reached, traverse the full tree as normal. See docstring + of 'scanner' for more details. Returns ------- @@ -433,5 +497,6 @@ def scan_structure_elements(items: Union[list[StructureElement], StructureElemen items=items, converters=converters, restricted_path=restricted_path, - debug_tree=debug_tree + debug_tree=debug_tree, + registered_transformer_functions=registered_transformer_functions ) diff --git a/src/caoscrawler/transformer_functions.py b/src/caoscrawler/transformer_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..eda9f3c2bc98c8d2561f152f9f6ddd422daee00a --- /dev/null +++ b/src/caoscrawler/transformer_functions.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# Copyright (C) 2023 Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +""" +Defnition of default transformer functions. +""" +import re +from typing import Any + + +def submatch(in_value: Any, in_parameters: dict): + """ + Substitute the variable if it matches the regexp stored in "match". + + Returns the "in" value if it does NOT match the reg exp of 'match'. + Otherwise (if it matches) the value of 'then' stored in the second argument is returned. + """ + if "match" not in in_parameters or "then" not in in_parameters: + raise RuntimeError("Mandatory parameters missing.") + if re.match(in_parameters["match"], in_value) is not None: + return in_parameters["then"] + return in_value + + +def split(in_value: Any, in_parameters: dict): + """calls the string 'split' function on the first argument and uses the value of the key + 'marker' stored in the second argument + """ + if "marker" not in in_parameters: + raise RuntimeError("Mandatory parameter missing.") + if not isinstance(in_value, str): + raise RuntimeError("must be string") + return in_value.split(in_parameters['marker']) + + +def replace(in_value: Any, in_parameters: dict): + """calls the string 'replace' function on the first argument and uses the value of the keys + 'remove' and 'insert' stored in the second argument + """ + if "remove" not in in_parameters or "insert" not in in_parameters: + raise RuntimeError("Mandatory parameter missing.") + if not isinstance(in_value, str): + raise RuntimeError("must be string") + return in_value.replace(in_parameters['remove'], in_parameters['insert']) diff --git a/src/caoscrawler/utils.py b/src/caoscrawler/utils.py index 61b363099d0892b74e91f257bccb6cc832c3d59f..c62f44eeaa75ca42579aa3d6ead437e901cd38ff 100644 --- a/src/caoscrawler/utils.py +++ b/src/caoscrawler/utils.py @@ -25,7 +25,7 @@ # Some utility functions, e.g. for extending pylib. -import caosdb as db +import linkahead as db def has_parent(entity: db.Entity, name: str): diff --git a/src/doc/README_SETUP.md b/src/doc/README_SETUP.md index 5f5161d0d672ff3ad14db5c5b49f5c65550b06d7..a75193783d861707adf3b3d45311c392e22626f4 100644 --- a/src/doc/README_SETUP.md +++ b/src/doc/README_SETUP.md @@ -4,7 +4,10 @@ see INSTALL.md ## Run Unit Tests -Run `pytest unittests`. + +1. Install additional dependencies: + - h5py +2. Run `pytest unittests`. ## Documentation ## We use sphinx to create the documentation. Docstrings in the code should comply diff --git a/src/doc/cfood.rst b/src/doc/cfood.rst index f761493146bebb932f804c70152bbc09acf9d880..07431af0a9fb26e569be5d47f79d6a4f120df269 100644 --- a/src/doc/cfood.rst +++ b/src/doc/cfood.rst @@ -179,6 +179,23 @@ in a vairable with the same name (as it is the case for other Records). SomeRecord: ParameterFile: $fileEntity # creates a reference to the file + +Transform Functions +------------------- +You can use transform functions to alter variable values that the crawler consumes (e.g. a string +that was matched with a reg exp). See :doc:`Converter Documentation<converters>`. + +You can define your own transform functions by adding the the same way you add custom converters: + +.. code-block:: yaml + + Transformers: + transform_foo: + package: some.package + function: some_foo + + + Automatically generated keys ++++++++++++++++++++++++++++ diff --git a/src/doc/concepts.rst b/src/doc/concepts.rst index 8fcc27d077fef8dffb375b83698efb2db5979ea9..7100bcd1790edb3e040a1a90663a32a09b7c8eaf 100644 --- a/src/doc/concepts.rst +++ b/src/doc/concepts.rst @@ -17,6 +17,8 @@ Relevant sources in: - ``src/structure_elements.py`` +.. _ConceptConverters: + Converters ++++++++++ @@ -83,7 +85,9 @@ we can check whether a Record with the parent "Project" is referencing the "Expe Record. If that is the case, this reference is part of the identifiable for the "Experiment" Record. Note, that if there are multiple Records with the appropriate parent (e.g. multiple "Project" Records in the above example) it will be required that all of them -reference the object to be identified. +reference the object to be identified. You can also use the wildcard "*" as +RecordType name in the configuration which will only require, that ANY Record +references the Record at hand. Identified Records diff --git a/src/doc/conf.py b/src/doc/conf.py index a45097e8849329314b6a9529de7a7bf5e205912c..ac5fc7f5d95c2399f4402121ba9445bc0dbc6aaa 100644 --- a/src/doc/conf.py +++ b/src/doc/conf.py @@ -29,14 +29,14 @@ import sphinx_rtd_theme # noqa: E402 # -- Project information ----------------------------------------------------- project = 'caosdb-caoscrawler' -copyright = '2021, MPIDS' +copyright = '2024, IndiScale' author = 'Alexander Schlemmer' # The short X.Y version -version = '0.6.1' +version = '0.7.1' # The full version, including alpha/beta/rc tags # release = '0.5.2-rc2' -release = '0.6.1-dev' +release = '0.7.1-dev' # -- General configuration --------------------------------------------------- diff --git a/src/doc/converters.rst b/src/doc/converters.rst index 0dde61d56229d269682704f1bf1df70d056a11ef..b621ca58c7b95456d8d1ec19e2fd20e5c777137f 100644 --- a/src/doc/converters.rst +++ b/src/doc/converters.rst @@ -30,7 +30,7 @@ The yaml definition may look like this: TODO: outdated, see cfood-schema.yml .. code-block:: yaml - + <NodeName>: type: <ConverterName> match: ".*" @@ -46,7 +46,7 @@ TODO: outdated, see cfood-schema.yml - Experiment subtree: (...) - + The **<NodeName>** is a description of what it represents (e.g. 'experiment-folder') and is used as identifier. @@ -61,6 +61,54 @@ to generate records (see :py:meth:`~caoscrawler.converters.Converter.create_reco **subtree** makes the yaml recursive: It contains a list of new Converter definitions, which work on the StructureElements that are returned by the current Converter. +Transform Functions ++++++++++++++++++++ +Often the situation arises, that you cannot use a value as it is found. Maybe a value should be +increased by an offset or a string should be split into a list of pieces. In order to allow such +simple conversions, transform functions can be named in the converter definition that are then +applied to the respective variables when the converter is executed. + +.. code-block:: yaml + + <NodeName>: + type: <ConverterName> + match: ".*" + transform: + <TransformNodeName>: + in: $<in_var_name> + out: $<out_var_name> + functions: + - <func_name>: # name of the function to be applied + <func_arg1>: <func_arg1_value> # key value pairs that are passed as parameters + <func_arg2>: <func_arg2_value> + # ... + +An example that splits the variable ``a`` and puts the generated list in ``b`` is the following: + +.. code-block:: yaml + + Experiment: + type: Dict + match: ".*" + transform: + param_split: + in: $a + out: $b + functions: + - split: # split is a function that is defined by default + marker: "|" # its only parameter is the marker that is used to split the string + records: + Report: + tags: $b + +This splits the string in '$a' and stores the resulting list in '$b'. This is here used to add a +list valued property to the Report Record. + + +There are a number of transform functions that are defined by default (see +``src/caoscrawler/default_transformers.yml``). You can define custom transform functions by adding +them to the cfood definition (see :doc:`CFood Documentation<cfood>`). + Standard Converters +++++++++++++++++++ @@ -156,7 +204,7 @@ column names to values of the respective cell. Example: .. code-block:: yaml - + subtree: TABLE: # Any name for the table as a whole type: CSVTableConverter @@ -189,6 +237,105 @@ CSVTableConverter CSV File → DictElement +Further converters +++++++++++++++++++ + +More converters, together with cfood definitions and examples can be found in +the `LinkAhead Crawler Extensions Subgroup +<https://gitlab.com/linkahead/crawler-extensions>`_ on gitlab. In the following, +we list converters that are shipped with the crawler library itself but are not +part of the set of standard converters and may require this library to be +installed with additional optional dependencies. + +HDF5 Converters +=============== + +For treating `HDF5 Files +<https://docs.hdfgroup.org/hdf5/develop/_s_p_e_c.html>`_, there are in total +four individual converters corresponding to the internal structure of HDF5 files: +the :ref:`H5FileConverter` which opens the file itself and creates further +structure elements from HDF5 groups, datasets, and included multi-dimensional +arrays that are in turn treated by the :ref:`H5GroupConverter`, the +:ref:`H5DatasetConverter`, and the :ref:`H5NdarrayConverter`, respectively. You +need to install the LinkAhead crawler with its optional ``h5crawler`` dependency +for using these converters. + +The basic idea when crawling HDF5 files is to treat them very similar to +:ref:`dictionaries <DictElement Converter>` in which the attributes on root, +group, or dataset level are essentially treated like ``BooleanElement``, +``TextElement``, ``FloatElement``, and ``IntegerElement`` in a dictionary: They +are appended as children and can be accessed via the ``subtree``. The file +itself and the groups within may contain further groups and datasets, which can +have their own attributes, subgroups, and datasets, very much like +``DictElements`` within a dictionary. The main difference to any other +dictionary type is the presence of multi-dimensional arrays within HDF5 +datasets. Since LinkAhead doesn't have any datatype corresponding to these, and +since it isn't desirable to store these arrays directly within LinkAhead for +reasons of performance and of searchability, we wrap them within a specific +Record as explained :ref:`below <H5NdarrayConverter>`, together with more +metadata and their internal path within the HDF5 file. Users can thus query for +datasets and their arrays according to their metadata within LinkAhead and then +use the internal path information to access the dataset within the file +directly. The type of this record and the property for storing the internal path +need to be reflected in the datamodel. Using the default names, you would need a +datamodel like + +.. code-block:: yaml + + H5Ndarray: + obligatory_properties: + internal_hdf5-path: + datatype: TEXT + +although the names of both property and record type can be configured within the +cfood definition. + +A simple example of a cfood definition for HDF5 files can be found in the `unit +tests +<https://gitlab.com/linkahead/linkahead-crawler/-/blob/main/unittests/h5_cfood.yml?ref_type=heads>`_ +and shows how the individual converters are used in order to crawl a `simple +example file +<https://gitlab.com/linkahead/linkahead-crawler/-/blob/main/unittests/hdf5_dummy_file.hdf5?ref_type=heads>`_ +containing groups, subgroups, and datasets, together with their respective +attributes. + +H5FileConverter +--------------- + +This is an extension of the +:py:class:`~caoscrawler.converters.SimpleFileConverter` class. It opens the HDF5 +file and creates children for any contained group or dataset. Additionally, the +root-level attributes of the HDF5 file are accessible as children. + +H5GroupConverter +---------------- + +This is an extension of the +:py:class:`~caoscrawler.converters.DictElementConverter` class. Children are +created for all subgroups and datasets in this HDF5 group. Additionally, the +group-level attributes are accessible as children. + +H5DatasetConverter +------------------ + +This is an extension of the +:py:class:`~caoscrawler.converters.DictElementConverter` class. Most +importantly, it stores the array data in HDF5 dataset into +:py:class:`~caoscrawler.hdf5_converter.H5NdarrayElement` which is added to its +children, as well as the dataset attributes. + +H5NdarrayConverter +------------------ + +This converter creates a wrapper record for the contained dataset. The name of +this record needs to be specified in the cfood definition of this converter via +the ``recordname`` option. The RecordType of this record can be configured with +the ``array_recordtype_name`` option and defaults to ``H5Ndarray``. Via the +given ``recordname``, this record can be used within the cfood. Most +importantly, this record stores the internal path of this array within the HDF5 +file in a text property, the name of which can be configured with the +``internal_path_property_name`` option which defaults to ``internal_hdf5_path``. + Custom Converters +++++++++++++++++ @@ -231,10 +378,10 @@ The following methods are abstract and need to be overwritten by your custom con - :py:meth:`~caoscrawler.converters.Converter.match` - :py:meth:`~caoscrawler.converters.Converter.typecheck` - + Example ======= - + In the following, we will explain the process of adding a custom converter to a yaml file using a SourceResolver that is able to attach a source element to another entity. @@ -265,50 +412,50 @@ Furthermore we will customize the method :py:meth:`~caoscrawler.converters.Conve number of records can be generated by the yaml definition. So for any applications - like here - that require an arbitrary number of records to be created, a customized implementation of :py:meth:`~caoscrawler.converters.Converter.create_records` is recommended. In this context it is recommended to make use of the function :func:`caoscrawler.converters.create_records` that implements creation of record objects from python dictionaries of the same structure that would be given using a yaml definition (see next section below). - + .. code-block:: python import re from caoscrawler.stores import GeneralStore, RecordStore from caoscrawler.converters import TextElementConverter, create_records from caoscrawler.structure_elements import StructureElement, TextElement - + class SourceResolver(TextElementConverter): """ This resolver uses a source list element (e.g. from the markdown readme file) to link sources correctly. """ - + def __init__(self, definition: dict, name: str, converter_registry: dict): """ Initialize a new directory converter. """ super().__init__(definition, name, converter_registry) - + def create_children(self, generalStore: GeneralStore, element: StructureElement): - + # The source resolver does not create children: - + return [] - + def create_records(self, values: GeneralStore, records: RecordStore, element: StructureElement, file_path_prefix): if not isinstance(element, TextElement): raise RuntimeError() - + # This function must return a list containing tuples, each one for a modified # property: (name_of_entity, name_of_property) keys_modified = [] - + # This is the name of the entity where the source is going to be attached: attach_to_scientific_activity = self.definition["scientific_activity"] rec = records[attach_to_scientific_activity] - + # The "source" is a path to a source project, so it should have the form: # /<Category>/<project>/<scientific_activity>/ # obtain these information from the structure element: @@ -316,18 +463,18 @@ that would be given using a yaml definition (see next section below). regexp = (r'/(?P<category>(SimulationData)|(ExperimentalData)|(DataAnalysis))' '/(?P<project_date>.*?)_(?P<project_identifier>.*)' '/(?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2})(_(?P<identifier>.*))?/') - + res = re.match(regexp, val) if res is None: raise RuntimeError("Source cannot be parsed correctly.") - + # Mapping of categories on the file system to corresponding record types in CaosDB: cat_map = { "SimulationData": "Simulation", "ExperimentalData": "Experiment", "DataAnalysis": "DataAnalysis"} linkrt = cat_map[res.group("category")] - + keys_modified.extend(create_records(values, records, { "Project": { "date": res.group("project_date"), @@ -341,7 +488,7 @@ that would be given using a yaml definition (see next section below). attach_to_scientific_activity: { "sources": "+$" + linkrt }}, file_path_prefix)) - + # Process the records section of the yaml definition: keys_modified.extend( super().create_records(values, records, element, file_path_prefix)) @@ -354,7 +501,7 @@ that would be given using a yaml definition (see next section below). If the recommended (python) package structure is used, the package containing the converter definition can just be installed using `pip install .` or `pip install -e .` from the `scifolder_package` directory. - + The following yaml block will register the converter in a yaml file: .. code-block:: yaml @@ -364,7 +511,7 @@ The following yaml block will register the converter in a yaml file: package: scifolder.converters.sources converter: SourceResolver - + Using the `create_records` API function ======================================= @@ -402,7 +549,7 @@ Let's formulate that using `create_records`: .. code-block:: python dir_name = "directory name" - + record_def = { "Experiment": { "identifier": dir_name @@ -478,7 +625,7 @@ Let's have a look at a more complex examples, defining multiple records: Project: $Project ProjectGroup: projects: +$Project - + This block will create two new Records: @@ -494,7 +641,7 @@ Let's formulate that using `create_records` (again, `dir_name` is constant here) .. code-block:: python dir_name = "directory name" - + record_def = { "Project": { "identifier": "project_name", @@ -506,7 +653,7 @@ Let's formulate that using `create_records` (again, `dir_name` is constant here) "ProjectGroup": { "projects": "+$Project", } - + } keys_modified = create_records(values, records, diff --git a/src/doc/index.rst b/src/doc/index.rst index 20f335f7885971b65caf91dfe723f867e46b8595..8a02ec62e50308a28899e71b4664f626dfa0c27b 100644 --- a/src/doc/index.rst +++ b/src/doc/index.rst @@ -15,6 +15,9 @@ CaosDB-Crawler Documentation Macros<macros> How to upgrade<how-to-upgrade> API documentation<_apidoc/modules> + Related Projects<related_projects/index> + + Back to Overview <https://docs.indiscale.com/> diff --git a/src/doc/macros.rst b/src/doc/macros.rst index 0bac6c7c561ae88c49c686ff61d97a5a15abe666..d093d9b69f5d2c14b5bfbb2fe292545fc7943ca7 100644 --- a/src/doc/macros.rst +++ b/src/doc/macros.rst @@ -193,6 +193,9 @@ The example will be expanded to: a: '5' +Note, that you need to make sure that subsequent macro calls do not +use the same top level key. Because later versions would overwrite previous +ones. Here we used ``$macro_name`` to prevent that. Limitation @@ -232,3 +235,43 @@ positions. Consider: However, this should not be a real limitation, as the crawler is designed in a way, that the order of the nodes in the same level should not matter. + + +Using macros within macro definitions +===================================== + +It is possible to use other macros in macro definitions. Again, examples can be found in +the macro unit tests (see e.g. :func:`unittests.test_macros.test_macros_in_macros`): + +.. _example_macros_in_macros: +.. code-block:: yaml + + --- + metadata: + crawler-version: 0.3.1 + macros: + - !defmacro + name: one_macro + params: + a: 25 + definition: + macro_sub_$a: + b: $a + another_param: 3 + - !defmacro + name: test_macrodef + params: {} + definition: + macro_top: !macro + one_macro: + - a: 17 + - {} + - a: 98 + not_macro: + a: 26 + --- + extroot: !macro + test_macrodef: + +TODO: +to be continued diff --git a/src/doc/related_projects/index.rst b/src/doc/related_projects/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..1dc8f41d6b81cae202bd46e7945ca05c682e479f --- /dev/null +++ b/src/doc/related_projects/index.rst @@ -0,0 +1,25 @@ +Related Projects +++++++++++++++++ + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + :hidden: + +.. container:: projects + + For in-depth documentation for users, administrators and developers, you may want to visit the subproject-specific documentation pages for: + + :`Server <https://docs.indiscale.com/caosdb-server>`_: The Java part of the LinkAhead server. + + :`MySQL backend <https://docs.indiscale.com/caosdb-mysqlbackend>`_: The MySQl/MariaDB components of the LinkAhead server. + + :`WebUI <https://docs.indiscale.com/caosdb-webui>`_: The default web frontend for the LinkAhead server. + + :`PyLinkAhead <https://docs.indiscale.com/caosdb-pylib>`_: The LinkAhead Python library. + + :`Advanced user tools <https://docs.indiscale.com/caosdb-advanced-user-tools>`_: The advanced Python tools for LinkAhead. + + :`LinkAhead <https://docs.indiscale.com/caosdb-deploy>`_: Your all inclusive LinkAhead software package. + + :`Back to Overview <https://docs.indiscale.com/>`_: LinkAhead Documentation. diff --git a/tox.ini b/tox.ini index a7d4465ed36f0fe5e49c06721d3e3a0cdf453fa0..03e02ebeff196430129e10c4c0d853ca77c47302 100644 --- a/tox.ini +++ b/tox.ini @@ -6,6 +6,7 @@ skip_missing_interpreters = true deps = . pytest pytest-cov + h5py # TODO: Make this f-branch sensitive git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@dev diff --git a/unittests/h5_cfood.yml b/unittests/h5_cfood.yml new file mode 100644 index 0000000000000000000000000000000000000000..f688de6a2171da6533626449b030bcd95a43b37b --- /dev/null +++ b/unittests/h5_cfood.yml @@ -0,0 +1,69 @@ +--- +metadata: + crawler-version: 0.6.1 +--- +Converters: + H5Dataset: + converter: H5DatasetConverter + package: caoscrawler.hdf5_converter + H5File: + converter: H5FileConverter + package: caoscrawler.hdf5_converter + H5Group: + converter: H5GroupConverter + package: caoscrawler.hdf5_converter + H5Ndarray: + converter: H5NdarrayConverter + package: caoscrawler.hdf5_converter +# Top-level, we have just the HDF5 file. +ParentDirectory: + type: Directory + match: (.*) + subtree: + H5FileElement: + type: H5File + match: (.*)\.(hdf5|h5)$ + records: + H5File: + parents: + - H5File + role: File + path: $H5FileElement + file: $H5FileElement + subtree: + # Here, we have the groups, the top-level dataset, and possible + # attributes (empty for now). + RootIntegerElement: + type: H5Dataset + match_name: ^root_integers$ + records: + H5Dataset: + parents: + - H5Dataset + H5File: + H5Dataset: +$H5Dataset + subtree: + # included NDArray in this dataset + TopLevelIntNDElement: + type: H5Ndarray + match_name: (.*) + recordname: this + records: + # this: + # ContainingFile: $H5File + H5Dataset: + Ndarray: $this + # There is one more list-valued attribute to this dataset. + TopLevelDataAttribute: + type: ListElement + match_name: ^attr_data_root$ + subtree: + AttributeListEntry: + type: FloatElement + match_name: (.*) + match_value: (?P<value>.*) + records: + H5Dataset: + attr_data_root: +$value + + diff --git a/unittests/hdf5_dummy_file.hdf5 b/unittests/hdf5_dummy_file.hdf5 new file mode 100644 index 0000000000000000000000000000000000000000..41bfb7ab3bcac19d90fd4f018cdd8118ae806eaf Binary files /dev/null and b/unittests/hdf5_dummy_file.hdf5 differ diff --git a/unittests/test_converters.py b/unittests/test_converters.py index ab5710feaaf14babc3fed65f10598250e53ffd9b..52ece13dc2269a3e3b16e6378166e91b084f4a7c 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -39,17 +39,20 @@ from caoscrawler.converters import (Converter, ConverterValidationError, DictIntegerElementConverter, DirectoryConverter, FloatElementConverter, IntegerElementConverter, JSONFileConverter, + ListElementConverter, MarkdownFileConverter, YAMLFileConverter, _AbstractScalarValueElementConverter, - handle_value) + handle_value, replace_variables) from caoscrawler.crawl import Crawler from caoscrawler.scanner import (_load_definition_from_yaml_dict, - create_converter_registry, load_definition) + create_converter_registry, + create_transformer_registry, load_definition) from caoscrawler.stores import GeneralStore from caoscrawler.structure_elements import (BooleanElement, DictElement, Directory, File, FloatElement, IntegerElement, ListElement, TextElement) +from caoscrawler.transformer_functions import replace, split UNITTESTDIR = Path(__file__).parent @@ -166,7 +169,7 @@ def test_markdown_converter(converter_registry): test_readme2 = File( "README.md", - UNITTESTDIR/"test_directories" / "examples_article" / + UNITTESTDIR / "test_directories" / "examples_article" / "ExperimentalData" / "2020_SpeedOfLight" / "2020-01-01_TimeOfFlight" / "README.md" ) @@ -244,7 +247,7 @@ def test_json_converter(converter_registry): invalid_json = File( "invalidjson.json", - UNITTESTDIR/"test_directories" / "examples_json" / "invalidjson.json" + UNITTESTDIR / "test_directories" / "examples_json" / "invalidjson.json" ) # Doesn't validate because of missing required 'name' property with pytest.raises(ConverterValidationError) as err: @@ -253,7 +256,7 @@ def test_json_converter(converter_registry): broken_json = File( "brokenjson.json", - UNITTESTDIR/"test_directories" / "examples_json" / "brokenjson.json" + UNITTESTDIR / "test_directories" / "examples_json" / "brokenjson.json" ) with pytest.raises(json.decoder.JSONDecodeError) as err: jsonconverter.create_children(None, broken_json) @@ -318,7 +321,7 @@ def test_yaml_converter(converter_registry): invalid_yaml = File( "invalidyaml.yml", - UNITTESTDIR/"test_directories" / "test_yamls" / "invalidyaml.yml" + UNITTESTDIR / "test_directories" / "test_yamls" / "invalidyaml.yml" ) # Doesn't validate because of missing required 'name' property @@ -328,7 +331,7 @@ def test_yaml_converter(converter_registry): broken_yaml = File( "brokenyaml.yml", - UNITTESTDIR/"test_directories" / "test_yamls" / "brokenyaml.yml" + UNITTESTDIR / "test_directories" / "test_yamls" / "brokenyaml.yml" ) with pytest.raises(yaml.parser.ParserError) as err: yamlconverter.create_children(None, broken_yaml) @@ -339,6 +342,12 @@ def test_variable_replacement(): values["a"] = 4 values["b"] = "68" + # basic values stay unchanged + assert replace_variables(5, values) is 5 + assert replace_variables(True, values) is True + assert replace_variables("$a", values) is 4 + assert replace_variables("${b}", values) == "68" + assert handle_value("b", values) == ("b", "single") assert handle_value("+b", values) == ("b", "list") assert handle_value("*b", values) == ("b", "multiproperty") @@ -360,7 +369,36 @@ def test_variable_replacement(): "collection_mode": "multiproperty"}, values) == ("68", "multiproperty") assert handle_value(["a", "b"], values) == (["a", "b"], "single") - assert handle_value(["$a", "$b"], values) == (["4", "68"], "single") + assert handle_value(["$a", "$b"], values) == ([4, "68"], "single") + + +def test_apply_transformers(converter_registry): + cfood_def = {"type": 'ListElement', "debug_match": True, "match_name": ".*", + 'transform': {'test': {'in': '$a', 'out': '$b', 'functions': [{ + 'split': {'marker': '|'}}]}}} + values = GeneralStore() + values["a"] = "a|b|c" + + # transformer_functions = create_transformer_registry(crawler_definition) + transformer_functions = {"split": split} + + conv = ListElementConverter(definition=cfood_def, name='test', + converter_registry=converter_registry) + + assert values['a'] is "a|b|c" + conv.apply_transformers(values, transformer_functions) + assert values['a'] is "a|b|c" + assert values['b'] == ["a", "b", "c"] + + # Check replacing of existing variable + cfood_def = {"type": 'ListElement', "debug_match": True, "match_name": ".*", + 'transform': {'test': {'in': '$a', 'out': '$a', 'functions': [{ + 'split': {'marker': '|'}}]}}} + conv = ListElementConverter(definition=cfood_def, name='test', + converter_registry=converter_registry) + + conv.apply_transformers(values, transformer_functions) + assert values['a'] == ["a", "b", "c"] def test_filter_children_of_directory(converter_registry, capsys): diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index 91e0e86a6d6cf2967ab3567a2ef93b7ccde56e64..e026899201ae0dfa937053d315854e5fbb8a351a 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -33,12 +33,15 @@ from os.path import basename, dirname, join from pathlib import Path from unittest.mock import MagicMock, Mock, patch -import caosdb as db -import caosdb.common.models as dbmodels +import caoscrawler +import linkahead as db +import linkahead.common.models as dbmodels import pytest import yaml -from caoscrawler.crawl import (Crawler, SecurityMode, _treat_deprecated_prefix, - crawler_main, split_restricted_path) +from caosadvancedtools.models.parser import parse_model_from_string +from caoscrawler.crawl import (Crawler, SecurityMode, TreatedRecordLookUp, + _treat_deprecated_prefix, crawler_main, + split_restricted_path) from caoscrawler.debug_tree import DebugTree from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import (CaosDBIdentifiableAdapter, @@ -49,9 +52,9 @@ from caoscrawler.scanner import (create_converter_registry, scan_directory, from caoscrawler.stores import GeneralStore, RecordStore from caoscrawler.structure_elements import (DictElement, DictListElement, DictTextElement, File) -from caosdb.apiutils import compare_entities -from caosdb.cached import cache_clear -from caosdb.exceptions import EmptyUniqueQueryError +from linkahead.apiutils import compare_entities +from linkahead.cached import cache_clear +from linkahead.exceptions import EmptyUniqueQueryError from pytest import raises UNITTESTDIR = Path(__file__).parent @@ -107,6 +110,47 @@ def mock_get_entity_by(eid=None, name=None, path=None): raise EmptyUniqueQueryError("") +def mock_retrieve_record(identifiable: Identifiable): + """ assumes that the identifiable is always only the date""" + + for record in EXAMPLE_SERVER_STATE: + if (record.role == "Record" and "date" in identifiable.properties + and record.get_property("date").value == identifiable.properties['date']): + return record + return None + + +def mock_cached_only_rt(query_string: str): + """Always return an empty Container""" + result = db.Container() + lo_query = query_string.lower() + if lo_query.startswith("find record ") or lo_query.startswith("find file "): + return result + model = parse_model_from_string(""" +B: + obligatory_properties: + C: + obligatory_properties: + prop_other: + datatype: INTEGER + prop_ident: + datatype: INTEGER +A: + obligatory_properties: + B: + datatype: LIST<B> + prop_ident: +""") + if query_string == "FIND RECORDTYPE 'A'": + model.get_deep("A").id = 1 + return result + [model.get_deep("A")] + if query_string == "FIND RECORDTYPE 'B'": + model.get_deep("A").id = 2 + return result + [model.get_deep("B")] + print(query_string) + raise NotImplementedError("Mock for this case is missing") + + @pytest.fixture(autouse=True) def clear_cache(): cache_clear() @@ -214,6 +258,13 @@ def test_split_into_inserts_and_updates_trivial(): crawler.split_into_inserts_and_updates([]) +def test_split_into_inserts_and_updates_unidentified(): + crawler = Crawler() + with raises(ValueError) as err: + crawler.split_into_inserts_and_updates([db.Record(name="recname").add_parent("someparent")]) + assert str(err.value).startswith("There is no identifying information.") + + def basic_retrieve_by_name_mock_up(rec, referencing_entities=None, known=None): """ returns a stored Record if rec.name is an existing key, None otherwise """ if rec.name in known: @@ -246,8 +297,8 @@ def test_split_into_inserts_and_updates_single(crawler_mocked_identifiable_retri entlist = [db.Record(name="A").add_parent( "C"), db.Record(name="B").add_parent("C")] - assert crawler.get_from_any_cache(identlist[0]) is None - assert crawler.get_from_any_cache(identlist[1]) is None + assert crawler.treated_records_lookup.get_any(entlist[0], identlist[0]) is None + assert crawler.treated_records_lookup.get_any(entlist[0], identlist[0]) is None assert not crawler._has_reference_value_without_id(identlist[0]) assert not crawler._has_reference_value_without_id(identlist[1]) assert crawler.identifiableAdapter.retrieve_identified_record_for_record( @@ -354,6 +405,84 @@ def test_split_into_inserts_and_updates_with_copy_attr(crawler_mocked_identifiab crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() +@pytest.mark.xfail(reason="https://gitlab.com/linkahead/linkahead-crawler/-/issues/88") +@patch("caoscrawler.identifiable_adapters.cached_query", + new=Mock(side_effect=mock_cached_only_rt)) +def test_split_iiau_with_unmergeable_list_items(): + """Test for meaningful exception when referencing a list of unmergeable entities. + +Datamodel +--------- +A: + B: LIST<B> + prop_ident: INTEGER + +B: + prop_ident: + C: + +C: + prop_other: INTEGER + +Identifiables +------------- + +id_A: [prop_ident] +id_B: [prop_ident, "is_referenced_by: A"] + +Data +---- + +b1: ("same", 23) +b2: ("same", 42) + +a: ([b1, b2]) + """ + prop_ident = db.Property("prop_ident", datatype=db.INTEGER) + prop_other = db.Property("prop_ident", datatype=db.INTEGER) + rt_c = db.RecordType("C").add_property(prop_other) + # Somehow it is necessary that `B` has a reference property. Dunno if C must have an + # identifiable as well. + rt_b = db.RecordType("B").add_property(prop_ident).add_property("C") + rt_a = db.RecordType("A").add_property(prop_ident).add_property("LIST<B>") + + ident_a = db.RecordType().add_parent("A").add_property("prop_ident") + ident_b = db.RecordType().add_parent("B").add_property("prop_ident").add_property( + "is_referenced_by", value="A") + ident_c = db.RecordType().add_parent("C").add_property("prop_other").add_property( + "is_referenced_by", value="B") + + rec_a = db.Record("a").add_parent(rt_a).add_property("prop_ident", value=1234) + rec_b = [] + rec_c = [] + for value in [23, 42]: + new_c = db.Record().add_parent(rt_c).add_property("prop_other", value=value) + rec_c.append(new_c) + rec_b.append(db.Record().add_parent(rt_b).add_property( + "prop_ident", value=2020).add_property("C", value=new_c)) + rec_a.add_property("B", rec_b) + + ident_adapter = CaosDBIdentifiableAdapter() + ident_adapter.register_identifiable("A", ident_a) + ident_adapter.register_identifiable("B", ident_b) + ident_adapter.register_identifiable("C", ident_c) + + crawler = Crawler(identifiableAdapter=ident_adapter) + + # This should give a merge conflict, and not + # "Could not find referencing entities of type(s): A" + + # from IPython import embed; embed() + with raises(RuntimeError) as rte: + crawler.synchronize(commit_changes=False, + crawled_data=[rec_a, *rec_b, *rec_c]) + assert not isinstance(rte.value, NotImplementedError), \ + "Exception must not be NotImplementedError, but plain RuntimeError." + assert "Could not find referencing entities" not in rte.value.args[0] + assert "merging impossible" in rte.something + # crawler.split_into_inserts_and_updates(ent_list=[rec_a, *rec_b, *rec_c]) + + def test_has_missing_object_in_references(): crawler = Crawler() # Simulate remote server content by using the names to identify records @@ -367,36 +496,40 @@ def test_has_missing_object_in_references(): # one reference with id -> check assert not crawler._has_missing_object_in_references( - Identifiable(name="C", record_type="RTC", properties={'d': 123}), []) + Identifiable(name="C", record_type="RTC", properties={'d': 123}), {}) # one ref with Entity with id -> check + rec = db.Record(id=123).add_parent("C") assert not crawler._has_missing_object_in_references( - Identifiable(name="C", record_type="RTC", properties={'d': db.Record(id=123) - .add_parent("C")}), []) + Identifiable(name="C", record_type="RTC", properties={'d': rec}), {id(rec): {'C': [None]}}) # one ref with id one with Entity with id (mixed) -> check + rec = db.Record(id=123).add_parent("RTC") assert not crawler._has_missing_object_in_references( Identifiable(name="C", record_type="RTD", - properties={'d': 123, 'b': db.Record(id=123).add_parent("RTC")}), []) + properties={'d': 123, 'b': rec}), {id(rec): {'C': [None]}}) # entity to be referenced in the following a = db.Record(name="C").add_parent("C").add_property("d", 12311) # one ref with id one with Entity without id (but not identifying) -> fail assert not crawler._has_missing_object_in_references( - Identifiable(name="C", record_type="RTC", properties={'d': 123, 'e': a}), []) + Identifiable(name="C", record_type="RTC", properties={'d': 123, 'e': a}), + {id(a): {'C': [None]}}) # one ref with id one with Entity without id (mixed) -> fail assert not crawler._has_missing_object_in_references( - Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), []) + Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), + {id(a): {'C': [None]}}) - crawler.add_to_remote_missing_cache(a, Identifiable(name="C", record_type="RTC", - properties={'d': 12311})) + crawler.treated_records_lookup.add(a, Identifiable(name="C", record_type="RTC", + properties={'d': 12311})) # one ref with id one with Entity without id but in cache -> check assert crawler._has_missing_object_in_references( - Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), []) + Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), + {id(a): {'C': [None]}}) # if this ever fails, the mock up may be removed crawler.identifiableAdapter.get_registered_identifiable.assert_called() -@pytest.mark.xfail() +@ pytest.mark.xfail() def test_references_entities_without_ids(): crawler = Crawler() assert not crawler._has_reference_value_without_id(db.Record().add_parent("Person") @@ -436,25 +569,15 @@ def reset_mocks(mocks): mock.reset_mock() -def mock_retrieve_record(identifiable: Identifiable): - """ assumes that the identifiable is always only the date""" - - for record in EXAMPLE_SERVER_STATE: - if (record.role == "Record" - and record.get_property("date").value == identifiable.properties['date']): - return record - return None - - -@patch("caoscrawler.crawl.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by)) -@patch("caoscrawler.identifiable_adapters.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by)) -@patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter." - "retrieve_identified_record_for_identifiable", - new=Mock(side_effect=mock_retrieve_record)) -@patch("caoscrawler.crawl.db.Container.insert") -@patch("caoscrawler.crawl.db.Container.update") +@ patch("caoscrawler.crawl.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@ patch("caoscrawler.identifiable_adapters.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@ patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter." + "retrieve_identified_record_for_identifiable", + new=Mock(side_effect=mock_retrieve_record)) +@ patch("caoscrawler.crawl.db.Container.insert") +@ patch("caoscrawler.crawl.db.Container.update") def test_synchronization_no_commit(upmock, insmock): crawled_data = [r.copy() for r in EXAMPLE_SERVER_STATE if r.role == "Record"] # change one; add one @@ -471,16 +594,16 @@ def test_synchronization_no_commit(upmock, insmock): assert len(ups) == 1 -@patch("caoscrawler.crawl.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by)) -@patch("caoscrawler.identifiable_adapters.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by)) -@patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter." - "retrieve_identified_record_for_identifiable", - new=Mock(side_effect=mock_retrieve_record)) -@patch("caoscrawler.crawl.db.Container.insert") -@patch("caoscrawler.crawl.db.Container.update") -@patch("caoscrawler.crawl.UpdateCache.insert") +@ patch("caoscrawler.crawl.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@ patch("caoscrawler.identifiable_adapters.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@ patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter." + "retrieve_identified_record_for_identifiable", + new=Mock(side_effect=mock_retrieve_record)) +@ patch("caoscrawler.crawl.db.Container.insert") +@ patch("caoscrawler.crawl.db.Container.update") +@ patch("caoscrawler.crawl.UpdateCache.insert") def test_security_mode(updateCacheMock, upmock, insmock): # trivial case: nothing to do crawled_data = [r.copy() for r in EXAMPLE_SERVER_STATE if r.role == "Record"] @@ -579,12 +702,13 @@ def test_security_mode(updateCacheMock, upmock, insmock): def test_create_reference_mapping(): a = db.Record().add_parent("A") - b = db.Record().add_parent("B").add_property('a', a) + b = db.Record(id=132).add_parent("B").add_property('a', a) ref = Crawler.create_reference_mapping([a, b]) assert id(a) in ref - assert id(b) not in ref + assert id(b) in ref assert "B" in ref[id(a)] - assert ref[id(a)]["B"] == [b] + assert {} == ref[id(b)] + assert ref[id(a)]["B"] == [132] def test_create_flat_list(): @@ -607,7 +731,7 @@ def test_create_flat_list(): assert c in flat -@pytest.fixture +@ pytest.fixture def crawler_mocked_for_backref_test(): crawler = Crawler() # mock retrieval of registered identifiabls: return Record with just a parent @@ -651,8 +775,8 @@ def test_validation_error_print(caplog): caplog.clear() -@patch("caoscrawler.identifiable_adapters.get_children_of_rt", - new=Mock(side_effect=lambda x: [x])) +@ patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test): crawler = crawler_mocked_for_backref_test identlist = [Identifiable(name="A", record_type="BR"), @@ -666,8 +790,8 @@ def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test) crawler.split_into_inserts_and_updates([db.Record(name="B").add_parent("C")]) # identifiables were not yet checked - assert crawler.get_from_any_cache(identlist[0]) is None - assert crawler.get_from_any_cache(identlist[1]) is None + assert crawler.treated_records_lookup.get_any(entlist[1], identlist[0]) is None + assert crawler.treated_records_lookup.get_any(entlist[0], identlist[1]) is None # one with reference, one without assert not crawler._has_reference_value_without_id(identlist[0]) assert crawler._has_reference_value_without_id(identlist[1]) @@ -687,8 +811,8 @@ def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test) assert insert[0].name == "B" -@patch("caoscrawler.identifiable_adapters.get_children_of_rt", - new=Mock(side_effect=lambda x: [x])) +@ patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_test): # test whether multiple references of the same record type are correctly used crawler = crawler_mocked_for_backref_test @@ -700,7 +824,9 @@ def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_ # test whether both entities are listed in the backref attribute of the identifiable referencing_entities = crawler.create_reference_mapping(entlist) - identifiable = crawler.identifiableAdapter.get_identifiable(referenced, referencing_entities) + identifiable = crawler.identifiableAdapter.get_identifiable( + referenced, + referencing_entities[id(referenced)]) assert len(identifiable.backrefs) == 2 # check the split... @@ -709,8 +835,8 @@ def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_ assert len(insert) == 2 -@patch("caoscrawler.identifiable_adapters.get_children_of_rt", - new=Mock(side_effect=lambda x: [x])) +@ patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_test): # test whether multiple references of the different record types are correctly used crawler = crawler_mocked_for_backref_test @@ -722,7 +848,10 @@ def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_ # test whether both entities are listed in the backref attribute of the identifiable referencing_entities = crawler.create_reference_mapping(entlist) - identifiable = crawler.identifiableAdapter.get_identifiable(referenced, referencing_entities) + identifiable = crawler.identifiableAdapter.get_identifiable( + referenced, + referencing_entities[id(referenced)]) + assert len(identifiable.backrefs) == 2 # check the split... @@ -735,7 +864,7 @@ def mock_create_values(values, element): pass -@patch("caoscrawler.converters.IntegerElementConverter.create_values") +@ patch("caoscrawler.converters.IntegerElementConverter.create_values") def test_restricted_path(create_mock): """ The restricted_path argument allows to ignroe part of the crawled data structure. Here, we make @@ -828,7 +957,7 @@ def test_split_restricted_path(): # Filter the warning because we want to have it here and this way it does not hinder running # tests with -Werror. -@pytest.mark.filterwarnings("ignore:The prefix:DeprecationWarning") +@ pytest.mark.filterwarnings("ignore:The prefix:DeprecationWarning") def test_deprecated_prefix_option(): """Test that calling the crawler's main function with the deprecated `prefix` option raises the correct errors and warnings. @@ -885,5 +1014,150 @@ def test_detect_circular_dependency(crawler_mocked_identifiable_retrieve, caplog _, _ = crawler.split_into_inserts_and_updates(flat) caplog.set_level(logging.ERROR, logger="caoscrawler.converters") assert "Found circular dependency" in caplog.text - assert "-------\na\n['C" in caplog.text + assert "\n--------\n\n> Parent: C\n\n>> Name: a\n[\'C\']" in caplog.text caplog.clear() + + +def mock_get_entity_by_query(query=None): + if query is not None: + return db.Record(id=1111, name='rec_name').add_parent('RT') + + +@ patch("caoscrawler.crawl.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by_query)) +def test_replace_name_with_referenced_entity(): + test_text = 'lkajsdf' + test_int = 134343 + test_id = 1111 + test_name = 'rec_name' + + # do not touch Properties with non-ref datatype + prop = db.Property(name='a', datatype=db.TEXT, value=test_text) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value is test_text + + # do not touch Properties with generic-ref datatype + prop = db.Property(name='a', datatype=db.REFERENCE, value=test_text) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value is test_text + + # do not touch Properties with file-ref datatype + prop = db.Property(name='a', datatype=db.FILE, value=test_text) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value is test_text + + # do not touch Properties with non-str values + prop = db.Property(name='a', datatype="RT", value=test_int) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value is test_int + + # no LinkAhead acccess until here + assert caoscrawler.crawl.cached_get_entity_by.call_count == 0 + + # change Properties with custom dt and str value + prop = db.Property(name='a', datatype="RT", value=test_name) + Crawler.replace_name_with_referenced_entity_id(prop) + assert isinstance(prop.value, int) + assert prop.value == test_id + assert caoscrawler.crawl.cached_get_entity_by.call_count == 1 + + # do not touch Properties with non-ref datatype (LIST) + prop = db.Property(name='a', datatype=db.LIST(db.TEXT), value=[test_text]) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value[0] is test_text + + # do not touch Properties with generic-ref datatype (LIST) + prop = db.Property(name='a', datatype=db.LIST(db.REFERENCE), value=[test_text]) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value[0] is test_text + + # do not touch Properties with file-ref datatype (LIST) + prop = db.Property(name='a', datatype=db.LIST(db.FILE), value=[test_text]) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value[0] is test_text + + # do not touch Properties with non-str values (LIST) + prop = db.Property(name='a', datatype=db.LIST("RT"), value=[test_int]) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value[0] is test_int + + # change Properties with custom dt and str value + prop = db.Property(name='a', datatype=db.LIST("RT"), value=[test_name, db.Record(name='hi'), + test_name]) + Crawler.replace_name_with_referenced_entity_id(prop) + assert isinstance(prop.value[0], int) + assert prop.value[0] == test_id + assert isinstance(prop.value[1], db.Entity) + assert prop.value[1].name == "hi" + assert isinstance(prop.value[2], int) + assert prop.value[2] == test_id + assert caoscrawler.crawl.cached_get_entity_by.call_count == 3 + + +def test_treated_record_lookup(): + trlu = TreatedRecordLookUp() + exist = db.Record(id=1) + trlu.add(exist) + assert len(trlu._existing) == 1 + # was added to existing + assert trlu._existing[id(exist)] is exist + # is in ID lookup + assert trlu._id_look_up[exist.id] is exist + # can be accessed via get_existing + assert trlu.get_existing(db.Record(id=1)) is exist + + miss = db.Record() + # exception when identifiable is missing + with raises(RuntimeError): + trlu.add(miss) + ident = Identifiable(name='a') + trlu.add(miss, ident) + # was added to missing + assert trlu._missing[id(miss)] is miss + # is in ident lookup + assert trlu._identifiable_look_up[ident.get_representation()] is miss + # can be accessed via get_missing + assert trlu.get_missing(db.Record(), Identifiable(name='a')) is miss + + fi = db.File(path='a', id=2) + trlu.add(fi) + assert len(trlu._existing) == 2 + # was added to existing + assert trlu._existing[id(fi)] is fi + # is in ID lookup + assert trlu._id_look_up[fi.id] is fi + # is in path lookup + assert trlu._path_look_up[fi.path] is fi + # can be accessed via get_existing + assert trlu.get_existing(fi) is fi + + all_exi = trlu.get_existing_list() + assert fi in all_exi + assert exist in all_exi + all_mi = trlu.get_missing_list() + assert miss in all_mi + + # If a Record was added using the ID, the ID must be used to identify it even though later an + # identifiable may be passed as well + assert trlu.get_any(exist, Identifiable(name='b')) is exist + + fi2 = db.File(path='b') + trlu.add(fi2) + assert trlu.get_any(db.File(path='b'), Identifiable(name='c')) is fi2 + + +def test_merge_entity_with_identifying_reference(crawler_mocked_identifiable_retrieve): + # When one python object representing a record is merged into another python object + # representing the same record, the former object can be forgotten and references from it to + # other records must not play a role + crawler = crawler_mocked_identifiable_retrieve + crawler.identifiableAdapter.get_registered_identifiable = Mock( + side_effect=lambda x: db.Record().add_parent('C').add_property(name='name') if + x.parents[0].name == "C" else + db.Record().add_parent('D').add_property(name='is_referenced_by', value="*") + ) + a = db.Record(name='a').add_parent("D") + b = db.Record(name='b').add_parent("C") + c = db.Record(name='b').add_parent("C").add_property(name="C", value=a) + flat = [a, c, b] + _, _ = crawler.split_into_inserts_and_updates(flat) diff --git a/unittests/test_directories/test_transformers/Day_Mon/README.md b/unittests/test_directories/test_transformers/Day_Mon/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/unittests/test_directories/test_transformers/Day_Tue/README.md b/unittests/test_directories/test_transformers/Day_Tue/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/unittests/test_directories/test_transformers/Day_Unk/README.md b/unittests/test_directories/test_transformers/Day_Unk/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/unittests/test_directories/test_transformers/cfood.yml b/unittests/test_directories/test_transformers/cfood.yml new file mode 100644 index 0000000000000000000000000000000000000000..ec79eaf00203b5df1a436dfe50fbf17fa7b764db --- /dev/null +++ b/unittests/test_directories/test_transformers/cfood.yml @@ -0,0 +1,49 @@ + +# See: https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/107 +# +Transformers: + ceil: + function: quote + package: shlex + +RootDir: + type: Directory + match: ^.*$ + subtree: + DateDir: + type: Directory + match: ^Day_(?P<day_short>.*)$ # Example: Day_Mon + transform: + MakeDayLong: + in: $day_short + out: $day_long + functions: + - submatch: # name of the function + match: Mon # match is one specific argument + then: Monday # then another one + - submatch: # next function + match: Tue + then: Tuesday + TestSplit: + in: $day_short + out: $day_split + functions: + - split: + marker: o + records: + DayFolder: + Day: $day_long + DayShort: $day_short # just for checking, whether this variable remains + DaySplit: $day_split # just for checking, whether this variable remains + Testfi: + type: File + match: ^(?P<no>(\d+ )*)$ + transform: + up: + in: $no + out: $no + functions: + - ceil: {} + records: + Number: + num: $no diff --git a/unittests/test_h5_converter.py b/unittests/test_h5_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..2f7fae5d8d32bb7e5c90a535b63158c33df55daa --- /dev/null +++ b/unittests/test_h5_converter.py @@ -0,0 +1,135 @@ +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2023 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2023 Florian Spreckelsen <f.spreckelsen@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +import numpy as np + +from functools import partial +from pathlib import Path +from pytest import fixture, importorskip + +import caosdb as db + +from caoscrawler.debug_tree import DebugTree +from caoscrawler.hdf5_converter import (convert_basic_element_with_nd_array, + convert_h5_element, H5GroupElement, + H5DatasetElement, H5NdarrayElement) +from caoscrawler.scanner import scan_directory +from caoscrawler.structure_elements import (FloatElement, ListElement, + TextElement) +from utils import dircheckstr as dircheck_base + +# Skip the whole module if h5py hasn't been installed +h5py = importorskip("h5py") + + +UNITTESTDIR = Path(__file__).parent + +# always add the path here +dircheckstr = partial(dircheck_base, UNITTESTDIR) + + +@fixture +def h5_dummy_file(): + + path = UNITTESTDIR / "hdf5_dummy_file.hdf5" + + return h5py.File(path, 'r') + + +def test_h5_elements(h5_dummy_file): + + elt = convert_h5_element(h5_dummy_file["group_level1_a"], "test") + assert isinstance(elt, H5GroupElement) + + elt = convert_h5_element(h5_dummy_file["root_integers"], "test") + assert isinstance(elt, H5DatasetElement) + + +def test_nd_array_conversion(): + + # Only test array handling here, `convert_basic_element` is tested + # elsewhere. + arr = np.array([[["something"]]]) + elt = convert_basic_element_with_nd_array(arr) + assert isinstance(elt, TextElement) + assert elt.value == "something" + + arr = np.zeros((1, 1)) + elt = convert_basic_element_with_nd_array(arr) + assert isinstance(elt, FloatElement) + assert elt.value == 0 + + arr = np.zeros((1, 3, 1)) + elt = convert_basic_element_with_nd_array(arr) + assert isinstance(elt, ListElement) + assert elt.value == [0, 0, 0] + + arr = np.array([[1, 2, 3], [4, 5, 6]]) + elt = convert_basic_element_with_nd_array(arr, internal_path="some/path") + assert isinstance(elt, H5NdarrayElement) + assert elt.internal_path == "some/path" + + # Non-arrays should be forwarded correctly + elt = convert_basic_element_with_nd_array("something") + assert isinstance(elt, TextElement) + assert elt.value == "something" + + elt = convert_basic_element_with_nd_array([0, 0, 0]) + assert isinstance(elt, ListElement) + assert elt.value == [0, 0, 0] + + +def test_record_creation(): + + dbt = DebugTree() + records = scan_directory(UNITTESTDIR, UNITTESTDIR / "h5_cfood.yml", debug_tree=dbt) + + # In total 3 records: The file, the Dataset, and its ndarray + assert len(records) == 3 + file_rec = [rec for rec in records if isinstance(rec, db.File)] + # exactly on file + assert len(file_rec) == 1 + + subd = dbt.debug_tree[dircheckstr("hdf5_dummy_file.hdf5")] + # At this level, we have 5 variables (directories and paths, plus H5File + # record), and one record. + assert len(subd[0]) == 5 + assert len(subd[1]) == 1 + file_rec = subd[1]["H5File"] + assert file_rec.get_property("H5Dataset") is not None + assert file_rec.get_property("H5Dataset").value is not None + # Reference properties currently need to be integration tested (especially + # with the circular dependency between) H5File and NDArray. + + # top level integers + subd = dbt.debug_tree["root_integers"] + # Two additional variables (RootIntegerElement + Dataset record), one + # additional record + assert len(subd[0]) == 7 + assert len(subd[1]) == 2 + ds_rec = subd[1]["H5Dataset"] + assert isinstance(ds_rec, db.Record) + assert len(ds_rec.parents) == 1 + assert ds_rec.parents[0].name == "H5Dataset" + assert ds_rec.get_property("Ndarray") is not None + assert ds_rec.get_property("Ndarray").value is not None + assert ds_rec.get_property("attr_data_root") is not None + assert isinstance(ds_rec.get_property("attr_data_root").value, list) + for number in [-2., -4., -8., -10.12345]: + assert number in [float(val) for val in ds_rec.get_property("attr_data_root").value] diff --git a/unittests/test_identifiable.py b/unittests/test_identifiable.py index 3f3c606b163df4dc238be9a669fd31eb630a582d..28bdb7a2ad75d5b9389b47ca3f0ec2b2e2a1404b 100644 --- a/unittests/test_identifiable.py +++ b/unittests/test_identifiable.py @@ -24,10 +24,9 @@ test identifiable module """ -import pytest import caosdb as db +import pytest from caoscrawler.identifiable import Identifiable -from caoscrawler.identified_cache import IdentifiedCache def test_create_hashable_string(): diff --git a/unittests/test_identifiable_adapters.py b/unittests/test_identifiable_adapters.py index 268b9800ddf1ef1688386f394ec1c6c7eb3e3912..ee0e0d6cd7c791f78e7cd2307dc6f34698326b4a 100644 --- a/unittests/test_identifiable_adapters.py +++ b/unittests/test_identifiable_adapters.py @@ -133,6 +133,21 @@ def test_non_default_name(): assert identifiable.name is None +def test_wildcard_ref(): + ident = CaosDBIdentifiableAdapter() + ident.register_identifiable( + "Person", db.RecordType() + .add_parent(name="Person") + .add_property(name="is_referenced_by", value=["*"])) + rec = (db.Record(name="don't touch it").add_parent("Person") + .add_property(name="last_name", value='Tom')) + identifiable = ident.get_identifiable(rec, + referencing_entities={ + 'A': [1]} + ) + assert identifiable.backrefs[0] == 1 + + def test_convert_value(): # test that string representation of objects stay unchanged. No stripping or so. class A(): diff --git a/unittests/test_identified_cache.py b/unittests/test_identified_cache.py deleted file mode 100644 index 4ed7c55c7326415308917e20e9f391b17b07ad87..0000000000000000000000000000000000000000 --- a/unittests/test_identified_cache.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -# -# ** header v3.0 -# This file is a part of the CaosDB Project. -# -# Copyright (C) 2021 Indiscale GmbH <info@indiscale.com> -# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see <https://www.gnu.org/licenses/>. -# -# ** end header -# - -""" -test identified_cache module -""" - -import caosdb as db -from caoscrawler.identifiable import Identifiable -from caoscrawler.identified_cache import IdentifiedCache - - -def test_IdentifiedCache(): - ident = Identifiable(name="A", record_type="B") - record = db.Record("A").add_parent("B").add_property('b', 5) - cache = IdentifiedCache() - assert ident not in cache - cache.add(record=record, identifiable=ident) - assert ident in cache - assert cache[ident] is record - assert Identifiable(name="A", record_type="C") != Identifiable(name="A", record_type="B") - assert Identifiable(name="A", record_type="C") not in cache diff --git a/unittests/test_scalars_cfood.py b/unittests/test_scalars_cfood.py index 57a8083c2aace830115c410e0425a8af4da17a7b..ba604fe4f5b695506bf8df9dab79fc23232c546a 100644 --- a/unittests/test_scalars_cfood.py +++ b/unittests/test_scalars_cfood.py @@ -37,11 +37,11 @@ def test_handle_value(): def test_record_structure_generation(): dbt = DebugTree() - scan_directory(UNITTESTDIR/"test_directories" / "examples_article", - UNITTESTDIR/"cfoods_scalar.yml", + scan_directory(UNITTESTDIR / "test_directories" / "examples_article", + UNITTESTDIR / "cfoods_scalar.yml", debug_tree=dbt) subd = dbt.debug_tree[dircheckstr( - UNITTESTDIR/"test_directories" / "examples_article", "DataAnalysis")] + UNITTESTDIR / "test_directories" / "examples_article", "DataAnalysis")] assert len(subd) == 2 # variables store on Data Analysis node of debug tree if "Data" in subd[0]: diff --git a/unittests/test_scanner.py b/unittests/test_scanner.py index 21b7da10ef860378a4779f34f8f1b26c6a54f359..c0ce736fc4bed18f371f1626b6bc451ee103db49 100644 --- a/unittests/test_scanner.py +++ b/unittests/test_scanner.py @@ -1,8 +1,10 @@ +#!/usr/bin/env python3 # encoding: utf-8 # -# This file is a part of the CaosDB Project. +# This file is a part of the LinkAhead Project. # -# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# Copyright (C) 2023,2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2023,2024 Henrik tom Wörden <h.tomwoerden@indiscale.com> # 2021-2023 Research Group Biomedical Physics, # Max-Planck-Institute for Dynamics and Self-Organization Göttingen # Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> @@ -20,37 +22,24 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. # +""" +Unit test functions for the scanner. +""" -import json -import logging -import os -import warnings -from copy import deepcopy from functools import partial -from os.path import basename, dirname, join from pathlib import Path from tempfile import NamedTemporaryFile from unittest.mock import MagicMock, Mock, patch import caosdb as db -import caosdb.common.models as dbmodels import pytest import yaml -from caoscrawler.crawl import (Crawler, SecurityMode, _treat_deprecated_prefix, - crawler_main, split_restricted_path) +from caoscrawler.crawl import Crawler from caoscrawler.debug_tree import DebugTree -from caoscrawler.identifiable import Identifiable -from caoscrawler.identifiable_adapters import (CaosDBIdentifiableAdapter, - IdentifiableAdapter, - LocalStorageIdentifiableAdapter) from caoscrawler.scanner import (create_converter_registry, load_definition, scan_directory, scan_structure_elements) -from caoscrawler.stores import GeneralStore, RecordStore from caoscrawler.structure_elements import (DictElement, DictListElement, DictTextElement, File) -from caosdb.apiutils import compare_entities -from caosdb.cached import cache_clear -from caosdb.exceptions import EmptyUniqueQueryError from pytest import raises from utils import dircheckstr as dircheck_base @@ -210,7 +199,6 @@ def test_record_generation(): # Try each record to check for i, check_prop in enumerate(check_props): matches = True - # breakpoint() # Verify that all props are in the record and have the right value for pr in check_prop: if rec.get_property(pr) is None: diff --git a/unittests/test_transformers.py b/unittests/test_transformers.py new file mode 100644 index 0000000000000000000000000000000000000000..02d932d13cc3fad52048b08e2b9fe56f11db2ae7 --- /dev/null +++ b/unittests/test_transformers.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2023 Research Group Biomedical Physics, +# Max-Planck-Institute for Dynamics and Self-Organization Göttingen +# Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" +Unit test functions for the transformer feature of the scanner. + +Currently, this is under development. +See: https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/107 +""" + +import importlib +from functools import partial +from pathlib import Path +from tempfile import NamedTemporaryFile +from unittest.mock import MagicMock, Mock, patch + +import caosdb as db +import pytest +import yaml +from caoscrawler.converters import Converter, ListElementConverter +from caoscrawler.scanner import create_transformer_registry, scan_directory +from caoscrawler.stores import GeneralStore +from caoscrawler.transformer_functions import replace, split +from pytest import raises + +UNITTESTDIR = Path(__file__).parent + + +def test_simple_transformer(): + """ + Test the correct list of returned records by the scanner using the + scifolder example from the article. + """ + + records = scan_directory(UNITTESTDIR / "test_directories" / "test_transformers", + UNITTESTDIR / "test_directories" / "test_transformers" / + "cfood.yml") + + for r in records: + if r.parents[0].name == "DayFolder": + assert r.get_property("Day") is not None + assert r.get_property("DayShort") is not None + assert r.get_property("DayShort").value != "$day_short" + if r.get_property("DayShort").value == "Unk": + # This unkown folder should not lead to a replacement + assert r.get_property("Day").value == "Unk" + assert r.get_property("DaySplit").value == ["Unk"] + elif r.get_property("DayShort").value == "Mon": + assert r.get_property("Day").value == "Monday" + assert r.get_property("DaySplit").value == ["M", "n"] + elif r.get_property("DayShort").value == "Tue": + assert r.get_property("Day").value == "Tuesday" + assert r.get_property("DaySplit").value == ["Tue"] + else: + # unexpected occurence of a short form, something wrong with test directories + assert False + elif r.parents[0].name == "Number": + assert r.get_property("num") is not None + assert r.get_property("num").value == "'12345 5 '" + else: + # unkown error, something wrong with test directories + assert False + + +@pytest.fixture +def converter_registry(): + converter_registry: dict[str, dict[str, str]] = { + "Directory": { + "converter": "DirectoryConverter", + "package": "caoscrawler.converters"}, + "MarkdownFile": { + "converter": "MarkdownFileConverter", + "package": "caoscrawler.converters"}, + "Date": { + "converter": "DateElementConverter", + "package": "caoscrawler.converters"}, + "DictElement": { + "converter": "DictElementConverter", + "package": "caoscrawler.converters"}, + "TextElement": { + "converter": "TextElementConverter", + "package": "caoscrawler.converters"}, + "ListElement": { + "converter": "ListElementConverter", + "package": "caoscrawler.converters"}, + "JSONFile": { + "converter": "JSONFileConverter", + "package": "caoscrawler.converters"}, + } + + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry + + +def test_apply_replace(converter_registry): + cfood_def = {"type": 'ListElement', "match_name": ".*", + 'transform': {'test': {'in': '$a', 'out': '$b', 'functions': [{ + 'replace': {'insert': ':', "remove": "_"}}]}}} + values = GeneralStore() + values["a"] = "16_45" + + # transformer_functions = create_transformer_registry(crawler_definition) + transformer_functions = {"replace": replace} + + conv = ListElementConverter(definition=cfood_def, name='test', + converter_registry=converter_registry) + + conv.apply_transformers(values, transformer_functions) + assert values['b'] == "16:45" + + +def test_apply_replace_from_def(converter_registry): + cfood_def = {"type": 'ListElement', "match_name": ".*", + 'transform': {'test': {'in': '$a', 'out': '$b', 'functions': [{ + 'replace': {'insert': ':', "remove": "_"}}]}}} + values = GeneralStore() + values["a"] = "16_45" + + transformer_functions = create_transformer_registry({}) + # transformer_functions = {"replace": replace} + + conv = ListElementConverter(definition=cfood_def, name='test', + converter_registry=converter_registry) + + conv.apply_transformers(values, transformer_functions) + assert values['b'] == "16:45" diff --git a/unittests/test_variable_substitutions.py b/unittests/test_variable_substitutions.py index 9d1981c773e481e78eb4f82e785e27ee8f8d00d6..09f78df661d82970e7264996102eff8881ee19ec 100644 --- a/unittests/test_variable_substitutions.py +++ b/unittests/test_variable_substitutions.py @@ -41,7 +41,7 @@ from pytest import raises from utils import dircheckstr as dircheckstr_base UNITTESTDIR = Path(__file__).parent -dircheckstr = partial(dircheckstr_base, UNITTESTDIR/"test_directories" / +dircheckstr = partial(dircheckstr_base, UNITTESTDIR / "test_directories" / "example_substitutions")