diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8840e613f1e1eb86f30779b8b3535e2ff97ad0cc..b2ecd97cbded32dea3be9046cfdaac516bfb0ec2 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -130,7 +130,7 @@ unittest_py3.7: # TODO: Use f-branch logic here - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@dev - - pip install . + - pip install .[h5-crawler] # actual test - caosdb-crawler --help - pytest --cov=caosdb -vv ./unittests diff --git a/CHANGELOG.md b/CHANGELOG.md index ba0603be4ef7a66be5377e315e8dce0befae5d3c..2d049e0f3c5e8b30b7a7dc2206664e6fd7964246 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,23 +8,51 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ## ### Added ### -* 'transform' sections can be added to a CFood to apply functions to values stored in variables. + +* `transform` sections can be added to a CFood to apply functions to values stored in variables. * default transform functions: submatch, split and replace. * New command line option "--new-debug-tree" that allows saving a full tree of debug information for crawler runs in yaml and HTML format. +* `*` can now be used as a wildcard in the identifiables parameter file to denote + that any Record may reference the identified one. +* `crawl.TreatedRecordLookUp` class replacing the old (and slow) + `identified_cache` module. The new class now handles all records identified by + id, path, or identifiable simultaneously. See API docs for more info on how to + add to and get from the new lookup class. +* `identifiable_adapters.IdentifiableAdapter.get_identifying_referencing_entities` + and + `identifiable_adapters.IdentifiableAdapter.get_identifying_referenced_entities` + static methods to return the referencing or referenced entities belonging to a + registered identifiable, respectively. +* [#70](https://gitlab.com/linkahead/linkahead-crawler/-/issues/70): Optional + converters for HDF5 files. They require this package to be installed with its + ``h5-crawler`` dependency. ### Changed ### -- If the `parents` key is used in a cfood at a lower level for a Record that + +* If the `parents` key is used in a cfood at a lower level for a Record that already has a Parent (because it was explicitly given or the default Parent), the old Parent(s) are now overwritten with the value belonging to the `parents` key. -- If a registered identifiable states, that a reference by a Record with parent +* If a registered identifiable states, that a reference by a Record with parent RT1 is needed, then now also references from Records that have a child of RT1 as parent are accepted. +* More aggressive caching. +* The `identifiable_adapters.IdentifiableAdapter` now creates (possibly empty) + reference lists for all records in `create_reference_mapping`. This allows + functions like `get_identifiable` to be called only with the subset of the + referenceing entities belonging to a specific Record. +* The `identifiable_adapters.IdentifiableAdapter` uses entity ids (negative for + entities that don't exist remotely) instead of entity objects for keeping + track of references. ### Deprecated ### +* `IdentifiableAdapter.get_file` + ### Removed ### +* `identified_cache` module which was replaced by the `crawl.TreatedRecordLookUp` class. + ### Fixed ### * Empty Records can now be created (https://gitlab.com/caosdb/caosdb-crawler/-/issues/27) @@ -37,6 +65,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 handles cases correctly in which entities retrieved from the server have to be merged with local entities that both reference another, already existing entity +* A corner case in `split_into_inserts_and_updates` whereby two records created + in different places in the cfood definition would not be merged if both were + identified by the same LinkAhead id +* [#87](https://gitlab.com/linkahead/linkahead-crawler/-/issues/87) Handle long strings more gracefully. The crawler sometimes runs into + [linkahead-server#101](https://gitlab.com/linkahead/linkahead-server/-/issues/101), this is now mitigated. +* [indiscale#128](https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/128) Yet another corner case of referencing resolution resolved. ### Security ### diff --git a/CITATION.cff b/CITATION.cff index c6d41cc49d74b72056d825ca731fa79897fc537b..a41a45add0addbe657e6a95bd4e74f8c2365c742 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -1,25 +1,22 @@ cff-version: 1.2.0 message: "If you use this software, please cite it as below." authors: - - family-names: Fitschen - given-names: Timm - orcid: https://orcid.org/0000-0002-4022-432X - - family-names: Schlemmer - given-names: Alexander - orcid: https://orcid.org/0000-0003-4124-9649 - - family-names: Hornung - given-names: Daniel - orcid: https://orcid.org/0000-0002-7846-6375 - family-names: tom Wörden given-names: Henrik orcid: https://orcid.org/0000-0002-5549-578X - - family-names: Parlitz - given-names: Ulrich - orcid: https://orcid.org/0000-0003-3058-1435 + - family-names: Spreckelsen + given-names: Florian + orcid: https://orcid.org/0000-0002-6856-2910 - family-names: Luther given-names: Stefan orcid: https://orcid.org/0000-0001-7214-8125 + - family-names: Parlitz + given-names: Ulrich + orcid: https://orcid.org/0000-0003-3058-1435 +- family-names: Schlemmer + given-names: Alexander + orcid: https://orcid.org/0000-0003-4124-9649 title: CaosDB - Crawler version: 0.6.0 -doi: 10.3390/data4020083 +doi: 10.3390/data9020024 date-released: 2023-06-23 \ No newline at end of file diff --git a/integrationtests/basic_example/test_basic.py b/integrationtests/basic_example/test_basic.py index d3482a19f0c95912002b2ff68101623476d452ea..c906a81d86af56669f7c522169bceb3b5fcb3e01 100755 --- a/integrationtests/basic_example/test_basic.py +++ b/integrationtests/basic_example/test_basic.py @@ -112,11 +112,11 @@ def crawler_extended(ident): return cr, crawled_data, debug_tree -def test_ambigious_lookup(clear_database, usemodel, crawler, ident): +def test_ambiguous_lookup(clear_database, usemodel, crawler, ident): ins, ups = crawler[0].synchronize(crawled_data=crawler[1]) proj = db.execute_query("FIND Project WITH identifier='SpeedOfLight'", unique=True) - with pytest.raises(RuntimeError, match=".*unambigiously.*"): + with pytest.raises(RuntimeError, match=".*unambiguously.*"): print(crawler[0].identifiableAdapter.retrieve_identified_record_for_identifiable( Identifiable(properties={'project': proj.id}))) diff --git a/integrationtests/test_issues.py b/integrationtests/test_issues.py index 441edac5481585e483c94d61d864a1baaa139aa2..814e82ad75512ec8fe217294e1a9e86c6aa01ab3 100644 --- a/integrationtests/test_issues.py +++ b/integrationtests/test_issues.py @@ -16,24 +16,26 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. # -from pytest import fixture, mark +from pytest import fixture, mark, raises -import caosdb as db -from caosdb.cached import cache_clear +import linkahead as db +from linkahead.cached import cache_clear from caosadvancedtools.models.parser import parse_model_from_string from caoscrawler.crawl import Crawler +from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter from caoscrawler.structure_elements import DictElement from caoscrawler.scanner import create_converter_registry, scan_structure_elements -from caosdb.utils.register_tests import clear_database, set_test_key +from linkahead.utils.register_tests import clear_database, set_test_key set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") @fixture(autouse=True) def clear_cache(): + """Clear the LinkAhead cache.""" cache_clear() @@ -266,3 +268,64 @@ Campaign: # Nothing to do for the existing ents assert len(ups) == 0 assert ins[0].name == event.name + + +def test_indiscale_87(clear_database): + """Handle long string queries gracefully. + + https://gitlab.com/linkahead/linkahead-crawler/-/issues/87 + """ + + prop = db.Property(name="str", datatype=db.TEXT).insert() + rt = db.RecordType(name="RT1").add_property(prop).insert() + strings = [ + "X123456789" * 26, + "X" * 260, + "X123456789" * 25 + "9876543210", + ] + recs = [ + db.Record().add_parent(rt).add_property(name="str", value=string).insert() + for string in strings + ] + idents = [ + Identifiable(record_type="RT1", properties={"str": string}) + for string in strings + ] + adapter = CaosDBIdentifiableAdapter() + for rec, ident in zip(recs, idents): + print(f"Testing: ...{rec.get_property('str').value[-10:]}") + retrieved = adapter.retrieve_identified_record_for_identifiable(ident) + # print(rec) + # print(retrieved) + print(db.apiutils.compare_entities(rec, retrieved)) + assert db.apiutils.empty_diff(rec, retrieved) + print("---") + + # add another, harmless, property + prop2 = db.Property(name="someint", datatype=db.INTEGER).insert() + rt.add_property(prop2).update() + string = "Y123456789" * 26 + numbers = [23, 42] + recs = [ + db.Record().add_parent(rt).add_property(name="str", value=string).add_property( + name="someint", value=number).insert() + for number in numbers + ] + idents = [Identifiable(record_type="RT1", properties={"str": string})] + # Ambiguous result + with raises(RuntimeError, match=".*unambiguously.*"): + retrieved = adapter.retrieve_identified_record_for_identifiable(idents[0]) + + # Upgrade new property to be identifying + idents = [ + Identifiable(record_type="RT1", properties={"str": string, "someint": number}) + for number in numbers + ] + for rec, ident in zip(recs, idents): + print(f"Testing: someint={rec.get_property('someint').value}") + retrieved = adapter.retrieve_identified_record_for_identifiable(ident) + # print(rec) + # print(retrieved) + print(db.apiutils.compare_entities(rec, retrieved)) + assert db.apiutils.empty_diff(rec, retrieved) + print("---") diff --git a/setup.cfg b/setup.cfg index ffb00e54eb7bf8753a335bb96d9fdf23700aaadd..0f6afe5d2538228ee49608bb8b3751139cba4e00 100644 --- a/setup.cfg +++ b/setup.cfg @@ -40,3 +40,8 @@ per-file-ignores = __init__.py:F401 [options.entry_points] console_scripts = caosdb-crawler = caoscrawler.crawl:main + +[options.extras_require] +h5-crawler = + h5py >= 3.8 + numpy diff --git a/src/caoscrawler/cfood-schema.yml b/src/caoscrawler/cfood-schema.yml index b0d77bbf5d7ba09df3c0c47d656fa3d22d07b6d2..5a6e1e50345382ca6e5a1e6ef3a8fbeafb806b84 100644 --- a/src/caoscrawler/cfood-schema.yml +++ b/src/caoscrawler/cfood-schema.yml @@ -31,6 +31,10 @@ cfood: - JSONFile - CSVTableConverter - XLSXTableConverter + - H5File + - H5Dataset + - H5Group + - H5Ndarray description: Type of this converter node. match: description: typically a regexp which is matched to a structure element name diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index bbc34e3795f707b55d2147d05753fa6166a20dc5..477d114c8e29047278df90ef226371f20ad89e58 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -36,7 +36,7 @@ from inspect import signature from string import Template from typing import Any, List, Optional, Tuple, Union -import caosdb as db +import linkahead as db import pandas as pd import yaml import yaml_header_tools @@ -1082,16 +1082,13 @@ class ListElementConverter(Converter): "This converter can only process DictListElements.") children: list[StructureElement] = [] for index, list_element in enumerate(element.value): - # TODO(fspreck): Refactor this and merge with DictXXXElements maybe? - if isinstance(list_element, str): - children.append(TextElement(str(index), list_element)) - elif isinstance(list_element, dict): - children.append(DictElement(str(index), list_element)) - elif isinstance(list_element, StructureElement): - children.append(list_element) - else: - raise NotImplementedError( - f"Unkown type {type(list_element)} in list element {list_element}.") + children.append( + convert_basic_element( + list_element, + name=f"{index}", + msg_prefix=f"The value at index {index} in the list as an unknown type." + ) + ) return children def typecheck(self, element: StructureElement): diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index a4649ff0856dc917105fc773333b5b81ded3b85a..a542b366d42ef5de1d2edabb60326adb466ac121 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -3,10 +3,11 @@ # # This file is a part of the CaosDB Project. # -# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> -# 2021-2023 Research Group Biomedical Physics, -# Max-Planck-Institute for Dynamics and Self-Organization Göttingen -# Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> +# Copyright (C) 2021-2024 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# Copyright (C) 2021-2023 Research Group Biomedical Physics, MPI-DS Göttingen +# Copyright (C) 2021-2023 Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> +# Copyright (C) 2021-2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as @@ -26,7 +27,7 @@ """ Crawl a file structure using a yaml cfood definition and synchronize -the acuired data with CaosDB. +the acuired data with LinkAhead. """ from __future__ import annotations @@ -42,20 +43,20 @@ from argparse import RawTextHelpFormatter from copy import deepcopy from datetime import datetime from enum import Enum -from typing import Any, Optional, Union +from typing import Any, List, Optional, Union -import caosdb as db +import linkahead as db import yaml from caosadvancedtools.cache import UpdateCache from caosadvancedtools.crawler import Crawler as OldCrawler from caosadvancedtools.serverside.helper import send_mail from caosadvancedtools.utils import create_entity_link -from caosdb.apiutils import (EntityMergeConflictError, compare_entities, - merge_entities) -from caosdb.cached import cache_clear, cached_get_entity_by -from caosdb.exceptions import EmptyUniqueQueryError +from linkahead.apiutils import (EntityMergeConflictError, compare_entities, + merge_entities) +from linkahead.cached import cache_clear, cached_get_entity_by from linkahead.common.datatype import get_list_datatype, is_reference -from linkahead.utils.escape import escape_quoted_text +from linkahead.exceptions import EmptyUniqueQueryError +from linkahead.utils.escape import escape_squoted_text from .config import get_config_setting from .converters import Converter, ConverterValidationError @@ -64,7 +65,6 @@ from .identifiable import Identifiable from .identifiable_adapters import (CaosDBIdentifiableAdapter, IdentifiableAdapter, LocalStorageIdentifiableAdapter) -from .identified_cache import IdentifiedCache from .logging import configure_server_side_logging from .macros import defmacro_constructor, macro_constructor from .scanner import (create_converter_registry, initialize_converters, @@ -185,6 +185,11 @@ def _treat_merge_error_of(newrecord, record): """ for this_p in newrecord.properties: that_p = record.get_property(this_p.name) + if that_p is None: + logger.debug(f"Property {this_p.name} does not exist in the second entity. Note that " + "this should not be the reason for the merge conflict.") + continue + if (isinstance(this_p.value, db.Entity) and isinstance(that_p.value, db.Entity)): if this_p.value.id is not None and that_p.value.id is not None: @@ -204,14 +209,13 @@ def _treat_merge_error_of(newrecord, record): # TODO can we also compare lists? and not isinstance(this_p.value, list) and not isinstance(that_p.value, list)): - logger.error("The Crawler is trying to merge two entities " - "because they should be the same object (same" - " identifiables), but they have " - "different values for the same Property." - f"Problematic Property: {this_p.name}\n" - f"Values: {this_p.value} and " - f"{that_p.value}\n" - f"{record}\n{newrecord}") + logger.error( + "The Crawler is trying to merge two entities because they should be the same " + "object (same identifiables), but they have different values for the same " + "Property.\n" + f"Problematic Property: {this_p.name}\n" + f"Values: {this_p.value} and {that_p.value}\n" + f"{record}\n{newrecord}") raise RuntimeError("Cannot merge Entities") @@ -221,6 +225,110 @@ class SecurityMode(Enum): UPDATE = 2 +class TreatedRecordLookUp(): + """tracks Records and Identifiables for which it was checked whether they exist in the remote + server + + For a given Record it can be checked, whether it exists in the remote sever if + - it has a (valid) ID + - it has a (valid) path (FILEs only) + - an identifiable can be created for the Record. + + Records are added by calling the `add` function and they are then added to the internal + existing or missing list depending on whether the Record has a valid ID. + Additionally, the Record is added to three look up dicts. The keys of those are paths, IDs and + the representation of the identifiables. + + The extreme case, that one could imagine, would be that the same Record occurs three times as + different Python objects: one that only has an ID, one with only a path and one without ID and + path but with identifying properties. During `split_into_inserts_and_updates` all three + must be identified with each other (and must be merged). Since we require, that treated + entities have a valid ID if they exist in the remote server, all three objects would be + identified with each other simply using the IDs. + + In the case that the Record is not yet in the remote server, there cannot be a Python object + with an ID. Thus we might have one with a path and one with an identifiable. If that Record + does not yet exist, it is necessary that both Python objects have at least either the path or + the identifiable in common. + """ + + def __init__(self): + self._id_look_up: dict[int, db.Entity] = {} + self._path_look_up: dict[str, db.Entity] = {} + self._identifiable_look_up: dict[str, db.Entity] = {} + self.remote_missing_counter = -1 + self._missing: dict[int, db.Entity] = {} + self._existing: dict[int, db.Entity] = {} + + def add(self, record: db.Entity, identifiable: Optional[Identifiable] = None): + """ + Add a Record that was treated, such that it is contained in the internal look up dicts + + This Record MUST have an ID if it was found in the remote server. + """ + if record.id is None: + if record.path is None and identifiable is None: + raise RuntimeError("Record must have ID or path or an identifiable must be given." + f"Record is\n{record}") + record.id = self.remote_missing_counter + self.remote_missing_counter -= 1 + self._add_any(record, self._missing, identifiable) + else: + self._add_any(record, self._existing, identifiable) + + def get_any(self, record: db.Entity, identifiable: Optional[Identifiable] = None): + """ + Check whether this Record was already added. Identity is based on ID, path or Identifiable + represenation + """ + if record.id is not None and record.id in self._id_look_up: + return self._id_look_up[record.id] + if record.path is not None and record.path in self._path_look_up: + return self._path_look_up[record.path] + if (identifiable is not None and identifiable.get_representation() in + self._identifiable_look_up): + return self._identifiable_look_up[identifiable.get_representation()] + + def get_existing(self, record: db.Entity, identifiable: Optional[Identifiable] = None): + """ Check whether this Record exists on the remote server + + Returns: The stored Record + """ + rec = self.get_any(record, identifiable) + if id(rec) in self._existing: + return rec + else: + return None + + def get_missing(self, record: db.Entity, identifiable: Optional[Identifiable] = None): + """ Check whether this Record is missing on the remote server + + Returns: The stored Record + """ + rec = self.get_any(record, identifiable) + if id(rec) in self._missing: + return rec + else: + return None + + def get_missing_list(self): + """ Return all Records that are missing in the remote server """ + return list(self._missing.values()) + + def get_existing_list(self): + """ Return all Records that exist in the remote server """ + return list(self._existing.values()) + + def _add_any(self, record: db.Entity, lookup, identifiable: Optional[Identifiable] = None): + if record.id is not None: + self._id_look_up[record.id] = record + if record.path is not None: + self._path_look_up[record.path] = record + if identifiable is not None: + self._identifiable_look_up[identifiable.get_representation()] = record + lookup[id(record)] = record + + class Crawler(object): """ Crawler class that encapsulates crawling functions. @@ -257,8 +365,8 @@ class Crawler(object): # The following caches store records, where we checked whether they exist on the remote # server. Since, it is important to know whether they exist or not, we store them into two # different caches. - self.remote_existing_cache = IdentifiedCache() - self.remote_missing_cache = IdentifiedCache() + self.treated_records_lookup = TreatedRecordLookUp() + # TODO does it make sense to have this as member variable? self.securityMode = securityMode # TODO does it make sense to have this as member variable(run_id)? @@ -405,7 +513,7 @@ class Crawler(object): Crawler.create_flat_list([p.value], flat) return flat - def _has_missing_object_in_references(self, ident: Identifiable, referencing_entities: list): + def _has_missing_object_in_references(self, ident: Identifiable, referencing_entities: dict): """ returns False if any value in the properties attribute is a db.Entity object that is contained in the `remote_missing_cache`. If ident has such an object in @@ -418,16 +526,21 @@ class Crawler(object): # Entity instead of ID and not cached locally if (isinstance(pvalue, list)): for el in pvalue: - if (isinstance(el, db.Entity) and self.get_from_remote_missing_cache( - self.identifiableAdapter.get_identifiable(el, referencing_entities)) is not None): + elident = self.identifiableAdapter.get_identifiable( + el, referencing_entities[id(el)]) + if (isinstance(el, db.Entity) + and self.treated_records_lookup.get_missing(el, elident) is not None): return True - if (isinstance(pvalue, db.Entity) and self.get_from_remote_missing_cache( - self.identifiableAdapter.get_identifiable(pvalue, referencing_entities)) is not None): + if (isinstance(pvalue, db.Entity) and self.treated_records_lookup.get_missing( + pvalue, + self.identifiableAdapter.get_identifiable(pvalue, + referencing_entities[id(pvalue)]) + ) is not None): # might be checked when reference is resolved return True return False - def replace_references_with_cached(self, record: db.Record, referencing_entities: list): + def replace_references_with_cached(self, record: db.Record, referencing_entities: dict): """ Replace all references with the versions stored in the cache. @@ -438,8 +551,10 @@ class Crawler(object): lst = [] for el in p.value: if (isinstance(el, db.Entity) and el.id is None): - cached = self.get_from_any_cache( - self.identifiableAdapter.get_identifiable(el, referencing_entities)) + cached = self.treated_records_lookup.get_any( + el, + self.identifiableAdapter.get_identifiable( + el, referencing_entities[id(el)])) if cached is None: lst.append(el) continue @@ -447,12 +562,12 @@ class Crawler(object): if isinstance(p.value, db.File): if p.value.path != cached.path: raise RuntimeError( - "The cached and the refernced entity are not identical.\n" + "The cached and the referenced entity are not identical.\n" f"Cached:\n{cached}\nReferenced:\n{el}" ) else: raise RuntimeError( - "The cached and the refernced entity are not identical.\n" + "The cached and the referenced entity are not identical.\n" f"Cached:\n{cached}\nReferenced:\n{el}" ) lst.append(cached) @@ -460,82 +575,25 @@ class Crawler(object): lst.append(el) p.value = lst if (isinstance(p.value, db.Entity) and p.value.id is None): - cached = self.get_from_any_cache( - self.identifiableAdapter.get_identifiable(p.value, referencing_entities)) + cached = self.treated_records_lookup.get_any( + p.value, self.identifiableAdapter.get_identifiable( + p.value, referencing_entities[id(p.value)])) if cached is None: continue if not check_identical(cached, p.value, True): if isinstance(p.value, db.File): if p.value.path != cached.path: raise RuntimeError( - "The cached and the refernced entity are not identical.\n" + "The cached and the referenced entity are not identical.\n" f"Cached:\n{cached}\nReferenced:\n{p.value}" ) else: raise RuntimeError( - "The cached and the refernced entity are not identical.\n" + "The cached and the referenced entity are not identical.\n" f"Cached:\n{cached}\nReferenced:\n{p.value}" ) p.value = cached - def get_from_remote_missing_cache(self, identifiable: Identifiable): - """ - returns the identified record if an identifiable with the same values already exists locally - (Each identifiable that is not found on the remote server, is 'cached' locally to prevent - that the same identifiable exists twice) - """ - if identifiable is None: - raise ValueError("Identifiable has to be given as argument") - - if identifiable in self.remote_missing_cache: - return self.remote_missing_cache[identifiable] - else: - return None - - def get_from_any_cache(self, identifiable: Identifiable): - """ - returns the identifiable if an identifiable with the same values already exists locally - (Each identifiable that is not found on the remote server, is 'cached' locally to prevent - that the same identifiable exists twice) - """ - if identifiable is None: - raise ValueError("Identifiable has to be given as argument") - - if identifiable in self.remote_existing_cache: - return self.remote_existing_cache[identifiable] - elif identifiable in self.remote_missing_cache: - return self.remote_missing_cache[identifiable] - else: - return None - - def add_to_remote_missing_cache(self, record: db.Record, identifiable: Identifiable): - """ - stores the given Record in the remote_missing_cache. - - If identifiable is None, the Record is NOT stored. - """ - self.add_to_cache(record=record, cache=self.remote_missing_cache, - identifiable=identifiable) - - def add_to_remote_existing_cache(self, record: db.Record, identifiable: Identifiable): - """ - stores the given Record in the remote_existing_cache. - - If identifiable is None, the Record is NOT stored. - """ - self.add_to_cache(record=record, cache=self.remote_existing_cache, - identifiable=identifiable) - - def add_to_cache(self, record: db.Record, cache: IdentifiedCache, - identifiable: Identifiable) -> None: - """ - stores the given Record in the given cache. - - If identifiable is None, the Record is NOT stored. - """ - if identifiable is not None: - cache.add(identifiable=identifiable, record=record) - @staticmethod def bend_references_to_new_object(old, new, entities): """ Bend references to the other object @@ -552,23 +610,73 @@ class Crawler(object): if p.value is old: p.value = new + def _merge_identified(self, newrecord, record, try_to_merge_later, all_records): + """ tries to merge record into newrecord + + If it fails, record is added to the try_to_merge_later list. + In any case, references are bent to the newrecord object. + + """ + try: + merge_entities( + newrecord, record, merge_references_with_empty_diffs=False, + merge_id_with_resolved_entity=True) + except EntityMergeConflictError: + _treat_merge_error_of(newrecord, record) + # We cannot merge but it is none of the clear case where merge is + # impossible. Thus we try later + try_to_merge_later.append(record) + if newrecord.id is not None: + record.id = newrecord.id + except NotImplementedError: + print(newrecord) + print(record) + raise + Crawler.bend_references_to_new_object( + old=record, new=newrecord, + entities=all_records + ) + + def _identity_relies_on_unchecked_entities(self, record: db.Record, referencing_entities): + """ + If a record for which it could not yet be verified whether it exists in LA or not is part + of the identifying properties, this returns True, otherwise False + """ + + registered_identifiable = self.identifiableAdapter.get_registered_identifiable(record) + if registered_identifiable is None: + return False + refs = self.identifiableAdapter.get_identifying_referencing_entities(referencing_entities, + registered_identifiable) + if any(el is None for el in refs): + return True + + refs = self.identifiableAdapter.get_identifying_referenced_entities( + record, registered_identifiable) + if any([self.treated_records_lookup.get_any(el) is None for el in refs]): + return True + + return False + @staticmethod def create_reference_mapping(flat: list[db.Entity]): """ Create a dictionary of dictionaries of the form: - dict[int, dict[str, list[db.Entity]]] + dict[int, dict[str, list[Union[int,None]]]] - The integer index is the Python id of the value object. - The string is the name of the first parent of the referencing object. Each value objects is taken from the values of all properties from the list flat. - So the returned mapping maps ids of entities to the objects which are referring + So the returned mapping maps ids of entities to the ids of objects which are referring to them. """ # TODO we need to treat children of RecordTypes somehow. - references: dict[int, dict[str, list[db.Entity]]] = {} + references: dict[int, dict[str, list[Union[int, None]]]] = {} for ent in flat: + if id(ent) not in references: + references[id(ent)] = {} for p in ent.properties: val = p.value if not isinstance(val, list): @@ -579,127 +687,119 @@ class Crawler(object): references[id(v)] = {} if ent.parents[0].name not in references[id(v)]: references[id(v)][ent.parents[0].name] = [] - references[id(v)][ent.parents[0].name].append(ent) + references[id(v)][ent.parents[0].name].append(ent.id) return references def split_into_inserts_and_updates(self, ent_list: list[db.Entity]): - to_be_inserted: list[db.Entity] = [] - to_be_updated: list[db.Entity] = [] flat = Crawler.create_flat_list(ent_list) + all_records = list(flat) # TODO: can the following be removed at some point for ent in flat: if ent.role == "Record" and len(ent.parents) == 0: raise RuntimeError(f"Records must have a parent.\n{ent}") - resolved_references = True + # Check whether Records can be identified without identifiable + for i in reversed(range(len(flat))): + record = flat[i] + # 1. Can it be identified via an ID? + if record.id is not None: + treated_record = self.treated_records_lookup.get_existing(record) + if treated_record is not None: + self._merge_identified(treated_record, record, try_to_merge_later, all_records) + all_records.remove(record) + referencing_entities = self.create_reference_mapping(all_records) + else: + self.treated_records_lookup.add(record, None) + assert record.id + del flat[i] + # 2. Can it be identified via a path? + elif record.path is not None: + try: + existing = cached_get_entity_by(path=record.path) + except EmptyUniqueQueryError: + existing = None + if existing is not None: + record.id = existing.id + # TODO check the following copying of _size and _checksum + # Copy over checksum and size too if it is a file + record._size = existing._size + record._checksum = existing._checksum + treated_record = self.treated_records_lookup.get_any(record) + if treated_record is not None: + self._merge_identified(treated_record, record, try_to_merge_later, all_records) + all_records.remove(record) + referencing_entities = self.create_reference_mapping(all_records) + else: + # TODO add identifiable if possible + self.treated_records_lookup.add(record, None) + assert record.id + del flat[i] + + entity_was_treated = True # flat contains Entities which could not yet be checked against the remote server try_to_merge_later = [] - while resolved_references and len(flat) > 0: - resolved_references = False - referencing_entities = self.create_reference_mapping( - flat + to_be_updated + try_to_merge_later + to_be_inserted) + while entity_was_treated and len(flat) > 0: + entity_was_treated = False + referencing_entities = self.create_reference_mapping(all_records) # For each element we try to find out whether we can find it in the server or whether # it does not yet exist. Since a Record may reference other unkown Records it might not # be possible to answer this right away. # The following checks are done on each Record: - # 1. Can it be identified via an ID? - # 2. Can it be identified via a path? - # 3. Is it in the cache of already checked Records? - # 4. Can it be checked on the remote server? - # 5. Does it have to be new since a needed reference is missing? + # 1. Is it in the cache of already checked Records? + # 2. Can it be checked on the remote server? + # 3. Does it have to be new since a needed reference is missing? for i in reversed(range(len(flat))): record = flat[i] + + if self._identity_relies_on_unchecked_entities(record, + referencing_entities[id(record)]): + continue + identifiable = self.identifiableAdapter.get_identifiable( record, - referencing_entities=referencing_entities) - - # TODO remove if the exception is never raised - if record in to_be_inserted: - raise RuntimeError("This should not be reached since treated elements" - "are removed from the list") - # 1. Can it be identified via an ID? - elif record.id is not None: - to_be_updated.append(record) - self.add_to_remote_existing_cache(record, identifiable) - del flat[i] - # 2. Can it be identified via a path? - elif record.path is not None: - try: - existing = cached_get_entity_by(path=record.path) - except EmptyUniqueQueryError: - existing = None - if existing is None: - to_be_inserted.append(record) - self.add_to_remote_missing_cache(record, identifiable) - del flat[i] - else: - record.id = existing.id - # TODO check the following copying of _size and _checksum - # Copy over checksum and size too if it is a file - record._size = existing._size - record._checksum = existing._checksum - to_be_updated.append(record) - self.add_to_remote_existing_cache(record, identifiable) - del flat[i] - # 3. Is it in the cache of already checked Records? - elif self.get_from_any_cache(identifiable) is not None: - newrecord = self.get_from_any_cache(identifiable) - # Since the identifiables are the same, newrecord and record actually describe - # the same obejct. - # We merge the two in order to prevent loss of information - try: - merge_entities( - newrecord, record, merge_references_with_empty_diffs=False, merge_id_with_resolved_entity=True) - except EntityMergeConflictError: - _treat_merge_error_of(newrecord, record) - # We cannot merge but it is none of the clear case where merge is - # impossible. Thus we try later - try_to_merge_later.append(record) - if newrecord.id is not None: - record.id = newrecord.id - except NotImplementedError: - print(newrecord) - print(record) - raise - Crawler.bend_references_to_new_object( - old=record, new=newrecord, - entities=flat + to_be_updated + to_be_inserted + try_to_merge_later - ) - referencing_entities = self.create_reference_mapping( - flat + to_be_updated + try_to_merge_later + to_be_inserted) + referencing_entities=referencing_entities[id(record)]) + + # 1. Is it in the cache of already checked Records? + if self.treated_records_lookup.get_any(record, identifiable) is not None: + treated_record = self.treated_records_lookup.get_any(record, identifiable) + # Since the identifiables are the same, treated_record and record actually + # describe the same object. + # We merge record into treated_record in order to prevent loss of information + self._merge_identified(treated_record, record, try_to_merge_later, all_records) + all_records.remove(record) + referencing_entities = self.create_reference_mapping(all_records) del flat[i] - resolved_references = True + entity_was_treated = True - # 4. Can it be checked on the remote server? + # 2. Can it be checked on the remote server? elif not self._has_reference_value_without_id(identifiable): identified_record = ( self.identifiableAdapter.retrieve_identified_record_for_identifiable( identifiable)) if identified_record is None: # identifiable does not exist remotely -> record needs to be inserted - to_be_inserted.append(record) - self.add_to_remote_missing_cache(record, identifiable) - del flat[i] + self.treated_records_lookup.add(record, identifiable) else: # side effect record.id = identified_record.id - to_be_updated.append(record) - self.add_to_remote_existing_cache(record, identifiable) - del flat[i] - resolved_references = True + record.path = identified_record.path + self.treated_records_lookup.add(record, identifiable) + assert record.id + del flat[i] + entity_was_treated = True - # 5. Does it have to be new since a needed reference is missing? + # 3. Does it have to be new since a needed reference is missing? # (Is it impossible to check this record because an identifiable references a # missing record?) elif self._has_missing_object_in_references(identifiable, referencing_entities): - to_be_inserted.append(record) - self.add_to_remote_missing_cache(record, identifiable) + self.treated_records_lookup.add(record, identifiable) + assert record.id del flat[i] - resolved_references = True + entity_was_treated = True for record in flat: self.replace_references_with_cached(record, referencing_entities) @@ -709,23 +809,35 @@ class Crawler(object): for record in try_to_merge_later: identifiable = self.identifiableAdapter.get_identifiable( record, - referencing_entities=referencing_entities) - newrecord = self.get_from_any_cache(identifiable) + referencing_entities=referencing_entities[id(record)]) + newrecord = self.treated_records_lookup.get_any(record, identifiable) merge_entities(newrecord, record, merge_id_with_resolved_entity=True) if len(flat) > 0: circle = self.detect_circular_dependency(flat) if circle is None: logger.error("Failed, but found NO circular dependency. The data is as follows:" - + str(self.compact_entity_list_representation(flat))) + + str(self.compact_entity_list_representation(flat, + referencing_entities))) else: logger.error("Found circular dependency (Note that this might include references " "that are not identifying properties): " - + self.compact_entity_list_representation(circle)) + + self.compact_entity_list_representation(circle, + referencing_entities)) + raise RuntimeError( f"Could not finish split_into_inserts_and_updates. Circular dependency: " f"{circle is not None}") - return to_be_inserted, to_be_updated + # remove negative IDs + missing = self.treated_records_lookup.get_missing_list() + for el in missing: + if el.id is None: + raise RuntimeError("This should not happen") # TODO remove + if el.id >= 0: + raise RuntimeError("This should not happen") # TODO remove + el.id = None + + return (missing, self.treated_records_lookup.get_existing_list()) def replace_entities_with_ids(self, rec: db.Record): for el in rec.properties: @@ -738,23 +850,39 @@ class Crawler(object): if val.id is not None: el.value[index] = val.id - @staticmethod - def compact_entity_list_representation(circle): + @ staticmethod + def compact_entity_list_representation(entities, referencing_entities: List) -> str: """ a more readable representation than the standard xml representation TODO this can be removed once the yaml format representation is in pylib """ text = "\n--------\n" - for el in circle: - if el.name is not None: - text += f"{el.name}\n" - text += f"{[el.name for el in el.parents]}\n" - props = {p.name: p.value for p in el.properties} - text += f"{props}\n" + + grouped = {"": []} + for ent in entities: + if not ent.parents: + grouped[""].append(ent) + for parent in ent.parents: + if parent.name not in grouped: + grouped[parent.name] = [] + grouped[parent.name].append(ent) + if not grouped[""]: + del grouped[""] + for parent, group in grouped.items(): + text += f"\n> Parent: {parent}\n" + for ent in group: + if ent.name is not None: + text += f"\n>> Name: {ent.name}\n" + else: + text += "\n>> name: # No name" + text += f"{[ent.name for ent in ent.parents]}\n" + props = {p.name: p.value for p in ent.properties} + text += f"{props}\n" + text += f"is_referenced_by:\n{referencing_entities[id(ent)]}\n" return text + "--------\n" - @staticmethod + @ staticmethod def detect_circular_dependency(flat: list[db.Entity]): """ Detects whether there are circular references in the given entity list and returns a list @@ -787,7 +915,7 @@ class Crawler(object): return None return circle - @staticmethod + @ staticmethod def _merge_properties_from_remote( crawled_data: list[db.Record], identified_records: list[db.Record] @@ -829,7 +957,7 @@ class Crawler(object): return to_be_updated - @staticmethod + @ staticmethod def remove_unnecessary_updates( crawled_data: list[db.Record], identified_records: list[db.Record] @@ -855,7 +983,7 @@ class Crawler(object): return actual_updates - @staticmethod + @ staticmethod def execute_parent_updates_in_list(to_be_updated, securityMode, run_id, unique_names): """ Execute the updates of changed parents. @@ -898,13 +1026,13 @@ class Crawler(object): "mode. This might lead to a failure of inserts that follow.") logger.info(parent_updates) - @staticmethod + @ staticmethod def _get_property_id_for_datatype(rtname: str, name: str): return cached_get_entity_by( - query=f"FIND Entity '{escape_quoted_text(rtname)}' " - f"with name='{escape_quoted_text(name)}'").id + query=f"FIND Entity '{escape_squoted_text(rtname)}' " + f"with name='{escape_squoted_text(name)}'").id - @staticmethod + @ staticmethod def replace_name_with_referenced_entity_id(prop: db.Property): """changes the given property in place if it is a reference property that has a name as value @@ -937,7 +1065,7 @@ class Crawler(object): if isinstance(el, str): try: # the get_entity function will raise an error if not unique - propval.append(Crawler._get_property_id_for_datatype(rtname=prop.datatype, + propval.append(Crawler._get_property_id_for_datatype(rtname=dt, name=el)) except (db.EmptyUniqueQueryError, db.QueryNotUniqueError): logger.error( @@ -949,7 +1077,7 @@ class Crawler(object): propval.append(el) prop.value = propval - @staticmethod + @ staticmethod def execute_inserts_in_list(to_be_inserted, securityMode, run_id: Optional[uuid.UUID] = None, unique_names=True): @@ -969,7 +1097,7 @@ class Crawler(object): update_cache = UpdateCache() update_cache.insert(to_be_inserted, run_id, insert=True) - @staticmethod + @ staticmethod def set_ids_and_datatype_of_parents_and_properties(rec_list): for record in rec_list: for parent in record.parents: @@ -981,7 +1109,7 @@ class Crawler(object): prop.id = entity.id _resolve_datatype(prop, entity) - @staticmethod + @ staticmethod def execute_updates_in_list(to_be_updated, securityMode, run_id: Optional[uuid.UUID] = None, unique_names=True): @@ -995,7 +1123,7 @@ class Crawler(object): update_cache = UpdateCache() update_cache.insert(to_be_updated, run_id) - @staticmethod + @ staticmethod def check_whether_parent_exists(records: list[db.Entity], parents: list[str]): """ returns a list of all records in `records` that have a parent that is in `parents`""" problems = [] @@ -1047,12 +1175,11 @@ class Crawler(object): """ if crawled_data is None: warnings.warn(DeprecationWarning( - "Calling synchronize without the data to be synchronized is depricated. Please " + "Calling synchronize without the data to be synchronized is deprecated. Please " "use for example the Scanner to create this data.")) crawled_data = self.crawled_data to_be_inserted, to_be_updated = self.split_into_inserts_and_updates(crawled_data) - referencing_entities = self.create_reference_mapping(to_be_updated + to_be_inserted) for el in to_be_updated: # all entity objects are replaced by their IDs except for the not yet inserted ones @@ -1116,7 +1243,7 @@ class Crawler(object): return (to_be_inserted, to_be_updated) - @staticmethod + @ staticmethod def create_entity_summary(entities: list[db.Entity]): """ Creates a summary string reprensentation of a list of entities.""" parents = {} @@ -1135,7 +1262,7 @@ class Crawler(object): output = output[:-2] + "\n" return output - @staticmethod + @ staticmethod def inform_about_pending_changes(pending_changes, run_id, path, inserts=False): # Sending an Email with a link to a form to authorize updates is if get_config_setting("send_crawler_notifications"): @@ -1156,7 +1283,7 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) + " by invoking the crawler" " with the run id: {rid}\n".format(rid=run_id)) - @staticmethod + @ staticmethod def debug_build_usage_tree(converter: Converter): res: dict[str, dict[str, Any]] = { converter.name: { diff --git a/src/caoscrawler/debug_tree.py b/src/caoscrawler/debug_tree.py index 9983981c69e3df7c58ddfda4b6977944eac54999..0d57040f5c20aca236a3c11531e8b7c45bad89ab 100644 --- a/src/caoscrawler/debug_tree.py +++ b/src/caoscrawler/debug_tree.py @@ -45,13 +45,13 @@ from importlib_resources import files from jsonschema import validate from typing import Any, Optional, Type, Union -import caosdb as db +import linkahead as db from caosadvancedtools.cache import UpdateCache, Cache from caosadvancedtools.crawler import Crawler as OldCrawler -from caosdb.apiutils import (compare_entities, EntityMergeConflictError, - merge_entities) -from caosdb.common.datatype import is_reference +from linkahead.apiutils import (compare_entities, EntityMergeConflictError, + merge_entities) +from linkahead.common.datatype import is_reference from .converters import Converter, DirectoryConverter, ConverterValidationError diff --git a/src/caoscrawler/hdf5_converter.py b/src/caoscrawler/hdf5_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..5b1ff5775fb74919c989507c449636fd822db7f0 --- /dev/null +++ b/src/caoscrawler/hdf5_converter.py @@ -0,0 +1,336 @@ +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2023 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2023 Florian Spreckelsen <f.spreckelsen@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +try: + import h5py +except ModuleNotFoundError: + raise ModuleNotFoundError( + "Couldn't find module h5py. Did you install the crawler package with " + "its optional `h5-crawler` dependency?" + ) + +import numpy as np + +from typing import Union + +import linkahead as db + +from .converters import (convert_basic_element, Converter, DictElementConverter, + match_name_and_value, SimpleFileConverter) +from .stores import GeneralStore, RecordStore +from .structure_elements import DictElement, File, FloatElement, IntegerElement, StructureElement + + +def convert_attributes(elt: Union[h5py.File, h5py.Group, h5py.Dataset]): + """Convert hdf5 attributes to a list of either basic scalar structure elements or ndarrays. + + Parameters + ---------- + elt : Union[h5py.File, h5py.Group, h5py.Dataset] + The hdf5 element the attributes of which will be converted to structure + elements. + + Returns + ------- + converted : list[StructureElement] + A list of the attributes converted to StructureElements (either basic + scalar elements or ndarray). + """ + + converted = [] + for name, value in elt.attrs.items(): + converted.append(convert_basic_element_with_nd_array( + value, name, f"The value of attribute {name} has an unknown type: {type(value)}.")) + + return converted + + +def convert_h5_element(elt: Union[h5py.Group, h5py.Dataset], name: str): + """Convert a given HDF5 element to the corresponding StructureElement. + + Parameters + ---------- + elt : Union[h5py.Group, h5py.Dataset] + The hdf5 element to be converted. + name : str + The name of the StructureElement that the hdf5 element is converted to. + + Raises + ------ + ValueError + In case of anything that is not convertible to a HDF5 structure element. + + Returns + ------- + StructureElement + The converted StructureElement. + """ + + if isinstance(elt, h5py.Group): + + return H5GroupElement(name, elt) + + if isinstance(elt, h5py.Dataset): + + return H5DatasetElement(name, elt) + + raise ValueError("The given element must be either a HDF5 Group or Dataset object.") + + +def convert_basic_element_with_nd_array(value, name: str = None, + internal_path: str = None, msg_prefix: str = ""): + """Convert a given object either to an ndarray structure element or to a + basic scalar structure element. + + This function extends :func:`~caoscrawler.converters.convert_basic_element` + by a special treatment for certain numpy objects, most importantly + ndarrays. They are converted to a scalar in case of a size-1 array, to a + list in case of a 1-d array, and to a ``H5NdarrayElement`` in all other + cases. In addition, numpy integers and floats are also converted to + IntegerElements and FloatElements, respectively. + + Parameters + ---------- + value + The object to be converted. + name : str, optional + The name of the structure element ``value`` is being converted + to. Default is None. + internal_path : str, optional + The internal path of ``value`` within the HDF5 file. Default is None. + msg_prefix : str, optional + The prefix of the error message that will be raised. Default is ``""``. + + Returns + ------- + StructureElement + The StructureElement ``value`` was converted to. + + """ + + if isinstance(value, np.ndarray): + + if value.size == 1: + # this is a scalar stacked in a numpy array. We don't know its + # actual shape, so we reshape first, then use the actual value + # inside. + value = value.reshape((1,))[0] + + elif np.squeeze(value).ndim == 1: + # If the array is one-dimensional we can save it as a list + value = list(np.squeeze(value)) + + else: + # real multi-dimensional array + return H5NdarrayElement(name, value, internal_path) + + elif isinstance(value, np.int32) or isinstance(value, np.int64): + + return IntegerElement(name, value) + + elif isinstance(value, np.float64): + + return FloatElement(name, value) + + return convert_basic_element(value, name, msg_prefix) + + +class H5GroupElement(DictElement): + """StructureElement specific for HDF5 groups""" + + def __init__(self, name: str, value: h5py.Group): + super().__init__(name, value) + + +class H5DatasetElement(DictElement): + """StructureElement specific for HDF5 datasets.""" + + def __init__(self, name: str, value: h5py.Dataset): + super().__init__(name, value) + + +class H5NdarrayElement(DictElement): + """StructureElement specific for NDArrays within HDF5 files. + + Also store the internal path of the array within the HDF5 file in its + ``internal_path`` attribute. + + """ + + def __init__(self, name: str, value, internal_path: str): + super().__init__(name, value) + self.internal_path = internal_path + + +class H5FileConverter(SimpleFileConverter): + """Converter for HDF5 files that creates children for the contained + attributes, groups, and datasets. + + """ + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + """Create children from root-level file attributes and contained hdf5 + elements. + + """ + + if not isinstance(element, File): + + raise ValueError("create_children should have been called with a File object.") + + ff = h5py.File(element.path, 'r') + + children = [] + + for name, value in ff.items(): + + children.append(convert_h5_element(value, name)) + + children.extend(convert_attributes(ff)) + + return children + + +class H5GroupConverter(DictElementConverter): + """Converter for HDF5 groups that creates children from the group-level + attributes and the contained subgroups and datasets. + + """ + + def typecheck(self, element: StructureElement): + + return isinstance(element, H5GroupElement) + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + """Create children from group attributes and hdf5 elements contained in + this group. + + """ + + if not isinstance(element.value, h5py.Group): + + raise ValueError("create_children should have been called with a HDF5 Group object.") + + children = [] + + for name, value in element.value.items(): + + children.append(convert_h5_element(value, name)) + + children.append(convert_attributes(element.value)) + + return children + + +class H5DatasetConverter(DictElementConverter): + """Converter for HDF5 datasets that creates children from the dataset + attributes and the contained array data. + + """ + + def typecheck(self, element: StructureElement): + + return isinstance(element, H5DatasetElement) + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + """Create children from the dataset attributes and append the array data + contained in this dataset. + + """ + + if not isinstance(element.value, h5py.Dataset): + + raise ValueError("create_children should have been called with a HDF5 Dataset object") + + children = convert_attributes(element.value) + + children.append( + H5NdarrayElement( + name=self.name+"_ndarray", + value=element.value, + internal_path=element.value.name + ) + ) + return children + + +class H5NdarrayConverter(Converter): + """Converter for ndarrays contained in HDF5 files. Creates the wrapper + record for this ndarray. + + """ + + def __init__(self, definition: dict, name: str, converter_registry: dict): + + # Check that a non-empty name for the record that will be created for + # the ndarray Record (within the cfood) is given + if not ("recordname" in definition and definition["recordname"]): + + raise RuntimeError(f"Converter {name} lacks the `recordname` definition.") + + super().__init__(definition, name, converter_registry) + + def create_children(self, values: GeneralStore, element: StructureElement): + """The ndarray doesn't have any further children.""" + + return [] + + def create_records(self, values: GeneralStore, records: RecordStore, element: StructureElement): + """Create a wrapper record with name ``recordname``, type + ``array_recordtype_name`` (default ``H5Ndarray``) and the internal path + stored in a property with name ``internal_path_property_name`` (default + ``internal_hdf5_path``). + + """ + + rname = self.definition["recordname"] + if "array_recordtype_name" in self.definition: + rtname = self.definition["array_recordtype_name"] + else: + rtname = "H5Ndarray" + + if "internal_path_property_name" in self.definition: + propname = self.definition["internal_path_property_name"] + else: + propname = "internal_hdf5_path" + + rec = db.Record().add_parent(rtname) + records[rname] = rec + values[rname] = rec + + rec.add_property(name=propname, value=element.internal_path) + keys_modified = [(rname, propname)] + + keys_modified.extend(super().create_records(values, records, element)) + + return keys_modified + + def typecheck(self, element: StructureElement): + + return isinstance(element, H5NdarrayElement) + + @Converter.debug_matching("name") + def match(self, element: StructureElement): + + if not isinstance(element, H5NdarrayElement): + + raise RuntimeError("This converter can only be called with H5NdarrayElements.") + + return match_name_and_value(self.definition, element.name, element.value) diff --git a/src/caoscrawler/identifiable.py b/src/caoscrawler/identifiable.py index 75af5be8f06a6ab95a4b7f2b92eda8cf3e321a1b..cefdf4a0f42b1f610e0712fdefebc2dc3b78d69f 100644 --- a/src/caoscrawler/identifiable.py +++ b/src/caoscrawler/identifiable.py @@ -20,7 +20,7 @@ # from __future__ import annotations -import caosdb as db +import linkahead as db from datetime import datetime import json from hashlib import sha256 diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index ceaa3bfe1f1f040fc0099d17e14a0c6797804ac4..9be539fe0842e3ce24b68060fa2288cdc4c531b2 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -26,13 +26,16 @@ from __future__ import annotations import logging +import warnings from abc import ABCMeta, abstractmethod from datetime import datetime +from functools import lru_cache from typing import Any -import caosdb as db +import linkahead as db import yaml -from caosdb.cached import cached_get_entity_by +from linkahead.cached import cached_get_entity_by, cached_query +from linkahead.utils.escape import escape_squoted_text from .identifiable import Identifiable from .utils import has_parent @@ -43,21 +46,24 @@ logger = logging.getLogger(__name__) def get_children_of_rt(rtname): """Supply the name of a recordtype. This name and the name of all children RTs are returned in a list""" - return [p.name for p in db.execute_query(f"FIND RECORDTYPE {rtname}")] + escaped = escape_squoted_text(rtname) + return [p.name for p in cached_query(f"FIND RECORDTYPE '{escaped}'")] -def convert_value(value: Any): - """ Returns a string representation of the value that is suitable - to be used in the query - looking for the identified record. +def convert_value(value: Any) -> str: + """ Return a string representation of the value suitable for the search query. + + This is for search queries looking for the identified record. Parameters ---------- - value : Any type, the value that shall be returned and potentially converted. + value: Any + The value to be converted. Returns ------- - out : the string reprensentation of the value + out: str + the string reprensentation of the value. """ @@ -68,8 +74,7 @@ def convert_value(value: Any): elif isinstance(value, bool): return str(value).upper() elif isinstance(value, str): - # replace single quotes, otherwise they may break the queries - return value.replace("\'", "\\'") + return escape_squoted_text(value) else: return str(value) @@ -79,14 +84,15 @@ class IdentifiableAdapter(metaclass=ABCMeta): Some terms: -- Registered identifiable is the definition of an identifiable which is: - - A record type as the parent - - A list of properties - - A list of referenced by statements -- Identifiable is the concrete identifiable, e.g. the Record based on - the registered identifiable with all the values filled in. -- Identified record is the result of retrieving a record based on the - identifiable from the database. +- A *registered identifiable* defines an identifiable template, for example by specifying: + - Parent record types + - Properties + - ``is_referenced_by`` statements +- An *identifiable* belongs to a concrete record. It consists of identifying attributes which "fill + in" the *registered identifiable*. In code, it can be represented as a Record based on the + *registered identifiable* with all the values filled in. +- An *identified record* is the result of retrieving a record from the database, based on the + *identifiable* (and its values). General question to clarify: @@ -95,24 +101,28 @@ General question to clarify: The list of referenced by statements is currently not implemented. -The IdentifiableAdapter can be used to retrieve the three above mentioned objects (registred +The IdentifiableAdapter can be used to retrieve the three above mentioned objects (registered identifiabel, identifiable and identified record) for a Record. """ @staticmethod - def create_query_for_identifiable(ident: Identifiable): + def create_query_for_identifiable(ident: Identifiable, startswith: bool = False): """ This function is taken from the old crawler: caosdb-advanced-user-tools/src/caosadvancedtools/crawler.py uses the properties of ident to create a query that can determine whether the required record already exists. + + If ``startswith`` is True, use ``LIKE`` for long string values to test if the strings starts + with the first 200 characters of the value. """ query_string = "FIND RECORD " if ident.record_type is not None: - query_string += f"'{ident.record_type}'" + escaped_rt = escape_squoted_text(ident.record_type) + query_string += f"'{escaped_rt}'" for ref in ident.backrefs: eid = ref if isinstance(ref, db.Entity): @@ -122,11 +132,13 @@ identifiabel, identifiable and identified record) for a Record. query_string += " WITH " if ident.name is not None: - query_string += "name='{}'".format(convert_value(ident.name)) + query_string += "name='{}'".format(escape_squoted_text(ident.name)) if len(ident.properties) > 0: query_string += " AND " - query_string += IdentifiableAdapter.create_property_query(ident) + query_string += IdentifiableAdapter.create_property_query(ident, startswith=startswith) + + # TODO Can these cases happen at all with the current code? if query_string.endswith(" AND WITH "): query_string = query_string[:-len(" AND WITH ")] if query_string.endswith(" AND "): @@ -134,15 +146,40 @@ identifiabel, identifiable and identified record) for a Record. return query_string @staticmethod - def create_property_query(entity: Identifiable): + def __create_pov_snippet(pname: str, pvalue, startswith: bool = False): + """Return something like ``'name'='some value'`` or ``'name' LIKE 'some*'``. + +If ``startswith`` is True, the value of strings will be cut off at 200 characters and a ``LIKE`` +operator will be used to find entities matching at the beginning. +""" + if startswith and isinstance(pvalue, str) and len(pvalue) > 200: + operator_value_str = f" LIKE '{escape_squoted_text(pvalue[:200])}*'" + else: + operator_value_str = "='" + convert_value(pvalue) + "'" + result = "'" + escape_squoted_text(pname) + "'" + operator_value_str + return result + + @staticmethod + def create_property_query(entity: Identifiable, startswith: bool = False): + """Create a POV query part with the entity's properties. + +Parameters +---------- + +entity: Identifiable + The Identifiable whose properties shall be used. + +startswith: bool, optional + If True, check string typed properties against the first 200 characters only. Default is False. + """ query_string = "" + pov = IdentifiableAdapter.__create_pov_snippet # Shortcut for pname, pvalue in entity.properties.items(): if pvalue is None: - query_string += "'" + pname + "' IS NULL AND " + query_string += "'" + escape_squoted_text(pname) + "' IS NULL AND " elif isinstance(pvalue, list): for v in pvalue: - query_string += ("'" + pname + "'='" + - convert_value(v) + "' AND ") + query_string += pov(pname, v, startswith=startswith) + " AND " # TODO: (for review) # This code would allow for more complex identifiables with @@ -155,8 +192,7 @@ identifiabel, identifiable and identified record) for a Record. # IdentifiableAdapter.create_property_query(p.value) + # ") AND ") else: - query_string += ("'" + pname + "'='" + - convert_value(pvalue) + "' AND ") + query_string += pov(pname, pvalue, startswith=startswith) + " AND " # remove the last AND return query_string[:-4] @@ -174,19 +210,64 @@ identifiabel, identifiable and identified record) for a Record. @abstractmethod def get_file(self, identifiable: db.File): + warnings.warn(DeprecationWarning("This function is deprecated. Please do not use it.")) """ Retrieve the file object for a (File) identifiable. """ pass + @staticmethod + def get_identifying_referencing_entities(referencing_entities, registered_identifiable): + refs = [] + for prop in registered_identifiable.properties: + if prop.name.lower() != "is_referenced_by": + continue + for looking_for_rt in prop.value: + found = False + if looking_for_rt == "*": + for val in referencing_entities.values(): + if len(val) > 0: + found = True + refs.extend(val) + else: + rt_and_children = get_children_of_rt(looking_for_rt) + for rtname in rt_and_children: + if (rtname in referencing_entities): + refs.extend(referencing_entities[rtname]) + found = True + if not found: + raise NotImplementedError( + f"Could not find referencing entities of type(s): {prop.value}\n" + f"for registered identifiable:\n{registered_identifiable}\n" + f"There were {len(referencing_entities)} referencing entities to choose from." + ) + return refs + + @staticmethod + def get_identifying_referenced_entities(record, registered_identifiable): + refs = [] + for prop in registered_identifiable.properties: + pname = prop.name.lower() + if pname == "name" or pname == "is_referenced_by": + continue + if record.get_property(prop.name) is None: + raise RuntimeError("Missing identifying Property") + pval = record.get_property(prop.name).value + if not isinstance(prop.value, list): + pval = [prop.value] + for val in pval: + if isinstance(val, db.Entity): + refs.append(val) + return refs + def get_identifiable(self, record: db.Record, referencing_entities=None): """ - retrieve the registred identifiable and fill the property values to create an - identifiable + Retrieve the registered identifiable and fill the property values to create an + identifiable. Args: record: the record for which the Identifiable shall be created. - referencing_entities: a dictionary (Type: dict[int, dict[str, list[db.Entity]]]), that + referencing_entities: a dictionary (Type: dict[str, list[db.Entity]]), that allows to look up entities with a certain RecordType, that reference ``record`` Returns: @@ -205,6 +286,8 @@ identifiabel, identifiable and identified record) for a Record. name_is_identifying_property = False if registered_identifiable is not None: + identifiable_backrefs = self.get_identifying_referencing_entities( + referencing_entities, registered_identifiable) # fill the values: for prop in registered_identifiable.properties: if prop.name == "name": @@ -215,24 +298,8 @@ identifiabel, identifiable and identified record) for a Record. # case A: in the registered identifiable # case B: in the identifiable - # TODO: similar to the Identifiable class, Registred Identifiable should be a - # separate class too + # treated above if prop.name.lower() == "is_referenced_by": - for givenrt in prop.value: - rt_and_children = get_children_of_rt(givenrt) - found = False - for rtname in rt_and_children: - if (id(record) in referencing_entities - and rtname in referencing_entities[id(record)]): - identifiable_backrefs.extend( - referencing_entities[id(record)][rtname]) - found = True - if not found: - # TODO: is this the appropriate error? - raise NotImplementedError( - f"The following record is missing an identifying property:" - f"RECORD\n{record}\nIdentifying PROPERTY\n{prop.name}" - ) continue record_prop = record.get_property(prop.name) @@ -256,7 +323,7 @@ identifiabel, identifiable and identified record) for a Record. "Multi properties used in identifiables could cause unpredictable results and " "are not allowed. You might want to consider a Property with a list as value.") - # use the RecordType of the registred Identifiable if it exists + # use the RecordType of the registered Identifiable if it exists # We do not use parents of Record because it might have multiple try: return Identifiable( @@ -309,6 +376,8 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): """ def __init__(self): + warnings.warn(DeprecationWarning( + "This class is deprecated. Please use the CaosDBIdentifiableAdapter.")) self._registered_identifiables = dict() self._records = [] @@ -323,6 +392,7 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): Just look in records for a file with the same path. """ candidates = [] + warnings.warn(DeprecationWarning("This function is deprecated. Please do not use it.")) for record in self._records: if record.role == "File" and record.path == identifiable.path: candidates.append(record) @@ -470,14 +540,14 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): self._registered_identifiables[name] = definition def get_file(self, identifiable: Identifiable): + warnings.warn(DeprecationWarning("This function is deprecated. Please do not use it.")) # TODO is this needed for Identifiable? # or can we get rid of this function? if isinstance(identifiable, db.Entity): return cached_get_entity_by(path=identifiable) if identifiable.path is None: raise RuntimeError("Path must not be None for File retrieval.") - candidates = db.execute_query("FIND File which is stored at '{}'".format( - identifiable.path)) + candidates = cached_get_entity_by(path=identifiable.path) if len(candidates) > 1: raise RuntimeError("Identifiable was not defined unambigiously.") if len(candidates) == 0: @@ -486,7 +556,7 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): def get_registered_identifiable(self, record: db.Record): """ - returns the registred identifiable for the given Record + returns the registered identifiable for the given Record It is assumed, that there is exactly one identifiable for each RecordType. Only the first parent of the given Record is considered; others are ignored @@ -510,10 +580,28 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): def retrieve_identified_record_for_identifiable(self, identifiable: Identifiable): query_string = self.create_query_for_identifiable(identifiable) - candidates = db.execute_query(query_string) + try: + candidates = cached_query(query_string) + except db.exceptions.HTTPServerError as err: + query_string = self.create_query_for_identifiable(identifiable, startswith=True) + candidates = cached_query(query_string).copy() # Copy against cache poisoning + + # Test if the candidates really match all properties + for pname, pvalue in identifiable.properties.items(): + popme = [] + for i in range(len(candidates)): + this_prop = candidates[i].get_property(pname) + if this_prop is None: + popme.append(i) + continue + if not this_prop.value == pvalue: + popme.append(i) + for i in reversed(popme): + candidates.pop(i) + if len(candidates) > 1: raise RuntimeError( - f"Identifiable was not defined unambigiously.\n{query_string}\nReturned the " + f"Identifiable was not defined unambiguously.\n{query_string}\nReturned the " f"following {candidates}." f"Identifiable:\n{identifiable.record_type}{identifiable.properties}") if len(candidates) == 0: diff --git a/src/caoscrawler/identified_cache.py b/src/caoscrawler/identified_cache.py deleted file mode 100644 index aa2d82f8e66c738e737c62f3cc68eaf60127e28b..0000000000000000000000000000000000000000 --- a/src/caoscrawler/identified_cache.py +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -# -# ** header v3.0 -# This file is a part of the CaosDB Project. -# -# Copyright (C) 2021 Indiscale GmbH <info@indiscale.com> -# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see <https://www.gnu.org/licenses/>. -# -# ** end header -# - - -""" -see class docstring -""" - -from .identifiable import Identifiable -import caosdb as db - - -class IdentifiedCache(object): - """ - This class is like a dictionary where the keys are Identifiables. When you check whether an - Identifiable exists as key this class returns True not only if that exact Python object is - used as a key, but if an Identifiable is used as key that is **equal** to the one being - considered (see __eq__ function of Identifiable). Similarly, if you do `cache[identifiable]` - you get the Record where the key is an Identifiable that is equal to the one in the rectangular - brackets. - - This class is used for Records where we checked the existence in a remote server using - identifiables. If the Record was found, this means that we identified the corresponding Record - in the remote server and the ID of the local object can be set. - To prevent querying the server again and again for the same objects, this cache allows storing - Records that were found on a remote server and those that were not (typically in separate - caches). - """ - - def __init__(self): - self._cache = {} - self._identifiables = [] - - def __contains__(self, identifiable: Identifiable): - return identifiable in self._identifiables - - def __getitem__(self, identifiable: db.Record): - index = self._identifiables.index(identifiable) - return self._cache[id(self._identifiables[index])] - - def add(self, record: db.Record, identifiable: Identifiable): - self._cache[id(identifiable)] = record - self._identifiables.append(identifiable) diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py index 17ec54b0a2131468b11a99cba247c7e21fe317b3..ebb8c985bad8908b756ac4f3db53bdec6b93b0f4 100644 --- a/src/caoscrawler/scanner.py +++ b/src/caoscrawler/scanner.py @@ -41,7 +41,7 @@ from typing import Any, Optional, Type, Union import jinja2 -import caosdb as db +import linkahead as db import yaml from importlib_resources import files from jsonschema import validate @@ -268,34 +268,38 @@ def scanner(items: list[StructureElement], Formerly known as "_crawl". - items: structure_elements (e.g. files and folders on one level on the hierarchy) - - converters: locally defined converters for - treating structure elements. A locally defined converter could be - one that is only valid for a specific subtree of the originally - cralwed StructureElement structure. - - general_store and record_store: This recursion of the crawl function should only operate on - copies of the global stores of the Crawler object. - - restricted_path: optional, list of strings, traverse the data tree only along the given - path. For example, when a directory contains files a, b and c and b is - given as restricted_path, a and c will be ignroed by the crawler. - When the end of the given path is reached, traverse the full tree as - normal. The first element of the list provided by restricted_path should - be the name of the StructureElement at this level, i.e. denoting the - respective element in the items argument. - - registered_transformer_functions: dict + Parameters + ---------- + items: + structure_elements (e.g. files and folders on one level on the hierarchy) + + converters: + locally defined converters for treating structure elements. A locally + defined converter could be one that is only valid for a specific subtree + of the originally cralwed StructureElement structure. + + general_store, record_store: + This recursion of the crawl function should only operate on copies of + the global stores of the Crawler object. + + restricted_path : list of strings, optional + traverse the data tree only along the given path. For example, when a + directory contains files a, b and c and b is given as restricted_path, a + and c will be ignroed by the crawler. When the end of the given path is + reached, traverse the full tree as normal. The first element of the list + provided by restricted_path should be the name of the StructureElement + at this level, i.e. denoting the respective element in the items + argument. + + registered_transformer_functions : dict, optional A dictionary of transformer functions that can be used in the "transform" block of a converter and that allows to apply simple transformations to variables extracted either by the current converter or to other variables found in the current variable store. Each function is a dictionary: - - The key is the name of the function to be looked up in the dictionary - of registered transformer functions. - - The value is the function which needs to be of the form: + - The key is the name of the function to be looked up in the dictionary of registered transformer functions. + - The value is the function which needs to be of the form: def func(in_value: Any, in_parameters: dict) -> Any: pass @@ -539,9 +543,9 @@ def scan_structure_elements(items: Union[list[StructureElement], StructureElemen A dictionary representing the crawler definition, possibly from a yaml file. restricted_path: optional, list of strings - Traverse the data tree only along the given path. When the end of the given path - is reached, traverse the full tree as normal. See docstring of 'scanner' for - more details. + Traverse the data tree only along the given path. When the end of the + given path is reached, traverse the full tree as normal. See docstring + of 'scanner' formore details. Returns ------- diff --git a/src/caoscrawler/utils.py b/src/caoscrawler/utils.py index 61b363099d0892b74e91f257bccb6cc832c3d59f..c62f44eeaa75ca42579aa3d6ead437e901cd38ff 100644 --- a/src/caoscrawler/utils.py +++ b/src/caoscrawler/utils.py @@ -25,7 +25,7 @@ # Some utility functions, e.g. for extending pylib. -import caosdb as db +import linkahead as db def has_parent(entity: db.Entity, name: str): diff --git a/src/doc/README_SETUP.md b/src/doc/README_SETUP.md index 5f5161d0d672ff3ad14db5c5b49f5c65550b06d7..a75193783d861707adf3b3d45311c392e22626f4 100644 --- a/src/doc/README_SETUP.md +++ b/src/doc/README_SETUP.md @@ -4,7 +4,10 @@ see INSTALL.md ## Run Unit Tests -Run `pytest unittests`. + +1. Install additional dependencies: + - h5py +2. Run `pytest unittests`. ## Documentation ## We use sphinx to create the documentation. Docstrings in the code should comply diff --git a/src/doc/cfood.rst b/src/doc/cfood.rst index c12e251d49e164a737b20e92e56e7b3e10149d4f..07431af0a9fb26e569be5d47f79d6a4f120df269 100644 --- a/src/doc/cfood.rst +++ b/src/doc/cfood.rst @@ -183,7 +183,7 @@ in a vairable with the same name (as it is the case for other Records). Transform Functions ------------------- You can use transform functions to alter variable values that the crawler consumes (e.g. a string -that was matched with a reg exp). See :doc:`Converter Documentation<converters.rst>`. +that was matched with a reg exp). See :doc:`Converter Documentation<converters>`. You can define your own transform functions by adding the the same way you add custom converters: diff --git a/src/doc/concepts.rst b/src/doc/concepts.rst index 0b15f9b5d9aebefc2137b234ac4a9440b84906f5..32176b9edb895074021b3ed4eabe270ad48ae632 100644 --- a/src/doc/concepts.rst +++ b/src/doc/concepts.rst @@ -18,6 +18,8 @@ Relevant sources in: - ``src/structure_elements.py`` +.. _ConceptConverters: + Converters ++++++++++ @@ -85,7 +87,9 @@ we can check whether a Record with the parent "Project" is referencing the "Expe Record. If that is the case, this reference is part of the identifiable for the "Experiment" Record. Note, that if there are multiple Records with the appropriate parent (e.g. multiple "Project" Records in the above example) it will be required that all of them -reference the object to be identified. +reference the object to be identified. You can also use the wildcard "*" as +RecordType name in the configuration which will only require, that ANY Record +references the Record at hand. Identified Records diff --git a/src/doc/converters.rst b/src/doc/converters.rst index 60da52d3ed110f050a3d7aae866cc7d8b6b8dc31..44988fbd497cdb57023b5a696f83d55e7eb5113a 100644 --- a/src/doc/converters.rst +++ b/src/doc/converters.rst @@ -25,7 +25,7 @@ The yaml definition looks like the following: TODO: outdated, see cfood-schema.yml .. code-block:: yaml - + <NodeName>: type: <ConverterName> match: ".*" @@ -41,7 +41,7 @@ TODO: outdated, see cfood-schema.yml - Experiment subtree: (...) - + The **<NodeName>** is a description of what it represents (e.g. 'experiment-folder') and is used as identifier. @@ -58,13 +58,13 @@ described here. Transform Functions +++++++++++++++++++ -Often the situation arises, that you cannot use a value as it is found. Maybe a value should be +Often the situation arises, that you cannot use a value as it is found. Maybe a value should be increased by an offset or a string should be split into a list of pieces. In order to allow such simple conversions, transform functions can be named in the converter definition that are then applied to the respective variables when the converter is executed. .. code-block:: yaml - + <NodeName>: type: <ConverterName> match: ".*" @@ -102,7 +102,7 @@ list valued property to the Report Record. There are a number of transform functions that are defined by default (see ``src/caoscrawler/default_transformers.yml``). You can define custom transform functions by adding -them to the cfood definition (see :doc:`CFood Documentation<cfood.rst>`). +them to the cfood definition (see :doc:`CFood Documentation<cfood>`). Standard Converters @@ -191,7 +191,7 @@ column names to values of the respective cell. Example: .. code-block:: yaml - + subtree: TABLE: type: CSVTableConverter @@ -220,6 +220,105 @@ XLSXTableConverter CSVTableConverter ================= +Further converters +++++++++++++++++++ + +More converters, together with cfood definitions and examples can be found in +the `LinkAhead Crawler Extensions Subgroup +<https://gitlab.com/linkahead/crawler-extensions>`_ on gitlab. In the following, +we list converters that are shipped with the crawler library itself but are not +part of the set of standard converters and may require this library to be +installed with additional optional dependencies. + +HDF5 Converters +=============== + +For treating `HDF5 Files +<https://docs.hdfgroup.org/hdf5/develop/_s_p_e_c.html>`_, there are in total +four individual converters corresponding to the internal structure of HDF5 files: +the :ref:`H5FileConverter` which opens the file itself and creates further +structure elements from HDF5 groups, datasets, and included multi-dimensional +arrays that are in turn treated by the :ref:`H5GroupConverter`, the +:ref:`H5DatasetConverter`, and the :ref:`H5NdarrayConverter`, respectively. You +need to install the LinkAhead crawler with its optional ``h5crawler`` dependency +for using these converters. + +The basic idea when crawling HDF5 files is to treat them very similar to +:ref:`dictionaries <DictElement Converter>` in which the attributes on root, +group, or dataset level are essentially treated like ``BooleanElement``, +``TextElement``, ``FloatElement``, and ``IntegerElement`` in a dictionary: They +are appended as children and can be accessed via the ``subtree``. The file +itself and the groups within may contain further groups and datasets, which can +have their own attributes, subgroups, and datasets, very much like +``DictElements`` within a dictionary. The main difference to any other +dictionary type is the presence of multi-dimensional arrays within HDF5 +datasets. Since LinkAhead doesn't have any datatype corresponding to these, and +since it isn't desirable to store these arrays directly within LinkAhead for +reasons of performance and of searchability, we wrap them within a specific +Record as explained :ref:`below <H5NdarrayConverter>`, together with more +metadata and their internal path within the HDF5 file. Users can thus query for +datasets and their arrays according to their metadata within LinkAhead and then +use the internal path information to access the dataset within the file +directly. The type of this record and the property for storing the internal path +need to be reflected in the datamodel. Using the default names, you would need a +datamodel like + +.. code-block:: yaml + + H5Ndarray: + obligatory_properties: + internal_hdf5-path: + datatype: TEXT + +although the names of both property and record type can be configured within the +cfood definition. + +A simple example of a cfood definition for HDF5 files can be found in the `unit +tests +<https://gitlab.com/linkahead/linkahead-crawler/-/blob/main/unittests/h5_cfood.yml?ref_type=heads>`_ +and shows how the individual converters are used in order to crawl a `simple +example file +<https://gitlab.com/linkahead/linkahead-crawler/-/blob/main/unittests/hdf5_dummy_file.hdf5?ref_type=heads>`_ +containing groups, subgroups, and datasets, together with their respective +attributes. + +H5FileConverter +--------------- + +This is an extension of the +:py:class:`~caoscrawler.converters.SimpleFileConverter` class. It opens the HDF5 +file and creates children for any contained group or dataset. Additionally, the +root-level attributes of the HDF5 file are accessible as children. + +H5GroupConverter +---------------- + +This is an extension of the +:py:class:`~caoscrawler.converters.DictElementConverter` class. Children are +created for all subgroups and datasets in this HDF5 group. Additionally, the +group-level attributes are accessible as children. + +H5DatasetConverter +------------------ + +This is an extension of the +:py:class:`~caoscrawler.converters.DictElementConverter` class. Most +importantly, it stores the array data in HDF5 dataset into +:py:class:`~caoscrawler.hdf5_converter.H5NdarrayElement` which is added to its +children, as well as the dataset attributes. + +H5NdarrayConverter +------------------ + +This converter creates a wrapper record for the contained dataset. The name of +this record needs to be specified in the cfood definition of this converter via +the ``recordname`` option. The RecordType of this record can be configured with +the ``array_recordtype_name`` option and defaults to ``H5Ndarray``. Via the +given ``recordname``, this record can be used within the cfood. Most +importantly, this record stores the internal path of this array within the HDF5 +file in a text property, the name of which can be configured with the +``internal_path_property_name`` option which defaults to ``internal_hdf5_path``. + Custom Converters +++++++++++++++++ @@ -251,10 +350,10 @@ The following methods are abstract and need to be overwritten by your custom con - :py:meth:`~caoscrawler.converters.Converter.match` - :py:meth:`~caoscrawler.converters.Converter.typecheck` - + Example ======= - + In the following, we will explain the process of adding a custom converter to a yaml file using a SourceResolver that is able to attach a source element to another entity. @@ -285,50 +384,50 @@ Furthermore we will customize the method :py:meth:`~caoscrawler.converters.Conve number of records can be generated by the yaml definition. So for any applications - like here - that require an arbitrary number of records to be created, a customized implementation of :py:meth:`~caoscrawler.converters.Converter.create_records` is recommended. In this context it is recommended to make use of the function :func:`caoscrawler.converters.create_records` that implements creation of record objects from python dictionaries of the same structure that would be given using a yaml definition (see next section below). - + .. code-block:: python import re from caoscrawler.stores import GeneralStore, RecordStore from caoscrawler.converters import TextElementConverter, create_records from caoscrawler.structure_elements import StructureElement, TextElement - + class SourceResolver(TextElementConverter): """ This resolver uses a source list element (e.g. from the markdown readme file) to link sources correctly. """ - + def __init__(self, definition: dict, name: str, converter_registry: dict): """ Initialize a new directory converter. """ super().__init__(definition, name, converter_registry) - + def create_children(self, generalStore: GeneralStore, element: StructureElement): - + # The source resolver does not create children: - + return [] - + def create_records(self, values: GeneralStore, records: RecordStore, element: StructureElement, file_path_prefix): if not isinstance(element, TextElement): raise RuntimeError() - + # This function must return a list containing tuples, each one for a modified # property: (name_of_entity, name_of_property) keys_modified = [] - + # This is the name of the entity where the source is going to be attached: attach_to_scientific_activity = self.definition["scientific_activity"] rec = records[attach_to_scientific_activity] - + # The "source" is a path to a source project, so it should have the form: # /<Category>/<project>/<scientific_activity>/ # obtain these information from the structure element: @@ -336,18 +435,18 @@ that would be given using a yaml definition (see next section below). regexp = (r'/(?P<category>(SimulationData)|(ExperimentalData)|(DataAnalysis))' '/(?P<project_date>.*?)_(?P<project_identifier>.*)' '/(?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2})(_(?P<identifier>.*))?/') - + res = re.match(regexp, val) if res is None: raise RuntimeError("Source cannot be parsed correctly.") - + # Mapping of categories on the file system to corresponding record types in CaosDB: cat_map = { "SimulationData": "Simulation", "ExperimentalData": "Experiment", "DataAnalysis": "DataAnalysis"} linkrt = cat_map[res.group("category")] - + keys_modified.extend(create_records(values, records, { "Project": { "date": res.group("project_date"), @@ -361,7 +460,7 @@ that would be given using a yaml definition (see next section below). attach_to_scientific_activity: { "sources": "+$" + linkrt }}, file_path_prefix)) - + # Process the records section of the yaml definition: keys_modified.extend( super().create_records(values, records, element, file_path_prefix)) @@ -374,7 +473,7 @@ that would be given using a yaml definition (see next section below). If the recommended (python) package structure is used, the package containing the converter definition can just be installed using `pip install .` or `pip install -e .` from the `scifolder_package` directory. - + The following yaml block will register the converter in a yaml file: .. code-block:: yaml @@ -384,7 +483,7 @@ The following yaml block will register the converter in a yaml file: package: scifolder.converters.sources converter: SourceResolver - + Using the `create_records` API function ======================================= @@ -422,7 +521,7 @@ Let's formulate that using `create_records`: .. code-block:: python dir_name = "directory name" - + record_def = { "Experiment": { "identifier": dir_name @@ -498,7 +597,7 @@ Let's have a look at a more complex examples, defining multiple records: Project: $Project ProjectGroup: projects: +$Project - + This block will create two new Records: @@ -514,7 +613,7 @@ Let's formulate that using `create_records` (again, `dir_name` is constant here) .. code-block:: python dir_name = "directory name" - + record_def = { "Project": { "identifier": "project_name", @@ -526,7 +625,7 @@ Let's formulate that using `create_records` (again, `dir_name` is constant here) "ProjectGroup": { "projects": "+$Project", } - + } keys_modified = create_records(values, records, diff --git a/tox.ini b/tox.ini index a7d4465ed36f0fe5e49c06721d3e3a0cdf453fa0..03e02ebeff196430129e10c4c0d853ca77c47302 100644 --- a/tox.ini +++ b/tox.ini @@ -6,6 +6,7 @@ skip_missing_interpreters = true deps = . pytest pytest-cov + h5py # TODO: Make this f-branch sensitive git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@dev diff --git a/unittests/h5_cfood.yml b/unittests/h5_cfood.yml new file mode 100644 index 0000000000000000000000000000000000000000..f688de6a2171da6533626449b030bcd95a43b37b --- /dev/null +++ b/unittests/h5_cfood.yml @@ -0,0 +1,69 @@ +--- +metadata: + crawler-version: 0.6.1 +--- +Converters: + H5Dataset: + converter: H5DatasetConverter + package: caoscrawler.hdf5_converter + H5File: + converter: H5FileConverter + package: caoscrawler.hdf5_converter + H5Group: + converter: H5GroupConverter + package: caoscrawler.hdf5_converter + H5Ndarray: + converter: H5NdarrayConverter + package: caoscrawler.hdf5_converter +# Top-level, we have just the HDF5 file. +ParentDirectory: + type: Directory + match: (.*) + subtree: + H5FileElement: + type: H5File + match: (.*)\.(hdf5|h5)$ + records: + H5File: + parents: + - H5File + role: File + path: $H5FileElement + file: $H5FileElement + subtree: + # Here, we have the groups, the top-level dataset, and possible + # attributes (empty for now). + RootIntegerElement: + type: H5Dataset + match_name: ^root_integers$ + records: + H5Dataset: + parents: + - H5Dataset + H5File: + H5Dataset: +$H5Dataset + subtree: + # included NDArray in this dataset + TopLevelIntNDElement: + type: H5Ndarray + match_name: (.*) + recordname: this + records: + # this: + # ContainingFile: $H5File + H5Dataset: + Ndarray: $this + # There is one more list-valued attribute to this dataset. + TopLevelDataAttribute: + type: ListElement + match_name: ^attr_data_root$ + subtree: + AttributeListEntry: + type: FloatElement + match_name: (.*) + match_value: (?P<value>.*) + records: + H5Dataset: + attr_data_root: +$value + + diff --git a/unittests/hdf5_dummy_file.hdf5 b/unittests/hdf5_dummy_file.hdf5 new file mode 100644 index 0000000000000000000000000000000000000000..41bfb7ab3bcac19d90fd4f018cdd8118ae806eaf Binary files /dev/null and b/unittests/hdf5_dummy_file.hdf5 differ diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index fbf98346e59b0cbec88f17398eff41f26c423dee..e026899201ae0dfa937053d315854e5fbb8a351a 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -34,12 +34,14 @@ from pathlib import Path from unittest.mock import MagicMock, Mock, patch import caoscrawler -import caosdb as db -import caosdb.common.models as dbmodels +import linkahead as db +import linkahead.common.models as dbmodels import pytest import yaml -from caoscrawler.crawl import (Crawler, SecurityMode, _treat_deprecated_prefix, - crawler_main, split_restricted_path) +from caosadvancedtools.models.parser import parse_model_from_string +from caoscrawler.crawl import (Crawler, SecurityMode, TreatedRecordLookUp, + _treat_deprecated_prefix, crawler_main, + split_restricted_path) from caoscrawler.debug_tree import DebugTree from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import (CaosDBIdentifiableAdapter, @@ -50,9 +52,9 @@ from caoscrawler.scanner import (create_converter_registry, scan_directory, from caoscrawler.stores import GeneralStore, RecordStore from caoscrawler.structure_elements import (DictElement, DictListElement, DictTextElement, File) -from caosdb.apiutils import compare_entities -from caosdb.cached import cache_clear -from caosdb.exceptions import EmptyUniqueQueryError +from linkahead.apiutils import compare_entities +from linkahead.cached import cache_clear +from linkahead.exceptions import EmptyUniqueQueryError from pytest import raises UNITTESTDIR = Path(__file__).parent @@ -108,6 +110,47 @@ def mock_get_entity_by(eid=None, name=None, path=None): raise EmptyUniqueQueryError("") +def mock_retrieve_record(identifiable: Identifiable): + """ assumes that the identifiable is always only the date""" + + for record in EXAMPLE_SERVER_STATE: + if (record.role == "Record" and "date" in identifiable.properties + and record.get_property("date").value == identifiable.properties['date']): + return record + return None + + +def mock_cached_only_rt(query_string: str): + """Always return an empty Container""" + result = db.Container() + lo_query = query_string.lower() + if lo_query.startswith("find record ") or lo_query.startswith("find file "): + return result + model = parse_model_from_string(""" +B: + obligatory_properties: + C: + obligatory_properties: + prop_other: + datatype: INTEGER + prop_ident: + datatype: INTEGER +A: + obligatory_properties: + B: + datatype: LIST<B> + prop_ident: +""") + if query_string == "FIND RECORDTYPE 'A'": + model.get_deep("A").id = 1 + return result + [model.get_deep("A")] + if query_string == "FIND RECORDTYPE 'B'": + model.get_deep("A").id = 2 + return result + [model.get_deep("B")] + print(query_string) + raise NotImplementedError("Mock for this case is missing") + + @pytest.fixture(autouse=True) def clear_cache(): cache_clear() @@ -215,6 +258,13 @@ def test_split_into_inserts_and_updates_trivial(): crawler.split_into_inserts_and_updates([]) +def test_split_into_inserts_and_updates_unidentified(): + crawler = Crawler() + with raises(ValueError) as err: + crawler.split_into_inserts_and_updates([db.Record(name="recname").add_parent("someparent")]) + assert str(err.value).startswith("There is no identifying information.") + + def basic_retrieve_by_name_mock_up(rec, referencing_entities=None, known=None): """ returns a stored Record if rec.name is an existing key, None otherwise """ if rec.name in known: @@ -247,8 +297,8 @@ def test_split_into_inserts_and_updates_single(crawler_mocked_identifiable_retri entlist = [db.Record(name="A").add_parent( "C"), db.Record(name="B").add_parent("C")] - assert crawler.get_from_any_cache(identlist[0]) is None - assert crawler.get_from_any_cache(identlist[1]) is None + assert crawler.treated_records_lookup.get_any(entlist[0], identlist[0]) is None + assert crawler.treated_records_lookup.get_any(entlist[0], identlist[0]) is None assert not crawler._has_reference_value_without_id(identlist[0]) assert not crawler._has_reference_value_without_id(identlist[1]) assert crawler.identifiableAdapter.retrieve_identified_record_for_record( @@ -355,6 +405,84 @@ def test_split_into_inserts_and_updates_with_copy_attr(crawler_mocked_identifiab crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() +@pytest.mark.xfail(reason="https://gitlab.com/linkahead/linkahead-crawler/-/issues/88") +@patch("caoscrawler.identifiable_adapters.cached_query", + new=Mock(side_effect=mock_cached_only_rt)) +def test_split_iiau_with_unmergeable_list_items(): + """Test for meaningful exception when referencing a list of unmergeable entities. + +Datamodel +--------- +A: + B: LIST<B> + prop_ident: INTEGER + +B: + prop_ident: + C: + +C: + prop_other: INTEGER + +Identifiables +------------- + +id_A: [prop_ident] +id_B: [prop_ident, "is_referenced_by: A"] + +Data +---- + +b1: ("same", 23) +b2: ("same", 42) + +a: ([b1, b2]) + """ + prop_ident = db.Property("prop_ident", datatype=db.INTEGER) + prop_other = db.Property("prop_ident", datatype=db.INTEGER) + rt_c = db.RecordType("C").add_property(prop_other) + # Somehow it is necessary that `B` has a reference property. Dunno if C must have an + # identifiable as well. + rt_b = db.RecordType("B").add_property(prop_ident).add_property("C") + rt_a = db.RecordType("A").add_property(prop_ident).add_property("LIST<B>") + + ident_a = db.RecordType().add_parent("A").add_property("prop_ident") + ident_b = db.RecordType().add_parent("B").add_property("prop_ident").add_property( + "is_referenced_by", value="A") + ident_c = db.RecordType().add_parent("C").add_property("prop_other").add_property( + "is_referenced_by", value="B") + + rec_a = db.Record("a").add_parent(rt_a).add_property("prop_ident", value=1234) + rec_b = [] + rec_c = [] + for value in [23, 42]: + new_c = db.Record().add_parent(rt_c).add_property("prop_other", value=value) + rec_c.append(new_c) + rec_b.append(db.Record().add_parent(rt_b).add_property( + "prop_ident", value=2020).add_property("C", value=new_c)) + rec_a.add_property("B", rec_b) + + ident_adapter = CaosDBIdentifiableAdapter() + ident_adapter.register_identifiable("A", ident_a) + ident_adapter.register_identifiable("B", ident_b) + ident_adapter.register_identifiable("C", ident_c) + + crawler = Crawler(identifiableAdapter=ident_adapter) + + # This should give a merge conflict, and not + # "Could not find referencing entities of type(s): A" + + # from IPython import embed; embed() + with raises(RuntimeError) as rte: + crawler.synchronize(commit_changes=False, + crawled_data=[rec_a, *rec_b, *rec_c]) + assert not isinstance(rte.value, NotImplementedError), \ + "Exception must not be NotImplementedError, but plain RuntimeError." + assert "Could not find referencing entities" not in rte.value.args[0] + assert "merging impossible" in rte.something + # crawler.split_into_inserts_and_updates(ent_list=[rec_a, *rec_b, *rec_c]) + + def test_has_missing_object_in_references(): crawler = Crawler() # Simulate remote server content by using the names to identify records @@ -368,36 +496,40 @@ def test_has_missing_object_in_references(): # one reference with id -> check assert not crawler._has_missing_object_in_references( - Identifiable(name="C", record_type="RTC", properties={'d': 123}), []) + Identifiable(name="C", record_type="RTC", properties={'d': 123}), {}) # one ref with Entity with id -> check + rec = db.Record(id=123).add_parent("C") assert not crawler._has_missing_object_in_references( - Identifiable(name="C", record_type="RTC", properties={'d': db.Record(id=123) - .add_parent("C")}), []) + Identifiable(name="C", record_type="RTC", properties={'d': rec}), {id(rec): {'C': [None]}}) # one ref with id one with Entity with id (mixed) -> check + rec = db.Record(id=123).add_parent("RTC") assert not crawler._has_missing_object_in_references( Identifiable(name="C", record_type="RTD", - properties={'d': 123, 'b': db.Record(id=123).add_parent("RTC")}), []) + properties={'d': 123, 'b': rec}), {id(rec): {'C': [None]}}) # entity to be referenced in the following a = db.Record(name="C").add_parent("C").add_property("d", 12311) # one ref with id one with Entity without id (but not identifying) -> fail assert not crawler._has_missing_object_in_references( - Identifiable(name="C", record_type="RTC", properties={'d': 123, 'e': a}), []) + Identifiable(name="C", record_type="RTC", properties={'d': 123, 'e': a}), + {id(a): {'C': [None]}}) # one ref with id one with Entity without id (mixed) -> fail assert not crawler._has_missing_object_in_references( - Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), []) + Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), + {id(a): {'C': [None]}}) - crawler.add_to_remote_missing_cache(a, Identifiable(name="C", record_type="RTC", - properties={'d': 12311})) + crawler.treated_records_lookup.add(a, Identifiable(name="C", record_type="RTC", + properties={'d': 12311})) # one ref with id one with Entity without id but in cache -> check assert crawler._has_missing_object_in_references( - Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), []) + Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), + {id(a): {'C': [None]}}) # if this ever fails, the mock up may be removed crawler.identifiableAdapter.get_registered_identifiable.assert_called() -@pytest.mark.xfail() +@ pytest.mark.xfail() def test_references_entities_without_ids(): crawler = Crawler() assert not crawler._has_reference_value_without_id(db.Record().add_parent("Person") @@ -437,25 +569,15 @@ def reset_mocks(mocks): mock.reset_mock() -def mock_retrieve_record(identifiable: Identifiable): - """ assumes that the identifiable is always only the date""" - - for record in EXAMPLE_SERVER_STATE: - if (record.role == "Record" - and record.get_property("date").value == identifiable.properties['date']): - return record - return None - - -@patch("caoscrawler.crawl.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by)) -@patch("caoscrawler.identifiable_adapters.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by)) -@patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter." - "retrieve_identified_record_for_identifiable", - new=Mock(side_effect=mock_retrieve_record)) -@patch("caoscrawler.crawl.db.Container.insert") -@patch("caoscrawler.crawl.db.Container.update") +@ patch("caoscrawler.crawl.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@ patch("caoscrawler.identifiable_adapters.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@ patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter." + "retrieve_identified_record_for_identifiable", + new=Mock(side_effect=mock_retrieve_record)) +@ patch("caoscrawler.crawl.db.Container.insert") +@ patch("caoscrawler.crawl.db.Container.update") def test_synchronization_no_commit(upmock, insmock): crawled_data = [r.copy() for r in EXAMPLE_SERVER_STATE if r.role == "Record"] # change one; add one @@ -472,16 +594,16 @@ def test_synchronization_no_commit(upmock, insmock): assert len(ups) == 1 -@patch("caoscrawler.crawl.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by)) -@patch("caoscrawler.identifiable_adapters.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by)) -@patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter." - "retrieve_identified_record_for_identifiable", - new=Mock(side_effect=mock_retrieve_record)) -@patch("caoscrawler.crawl.db.Container.insert") -@patch("caoscrawler.crawl.db.Container.update") -@patch("caoscrawler.crawl.UpdateCache.insert") +@ patch("caoscrawler.crawl.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@ patch("caoscrawler.identifiable_adapters.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@ patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter." + "retrieve_identified_record_for_identifiable", + new=Mock(side_effect=mock_retrieve_record)) +@ patch("caoscrawler.crawl.db.Container.insert") +@ patch("caoscrawler.crawl.db.Container.update") +@ patch("caoscrawler.crawl.UpdateCache.insert") def test_security_mode(updateCacheMock, upmock, insmock): # trivial case: nothing to do crawled_data = [r.copy() for r in EXAMPLE_SERVER_STATE if r.role == "Record"] @@ -580,12 +702,13 @@ def test_security_mode(updateCacheMock, upmock, insmock): def test_create_reference_mapping(): a = db.Record().add_parent("A") - b = db.Record().add_parent("B").add_property('a', a) + b = db.Record(id=132).add_parent("B").add_property('a', a) ref = Crawler.create_reference_mapping([a, b]) assert id(a) in ref - assert id(b) not in ref + assert id(b) in ref assert "B" in ref[id(a)] - assert ref[id(a)]["B"] == [b] + assert {} == ref[id(b)] + assert ref[id(a)]["B"] == [132] def test_create_flat_list(): @@ -608,7 +731,7 @@ def test_create_flat_list(): assert c in flat -@pytest.fixture +@ pytest.fixture def crawler_mocked_for_backref_test(): crawler = Crawler() # mock retrieval of registered identifiabls: return Record with just a parent @@ -652,8 +775,8 @@ def test_validation_error_print(caplog): caplog.clear() -@patch("caoscrawler.identifiable_adapters.get_children_of_rt", - new=Mock(side_effect=lambda x: [x])) +@ patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test): crawler = crawler_mocked_for_backref_test identlist = [Identifiable(name="A", record_type="BR"), @@ -667,8 +790,8 @@ def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test) crawler.split_into_inserts_and_updates([db.Record(name="B").add_parent("C")]) # identifiables were not yet checked - assert crawler.get_from_any_cache(identlist[0]) is None - assert crawler.get_from_any_cache(identlist[1]) is None + assert crawler.treated_records_lookup.get_any(entlist[1], identlist[0]) is None + assert crawler.treated_records_lookup.get_any(entlist[0], identlist[1]) is None # one with reference, one without assert not crawler._has_reference_value_without_id(identlist[0]) assert crawler._has_reference_value_without_id(identlist[1]) @@ -688,8 +811,8 @@ def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test) assert insert[0].name == "B" -@patch("caoscrawler.identifiable_adapters.get_children_of_rt", - new=Mock(side_effect=lambda x: [x])) +@ patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_test): # test whether multiple references of the same record type are correctly used crawler = crawler_mocked_for_backref_test @@ -701,7 +824,9 @@ def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_ # test whether both entities are listed in the backref attribute of the identifiable referencing_entities = crawler.create_reference_mapping(entlist) - identifiable = crawler.identifiableAdapter.get_identifiable(referenced, referencing_entities) + identifiable = crawler.identifiableAdapter.get_identifiable( + referenced, + referencing_entities[id(referenced)]) assert len(identifiable.backrefs) == 2 # check the split... @@ -710,8 +835,8 @@ def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_ assert len(insert) == 2 -@patch("caoscrawler.identifiable_adapters.get_children_of_rt", - new=Mock(side_effect=lambda x: [x])) +@ patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_test): # test whether multiple references of the different record types are correctly used crawler = crawler_mocked_for_backref_test @@ -723,7 +848,10 @@ def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_ # test whether both entities are listed in the backref attribute of the identifiable referencing_entities = crawler.create_reference_mapping(entlist) - identifiable = crawler.identifiableAdapter.get_identifiable(referenced, referencing_entities) + identifiable = crawler.identifiableAdapter.get_identifiable( + referenced, + referencing_entities[id(referenced)]) + assert len(identifiable.backrefs) == 2 # check the split... @@ -736,7 +864,7 @@ def mock_create_values(values, element): pass -@patch("caoscrawler.converters.IntegerElementConverter.create_values") +@ patch("caoscrawler.converters.IntegerElementConverter.create_values") def test_restricted_path(create_mock): """ The restricted_path argument allows to ignroe part of the crawled data structure. Here, we make @@ -829,7 +957,7 @@ def test_split_restricted_path(): # Filter the warning because we want to have it here and this way it does not hinder running # tests with -Werror. -@pytest.mark.filterwarnings("ignore:The prefix:DeprecationWarning") +@ pytest.mark.filterwarnings("ignore:The prefix:DeprecationWarning") def test_deprecated_prefix_option(): """Test that calling the crawler's main function with the deprecated `prefix` option raises the correct errors and warnings. @@ -886,7 +1014,7 @@ def test_detect_circular_dependency(crawler_mocked_identifiable_retrieve, caplog _, _ = crawler.split_into_inserts_and_updates(flat) caplog.set_level(logging.ERROR, logger="caoscrawler.converters") assert "Found circular dependency" in caplog.text - assert "-------\na\n['C" in caplog.text + assert "\n--------\n\n> Parent: C\n\n>> Name: a\n[\'C\']" in caplog.text caplog.clear() @@ -895,8 +1023,8 @@ def mock_get_entity_by_query(query=None): return db.Record(id=1111, name='rec_name').add_parent('RT') -@patch("caoscrawler.crawl.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by_query)) +@ patch("caoscrawler.crawl.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by_query)) def test_replace_name_with_referenced_entity(): test_text = 'lkajsdf' test_int = 134343 @@ -964,3 +1092,72 @@ def test_replace_name_with_referenced_entity(): assert isinstance(prop.value[2], int) assert prop.value[2] == test_id assert caoscrawler.crawl.cached_get_entity_by.call_count == 3 + + +def test_treated_record_lookup(): + trlu = TreatedRecordLookUp() + exist = db.Record(id=1) + trlu.add(exist) + assert len(trlu._existing) == 1 + # was added to existing + assert trlu._existing[id(exist)] is exist + # is in ID lookup + assert trlu._id_look_up[exist.id] is exist + # can be accessed via get_existing + assert trlu.get_existing(db.Record(id=1)) is exist + + miss = db.Record() + # exception when identifiable is missing + with raises(RuntimeError): + trlu.add(miss) + ident = Identifiable(name='a') + trlu.add(miss, ident) + # was added to missing + assert trlu._missing[id(miss)] is miss + # is in ident lookup + assert trlu._identifiable_look_up[ident.get_representation()] is miss + # can be accessed via get_missing + assert trlu.get_missing(db.Record(), Identifiable(name='a')) is miss + + fi = db.File(path='a', id=2) + trlu.add(fi) + assert len(trlu._existing) == 2 + # was added to existing + assert trlu._existing[id(fi)] is fi + # is in ID lookup + assert trlu._id_look_up[fi.id] is fi + # is in path lookup + assert trlu._path_look_up[fi.path] is fi + # can be accessed via get_existing + assert trlu.get_existing(fi) is fi + + all_exi = trlu.get_existing_list() + assert fi in all_exi + assert exist in all_exi + all_mi = trlu.get_missing_list() + assert miss in all_mi + + # If a Record was added using the ID, the ID must be used to identify it even though later an + # identifiable may be passed as well + assert trlu.get_any(exist, Identifiable(name='b')) is exist + + fi2 = db.File(path='b') + trlu.add(fi2) + assert trlu.get_any(db.File(path='b'), Identifiable(name='c')) is fi2 + + +def test_merge_entity_with_identifying_reference(crawler_mocked_identifiable_retrieve): + # When one python object representing a record is merged into another python object + # representing the same record, the former object can be forgotten and references from it to + # other records must not play a role + crawler = crawler_mocked_identifiable_retrieve + crawler.identifiableAdapter.get_registered_identifiable = Mock( + side_effect=lambda x: db.Record().add_parent('C').add_property(name='name') if + x.parents[0].name == "C" else + db.Record().add_parent('D').add_property(name='is_referenced_by', value="*") + ) + a = db.Record(name='a').add_parent("D") + b = db.Record(name='b').add_parent("C") + c = db.Record(name='b').add_parent("C").add_property(name="C", value=a) + flat = [a, c, b] + _, _ = crawler.split_into_inserts_and_updates(flat) diff --git a/unittests/test_h5_converter.py b/unittests/test_h5_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..2f7fae5d8d32bb7e5c90a535b63158c33df55daa --- /dev/null +++ b/unittests/test_h5_converter.py @@ -0,0 +1,135 @@ +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2023 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2023 Florian Spreckelsen <f.spreckelsen@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +import numpy as np + +from functools import partial +from pathlib import Path +from pytest import fixture, importorskip + +import caosdb as db + +from caoscrawler.debug_tree import DebugTree +from caoscrawler.hdf5_converter import (convert_basic_element_with_nd_array, + convert_h5_element, H5GroupElement, + H5DatasetElement, H5NdarrayElement) +from caoscrawler.scanner import scan_directory +from caoscrawler.structure_elements import (FloatElement, ListElement, + TextElement) +from utils import dircheckstr as dircheck_base + +# Skip the whole module if h5py hasn't been installed +h5py = importorskip("h5py") + + +UNITTESTDIR = Path(__file__).parent + +# always add the path here +dircheckstr = partial(dircheck_base, UNITTESTDIR) + + +@fixture +def h5_dummy_file(): + + path = UNITTESTDIR / "hdf5_dummy_file.hdf5" + + return h5py.File(path, 'r') + + +def test_h5_elements(h5_dummy_file): + + elt = convert_h5_element(h5_dummy_file["group_level1_a"], "test") + assert isinstance(elt, H5GroupElement) + + elt = convert_h5_element(h5_dummy_file["root_integers"], "test") + assert isinstance(elt, H5DatasetElement) + + +def test_nd_array_conversion(): + + # Only test array handling here, `convert_basic_element` is tested + # elsewhere. + arr = np.array([[["something"]]]) + elt = convert_basic_element_with_nd_array(arr) + assert isinstance(elt, TextElement) + assert elt.value == "something" + + arr = np.zeros((1, 1)) + elt = convert_basic_element_with_nd_array(arr) + assert isinstance(elt, FloatElement) + assert elt.value == 0 + + arr = np.zeros((1, 3, 1)) + elt = convert_basic_element_with_nd_array(arr) + assert isinstance(elt, ListElement) + assert elt.value == [0, 0, 0] + + arr = np.array([[1, 2, 3], [4, 5, 6]]) + elt = convert_basic_element_with_nd_array(arr, internal_path="some/path") + assert isinstance(elt, H5NdarrayElement) + assert elt.internal_path == "some/path" + + # Non-arrays should be forwarded correctly + elt = convert_basic_element_with_nd_array("something") + assert isinstance(elt, TextElement) + assert elt.value == "something" + + elt = convert_basic_element_with_nd_array([0, 0, 0]) + assert isinstance(elt, ListElement) + assert elt.value == [0, 0, 0] + + +def test_record_creation(): + + dbt = DebugTree() + records = scan_directory(UNITTESTDIR, UNITTESTDIR / "h5_cfood.yml", debug_tree=dbt) + + # In total 3 records: The file, the Dataset, and its ndarray + assert len(records) == 3 + file_rec = [rec for rec in records if isinstance(rec, db.File)] + # exactly on file + assert len(file_rec) == 1 + + subd = dbt.debug_tree[dircheckstr("hdf5_dummy_file.hdf5")] + # At this level, we have 5 variables (directories and paths, plus H5File + # record), and one record. + assert len(subd[0]) == 5 + assert len(subd[1]) == 1 + file_rec = subd[1]["H5File"] + assert file_rec.get_property("H5Dataset") is not None + assert file_rec.get_property("H5Dataset").value is not None + # Reference properties currently need to be integration tested (especially + # with the circular dependency between) H5File and NDArray. + + # top level integers + subd = dbt.debug_tree["root_integers"] + # Two additional variables (RootIntegerElement + Dataset record), one + # additional record + assert len(subd[0]) == 7 + assert len(subd[1]) == 2 + ds_rec = subd[1]["H5Dataset"] + assert isinstance(ds_rec, db.Record) + assert len(ds_rec.parents) == 1 + assert ds_rec.parents[0].name == "H5Dataset" + assert ds_rec.get_property("Ndarray") is not None + assert ds_rec.get_property("Ndarray").value is not None + assert ds_rec.get_property("attr_data_root") is not None + assert isinstance(ds_rec.get_property("attr_data_root").value, list) + for number in [-2., -4., -8., -10.12345]: + assert number in [float(val) for val in ds_rec.get_property("attr_data_root").value] diff --git a/unittests/test_identifiable.py b/unittests/test_identifiable.py index 3f3c606b163df4dc238be9a669fd31eb630a582d..28bdb7a2ad75d5b9389b47ca3f0ec2b2e2a1404b 100644 --- a/unittests/test_identifiable.py +++ b/unittests/test_identifiable.py @@ -24,10 +24,9 @@ test identifiable module """ -import pytest import caosdb as db +import pytest from caoscrawler.identifiable import Identifiable -from caoscrawler.identified_cache import IdentifiedCache def test_create_hashable_string(): diff --git a/unittests/test_identifiable_adapters.py b/unittests/test_identifiable_adapters.py index 268b9800ddf1ef1688386f394ec1c6c7eb3e3912..ee0e0d6cd7c791f78e7cd2307dc6f34698326b4a 100644 --- a/unittests/test_identifiable_adapters.py +++ b/unittests/test_identifiable_adapters.py @@ -133,6 +133,21 @@ def test_non_default_name(): assert identifiable.name is None +def test_wildcard_ref(): + ident = CaosDBIdentifiableAdapter() + ident.register_identifiable( + "Person", db.RecordType() + .add_parent(name="Person") + .add_property(name="is_referenced_by", value=["*"])) + rec = (db.Record(name="don't touch it").add_parent("Person") + .add_property(name="last_name", value='Tom')) + identifiable = ident.get_identifiable(rec, + referencing_entities={ + 'A': [1]} + ) + assert identifiable.backrefs[0] == 1 + + def test_convert_value(): # test that string representation of objects stay unchanged. No stripping or so. class A(): diff --git a/unittests/test_identified_cache.py b/unittests/test_identified_cache.py deleted file mode 100644 index 4ed7c55c7326415308917e20e9f391b17b07ad87..0000000000000000000000000000000000000000 --- a/unittests/test_identified_cache.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -# -# ** header v3.0 -# This file is a part of the CaosDB Project. -# -# Copyright (C) 2021 Indiscale GmbH <info@indiscale.com> -# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see <https://www.gnu.org/licenses/>. -# -# ** end header -# - -""" -test identified_cache module -""" - -import caosdb as db -from caoscrawler.identifiable import Identifiable -from caoscrawler.identified_cache import IdentifiedCache - - -def test_IdentifiedCache(): - ident = Identifiable(name="A", record_type="B") - record = db.Record("A").add_parent("B").add_property('b', 5) - cache = IdentifiedCache() - assert ident not in cache - cache.add(record=record, identifiable=ident) - assert ident in cache - assert cache[ident] is record - assert Identifiable(name="A", record_type="C") != Identifiable(name="A", record_type="B") - assert Identifiable(name="A", record_type="C") not in cache