diff --git a/setup.cfg b/setup.cfg index dd2961d13934c88b13535a0cc1c17b6d7dbd74e3..205fc60074589ac11c745ade435525602af6b434 100644 --- a/setup.cfg +++ b/setup.cfg @@ -19,8 +19,8 @@ package_dir = packages = find: python_requires = >=3.7 install_requires = - importlib-resources - caosadvancedtools >= 0.7.0 + importlib-resources + caosadvancedtools >= 0.7.0 linkahead > 0.13.2 yaml-header-tools >= 0.2.1 pyyaml diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index dddb83a7c57d90a987de74f337668f3bae73ee1b..772f690b2ce7f8a3822d0d833f13c84cdd9d5e35 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -36,7 +36,7 @@ from inspect import signature from string import Template from typing import Any, List, Optional, Tuple, Union -import caosdb as db +import linkahead as db import pandas as pd import yaml import yaml_header_tools diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 4990aecc025c290bed26e05d73833b756f904b46..698a154f49d84e15dbe41f91da67469184151316 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -207,14 +207,13 @@ def _treat_merge_error_of(newrecord, record): # TODO can we also compare lists? and not isinstance(this_p.value, list) and not isinstance(that_p.value, list)): - logger.error("The Crawler is trying to merge two entities " - "because they should be the same object (same" - " identifiables), but they have " - "different values for the same Property." - f"Problematic Property: {this_p.name}\n" - f"Values: {this_p.value} and " - f"{that_p.value}\n" - f"{record}\n{newrecord}") + logger.error( + "The Crawler is trying to merge two entities because they should be the same " + "object (same identifiables), but they have different values for the same " + "Property.\n" + f"Problematic Property: {this_p.name}\n" + f"Values: {this_p.value} and {that_p.value}\n" + f"{record}\n{newrecord}") raise RuntimeError("Cannot merge Entities") @@ -761,8 +760,8 @@ class Crawler(object): # 1. Is it in the cache of already checked Records? if self.treated_records_lookup.get_any(record, identifiable) is not None: treated_record = self.treated_records_lookup.get_any(record, identifiable) - # Since the identifiables are the same, treated_record and record actually describe - # the same obejct. + # Since the identifiables are the same, treated_record and record actually + # describe the same object. # We merge record into treated_record in order to prevent loss of information self._merge_identified(treated_record, record, try_to_merge_later, all_records) referencing_entities = self.create_reference_mapping(all_records) @@ -1149,7 +1148,7 @@ class Crawler(object): """ if crawled_data is None: warnings.warn(DeprecationWarning( - "Calling synchronize without the data to be synchronized is depricated. Please " + "Calling synchronize without the data to be synchronized is deprecated. Please " "use for example the Scanner to create this data.")) crawled_data = self.crawled_data diff --git a/src/caoscrawler/hdf5_converter.py b/src/caoscrawler/hdf5_converter.py index 506c7b3942cc2518ffa47762c4bed742b9f09b83..5b1ff5775fb74919c989507c449636fd822db7f0 100644 --- a/src/caoscrawler/hdf5_converter.py +++ b/src/caoscrawler/hdf5_converter.py @@ -30,7 +30,7 @@ import numpy as np from typing import Union -import caosdb as db +import linkahead as db from .converters import (convert_basic_element, Converter, DictElementConverter, match_name_and_value, SimpleFileConverter) diff --git a/src/caoscrawler/identifiable.py b/src/caoscrawler/identifiable.py index 75af5be8f06a6ab95a4b7f2b92eda8cf3e321a1b..cefdf4a0f42b1f610e0712fdefebc2dc3b78d69f 100644 --- a/src/caoscrawler/identifiable.py +++ b/src/caoscrawler/identifiable.py @@ -20,7 +20,7 @@ # from __future__ import annotations -import caosdb as db +import linkahead as db from datetime import datetime import json from hashlib import sha256 diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index d94b6fca211fc26c2702b8f9f5ad4bc8993bd920..9be539fe0842e3ce24b68060fa2288cdc4c531b2 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -84,14 +84,15 @@ class IdentifiableAdapter(metaclass=ABCMeta): Some terms: -- Registered identifiable is the definition of an identifiable which is: - - A record type as the parent - - A list of properties - - A list of referenced by statements -- Identifiable is the concrete identifiable, e.g. the Record based on - the registered identifiable with all the values filled in. -- Identified record is the result of retrieving a record based on the - identifiable from the database. +- A *registered identifiable* defines an identifiable template, for example by specifying: + - Parent record types + - Properties + - ``is_referenced_by`` statements +- An *identifiable* belongs to a concrete record. It consists of identifying attributes which "fill + in" the *registered identifiable*. In code, it can be represented as a Record based on the + *registered identifiable* with all the values filled in. +- An *identified record* is the result of retrieving a record from the database, based on the + *identifiable* (and its values). General question to clarify: @@ -221,23 +222,24 @@ startswith: bool, optional for prop in registered_identifiable.properties: if prop.name.lower() != "is_referenced_by": continue - for givenrt in prop.value: + for looking_for_rt in prop.value: found = False - if givenrt == "*": + if looking_for_rt == "*": for val in referencing_entities.values(): if len(val) > 0: found = True refs.extend(val) else: - rt_and_children = get_children_of_rt(givenrt) + rt_and_children = get_children_of_rt(looking_for_rt) for rtname in rt_and_children: if (rtname in referencing_entities): refs.extend(referencing_entities[rtname]) found = True if not found: raise NotImplementedError( - f"An identifying property:\n" - f"\nIdentifying PROPERTY\n{prop.name}" + f"Could not find referencing entities of type(s): {prop.value}\n" + f"for registered identifiable:\n{registered_identifiable}\n" + f"There were {len(referencing_entities)} referencing entities to choose from." ) return refs @@ -260,8 +262,8 @@ startswith: bool, optional def get_identifiable(self, record: db.Record, referencing_entities=None): """ - retrieve the registered identifiable and fill the property values to create an - identifiable + Retrieve the registered identifiable and fill the property values to create an + identifiable. Args: record: the record for which the Identifiable shall be created. diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py index c4a8063cae5e74222e92f28a3898656bf4a97f6a..5d92401a50d3e4aa9cc258e792be41df18ef3976 100644 --- a/src/caoscrawler/scanner.py +++ b/src/caoscrawler/scanner.py @@ -39,7 +39,7 @@ import warnings from collections.abc import Callable from typing import Any, Optional, Type, Union -import caosdb as db +import linkahead as db import yaml from importlib_resources import files from jsonschema import validate diff --git a/src/caoscrawler/utils.py b/src/caoscrawler/utils.py index 61b363099d0892b74e91f257bccb6cc832c3d59f..c62f44eeaa75ca42579aa3d6ead437e901cd38ff 100644 --- a/src/caoscrawler/utils.py +++ b/src/caoscrawler/utils.py @@ -25,7 +25,7 @@ # Some utility functions, e.g. for extending pylib. -import caosdb as db +import linkahead as db def has_parent(entity: db.Entity, name: str): diff --git a/src/doc/README_SETUP.md b/src/doc/README_SETUP.md index 5f5161d0d672ff3ad14db5c5b49f5c65550b06d7..a75193783d861707adf3b3d45311c392e22626f4 100644 --- a/src/doc/README_SETUP.md +++ b/src/doc/README_SETUP.md @@ -4,7 +4,10 @@ see INSTALL.md ## Run Unit Tests -Run `pytest unittests`. + +1. Install additional dependencies: + - h5py +2. Run `pytest unittests`. ## Documentation ## We use sphinx to create the documentation. Docstrings in the code should comply diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index 26aa9e73c42e4d3923d01d83a47c8dfa7007f0a6..920728c418479cd594e894e990d5f15693be589d 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -51,9 +51,10 @@ from caoscrawler.scanner import (create_converter_registry, scan_directory, from caoscrawler.stores import GeneralStore, RecordStore from caoscrawler.structure_elements import (DictElement, DictListElement, DictTextElement, File) -from caosdb.apiutils import compare_entities -from caosdb.cached import cache_clear -from caosdb.exceptions import EmptyUniqueQueryError +from linkahead.apiutils import compare_entities +from caosadvancedtools.models.parser import parse_model_from_string +from linkahead.cached import cache_clear +from linkahead.exceptions import EmptyUniqueQueryError from pytest import raises UNITTESTDIR = Path(__file__).parent @@ -109,6 +110,47 @@ def mock_get_entity_by(eid=None, name=None, path=None): raise EmptyUniqueQueryError("") +def mock_retrieve_record(identifiable: Identifiable): + """ assumes that the identifiable is always only the date""" + + for record in EXAMPLE_SERVER_STATE: + if (record.role == "Record" and "date" in identifiable.properties + and record.get_property("date").value == identifiable.properties['date']): + return record + return None + + +def mock_cached_only_rt(query_string: str): + """Always return an empty Container""" + result = db.Container() + lo_query = query_string.lower() + if lo_query.startswith("find record ") or lo_query.startswith("find file "): + return result + model = parse_model_from_string(""" +B: + obligatory_properties: + C: + obligatory_properties: + prop_other: + datatype: INTEGER + prop_ident: + datatype: INTEGER +A: + obligatory_properties: + B: + datatype: LIST<B> + prop_ident: +""") + if query_string == "FIND RECORDTYPE 'A'": + model.get_deep("A").id = 1 + return result + [model.get_deep("A")] + if query_string == "FIND RECORDTYPE 'B'": + model.get_deep("A").id = 2 + return result + [model.get_deep("B")] + print(query_string) + raise NotImplementedError("Mock for this case is missing") + + @pytest.fixture(autouse=True) def clear_cache(): cache_clear() @@ -363,6 +405,84 @@ def test_split_into_inserts_and_updates_with_copy_attr(crawler_mocked_identifiab crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() +@pytest.mark.xfail(reason="https://gitlab.com/linkahead/linkahead-crawler/-/issues/88") +@patch("caoscrawler.identifiable_adapters.cached_query", + new=Mock(side_effect=mock_cached_only_rt)) +def test_split_iiau_with_unmergeable_list_items(): + """Test for meaningful exception when referencing a list of unmergeable entities. + +Datamodel +--------- +A: + B: LIST<B> + prop_ident: INTEGER + +B: + prop_ident: + C: + +C: + prop_other: INTEGER + +Identifiables +------------- + +id_A: [prop_ident] +id_B: [prop_ident, "is_referenced_by: A"] + +Data +---- + +b1: ("same", 23) +b2: ("same", 42) + +a: ([b1, b2]) + """ + prop_ident = db.Property("prop_ident", datatype=db.INTEGER) + prop_other = db.Property("prop_ident", datatype=db.INTEGER) + rt_c = db.RecordType("C").add_property(prop_other) + # Somehow it is necessary that `B` has a reference property. Dunno if C must have an + # identifiable as well. + rt_b = db.RecordType("B").add_property(prop_ident).add_property("C") + rt_a = db.RecordType("A").add_property(prop_ident).add_property("LIST<B>") + + ident_a = db.RecordType().add_parent("A").add_property("prop_ident") + ident_b = db.RecordType().add_parent("B").add_property("prop_ident").add_property( + "is_referenced_by", value="A") + ident_c = db.RecordType().add_parent("C").add_property("prop_other").add_property( + "is_referenced_by", value="B") + + rec_a = db.Record("a").add_parent(rt_a).add_property("prop_ident", value=1234) + rec_b = [] + rec_c = [] + for value in [23, 42]: + new_c = db.Record().add_parent(rt_c).add_property("prop_other", value=value) + rec_c.append(new_c) + rec_b.append(db.Record().add_parent(rt_b).add_property( + "prop_ident", value=2020).add_property("C", value=new_c)) + rec_a.add_property("B", rec_b) + + ident_adapter = CaosDBIdentifiableAdapter() + ident_adapter.register_identifiable("A", ident_a) + ident_adapter.register_identifiable("B", ident_b) + ident_adapter.register_identifiable("C", ident_c) + + crawler = Crawler(identifiableAdapter=ident_adapter) + + # This should give a merge conflict, and not + # "Could not find referencing entities of type(s): A" + + # from IPython import embed; embed() + with raises(RuntimeError) as rte: + crawler.synchronize(commit_changes=False, + crawled_data=[rec_a, *rec_b, *rec_c]) + assert not isinstance(rte.value, NotImplementedError), \ + "Exception must not be NotImplementedError, but plain RuntimeError." + assert "Could not find referencing entities" not in rte.value.args[0] + assert "merging impossible" in rte.something + # crawler.split_into_inserts_and_updates(ent_list=[rec_a, *rec_b, *rec_c]) + + def test_has_missing_object_in_references(): crawler = Crawler() # Simulate remote server content by using the names to identify records @@ -449,16 +569,6 @@ def reset_mocks(mocks): mock.reset_mock() -def mock_retrieve_record(identifiable: Identifiable): - """ assumes that the identifiable is always only the date""" - - for record in EXAMPLE_SERVER_STATE: - if (record.role == "Record" - and record.get_property("date").value == identifiable.properties['date']): - return record - return None - - @ patch("caoscrawler.crawl.cached_get_entity_by", new=Mock(side_effect=mock_get_entity_by)) @ patch("caoscrawler.identifiable_adapters.cached_get_entity_by",