diff --git a/CHANGELOG.md b/CHANGELOG.md index 15a35a01473f02a55ef5d9f04aac6f2e13af4ca6..7cbda476773ae1b64bb45cdd1dca1be0e7894748 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -170,6 +170,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - ``add_prefix`` and ``remove_prefix`` arguments for the command line interface and the ``crawler_main`` function for the adding/removal of path prefixes when creating file entities. +- More strict checking of `identifiables.yaml`. +- Better error messages when server does not conform to expected data model. ### Changed ### diff --git a/src/caoscrawler/exceptions.py b/src/caoscrawler/exceptions.py index 6d08cf76fc177407154e38f0eb6aaa47bc863866..e7c61c34e2abbebef4790bde42f50d4b5b29f957 100644 --- a/src/caoscrawler/exceptions.py +++ b/src/caoscrawler/exceptions.py @@ -27,15 +27,6 @@ class ForbiddenTransaction(Exception): pass -class MissingReferencingEntityError(Exception): - """Thrown if the identifiable requires that some entity references the given entity but there - is no such reference """ - - def __init__(self, *args, rts=None, **kwargs): - self.rts = rts - super().__init__(self, *args, **kwargs) - - class ImpossibleMergeError(Exception): """Thrown if due to identifying information, two SyncNodes or two Properties of SyncNodes should be merged, but there is conflicting information that prevents this. @@ -47,8 +38,29 @@ class ImpossibleMergeError(Exception): super().__init__(self, *args, **kwargs) +class InvalidIdentifiableYAML(Exception): + """Thrown if the identifiable definition is invalid.""" + pass + + class MissingIdentifyingProperty(Exception): """Thrown if a SyncNode does not have the properties required by the corresponding registered identifiable """ pass + + +class MissingRecordType(Exception): + """Thrown if an record type can not be found although it is expected that it exists on the + server. + """ + pass + + +class MissingReferencingEntityError(Exception): + """Thrown if the identifiable requires that some entity references the given entity but there + is no such reference """ + + def __init__(self, *args, rts=None, **kwargs): + self.rts = rts + super().__init__(self, *args, **kwargs) diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index 3aae9353cb4c0cf4d6c264616d770837d87e801e..a22ad488f444999cab735bf37828805d96d4d449 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -36,7 +36,12 @@ import yaml from linkahead.cached import cached_get_entity_by, cached_query from linkahead.utils.escape import escape_squoted_text -from .exceptions import MissingIdentifyingProperty, MissingReferencingEntityError +from .exceptions import ( + InvalidIdentifiableYAML, + MissingIdentifyingProperty, + MissingRecordType, + MissingReferencingEntityError, +) from .identifiable import Identifiable from .sync_node import SyncNode from .utils import has_parent @@ -48,7 +53,10 @@ def get_children_of_rt(rtname): """Supply the name of a recordtype. This name and the name of all children RTs are returned in a list""" escaped = escape_squoted_text(rtname) - return [p.name for p in cached_query(f"FIND RECORDTYPE '{escaped}'")] + recordtypes = [p.name for p in cached_query(f"FIND RECORDTYPE '{escaped}'")] + if not recordtypes: + raise MissingRecordType(f"Record type could not be found on server: {rtname}") + return recordtypes def convert_value(value: Any) -> str: @@ -576,19 +584,32 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): """Load identifiables defined in a yaml file""" with open(path, "r", encoding="utf-8") as yaml_f: identifiable_data = yaml.safe_load(yaml_f) + self.load_from_yaml_object(identifiable_data) - for key, value in identifiable_data.items(): - rt = db.RecordType().add_parent(key) - for prop_name in value: + def load_from_yaml_object(self, identifiable_data): + """Load identifiables defined in a yaml object. + """ + + for rt_name, id_list in identifiable_data.items(): + rt = db.RecordType().add_parent(rt_name) + if not isinstance(id_list, list): + raise InvalidIdentifiableYAML( + f"Identifiable contents must be lists, but this was not: {rt_name}") + for prop_name in id_list: if isinstance(prop_name, str): rt.add_property(name=prop_name) elif isinstance(prop_name, dict): for k, v in prop_name.items(): + if k == "is_referenced_by" and not isinstance(v, list): + raise InvalidIdentifiableYAML( + f"'is_referenced_by' must be a list. Found in: {rt_name}") rt.add_property(name=k, value=v) else: - NotImplementedError("YAML is not structured correctly") + raise InvalidIdentifiableYAML( + "Identifiable properties must be str or dict, but this one was not:\n" + f" {rt_name}/{prop_name}") - self.register_identifiable(key, rt) + self.register_identifiable(rt_name, rt) def register_identifiable(self, name: str, definition: db.RecordType): self._registered_identifiables[name] = definition diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index e7a03e3322da0d937bf3c1330f21b90768b478d8..0a6aee44a1892f1c950a80b936adf184616fd612 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -173,7 +173,15 @@ A: model.get_deep("A").id = 2 return result + [model.get_deep("B")] print(query_string) - raise NotImplementedError("Mock for this case is missing") + raise NotImplementedError(f"Mock for this case is missing: {query_string}") + + +def mock_cached_only_rt_allow_empty(query_string: str): + try: + result = mock_cached_only_rt(query_string) + except NotImplementedError: + result = db.Container() + return result @pytest.fixture(autouse=True) diff --git a/unittests/test_data/invalid_identifiable/identifiable_content_no_list.yaml b/unittests/test_data/invalid_identifiable/identifiable_content_no_list.yaml new file mode 100644 index 0000000000000000000000000000000000000000..aee572a190bd7f439f638ef7c9a5d94a831aca81 --- /dev/null +++ b/unittests/test_data/invalid_identifiable/identifiable_content_no_list.yaml @@ -0,0 +1,4 @@ +Experiment: + date: + - 1 + - 2 diff --git a/unittests/test_data/invalid_identifiable/identifiable_no_str_or_dict.yaml b/unittests/test_data/invalid_identifiable/identifiable_no_str_or_dict.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a33c4ace9f8709a9b4a77c5fd8f38514acbe1e9c --- /dev/null +++ b/unittests/test_data/invalid_identifiable/identifiable_no_str_or_dict.yaml @@ -0,0 +1,3 @@ +Experiment: +- date +- 23 diff --git a/unittests/test_data/invalid_identifiable/identifiable_referenced_no_list.yaml b/unittests/test_data/invalid_identifiable/identifiable_referenced_no_list.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a504eab748d4891c3e1088ee785afcf6347fbbab --- /dev/null +++ b/unittests/test_data/invalid_identifiable/identifiable_referenced_no_list.yaml @@ -0,0 +1,5 @@ +Experiment: +- date +Event: +- is_referenced_by: Experiment +- event_id diff --git a/unittests/test_identifiable_adapters.py b/unittests/test_identifiable_adapters.py index e37c1ad4953880f988bb1efc3f6804766805b4ee..94685c27523f2f2cebd9d0723078f2da9ff56f78 100644 --- a/unittests/test_identifiable_adapters.py +++ b/unittests/test_identifiable_adapters.py @@ -34,6 +34,8 @@ from pathlib import Path import caosdb as db import pytest +from caoscrawler.exceptions import (InvalidIdentifiableYAML, + ) from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import (CaosDBIdentifiableAdapter, IdentifiableAdapter, @@ -122,6 +124,23 @@ def test_load_from_yaml_file(): assert project_i.get_property("title") is not None +def test_invalid_yaml(): + ident = CaosDBIdentifiableAdapter() + invalid_dir = UNITTESTDIR / "test_data" / "invalid_identifiable" + with pytest.raises(InvalidIdentifiableYAML) as exc: + ident.load_from_yaml_definition(invalid_dir / "identifiable_content_no_list.yaml") + assert str(exc.value) == "Identifiable contents must be lists, but this was not: Experiment" + + with pytest.raises(InvalidIdentifiableYAML) as exc: + ident.load_from_yaml_definition(invalid_dir / "identifiable_referenced_no_list.yaml") + assert str(exc.value) == "'is_referenced_by' must be a list. Found in: Event" + + with pytest.raises(InvalidIdentifiableYAML) as exc: + ident.load_from_yaml_definition(invalid_dir / "identifiable_no_str_or_dict.yaml") + assert str(exc.value) == ("Identifiable properties must be str or dict, but this one was not:\n" + " Experiment/23") + + def test_non_default_name(): ident = CaosDBIdentifiableAdapter() identifiable = ident.get_identifiable(SyncNode(db.Record(name="don't touch it") @@ -141,8 +160,8 @@ def test_wildcard_ref(): dummy.id = 1 identifiable = ident.get_identifiable(SyncNode(rec, db.RecordType() .add_parent(name="Person") - .add_property(name="is_referenced_by", value=["*"])), - + .add_property(name="is_referenced_by", + value=["*"])), [dummy] ) assert identifiable.backrefs[0] == 1 diff --git a/unittests/test_sync_graph.py b/unittests/test_sync_graph.py index a7c1539118a4cd87d8c46bf6e18b07b90a90361a..9015e74be69c60c43ece80a2f742d6e9b7badda6 100644 --- a/unittests/test_sync_graph.py +++ b/unittests/test_sync_graph.py @@ -25,10 +25,15 @@ from unittest.mock import MagicMock, Mock, patch import linkahead as db import pytest -from test_crawler import basic_retrieve_by_name_mock_up, mock_get_entity_by +from test_crawler import (basic_retrieve_by_name_mock_up, + mock_cached_only_rt_allow_empty, + mock_get_entity_by, + ) from caoscrawler.exceptions import (ImpossibleMergeError, - MissingIdentifyingProperty) + MissingIdentifyingProperty, + MissingRecordType, + ) from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter from caoscrawler.sync_graph import SyncGraph, _set_each_scalar_value @@ -651,3 +656,30 @@ def test_set_each_scalar_value(): assert a.properties[0].value == 42 _set_each_scalar_value(a, lambda x: x == 42, lambda x: None) assert a.properties[0].value is None + + +@patch("caoscrawler.identifiable_adapters.cached_query", + new=Mock(side_effect=mock_cached_only_rt_allow_empty)) +def test_merge_referenced_by(): + """Merging two entities that are referenced by a third entity with nonexistent RecordType. + + See also https://gitlab.com/linkahead/linkahead-crawler/-/issues/95 + """ + ident = CaosDBIdentifiableAdapter() + ident.load_from_yaml_object({ + "RT_A": ["name"], + "RT_B": [{"is_referenced_by": ["RT_A"]}, "my_id"] + }) + crawled_data: list = [] + references: list = [] + for ii in [0, 1]: + rec = db.Record().add_parent("RT_B").add_property("my_id", value=ii) + references.append(rec) + crawled_data.append(rec) + rec_a = db.Record(name="Rec_A").add_parent("RT_A") + rec_a.add_property("my_ref", value=references) + crawled_data.append(rec_a) + + with pytest.raises(MissingRecordType) as mrt: + SyncGraph(crawled_data, ident) + assert str(mrt.value).endswith("Record type could not be found on server: RT_A")