diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 0a8c156f603587e894a19b1f356cce2e1dcc5774..d5210a189a52d538e9bee5a8669695136182e5b3 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -61,6 +61,7 @@ from linkahead.utils.escape import escape_squoted_text from .config import get_config_setting from .converters import Converter, ConverterValidationError from .debug_tree import DebugTree +from .exceptions import MissingIdentifyingProperty from .identifiable import Identifiable from .identifiable_adapters import (CaosDBIdentifiableAdapter, IdentifiableAdapter) @@ -297,60 +298,6 @@ class Crawler(object): self.crawled_data = data return data -# def replace_references_with_cached(self, record: db.Record, referencing_entities: dict): -# """ -# Replace all references with the versions stored in the cache. -# -# If the cache version is not identical, raise an error. -# """ -# for p in record.properties: -# if (isinstance(p.value, list)): -# lst = [] -# for el in p.value: -# if (isinstance(el, db.Entity) and el.id is None): -# cached = self.treated_records_lookup.get_any( -# el, -# self.identifiableAdapter.get_identifiable( -# el, referencing_entities[id(el)])) -# if cached is None: -# lst.append(el) -# continue -# if not check_identical(cached, el, True): -# if isinstance(p.value, db.File): -# if p.value.path != cached.path: -# raise RuntimeError( -# "The cached and the referenced entity are not identical.\n" -# f"Cached:\n{cached}\nReferenced:\n{el}" -# ) -# else: -# raise RuntimeError( -# "The cached and the referenced entity are not identical.\n" -# f"Cached:\n{cached}\nReferenced:\n{el}" -# ) -# lst.append(cached) -# else: -# lst.append(el) -# p.value = lst -# if (isinstance(p.value, db.Entity) and p.value.id is None): -# cached = self.treated_records_lookup.get_any( -# p.value, self.identifiableAdapter.get_identifiable( -# p.value, referencing_entities[id(p.value)])) -# if cached is None: -# continue -# if not check_identical(cached, p.value, True): -# if isinstance(p.value, db.File): -# if p.value.path != cached.path: -# raise RuntimeError( -# "The cached and the referenced entity are not identical.\n" -# f"Cached:\n{cached}\nReferenced:\n{p.value}" -# ) -# else: -# raise RuntimeError( -# "The cached and the referenced entity are not identical.\n" -# f"Cached:\n{cached}\nReferenced:\n{p.value}" -# ) -# p.value = cached - def split_into_inserts_and_updates(self, st: SyncGraph): entity_was_treated = True # st.entities contains Entities which could not yet be checked against the remote server @@ -367,11 +314,10 @@ class Crawler(object): for se in list(st.unchecked): if se not in st.unchecked: continue + if st.identity_relies_on_unchecked_entity(se): - print(st.nodes.index(se), "relies on unchecked") continue - print(se.identifiable) if se.identifiable is None: st.set_identifiable_of_node(se, st.identifiableAdapter.get_identifiable( se, st.backward_id_referenced_by[se.uuid])) @@ -379,19 +325,6 @@ class Crawler(object): if se not in st.unchecked: continue - # if (equivalent_se.identifiable is None and not - # self.identity_relies_on_unchecked_entity(equivalent_se)): - # try: - # equivalent_se.identifiable = self.identifiableAdapter.get_identifiable( - # equivalent_se, self.backward_id_referenced_by[equivalent_se.uuid]) - # if equivalent_se not in self.unchecked: - # self._identifiable_look_up[ - # equivalent_se.identifiable.get_representation() - # ] = equivalent_se - # except Exception as es: - # print(es) - # pass - identified_record = ( st.identifiableAdapter.retrieve_identified_record_for_identifiable( se.identifiable)) @@ -411,14 +344,13 @@ class Crawler(object): circle = st.unchecked_contains_circular_dependency() if circle is None: logger.error("Failed, but found NO circular dependency. The data is as follows:" - # + str(self.compact_entity_list_representation(st.entities, - # referencing_entities)) + + "\n".join([str(el) for el in st.unchecked]) + ) else: logger.error("Found circular dependency (Note that this might include references " "that are not identifying properties): " - # + self.compact_entity_list_representation(circle, - # referencing_entities) + + "\n".join([str(el) for el in st.unchecked]) ) raise RuntimeError( @@ -745,8 +677,6 @@ class Crawler(object): for record in to_be_updated: if record.id is not None: # TODO: use cache here? - print(record.id) - print(record) identified_records.append(cached_get_entity_by(eid=record.id)) else: raise Exception("Please report a bug: At this stage all records to be updated" diff --git a/src/caoscrawler/exceptions.py b/src/caoscrawler/exceptions.py new file mode 100644 index 0000000000000000000000000000000000000000..b271cf5bc77b9d709b11ef3e46b95755f71ec41c --- /dev/null +++ b/src/caoscrawler/exceptions.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +class ForbiddenTransaction(Exception): + pass + + +class MissingReferencingEntityError(Exception): + def __init__(self, *args, rts=None, **kwargs): + self.rts = rts + super().__init__(self, *args, **kwargs) + + +class ImpossibleMergeError(Exception): + pass + + +class MissingIdentifyingProperty(Exception): + pass diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index 2405e454f7a17f8a74617349cd95e5b61fe66541..ef27515f09f1a9f9fa763e934b20e19c5a25abb6 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -37,6 +37,8 @@ import yaml from linkahead.cached import cached_get_entity_by, cached_query from linkahead.utils.escape import escape_squoted_text +from .exceptions import (MissingIdentifyingProperty, + MissingReferencingEntityError) from .identifiable import Identifiable from .sync_node import SyncNode from .utils import has_parent @@ -274,7 +276,7 @@ startswith: bool, optional for el in identifiable_backrefs: assert isinstance(el, SyncNode) if len(identifiable_backrefs) == 0: - raise RuntimeError( + raise MissingReferencingEntityError( f"Could not find referencing entities of type(s): {prop.value}\n" f"for registered identifiable:\n{registered_identifiable}\n" f"There were {len(identifiable_backrefs)} referencing entities to choose from.\n" @@ -288,7 +290,7 @@ startswith: bool, optional options = [p.value for p in se.properties if p.name == prop.name] if len(options) == 0: - raise NotImplementedError( + raise MissingIdentifyingProperty( f"The following record is missing an identifying property:\n" f"RECORD\n{se}\nIdentifying PROPERTY\n{prop.name}" ) @@ -327,7 +329,8 @@ startswith: bool, optional properties=identifiable_props, backrefs=[e.id for e in identifiable_backrefs] ) - except Exception: + except Exception as exc: + logger.error(exc) logger.error(f"Error while creating identifiable for this record:\n{se}") raise @@ -365,6 +368,8 @@ startswith: bool, optional def referencing_entity_has_appropriate_type(parents, register_identifiable): if register_identifiable.get_property("is_referenced_by") is None: return False + if register_identifiable.get_property("is_referenced_by").value is None: + return False appropriate_types = [el.lower() for el in register_identifiable.get_property("is_referenced_by").value] if "*" in appropriate_types: diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py index 9d1f538732858ff2fbf949d45c359ebb16fe3480..f6fc4d8f81d68277d6ad405ab04f50999d85525d 100644 --- a/src/caoscrawler/scanner.py +++ b/src/caoscrawler/scanner.py @@ -362,16 +362,17 @@ def scanner(items: list[StructureElement], debug_tree.debug_metadata["usage"][str(element)].add( "/".join(converters_path + [converter.name])) mod_info = debug_tree.debug_metadata["provenance"] - for record_name, prop_name in keys_modified: - # TODO: check - internal_id = record_store_copy.get_internal_id( - record_name) - record_identifier = record_name + \ - "_" + str(internal_id) - converter.metadata["usage"].add(record_identifier) - mod_info[record_identifier][prop_name] = ( - structure_elements_path + [element.get_name()], - converters_path + [converter.name]) + if keys_modified is not None: + for record_name, prop_name in keys_modified: + # TODO: check + internal_id = record_store_copy.get_internal_id( + record_name) + record_identifier = record_name + \ + "_" + str(internal_id) + converter.metadata["usage"].add(record_identifier) + mod_info[record_identifier][prop_name] = ( + structure_elements_path + [element.get_name()], + converters_path + [converter.name]) scanner(children, converter.converters, general_store_copy, record_store_copy, diff --git a/src/caoscrawler/sync_graph.py b/src/caoscrawler/sync_graph.py index ff9b7c6fa05eb4fc8b026bb160e81bc2151bc99e..37d46d15767237b7e877072f5304b4b5d66be0eb 100644 --- a/src/caoscrawler/sync_graph.py +++ b/src/caoscrawler/sync_graph.py @@ -25,6 +25,7 @@ A data model class for the semantic data that shall be created by synchronizatio from __future__ import annotations +import logging from typing import Any, Dict, List, Optional, Union import linkahead as db @@ -33,9 +34,36 @@ from linkahead.apiutils import (EntityMergeConflictError, compare_entities, from linkahead.cached import cache_clear, cached_get_entity_by from linkahead.exceptions import EmptyUniqueQueryError +from .exceptions import ImpossibleMergeError, MissingReferencingEntityError from .identifiable_adapters import IdentifiableAdapter from .sync_node import SyncNode +logger = logging.getLogger(__name__) + + +def _for_each_scalar_value(node, condition, kind, value=None): + for p in node.properties: + if isinstance(p.value, list): + for ii, el in enumerate(p.value): + if condition(el): + if kind == "remove": + p.value.remove(el) + elif kind == "set": + p.value[ii] = value(el) + elif condition(p.value): + if kind == "remove": + node.properties.remove(p) + elif kind == "set": + p.value = value(p.value) + + +def _remove_each_scalar_value(node, condition): + _for_each_scalar_value(node, condition, "remove") + + +def _set_each_scalar_value(node, condition, value): + _for_each_scalar_value(node, condition, "set", value=value) + class SyncGraph(): """ combines nodes in the graph based on their identity in order to create a graph of objects @@ -106,8 +134,6 @@ class SyncGraph(): if node_id is None: node_id = self._get_new_id() node.id = node_id - for el in node.other: - el.id = node_id if node_id in self._id_look_up: self._merge_into(node, self._id_look_up[node.id]) else: @@ -137,17 +163,13 @@ class SyncGraph(): for el in self.nodes: entities.append(el.export_entity()) node_map[id(el)] = entities[-1] - for oel in el.other: - node_map[id(oel)] = entities[-1] - for ent in entities: - for p in ent.properties: - if isinstance(p.value, list): - for ii, el in enumerate(p.value): - if isinstance(el, SyncNode): - p.value[ii] = node_map[id(el)] - elif isinstance(p.value, SyncNode): - p.value = node_map[id(p.value)] + if len(self.unchecked) > 1: + self.unchecked_contains_circular_dependency() + for ent in entities: + _set_each_scalar_value(ent, + condition=lambda val: isinstance(val, SyncNode), + value=lambda val: node_map[id(val)]) missing = [el for el in entities if el.id < 0] existing = [el for el in entities if el.id > 0] @@ -189,7 +211,7 @@ class SyncGraph(): if referenced in self.unchecked: if referenced in circle: closed = True - circle.append(pval) # FIXME + circle.append(referenced) added_to_circle = True if not added_to_circle: return None @@ -273,53 +295,6 @@ class SyncGraph(): SyncGraph._create_flat_list([p.value], flat) return flat - @staticmethod - def _treat_merge_error_of(newrecord, record): - """ - The parameters are two entities that cannot be merged with the merge_entities function. - - # This function checks for two obvious cases where no merge will ever be possible: - # 1. Two Entities with differing IDs - # 2. Two non-Entity values which differ - - It creates a more informative logger message and raises an Exception in those cases. - """ - for this_p in newrecord.properties: - that_p = record.get_property(this_p.name) - - if that_p is None: - logger.debug(f"Property {this_p.name} does not exist in the second entity. Note that " - "this should not be the reason for the merge conflict.") - continue - - if (isinstance(this_p.value, db.Entity) - and isinstance(that_p.value, db.Entity)): - if this_p.value.id is not None and that_p.value.id is not None: - if this_p.value.id != that_p.value.id: - logger.error("The Crawler is trying to merge two entities " - "because they should be the same object (same" - " identifiables), but they reference " - "different Entities with the same Property." - f"Problematic Property: {this_p.name}\n" - f"Referenced Entities: {this_p.value.id} and " - f"{that_p.value.id}\n" - f"{record}\n{newrecord}") - raise RuntimeError("Cannot merge Entities") - elif (not isinstance(this_p.value, db.Entity) - and not isinstance(that_p.value, db.Entity)): - if ((this_p.value != that_p.value) - # TODO can we also compare lists? - and not isinstance(this_p.value, list) - and not isinstance(that_p.value, list)): - logger.error( - "The Crawler is trying to merge two entities because they should be the same " - "object (same identifiables), but they have different values for the same " - "Property.\n" - f"Problematic Property: {this_p.name}\n" - f"Values: {this_p.value} and {that_p.value}\n" - f"{record}\n{newrecord}") - raise RuntimeError("Cannot merge Entities") - @staticmethod def _create_reference_mapping(flat: List[SyncNode]): """ @@ -421,10 +396,6 @@ class SyncGraph(): """ assert source is not target target.update(source) - target.other.append(source) - target.other.extend(source.other) - for el in target.other: - el.id = target.id # update reference mappings for node in self.forward_references.pop(source.uuid): @@ -432,6 +403,10 @@ class SyncGraph(): self.backward_references[node.uuid].remove(source) self.backward_references[node.uuid].add(target) for node in self.backward_references.pop(source.uuid): + # replace actual reference property values + _set_each_scalar_value(node, + condition=lambda val: val is source, + value=lambda val: target) self.backward_references[target.uuid].add(node) self.forward_references[node.uuid].remove(source) self.forward_references[node.uuid].add(target) @@ -473,7 +448,6 @@ class SyncGraph(): target, self.backward_id_referenced_by[target.uuid]) self.set_identifiable_of_node(target, identifiable) except Exception as es: - print(es) pass if id(source) in self._missing and id(target) not in self._missing: self._treat_missing(target) @@ -491,14 +465,9 @@ class SyncGraph(): self.identifiableAdapter.get_registered_identifiable(el))) se_lookup[id(el)] = self.nodes[-1] for node in self.nodes: - for p in node.properties: - if isinstance(p.value, list): - for index, val in enumerate(p.value): - if id(val) in se_lookup: - p.value[index] = se_lookup[id(val)] - else: - if id(p.value) in se_lookup: - p.value = se_lookup[id(p.value)] + _set_each_scalar_value(node, + condition=lambda val: id(val) in se_lookup, + value=lambda val: se_lookup[id(val)]) def _treat_missing(self, node): self._missing[id(node)] = node @@ -511,7 +480,6 @@ class SyncGraph(): other_node, self.backward_id_referenced_by[other_node.uuid]) self.set_identifiable_of_node(other_node, identifiable) except Exception as es: - print(es) pass if other_node in self.unchecked: self.set_id_of_node(other_node) @@ -527,7 +495,6 @@ class SyncGraph(): other_node, self.backward_id_referenced_by[other_node.uuid]) self.set_identifiable_of_node(other_node, identifiable) except Exception as es: - print(es) pass def __repr__(self): diff --git a/src/caoscrawler/sync_node.py b/src/caoscrawler/sync_node.py index 4dfc61ceeb29d157ae36cccb23d3273b4868861c..492839a9b9106ef591f8219f3610aca4997866c3 100644 --- a/src/caoscrawler/sync_node.py +++ b/src/caoscrawler/sync_node.py @@ -22,12 +22,18 @@ from __future__ import annotations +import logging from typing import Any, Dict, List, Optional, Union from uuid import uuid4 as uuid import linkahead as db +import yaml from linkahead.common.models import _ParentList, _Properties +from .exceptions import ImpossibleMergeError + +logger = logging.getLogger(__name__) + class SyncNode(): """ represents the information related to an Entity as it shall be created in LinkAhead @@ -102,12 +108,50 @@ class SyncNode(): pval = pval.id if entval != pval: - raise db.apiutils.EntityMergeConflictError(f"Differing values were set for Property {p.name}:\n" - f"{ent.get_property(p).value}\n{p.value}") + logger.error("The Crawler is trying to create an entity," + " but there are have conflicting property values." + f"Problematic Property: {p.name}\n" + f"First value:\n{ent.get_property(p).value}\n" + f"Second value:\n{p.value}\n" + f"{self}" + ) + ime = ImpossibleMergeError("Cannot merge Entities") + ime.pname = p.name + ime.values = (ent.get_property(p).value, p.value) + raise ime else: ent.add_property(id=p.id, name=p.name, value=p.value) return ent + def __repr__(self): + res = f"\n=====================================================\n{self.role}\n" + if hasattr(self, "_metadata"): + res += f"user: {self._metadata['user']}\n" + res += f"json: {self._metadata['json']}\n" + res += "---------------------------------------------------\n" + res += yaml.dump({"uuid": self.uuid.hex, "id": self.id, "name": self.name, + "parents": [el.name for el in self.parents]}, allow_unicode=True) + res += "---------------------------------------------------\n" + res += "properties:\n" + d = {} + for p in self.properties: + v = p.value + d[p.name] = [] + if not isinstance(p.value, list): + v = [v] + for el in v: + if isinstance(el, SyncNode): + d[p.name].append({"id": el.id, "name": el.name, "parents": [e.name for e in + el.parents]}) + else: + d[p.name].append(el) + + return (res + yaml.dump(d, allow_unicode=True) + + "=====================================================\n") + + def is_unidentifiable(self): + return self.registered_identifiable.get_property("no-ident") is not None + def parent_in_list(parent, plist): missing = False diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index bc6b5d43e79aff5974c9745a489716b7d9b4763b..90726f7756333abd8c71ae90c91018cf25fc00f1 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -42,6 +42,9 @@ from caosadvancedtools.models.parser import parse_model_from_string from caoscrawler.crawl import (Crawler, SecurityMode, _treat_deprecated_prefix, crawler_main, split_restricted_path) from caoscrawler.debug_tree import DebugTree +from caoscrawler.exceptions import (ImpossibleMergeError, + MissingIdentifyingProperty, + MissingReferencingEntityError) from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import (CaosDBIdentifiableAdapter, IdentifiableAdapter, @@ -281,9 +284,9 @@ def test_split_into_inserts_and_updates_unidentified(crawler_mocked_identifiable crawler = crawler_mocked_identifiable_retrieve st = SyncGraph([db.Record().add_parent("someparent")], crawler.identifiableAdapter) - with raises(ValueError) as err: + with raises(MissingIdentifyingProperty) as err: crawler.split_into_inserts_and_updates(st) - assert str(err.value).startswith("There is no identifying information.") + assert str(err.value).startswith("The node has no name.") def basic_retrieve_by_name_mock_up(rec, referencing_entities=None, known=None): @@ -493,7 +496,7 @@ a: ([b1, b2]) # The Cs cannot be merged due to different identifying properties # The Bs cannot be merged due to differeng references to Cs - with raises(db.apiutils.EntityMergeConflictError) as rte: + with raises(ImpossibleMergeError) as rte: crawler.split_into_inserts_and_updates(st) # assert not isinstance(rte.value, NotImplementedError), \ # "Exception must not be NotImplementedError, but plain RuntimeError." @@ -704,7 +707,7 @@ def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test) st = SyncGraph([db.Record(name="B").add_parent("C")], crawler.identifiableAdapter) # Test without referencing object # currently a RuntimeError is raised if necessary properties are missing. - with raises(RuntimeError): + with raises(MissingReferencingEntityError): crawler.split_into_inserts_and_updates(st) # identifiables were not yet checked diff --git a/unittests/test_sync_graph.py b/unittests/test_sync_graph.py index 2b39d42e225d39d7bcbecbd8d04cba07bddef976..1d053c54aa5096a61ae0c8404511099ce753ba07 100644 --- a/unittests/test_sync_graph.py +++ b/unittests/test_sync_graph.py @@ -23,6 +23,7 @@ from unittest.mock import MagicMock, Mock, patch import linkahead as db import pytest +from caoscrawler.exceptions import ImpossibleMergeError from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter from caoscrawler.sync_graph import SyncGraph @@ -286,16 +287,16 @@ def test_backward_id_referenced_by(): ident_adapter.register_identifiable("C", ident_b) referenced = db.Record(name="B").add_parent("C") - entlist = [referenced, db.Record(name="A").add_parent("BR").add_property("ref", referenced), ] + ent_list = [referenced, db.Record(name="A").add_parent("BR").add_property("ref", referenced), ] - st = SyncGraph(entlist, ident_adapter) + st = SyncGraph(ent_list, ident_adapter) assert st.nodes[1] in st.backward_id_referenced_by[st.nodes[0].uuid] def test_set_id_of_node(simple_adapter): # setting the id should lead to the node being marked as existing - entlist = [db.Record().add_parent("RT1")] - st = SyncGraph(entlist, simple_adapter) + ent_list = [db.Record().add_parent("RT1")] + st = SyncGraph(ent_list, simple_adapter) assert len(st.nodes) == 1 assert len(st.unchecked) == 1 st.set_id_of_node(st.unchecked[0], 101) @@ -304,8 +305,8 @@ def test_set_id_of_node(simple_adapter): assert id(st.nodes[0]) in st._existing # setting the id with None should lead to the node being marked as missing - entlist = [db.Record().add_parent("RT1").add_property(name="RT2", value=1)] - st = SyncGraph(entlist, simple_adapter) + ent_list = [db.Record().add_parent("RT1").add_property(name="RT2", value=1)] + st = SyncGraph(ent_list, simple_adapter) assert len(st.nodes) == 1 assert len(st.unchecked) == 1 # is automatically set in during initialization of graph @@ -316,10 +317,10 @@ def test_set_id_of_node(simple_adapter): assert id(st.nodes[0]) in st._missing # setting the id to one that already exists should lead to a merge - entlist = [ + ent_list = [ db.Record(id=101).add_parent("RT1"), db.Record().add_parent("RT1").add_property(name="a", value=1)] - st = SyncGraph(entlist, simple_adapter) + st = SyncGraph(ent_list, simple_adapter) assert len(st.nodes) == 2 assert len(st.unchecked) == 1 st.set_id_of_node(st.unchecked[0], 101) @@ -328,11 +329,11 @@ def test_set_id_of_node(simple_adapter): assert st.nodes[0].properties[0].name == "a" # setting the id to None should lead to depending nodes marked as missing - entlist = [ + ent_list = [ db.Record().add_parent("RT3").add_property(name="a", value=1).add_property( name="RT2", value=db.Record().add_parent("RT2")), ] - st = SyncGraph(entlist, simple_adapter) + st = SyncGraph(ent_list, simple_adapter) assert len(st.nodes) == 2 assert len(st.unchecked) == 2 st.set_id_of_node(st.unchecked[0]) @@ -342,13 +343,13 @@ def test_set_id_of_node(simple_adapter): assert id(st.nodes[1]) in st._missing # same as above but with backref - entlist = [ + ent_list = [ db.Record() .add_parent("RT4") .add_property(name="RT3", value=db.Record().add_parent("RT3").add_property(name="a", value=1)), ] - st = SyncGraph(entlist, simple_adapter) + st = SyncGraph(ent_list, simple_adapter) assert len(st.nodes) == 2 assert len(st.unchecked) == 2 assert st.unchecked[1].identifiable is not None @@ -359,13 +360,13 @@ def test_set_id_of_node(simple_adapter): assert id(st.nodes[1]) in st._missing # setting an id might allow to check another node that depends on the former - entlist = [ + ent_list = [ db.Record() .add_parent("RT4") .add_property(name="RT3", value=db.Record().add_parent("RT3").add_property(name="a", value=1)), ] - st = SyncGraph(entlist, simple_adapter) + st = SyncGraph(ent_list, simple_adapter) assert st.nodes[0].identifiable is None assert st.nodes[1].identifiable is not None st.set_id_of_node(st.unchecked[1], 111) @@ -373,7 +374,7 @@ def test_set_id_of_node(simple_adapter): assert st.nodes[1].identifiable is not None # same as above but going one step further: the new identifiable allows to merge that node - entlist = [ + ent_list = [ (db.Record() .add_parent("RT4") .add_property(name="RT3", @@ -383,7 +384,7 @@ def test_set_id_of_node(simple_adapter): .add_parent("RT4") .add_property(name="RT3", value=111)) ] - st = SyncGraph(entlist, simple_adapter) + st = SyncGraph(ent_list, simple_adapter) assert st.nodes[0].identifiable is None assert st.nodes[1].identifiable is not None assert st.nodes[2].identifiable is not None @@ -392,65 +393,44 @@ def test_set_id_of_node(simple_adapter): assert st.nodes[0].identifiable is not None assert len(st.nodes) == 2 - # Test for meaningful exception when referencing a list of unmergeable entities. - # - # Datamodel - # --------- - # A: - # B: LIST<B> - # prop_ident: INTEGER - # - # B: - # prop_ident: - # - # - # Identifiables - # ------------- - # - # id_A: [prop_ident] - # id_B: [prop_ident, "is_referenced_by: A"] - # - # Data - # ---- - # - # - # b1: ("same", c1) - # b2: ("same", c2) - # - # a: ([b1, b2]) - - prop_ident = db.Property("prop_ident", datatype=db.INTEGER) - prop_other = db.Property("prop_ident", datatype=db.INTEGER) - # Somehow it is necessary that `B` has a reference property. Dunno if C must have an - # identifiable as well. - rt_b = db.RecordType("B").add_property(prop_ident).add_property("C") - rt_a = db.RecordType("A").add_property(prop_ident).add_property("LIST<B>") - - ident_a = db.RecordType().add_parent("A").add_property("prop_ident") - ident_b = db.RecordType().add_parent("B").add_property("prop_ident").add_property( - "is_referenced_by", value="A") + # Test for meaningful exception when having unmergeable properties. + ent_list = [ + db.Record().add_parent("RT3").add_property('a', value=1).add_property('b', value=1), + db.Record().add_parent("RT3").add_property('a', value=1).add_property('b', value=2), + ] - rec_a = db.Record("a").add_parent(rt_a).add_property("prop_ident", value=1234) - rec_b = [] - for value in [23, 42]: - rec_b.append(db.Record().add_parent(rt_b).add_property("prop_ident", value=2020)) - rec_a.add_property("B", rec_b) + st = SyncGraph(ent_list, simple_adapter) + with pytest.raises(ImpossibleMergeError): + st.export_record_lists() + ent_list = [ + db.Record().add_parent("RT3").add_property('a', value=1) + .add_property('b', value=db.Record().add_parent("RT5")), + db.Record().add_parent("RT3").add_property('a', value=1) + .add_property('b', value=db.Record().add_parent("RT5")), + ] - ident_adapter = CaosDBIdentifiableAdapter() - ident_adapter.register_identifiable("A", ident_a) - ident_adapter.register_identifiable("B", ident_b) + st = SyncGraph(ent_list, simple_adapter) + with pytest.raises(ImpossibleMergeError): + st.export_record_lists() + ent_list = [ + db.Record(id=101).add_parent("RT3") + .add_property('b', value=db.Record().add_parent("RT5")), + db.Record().add_parent("RT3") + .add_property('b', value=db.Record().add_parent("RT5")), + ] - st = SyncGraph([rec_a, *rec_b], ident_adapter) - for node in st.nodes: - print(node.id, node.parents) - assert st.identity_relies_on_unchecked_entity(st.nodes[0]) is False - assert st.identity_relies_on_unchecked_entity(st.nodes[1]) - assert st.identity_relies_on_unchecked_entity(st.nodes[2]) + st = SyncGraph(ent_list, simple_adapter) + assert st.nodes[2].is_unidentifiable() + assert st.nodes[3].is_unidentifiable() + assert len(st.nodes) == 4 + assert len(st.unchecked) == 1 + st.set_id_of_node(st.nodes[1], 101) assert len(st.nodes) == 3 - assert len(st.unchecked) == 3 - st.set_id_of_node(st.nodes[0]) - assert len(st.nodes) == 2 assert len(st.unchecked) == 0 + # until implementation of it ... + with pytest.raises(NotImplementedError): + # with pytest.raises(ImpossibleMergeError): + st.export_record_lists() @patch("caoscrawler.sync_graph.cached_get_entity_by", @@ -466,30 +446,30 @@ def test_merging(simple_adapter): basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A")})) # merging based on id - entlist = [ + ent_list = [ db.Record(id=101).add_parent("A"), db.Record(id=101).add_parent("A")] - st = SyncGraph(entlist, ident_adapter) + st = SyncGraph(ent_list, ident_adapter) assert len(st.nodes) == 1 assert len(st.unchecked) == 0 assert 101 == st.nodes[0].id assert "A" == st.nodes[0].parents[0].name # merging based on path - entlist = [ + ent_list = [ db.File(path='101').add_parent("A"), db.File(path='101').add_parent("A")] - st = SyncGraph(entlist, ident_adapter) + st = SyncGraph(ent_list, ident_adapter) assert len(st.nodes) == 1 assert len(st.unchecked) == 0 assert '101' == st.nodes[0].path assert "A" == st.nodes[0].parents[0].name # merging based on identifiable - entlist = [ + ent_list = [ db.File(name='101').add_parent("A").add_property('a', value=1), db.File(name='101').add_parent("A").add_property('a', value=1)] - st = SyncGraph(entlist, ident_adapter) + st = SyncGraph(ent_list, ident_adapter) assert len(st.nodes) == 1 assert st.nodes[0].id is None assert '101' == st.nodes[0].name @@ -499,13 +479,13 @@ def test_merging(simple_adapter): # Merging a mix. One Record needs the identifiable to be merged. But the identifying # information is scattered in the other case. - entlist = [ + ent_list = [ db.Record(id=101).add_parent("A"), db.Record(id=101, name='a').add_parent("A"), db.Record(id=101).add_parent("A").add_property('a', value=1), db.Record(name='a').add_parent("A").add_property('a', value=1)] - st = SyncGraph(entlist, ident_adapter) + st = SyncGraph(ent_list, ident_adapter) assert len(st.nodes) == 1 assert len(st.unchecked) == 0 assert 'a' == st.nodes[0].name @@ -517,7 +497,7 @@ def test_merging(simple_adapter): def test_something(simple_adapter): a = db.Record().add_parent("RT3").add_property('a', value=1) - entlist = [ + ent_list = [ a, db.Record().add_parent("RT3").add_property('a', value=1), db.Record().add_parent("RT3").add_property('a', value=1), @@ -526,7 +506,7 @@ def test_something(simple_adapter): db.Record().add_parent("RT4").add_property('RT3', value=a), db.Record().add_parent("RT3").add_property('a', value=1), db.Record().add_parent("RT3").add_property('a', value=1)] - st = SyncGraph(entlist, simple_adapter) + st = SyncGraph(ent_list, simple_adapter) assert len(st.nodes) == 2 assert len(st.unchecked) == 2 assert 'RT4' == st.nodes[1].parents[0].name @@ -543,6 +523,7 @@ def test_sync_node(): .add_property(name="a", value='a') .add_property(id=103, value='b')) sn = SyncNode(rec) + assert "Record" in str(sn) assert sn.id == rec.id assert sn.name == rec.name assert sn.parents == rec.parents @@ -676,3 +657,26 @@ def test_export_node(): assert len(p.value) == len(exp.get_property(p.name).value) assert len(exp.properties) == len(rec_a.properties) assert len(exp.parents) == len(rec_a.parents) + + +def test_remove_merged(simple_adapter): + # We reference an entity that is merged into another node and then remove the merged node + # This should result in the reference being removed + b = db.Record().add_parent("RT3").add_property('a', value=1) + ent_list = [ + db.Record().add_parent("RT3").add_property('a', value=1), + b, + db.Record().add_parent("RT3").add_property('a', value=3).add_property('RT3', value=b), + ] + + st = SyncGraph(ent_list, simple_adapter) + se_a = st.nodes[0] + se_c = st.nodes[1] + for node in st.nodes: + print(node) + assert len(st.nodes) == 2 + assert len(st.unchecked) == 2 + st.remove_failed(se_a) + assert len(st.nodes) == 1 + assert len(st.unchecked) == 1 + assert "RT3" not in [p.name for p in se_c.properties]