diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index ce4a617853b629295079ca8ac76bfa0eb6e93635..0a8c156f603587e894a19b1f356cce2e1dcc5774 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -352,7 +352,6 @@ class Crawler(object): # p.value = cached def split_into_inserts_and_updates(self, st: SyncGraph): - entity_was_treated = True # st.entities contains Entities which could not yet be checked against the remote server while entity_was_treated and len(st.unchecked) > 0: @@ -366,6 +365,8 @@ class Crawler(object): # 2. Does it have to be new since a needed reference is missing? # 3. Can it be checked on the remote server? for se in list(st.unchecked): + if se not in st.unchecked: + continue if st.identity_relies_on_unchecked_entity(se): print(st.nodes.index(se), "relies on unchecked") continue @@ -375,8 +376,8 @@ class Crawler(object): st.set_identifiable_of_node(se, st.identifiableAdapter.get_identifiable( se, st.backward_id_referenced_by[se.uuid])) # entity was merged with another due to the new identifiable - if se not in st.unchecked: - continue + if se not in st.unchecked: + continue # if (equivalent_se.identifiable is None and not # self.identity_relies_on_unchecked_entity(equivalent_se)): diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index 7ecb1c55d8bb3d3a162811da474553e2ecb90a27..2405e454f7a17f8a74617349cd95e5b61fe66541 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -263,6 +263,7 @@ startswith: bool, optional # fill the values: for prop in registered_identifiable.properties: if prop.name == "name": + name = se.name continue # problem: what happens with multi properties? # case A: in the registered identifiable @@ -270,6 +271,8 @@ startswith: bool, optional # treated elsewhere if prop.name.lower() == "is_referenced_by": + for el in identifiable_backrefs: + assert isinstance(el, SyncNode) if len(identifiable_backrefs) == 0: raise RuntimeError( f"Could not find referencing entities of type(s): {prop.value}\n" @@ -277,6 +280,10 @@ startswith: bool, optional f"There were {len(identifiable_backrefs)} referencing entities to choose from.\n" f"This error can also occur in case of merge conflicts in the referencing entities." ) + elif len([e.id for e in identifiable_backrefs if el.id is None]) > 0: + raise RuntimeError( + f"Referencing entity has no id" + ) continue options = [p.value for p in se.properties if p.name == prop.name] @@ -316,7 +323,7 @@ startswith: bool, optional path=se.path, record_type=(registered_identifiable.parents[0].name if registered_identifiable else None), - name=se.name, + name=name, properties=identifiable_props, backrefs=[e.id for e in identifiable_backrefs] ) diff --git a/src/caoscrawler/sync_graph.py b/src/caoscrawler/sync_graph.py index 64058d374c8443a531176adf71d314d1974b45bc..ac484df26d0c43ad4e8fb831b4e1bb1b80a3819f 100644 --- a/src/caoscrawler/sync_graph.py +++ b/src/caoscrawler/sync_graph.py @@ -101,7 +101,8 @@ class SyncGraph(): """sets the ID attribute of the given SyncNode. If node_id is None, a negative Id will be given indicating that the node does not exist on the remote server""" if node.id is not None: - raise RuntimeError('cannot update id') + raise RuntimeError('Cannot update ID.\n' + f'It already is {node.id} and shall be set to {node_id}.') if node_id is None: node_id = self._get_new_id() node.id = node_id @@ -113,15 +114,6 @@ class SyncGraph(): self._treat_missing(node) else: self._treat_existing(node) - for other_node in self.backward_id_references[node.uuid].union( - self.forward_id_referenced_by[node.uuid]): - try: - identifiable = self.identifiableAdapter.get_identifiable( - other_node, self.backward_id_referenced_by[other_node.uuid]) - self.set_identifiable_of_node(other_node, identifiable) - except Exception as es: - print(es) - pass def _get_new_id(self): self._remote_missing_counter -= 1 @@ -145,7 +137,6 @@ class SyncGraph(): node_map[id(el)] = entities[-1] for ent in entities: for p in ent.properties: - # For lists append each element that is of type Entity to flat: if isinstance(p.value, list): for ii, el in enumerate(p.value): if isinstance(el, SyncNode): @@ -207,12 +198,18 @@ class SyncGraph(): Equivalent means that ID, path or identifiable are the same. """ if entity.id is not None and entity.id in self._id_look_up: - return self._id_look_up[entity.id] + candidate = self._id_look_up[entity.id] + if candidate is not entity: + return candidate if entity.path is not None and entity.path in self._path_look_up: - return self._path_look_up[entity.path] + candidate = self._path_look_up[entity.path] + if candidate is not entity: + return candidate if (entity.identifiable is not None and entity.identifiable.get_representation() in self._identifiable_look_up): - return self._identifiable_look_up[entity.identifiable.get_representation()] + candidate = self._identifiable_look_up[entity.identifiable.get_representation()] + if candidate is not entity: + return candidate # def get_equivalent_existing(self, se): # """ Check whether this Record exists on the remote server @@ -492,15 +489,31 @@ class SyncGraph(): self._missing[id(node)] = node self.unchecked.remove(node) - for other_missing in (self.backward_id_references[node.uuid].union( - self.forward_id_referenced_by[node.uuid])): - if other_missing in self.unchecked: - self.set_id_of_node(other_missing) + for other_node in (self.backward_id_references[node.uuid].union( + self.forward_id_referenced_by[node.uuid])): + try: + identifiable = self.identifiableAdapter.get_identifiable( + other_node, self.backward_id_referenced_by[other_node.uuid]) + self.set_identifiable_of_node(other_node, identifiable) + except Exception as es: + print(es) + pass + if other_node in self.unchecked: + self.set_id_of_node(other_node) def _treat_existing(self, node): assert node.id > 0 self._existing[id(node)] = node self.unchecked.remove(node) + for other_node in (self.backward_id_references[node.uuid].union( + self.forward_id_referenced_by[node.uuid])): + try: + identifiable = self.identifiableAdapter.get_identifiable( + other_node, self.backward_id_referenced_by[other_node.uuid]) + self.set_identifiable_of_node(other_node, identifiable) + except Exception as es: + print(es) + pass def __repr__(self): return f"SyncNode with ID={self.id}" diff --git a/src/caoscrawler/sync_node.py b/src/caoscrawler/sync_node.py index d75ce1d4f56e238f80decf8a8329d3ef08ea7a2f..0c6c32b1922656c6c6d6cecb9ba1ca0c6b551c61 100644 --- a/src/caoscrawler/sync_node.py +++ b/src/caoscrawler/sync_node.py @@ -93,7 +93,8 @@ class SyncNode(): for p in self.properties: if ent.get_property(p) is not None: if ent.get_property(p).value != p.value: - raise Exception() + raise db.apiutils.EntityMergeConflictError(f"Differing values were set for Property {p.name}:\n" + f"{ent.get_property(p).value}\n{p.value}") else: ent.add_property(p) return ent diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index 14167968b9816bc3f5c55b6dac8bec669e6ded8f..bc6b5d43e79aff5974c9745a489716b7d9b4763b 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -301,10 +301,6 @@ def test_split_into_inserts_and_updates_single(crawler_mocked_identifiable_retri db.Record(name="B").add_parent("C")] st = SyncGraph(entlist, crawler.identifiableAdapter) - assert st.get_equivalent(st.nodes[0]) is None - assert st.get_equivalent(st.nodes[0]) is None - assert not st.identity_relies_on_unchecked_entity(st.nodes[0]) - assert not st.identity_relies_on_unchecked_entity(st.nodes[1]) assert crawler.identifiableAdapter.retrieve_identified_record_for_record( identlist[0]).id == 1111 assert crawler.identifiableAdapter.retrieve_identified_record_for_record( @@ -378,9 +374,9 @@ def test_split_into_inserts_and_updates_with_complex(crawler_mocked_identifiable b = db.Record(name="B").add_parent("C") g = db.Record(name="G").add_parent("C") f = db.Record(name="F").add_parent("C") - g.add_property("A", a) - b.add_property("A", f) + g.add_property("C", b) b.add_property("A", a) + b.add_property("C", f) entlist = [a, b, g] st = SyncGraph(entlist, crawler.identifiableAdapter) insert, update = crawler.split_into_inserts_and_updates(st) @@ -483,18 +479,22 @@ a: ([b1, b2]) crawler = Crawler(identifiableAdapter=ident_adapter) st = SyncGraph(deepcopy([rec_a, *rec_b, *rec_c]), crawler.identifiableAdapter) + for node in st.nodes: + print(node.id, node.parents) assert st.identity_relies_on_unchecked_entity(st.nodes[0]) is False assert st.identity_relies_on_unchecked_entity(st.nodes[1]) assert st.identity_relies_on_unchecked_entity(st.nodes[2]) assert st.identity_relies_on_unchecked_entity(st.nodes[3]) assert st.identity_relies_on_unchecked_entity(st.nodes[4]) - st.nodes[0].identifiable = Identifiable(path='a') # dummy identifiable - st.set_missing(st.nodes[0]) - assert st.identity_relies_on_unchecked_entity(st.nodes[1]) is False + print("SEEEEEEEEEEEEEEEET") + print(st.nodes[1].properties) + print(st.nodes[2].properties) + assert len(st.unchecked) == 5 + # The Cs cannot be merged due to different identifying properties + # The Bs cannot be merged due to differeng references to Cs with raises(db.apiutils.EntityMergeConflictError) as rte: - crawler.synchronize(commit_changes=False, - crawled_data=[rec_a, *rec_b, *rec_c]) + crawler.split_into_inserts_and_updates(st) # assert not isinstance(rte.value, NotImplementedError), \ # "Exception must not be NotImplementedError, but plain RuntimeError." # assert "Could not find referencing entities" in rte.value.args[0] @@ -734,8 +734,8 @@ def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_ crawler = crawler_mocked_for_backref_test referenced = db.Record(name="B").add_parent("C") entlist = [referenced, - db.Record(name="A").add_parent("BR").add_property("ref", referenced), - db.Record(name="C").add_parent("BR").add_property("ref", referenced), + db.Record(id=1, name="A").add_parent("BR").add_property("ref", referenced), + db.Record(id=2, name="C").add_parent("BR").add_property("ref", referenced), ] # test whether both entities are listed in the backref attribute of the identifiable @@ -748,8 +748,8 @@ def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_ # check the split... insert, update = crawler.split_into_inserts_and_updates(st) - assert len(update) == 1 - assert len(insert) == 2 + assert len(update) == 2 + assert len(insert) == 1 @ patch("caoscrawler.identifiable_adapters.get_children_of_rt", @@ -759,8 +759,8 @@ def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_ crawler = crawler_mocked_for_backref_test referenced = db.Record(name="B").add_parent("D") entlist = [referenced, - db.Record(name="A").add_parent("BR").add_property("ref", referenced), - db.Record(name="A").add_parent("BR2").add_property("ref", referenced), + db.Record(id=1, name="A").add_parent("BR").add_property("ref", referenced), + db.Record(id=2, name="A").add_parent("BR2").add_property("ref", referenced), ] # test whether both entities are listed in the backref attribute of the identifiable @@ -1017,47 +1017,42 @@ def test_treated_record_lookup(): ident_adapter = CaosDBIdentifiableAdapter() trlu = SyncGraph([db.Record().add_parent( 'A'), db.Record().add_parent('A'), db.File()], ident_adapter) - exist = trlu.se[0] - miss = trlu.se[1] - fi = trlu.se[2] - exist.id = 1 - trlu.set_existing(exist) + exist = trlu.nodes[0] + miss = trlu.nodes[1] + fi = trlu.nodes[2] + trlu.set_id_of_node(exist, 1) assert len(trlu._existing) == 1 # was added to existing assert trlu._existing[id(exist)] is exist # is in ID lookup assert trlu._id_look_up[exist.id] is exist - # exception when identifiable is missing - with raises(RuntimeError): - trlu.set_missing(miss) - miss.identifiable = Identifiable(name='a') - trlu.set_missing(miss) + trlu.set_identifiable_of_node(miss, Identifiable(name='a')) + trlu.set_id_of_node(miss) # was added to missing assert trlu._missing[id(miss)] is miss # is in ident lookup assert trlu._identifiable_look_up[miss.identifiable.get_representation()] is miss - fi.path = 'a' - fi.id = 2 - trlu.set_existing(fi) - assert len(trlu._existing) == 2 - # was added to existing - assert trlu._existing[id(fi)] is fi - # is in ID lookup - assert trlu._id_look_up[fi.id] is fi - # is in path lookup - assert trlu._path_look_up[fi.path] is fi - - all_miss, all_exi = trlu.create_record_lists() - assert fi.fragments[0] in all_exi - assert exist.fragments[0] in all_exi - assert miss.fragments[0] in all_miss - - # If a Record was added using the ID, the ID must be used to identify it even though later an - # identifiable may be passed as well - exist.identifiable = Identifiable(name='b') - assert trlu.get_equivalent(exist) is exist +# fi.path = 'a' +# trlu.set_id_of_node(fi, 2) +# assert len(trlu._existing) == 2 +# # was added to existing +# assert trlu._existing[id(fi)] is fi +# # is in ID lookup +# assert trlu._id_look_up[fi.id] is fi +# # is in path lookup +# assert trlu._path_look_up[fi.path] is fi +# +# all_miss, all_exi = trlu.create_record_lists() +# assert fi.fragments[0] in all_exi +# assert exist.fragments[0] in all_exi +# assert miss.fragments[0] in all_miss +# +# # If a Record was added using the ID, the ID must be used to identify it even though later an +# # identifiable may be passed as well +# exist.identifiable = Identifiable(name='b') +# assert trlu.get_equivalent(exist) is exist def test_merge_entity_with_identifying_reference(crawler_mocked_identifiable_retrieve): diff --git a/unittests/test_identifiable.py b/unittests/test_identifiable.py index c79498ed8700f8418461c8047f9823529747ae47..32bd729e5c09e61f064cb58750c78ae265f56539 100644 --- a/unittests/test_identifiable.py +++ b/unittests/test_identifiable.py @@ -27,6 +27,7 @@ test identifiable module import caosdb as db import pytest from caoscrawler.identifiable import Identifiable +from caoscrawler.sync_node import SyncNode def test_create_hashable_string(): @@ -42,20 +43,20 @@ def test_create_hashable_string(): assert ( Identifiable._create_hashable_string( Identifiable(name="A", record_type="B", - properties={'a': db.Record(id=12)}) + properties={'a': SyncNode(db.Record(id=12))}) ) == "P<B>N<A>R<[]>a:12") a = Identifiable._create_hashable_string( - Identifiable(name="A", record_type="B", properties={'a': [db.Record(id=12)]})) + Identifiable(name="A", record_type="B", properties={'a': [SyncNode(db.Record(id=12))]})) assert (a == "P<B>N<A>R<[]>a:[12]") assert (Identifiable._create_hashable_string( Identifiable(name="A", record_type="B", properties={'a': [12]})) == "P<B>N<A>R<[]>a:[12]") assert ( Identifiable._create_hashable_string( Identifiable(name="A", record_type="B", properties={ - 'a': [db.Record(id=12), 11]}) + 'a': [SyncNode(db.Record(id=12)), 11]}) ) == "P<B>N<A>R<[]>a:[12, 11]") assert Identifiable._create_hashable_string( - Identifiable(name="A", record_type="B", backrefs=[123, db.Entity(id=124)], + Identifiable(name="A", record_type="B", backrefs=[123, SyncNode(db.Record(id=124))], properties={'a': 5})) == "P<B>N<A>R<['123', '124']>a:5" @@ -68,9 +69,9 @@ def test_repr(): # only test that something meaningful is returned assert 'properties' in str(Identifiable(name="A", record_type="B")) assert str(Identifiable(name="A", record_type="B", properties={'a': 0})).split( - "properties:\n")[1].split('\n')[0] == '{"a": 0}' + "properties:\n")[1].split('\n')[0] == '{"a": "0"}' assert str(Identifiable(name="A", record_type="B", properties={'a': 0, 'b': "test"})).split( - "properties:\n")[1].split('\n')[0] == '{"a": 0, "b": "test"}' + "properties:\n")[1].split('\n')[0] == '{"a": "0", "b": "test"}' # TODO(henrik): Add a test using backrefs once that's implemented. diff --git a/unittests/test_identifiable_adapters.py b/unittests/test_identifiable_adapters.py index 167c84e1cf4e11e1b19d5cee9fa633dc9e934dee..3f30f798d43cdaa09aeef26c193302f4cc704412 100644 --- a/unittests/test_identifiable_adapters.py +++ b/unittests/test_identifiable_adapters.py @@ -138,8 +138,8 @@ def test_wildcard_ref(): dummy = SyncNode(db.Record(), None) dummy.id = 1 identifiable = ident.get_identifiable(SyncNode(rec, db.RecordType() - .add_parent(name="Person") - .add_property(name="is_referenced_by", value=["*"])), + .add_parent(name="Person") + .add_property(name="is_referenced_by", value=["*"])), [dummy] ) @@ -163,13 +163,12 @@ def test_get_identifiable(): .add_property(name="date", value="2022-02-01") .add_property(name="result", value="FAIL")) se = SyncNode(rec, - ident.get_registered_identifiable(rec)) + ident.get_registered_identifiable(rec)) id_r0 = ident.get_identifiable(se, []) - r_cur = se.fragments[0] - assert r_cur.parents[0].name == id_r0.record_type - assert r_cur.get_property("date").value == id_r0.properties["date"] - assert len(r_cur.parents) == 1 - assert len(r_cur.properties) == 2 + assert rec.parents[0].name == id_r0.record_type + assert rec.get_property("date").value == id_r0.properties["date"] + assert len(rec.parents) == 1 + assert len(rec.properties) == 2 assert len(id_r0.properties) == 1 ident = CaosDBIdentifiableAdapter() @@ -180,7 +179,7 @@ def test_get_identifiable(): .add_property(name="a", value="2022-02-01") .add_property(name="result", value="FAIL")) se = SyncNode(rec, ident.get_registered_identifiable(rec)) - se.fragments.extend([ + for el in [ db.Record() .add_parent(name="A", id=3) .add_property(name="a", value="2022-02-01") @@ -189,12 +188,12 @@ def test_get_identifiable(): .add_parent(name="A", id=3) .add_property(name="a", value="2022-02-01") .add_property(name="result", value="FAIL"), - ]) + ]: + se.update(SyncNode(el)) id_r0 = ident.get_identifiable(se, []) - r_cur = se.fragments[0] - assert r_cur.parents[0].name == id_r0.record_type - assert r_cur.get_property("a").value == id_r0.properties["a"] + assert "A" == id_r0.record_type + assert "2022-02-01" == id_r0.properties["a"] assert 'a' == id_r0.name assert len(id_r0.properties) == 1 @@ -203,11 +202,11 @@ def test_get_identifiable(): .add_property(name="a", value="2") ) se = SyncNode(rec, ident.get_registered_identifiable(rec)) - se.fragments.extend([ + se.update(SyncNode( db.Record(name='a') .add_parent(name="A") .add_property(name="a", value="3") - ]) + )) with pytest.raises(RuntimeError): id_r0 = ident.get_identifiable(se, []) diff --git a/unittests/test_sync_graph.py b/unittests/test_sync_graph.py index 01efa4f4a25febfddfaf387cf2a6f344f03cefc5..0c39899e8c21bffeb249a20ed00d177c5360f5e5 100644 --- a/unittests/test_sync_graph.py +++ b/unittests/test_sync_graph.py @@ -391,6 +391,66 @@ def test_set_id_of_node(simple_adapter): assert st.nodes[0].identifiable is not None assert len(st.nodes) == 2 + # Test for meaningful exception when referencing a list of unmergeable entities. + # + # Datamodel + # --------- + # A: + # B: LIST<B> + # prop_ident: INTEGER + # + # B: + # prop_ident: + # + # + # Identifiables + # ------------- + # + # id_A: [prop_ident] + # id_B: [prop_ident, "is_referenced_by: A"] + # + # Data + # ---- + # + # + # b1: ("same", c1) + # b2: ("same", c2) + # + # a: ([b1, b2]) + + prop_ident = db.Property("prop_ident", datatype=db.INTEGER) + prop_other = db.Property("prop_ident", datatype=db.INTEGER) + # Somehow it is necessary that `B` has a reference property. Dunno if C must have an + # identifiable as well. + rt_b = db.RecordType("B").add_property(prop_ident).add_property("C") + rt_a = db.RecordType("A").add_property(prop_ident).add_property("LIST<B>") + + ident_a = db.RecordType().add_parent("A").add_property("prop_ident") + ident_b = db.RecordType().add_parent("B").add_property("prop_ident").add_property( + "is_referenced_by", value="A") + + rec_a = db.Record("a").add_parent(rt_a).add_property("prop_ident", value=1234) + rec_b = [] + for value in [23, 42]: + rec_b.append(db.Record().add_parent(rt_b).add_property("prop_ident", value=2020)) + rec_a.add_property("B", rec_b) + + ident_adapter = CaosDBIdentifiableAdapter() + ident_adapter.register_identifiable("A", ident_a) + ident_adapter.register_identifiable("B", ident_b) + + st = SyncGraph([rec_a, *rec_b], ident_adapter) + for node in st.nodes: + print(node.id, node.parents) + assert st.identity_relies_on_unchecked_entity(st.nodes[0]) is False + assert st.identity_relies_on_unchecked_entity(st.nodes[1]) + assert st.identity_relies_on_unchecked_entity(st.nodes[2]) + assert len(st.nodes) == 3 + assert len(st.unchecked) == 3 + st.set_id_of_node(st.nodes[0]) + assert len(st.nodes) == 2 + assert len(st.unchecked) == 0 + @patch("caoscrawler.sync_graph.cached_get_entity_by", new=Mock(side_effect=mock_get_entity_by)) @@ -497,9 +557,9 @@ def test_sync_node(): sn_a.update(sn_b) assert sn_a.id == rec_b.id assert sn_a.name == rec_a.name - for p in rec_a.parents+rec_b.parents: + for p in rec_a.parents + rec_b.parents: assert p in sn_a.parents - for p in rec_a.properties+rec_b.properties: + for p in rec_a.properties + rec_b.properties: assert p in sn_a.properties assert sn_a.description == rec_b.description assert sn_a.role == rec_a.role @@ -507,13 +567,13 @@ def test_sync_node(): export = sn_a.export_entity() assert export.id == rec_b.id assert export.name == rec_a.name - for p in rec_a.parents+rec_b.parents: + for p in rec_a.parents + rec_b.parents: assert p in export.parents # if p.name is not None: # assert p.name in [el.name for el in export.parents] # if p.id is not None: # assert p.id in [el.id for el in export.parents] - for p in rec_a.properties+rec_b.properties: + for p in rec_a.properties + rec_b.properties: if p.name is not None: assert p.name in [el.name for el in export.properties] if p.id is not None: @@ -540,9 +600,9 @@ def test_sync_node(): sn_a.update(sn_b) assert sn_a.id == rec_b.id assert sn_a.name == rec_a.name - for p in rec_a.parents+rec_b.parents: + for p in rec_a.parents + rec_b.parents: assert p in sn_a.parents - for p in rec_a.properties+rec_b.properties: + for p in rec_a.properties + rec_b.properties: assert p in sn_a.properties assert sn_a.description == rec_b.description assert sn_a.role == rec_a.role