diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index 1b8525db01508018a819b72721bc829b1e304276..7f54f39674ef69703615fe3a985aeadb9296d627 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -309,8 +309,7 @@ startswith: bool, optional raise MissingReferencingEntityError( f"Could not find referencing entities of type(s): {prop.value}\n" f"for registered identifiable:\n{registered_identifiable}\n" - f"There were {len(identifiable_backrefs) - } referencing entities to choose from.\n" + f"There were {len(identifiable_backrefs)} referencing entities to choose from.\n" f"This error can also occur in case of merge conflicts in the referencing entities." ) elif len([e.id for e in identifiable_backrefs if el.id is None]) > 0: diff --git a/src/caoscrawler/sync_graph.py b/src/caoscrawler/sync_graph.py index 26ef7f82c2ba953f0bf1f55bff0f8ce73f3daefb..6d2f9336fe0f739fc8ffeb70c9c6e39bdc44d947 100644 --- a/src/caoscrawler/sync_graph.py +++ b/src/caoscrawler/sync_graph.py @@ -113,6 +113,7 @@ class SyncGraph(): self._identifiable_look_up: Dict[str, SyncNode] = {} self._missing: Dict[int, SyncNode] = {} self._existing: Dict[int, SyncNode] = {} + self._nonidentifiable: Dict[int, SyncNode] = {} # entities that are missing get negative IDs to allow identifiable creation self._remote_missing_counter = -1 @@ -408,6 +409,7 @@ class SyncGraph(): for node in list(self.nodes[::-1]): if "nonidentifiable" in [p.name for p in node.registered_identifiable.properties]: self.unchecked.remove(node) + self._nonidentifiable[id(node)] = node def _merge_into(self, source: SyncNode, target: SyncNode): """ FIXME tries to merge record into newrecord @@ -465,23 +467,27 @@ class SyncGraph(): self.nodes.remove(source) if source in self.unchecked: self.unchecked.remove(source) - else: - self.unchecked.remove(target) - assert id(source) not in self._missing - assert id(source) not in self._existing + # update look ups if target.id is not None: self._id_look_up[target.id] = target if target.path is not None: self._path_look_up[target.path] = target + if target.identifiable is not None: + self._identifiable_look_up[target.identifiable.get_representation()] = target + + if ((id(source) in self._existing and id(target) in self._missing) + or (id(target) in self._existing and id(source) in self._missing)): + raise RuntimeError("Trying to merge missing and existing") + - # due to the merge it might now be possible to create an identifiable - if self._identifiable_is_needed(target): - self._set_identifiable_of_node(target) if id(source) in self._missing and id(target) not in self._missing: self._mark_missing(target) if id(source) in self._existing and id(target) not in self._existing: self._mark_existing(target) + # due to the merge it might now be possible to create an identifiable + if self._identifiable_is_needed(target): + self._set_identifiable_of_node(target) # This is one of three cases that affect other nodes: # - mark existing # - mark missing diff --git a/src/caoscrawler/sync_node.py b/src/caoscrawler/sync_node.py index 492839a9b9106ef591f8219f3610aca4997866c3..1daf358f1877c9f454d5982d8bfe999aa7edf6ec 100644 --- a/src/caoscrawler/sync_node.py +++ b/src/caoscrawler/sync_node.py @@ -74,6 +74,8 @@ class SyncNode(): if other.identifiable is not None and self.identifiable is not None: assert (other.identifiable.get_representation() == self.identifiable.get_representation()) + if other.identifiable: + self.identifiable = other.identifiable for attr in ["id", "path", "file", "role", "path", "name", "description"]: if other.__getattribute__(attr) is not None: if self.__getattribute__(attr) is None: diff --git a/unittests/test_sync_graph.py b/unittests/test_sync_graph.py index 7927c705dc3677c0a92a6eb6e8967ca0eb8c309a..1c34daf034708a68ec324933404ea8b09af4053e 100644 --- a/unittests/test_sync_graph.py +++ b/unittests/test_sync_graph.py @@ -332,7 +332,7 @@ def test_set_id_of_node(simple_adapter): st.set_id_of_node(st.unchecked[0], 101) assert len(st.nodes) == 1 assert len(st.unchecked) == 0 - assert st.nodes[0].properties[0].name == "a" + assert st.nodes[0].properties[0].name == "RT2" # setting the id to None should lead to depending nodes marked as missing ent_list = [ @@ -410,9 +410,9 @@ def test_set_id_of_node(simple_adapter): st.export_record_lists() ent_list = [ db.Record().add_parent("RT3").add_property('a', value=1) - .add_property('b', value=db.Record().add_parent("RT5")), + .add_property('b', value=db.Record(name='b').add_parent("RT5")), db.Record().add_parent("RT3").add_property('a', value=1) - .add_property('b', value=db.Record().add_parent("RT5")), + .add_property('b', value=db.Record(name='a').add_parent("RT5")), ] st = SyncGraph(ent_list, simple_adapter)