diff --git a/src/caoscrawler/sync_graph.py b/src/caoscrawler/sync_graph.py index dd6bcfaa65ea70f5aeb7b7fbbfb853c033c7ad88..dce6e05375e70443175272bb6c01098d6edebdd7 100644 --- a/src/caoscrawler/sync_graph.py +++ b/src/caoscrawler/sync_graph.py @@ -158,22 +158,10 @@ class SyncGraph(): if se.id is not None: raise RuntimeError('cannot update id') if node_id is None: - if se.path is None and se.identifiable is None: - raise RuntimeError("no identifying information") - se.id = self._remote_missing_counter - self._remote_missing_counter -= 1 - self._add_any(se, self._missing) - self.unchecked.remove(se) - - for other_missing in (self.backward_id_references[se.uuid] - + self.forward_id_referenced_by[se.uuid]): - self.set_id_of_node(other_missing) - + self._treat_missing(se) else: - assert node_id > 0 se.id = node_id - self._add_any(se, self._existing) - self.unchecked.remove(se) + self._treat_existing(se) def set_identifiable_of_node(self, se: SyncNode, identifiable: Identifiable): se.identifiable = identifiable @@ -407,26 +395,28 @@ class SyncGraph(): def _mark_entities_with_path_or_id(self): """ A path or an ID is sufficiently identifying. Thus, those entities can be marked as checked """ - for node in list(self.nodes[::-1]): - if node.id is None and node.path is None: - continue + for node in list(self.nodes): + if node.id is not None: + if self.get_equivalent(node) is not None: + self._merge_into(node, self.get_equivalent(node)) + else: + self._id_look_up[node.id] = node + self._treat_existing(node) + + for node in list(self.nodes): if node.path is not None: - try: - existing = cached_get_entity_by(path=node.path) - except EmptyUniqueQueryError: - existing = None - if existing is not None: - node.identify_with(existing) - - # at this point, node has an ID if it is existing - treated_before = self.get_equivalent(node) - if treated_before is None: - if node.id is None or node.id < 0: - self.set_missing(node) + if self.get_equivalent(node) is not None: + self._merge_into(node, self.get_equivalent(node)) else: - self.set_existing(node) - else: - self._merge_into(node, treated_before) + try: + existing = cached_get_entity_by(path=node.path) + except EmptyUniqueQueryError: + existing = None + remote_id = None + if existing is not None: + remote_id = existing.id + self._path_look_up[node.path] = node + self.set_id_of_node(node, remote_id) def _remove_non_identifiables(self): """ A path or an ID is sufficiently identifying. Thus, those entities can be marked as @@ -515,3 +505,20 @@ the respective attributes exist. else: if id(p.value) in se_lookup: p.value = se_lookup[id(p.value)] + + def _treat_missing(self, node): + if node.path is None and node.identifiable is None: + raise RuntimeError("no identifying information") + node.id = self._remote_missing_counter + self._remote_missing_counter -= 1 + self._add_any(node, self._missing) + self.unchecked.remove(node) + + for other_missing in (self.backward_id_references[node.uuid].union( + self.forward_id_referenced_by[node.uuid])): + self.set_id_of_node(other_missing) + + def _treat_existing(self, node): + assert node.id > 0 + self._add_any(node, self._existing) + self.unchecked.remove(node) diff --git a/unittests/test_sync_graph.py b/unittests/test_sync_graph.py index e2b8fc0622c6f0bec0e4d4f1b151696d431a18e1..e5321cdafeeb30b8c52cc7ad3f0d375a92265003 100644 --- a/unittests/test_sync_graph.py +++ b/unittests/test_sync_graph.py @@ -315,10 +315,10 @@ def test_merging(): st = SyncGraph(entlist, ident_adapter) assert len(st.unchecked) == 2 st.set_identifiable_of_node(st.nodes[0], - Identifiable(recordtype="A", name='101', properties={'a': 1})) + Identifiable(record_type="A", name='101', properties={'a': 1})) assert len(st.unchecked) == 2 st.set_identifiable_of_node(st.nodes[1], - Identifiable(recordtype="A", name='101', properties={'a': 1})) + Identifiable(record_type="A", name='101', properties={'a': 1})) assert len(st.unchecked) == 1 assert len(st.nodes) == 1 assert st.nodes[1].id is None