From db8db7fcb025024331dced2f7af880960fe5957a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com>
Date: Thu, 18 Apr 2024 22:46:11 +0200
Subject: [PATCH] wip

---
 src/caoscrawler/sync_graph.py | 71 +++++++++++++++++++----------------
 unittests/test_sync_graph.py  |  4 +-
 2 files changed, 41 insertions(+), 34 deletions(-)

diff --git a/src/caoscrawler/sync_graph.py b/src/caoscrawler/sync_graph.py
index dd6bcfaa..dce6e053 100644
--- a/src/caoscrawler/sync_graph.py
+++ b/src/caoscrawler/sync_graph.py
@@ -158,22 +158,10 @@ class SyncGraph():
         if se.id is not None:
             raise RuntimeError('cannot update id')
         if node_id is None:
-            if se.path is None and se.identifiable is None:
-                raise RuntimeError("no identifying information")
-            se.id = self._remote_missing_counter
-            self._remote_missing_counter -= 1
-            self._add_any(se, self._missing)
-            self.unchecked.remove(se)
-
-            for other_missing in (self.backward_id_references[se.uuid]
-                                  + self.forward_id_referenced_by[se.uuid]):
-                self.set_id_of_node(other_missing)
-
+            self._treat_missing(se)
         else:
-            assert node_id > 0
             se.id = node_id
-            self._add_any(se, self._existing)
-            self.unchecked.remove(se)
+            self._treat_existing(se)
 
     def set_identifiable_of_node(self, se: SyncNode, identifiable: Identifiable):
         se.identifiable = identifiable
@@ -407,26 +395,28 @@ class SyncGraph():
     def _mark_entities_with_path_or_id(self):
         """ A path or an ID is sufficiently identifying. Thus, those entities can be marked as
         checked """
-        for node in list(self.nodes[::-1]):
-            if node.id is None and node.path is None:
-                continue
+        for node in list(self.nodes):
+            if node.id is not None:
+                if self.get_equivalent(node) is not None:
+                    self._merge_into(node, self.get_equivalent(node))
+                else:
+                    self._id_look_up[node.id] = node
+                    self._treat_existing(node)
+
+        for node in list(self.nodes):
             if node.path is not None:
-                try:
-                    existing = cached_get_entity_by(path=node.path)
-                except EmptyUniqueQueryError:
-                    existing = None
-                if existing is not None:
-                    node.identify_with(existing)
-
-            # at this point, node has an ID if it is existing
-            treated_before = self.get_equivalent(node)
-            if treated_before is None:
-                if node.id is None or node.id < 0:
-                    self.set_missing(node)
+                if self.get_equivalent(node) is not None:
+                    self._merge_into(node, self.get_equivalent(node))
                 else:
-                    self.set_existing(node)
-            else:
-                self._merge_into(node, treated_before)
+                    try:
+                        existing = cached_get_entity_by(path=node.path)
+                    except EmptyUniqueQueryError:
+                        existing = None
+                    remote_id = None
+                    if existing is not None:
+                        remote_id = existing.id
+                    self._path_look_up[node.path] = node
+                    self.set_id_of_node(node, remote_id)
 
     def _remove_non_identifiables(self):
         """ A path or an ID is sufficiently identifying. Thus, those entities can be marked as
@@ -515,3 +505,20 @@ the respective attributes exist.
                 else:
                     if id(p.value) in se_lookup:
                         p.value = se_lookup[id(p.value)]
+
+    def _treat_missing(self, node):
+        if node.path is None and node.identifiable is None:
+            raise RuntimeError("no identifying information")
+        node.id = self._remote_missing_counter
+        self._remote_missing_counter -= 1
+        self._add_any(node, self._missing)
+        self.unchecked.remove(node)
+
+        for other_missing in (self.backward_id_references[node.uuid].union(
+                              self.forward_id_referenced_by[node.uuid])):
+            self.set_id_of_node(other_missing)
+
+    def _treat_existing(self, node):
+        assert node.id > 0
+        self._add_any(node, self._existing)
+        self.unchecked.remove(node)
diff --git a/unittests/test_sync_graph.py b/unittests/test_sync_graph.py
index e2b8fc06..e5321cda 100644
--- a/unittests/test_sync_graph.py
+++ b/unittests/test_sync_graph.py
@@ -315,10 +315,10 @@ def test_merging():
     st = SyncGraph(entlist, ident_adapter)
     assert len(st.unchecked) == 2
     st.set_identifiable_of_node(st.nodes[0],
-                                Identifiable(recordtype="A", name='101', properties={'a': 1}))
+                                Identifiable(record_type="A", name='101', properties={'a': 1}))
     assert len(st.unchecked) == 2
     st.set_identifiable_of_node(st.nodes[1],
-                                Identifiable(recordtype="A", name='101', properties={'a': 1}))
+                                Identifiable(record_type="A", name='101', properties={'a': 1}))
     assert len(st.unchecked) == 1
     assert len(st.nodes) == 1
     assert st.nodes[1].id is None
-- 
GitLab