From 21da31c253de06ffa8e562deaa2bdcc6e995ec44 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com>
Date: Tue, 28 May 2024 11:08:00 +0200
Subject: [PATCH] rename_refs

---
 src/caoscrawler/sync_graph.py |  87 ++++++++---------
 unittests/test_crawler.py     |   4 +-
 unittests/test_sync_graph.py  | 174 +++++++++++++++++-----------------
 3 files changed, 133 insertions(+), 132 deletions(-)

diff --git a/src/caoscrawler/sync_graph.py b/src/caoscrawler/sync_graph.py
index 652f96a5..d317b0c8 100644
--- a/src/caoscrawler/sync_graph.py
+++ b/src/caoscrawler/sync_graph.py
@@ -156,14 +156,14 @@ class SyncGraph():
         # list all SemanticEntities that have not yet been checked
         self.unchecked = list(self.nodes)
 
-        # initialize reference mappings
+        # initialize reference mappings (see _create_reference_mapping)
         (
             self.forward_references,
             self.backward_references,
-            self.forward_id_references,
-            self.backward_id_references,
-            self.forward_id_referenced_by,
-            self.backward_id_referenced_by,
+            self.forward_references_id_props,
+            self.backward_references_id_props,
+            self.forward_references_backref,
+            self.backward_references_backref,
         ) = self._create_reference_mapping(self.nodes)
 
         # remove entities with path or ID from unchecked list
@@ -250,9 +250,9 @@ class SyncGraph():
         """
 
         return any([id(ent) not in self._missing and id(ent) not in self._existing
-                    for ent in self.forward_id_references[id(node)]]
+                    for ent in self.forward_references_id_props[id(node)]]
                    + [id(ent) not in self._missing and id(ent) not in self._existing
-                      for ent in self.backward_id_referenced_by[id(node)]])
+                      for ent in self.backward_references_backref[id(node)]])
 
     def unchecked_contains_circular_dependency(self):
         """
@@ -328,7 +328,7 @@ class SyncGraph():
         if identifiable is None:
             self.identifiableAdapter.all_identifying_properties_exist(node)
             identifiable = self.identifiableAdapter.get_identifiable(
-                node, self.backward_id_referenced_by[id(node)])
+                node, self.backward_references_backref[id(node)])
         node.identifiable = identifiable
         equivalent_se = self.get_equivalent(node)
         if equivalent_se is not None and equivalent_se is not node:
@@ -362,8 +362,8 @@ class SyncGraph():
 
         Last review by Alexander Schlemmer on 2024-05-24.
         """
-        return (self.backward_id_references[id(node)].union(
-                self.forward_id_referenced_by[id(node)]))
+        return (self.backward_references_id_props[id(node)].union(
+                self.forward_references_backref[id(node)]))
 
     @staticmethod
     def _create_flat_list(ent_list: list[db.Entity], flat: Optional[list[db.Entity]] = None):
@@ -407,25 +407,26 @@ class SyncGraph():
 
         Then there are three kinds of maps being generated: One includes all references
         ("_references"), one includes references that are values of identifying properties
-        ("_id_references") and one includes references that are relevant for identifying
-        backreferences/"is_referenced_by" ("_id_references_by").
+        ("_references_id_props") and one includes references that are relevant for identifying
+        backreferences/"is_referenced_by" ("_references_backref"). I.e. the two latter are subesets
+        of the former reference map.
         """
         # TODO we need to treat children of RecordTypes somehow.
         forward_references: dict[int, set[SyncNode]] = {}
         backward_references: dict[int, set[SyncNode]] = {}
-        forward_id_references: dict[int, set[SyncNode]] = {}
-        backward_id_references: dict[int, set[SyncNode]] = {}
-        forward_id_referenced_by: dict[int, set[SyncNode]] = {}
-        backward_id_referenced_by: dict[int, set[SyncNode]] = {}
+        forward_references_id_props: dict[int, set[SyncNode]] = {}
+        backward_references_id_props: dict[int, set[SyncNode]] = {}
+        forward_references_backref: dict[int, set[SyncNode]] = {}
+        backward_references_backref: dict[int, set[SyncNode]] = {}
 
         # initialize with empty lists/dict
         for node in flat:
             forward_references[id(node)] = set()
             backward_references[id(node)] = set()
-            forward_id_references[id(node)] = set()
-            backward_id_references[id(node)] = set()
-            forward_id_referenced_by[id(node)] = set()
-            backward_id_referenced_by[id(node)] = set()
+            forward_references_id_props[id(node)] = set()
+            backward_references_id_props[id(node)] = set()
+            forward_references_backref[id(node)] = set()
+            backward_references_backref[id(node)] = set()
         for node in flat:
             for p in node.properties:
                 val = p.value
@@ -439,16 +440,16 @@ class SyncGraph():
                                 and len([el.name
                                          for el in node.registered_identifiable.properties if
                                          el.name == p.name]) > 0):
-                            forward_id_references[id(node)].add(v)
-                            backward_id_references[id(v)].add(node)
+                            forward_references_id_props[id(node)].add(v)
+                            backward_references_id_props[id(v)].add(node)
                         if (v.registered_identifiable is not None and
                                 IdentifiableAdapter.referencing_entity_has_appropriate_type(
                                 node.parents, v.registered_identifiable)):
-                            forward_id_referenced_by[id(node)].add(v)
-                            backward_id_referenced_by[id(v)].add(node)
+                            forward_references_backref[id(node)].add(v)
+                            backward_references_backref[id(v)].add(node)
 
-        return (forward_references, backward_references, forward_id_references,
-                backward_id_references, forward_id_referenced_by, backward_id_referenced_by,
+        return (forward_references, backward_references, forward_references_id_props,
+                backward_references_id_props, forward_references_backref, backward_references_backref,
                 )
 
     def _mark_entities_with_path_or_id(self):
@@ -523,23 +524,23 @@ class SyncGraph():
             self.forward_references[id(node)].remove(source)
             self.forward_references[id(node)].add(target)
 
-        for node in self.forward_id_references.pop(id(source)):
-            self.forward_id_references[id(target)].add(node)
-            self.backward_id_references[id(node)].remove(source)
-            self.backward_id_references[id(node)].add(target)
-        for node in self.backward_id_references.pop(id(source)):
-            self.backward_id_references[id(target)].add(node)
-            self.forward_id_references[id(node)].remove(source)
-            self.forward_id_references[id(node)].add(target)
-
-        for node in self.forward_id_referenced_by.pop(id(source)):
-            self.forward_id_referenced_by[id(target)].add(node)
-            self.backward_id_referenced_by[id(node)].remove(source)
-            self.backward_id_referenced_by[id(node)].add(target)
-        for node in self.backward_id_referenced_by.pop(id(source)):
-            self.backward_id_referenced_by[id(target)].add(node)
-            self.forward_id_referenced_by[id(node)].remove(source)
-            self.forward_id_referenced_by[id(node)].add(target)
+        for node in self.forward_references_id_props.pop(id(source)):
+            self.forward_references_id_props[id(target)].add(node)
+            self.backward_references_id_props[id(node)].remove(source)
+            self.backward_references_id_props[id(node)].add(target)
+        for node in self.backward_references_id_props.pop(id(source)):
+            self.backward_references_id_props[id(target)].add(node)
+            self.forward_references_id_props[id(node)].remove(source)
+            self.forward_references_id_props[id(node)].add(target)
+
+        for node in self.forward_references_backref.pop(id(source)):
+            self.forward_references_backref[id(target)].add(node)
+            self.backward_references_backref[id(node)].remove(source)
+            self.backward_references_backref[id(node)].add(target)
+        for node in self.backward_references_backref.pop(id(source)):
+            self.backward_references_backref[id(target)].add(node)
+            self.forward_references_backref[id(node)].remove(source)
+            self.forward_references_backref[id(node)].add(target)
 
         # remove unneeded SyncNode
         self.nodes.remove(source)
diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py
index ccd441e7..4e8b057e 100644
--- a/unittests/test_crawler.py
+++ b/unittests/test_crawler.py
@@ -546,7 +546,7 @@ def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_
 
     identifiable = crawler.identifiableAdapter.get_identifiable(
         st.nodes[0],
-        st.backward_id_referenced_by[id(st.nodes[0])])
+        st.backward_references_backref[id(st.nodes[0])])
     assert len(identifiable.backrefs) == 2
 
     # check the split...
@@ -570,7 +570,7 @@ def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_
     st = SyncGraph(entlist, crawler.identifiableAdapter)
     identifiable = crawler.identifiableAdapter.get_identifiable(
         st.nodes[0],
-        st.backward_id_referenced_by[id(st.nodes[0])])
+        st.backward_references_backref[id(st.nodes[0])])
 
     assert len(identifiable.backrefs) == 2
 
diff --git a/unittests/test_sync_graph.py b/unittests/test_sync_graph.py
index e14c8346..1ec2b3b2 100644
--- a/unittests/test_sync_graph.py
+++ b/unittests/test_sync_graph.py
@@ -89,9 +89,9 @@ def test_create_reference_mapping():
     for index, mapping in product((0, 1), mappings):
         assert id(ses[index]) in mapping
 
-    (forward_references, backward_references, forward_id_references,
-     backward_id_references, forward_id_referenced_by,
-     backward_id_referenced_by) = mappings
+    (forward_references, backward_references, forward_references_id_props,
+     backward_references_id_props, forward_references_backref,
+     backward_references_backref) = mappings
 
     # a has no ref
     assert len(forward_references[id(a)]) == 0
@@ -100,17 +100,17 @@ def test_create_reference_mapping():
     assert forward_references[id(b)] == set([a])
     assert backward_references[id(b)] == set()
     # a has no identifying reference
-    assert forward_id_references[id(a)] == set()
-    assert backward_id_references[id(a)] == set([b])
+    assert forward_references_id_props[id(a)] == set()
+    assert backward_references_id_props[id(a)] == set([b])
     # b has an identifying reference
-    assert forward_id_references[id(b)] == set([a])
-    assert backward_id_references[id(b)] == set()
+    assert forward_references_id_props[id(b)] == set([a])
+    assert backward_references_id_props[id(b)] == set()
     # a has an identifying back reference
-    assert forward_id_referenced_by[id(a)] == set()
-    assert backward_id_referenced_by[id(a)] == set([b])
+    assert forward_references_backref[id(a)] == set()
+    assert backward_references_backref[id(a)] == set([b])
     # b does not
-    assert forward_id_referenced_by[id(b)] == set([a])
-    assert backward_id_referenced_by[id(b)] == set()
+    assert forward_references_backref[id(b)] == set([a])
+    assert backward_references_backref[id(b)] == set()
 
 
 @patch("caoscrawler.sync_graph.cached_get_entity_by",
@@ -203,23 +203,23 @@ def test_merge_into_trivial(simple_adapter):
     assert len(st.backward_references[id(se_c)]) == 1
     assert se_a in st.backward_references[id(se_c)]
 
-    assert len(st.forward_id_references[id(se_a)]) == 1
-    assert se_c in st.forward_id_references[id(se_a)]
-    assert len(st.forward_id_references[id(se_b)]) == 0
-    assert len(st.forward_id_references[id(se_c)]) == 0
-    assert len(st.backward_id_references[id(se_a)]) == 0
-    assert len(st.backward_id_references[id(se_b)]) == 0
-    assert len(st.backward_id_references[id(se_c)]) == 1
-    assert se_a in st.backward_id_references[id(se_c)]
-
-    assert len(st.forward_id_referenced_by[id(se_a)]) == 1
-    assert se_c in st.forward_id_referenced_by[id(se_a)]
-    assert len(st.forward_id_referenced_by[id(se_b)]) == 0
-    assert len(st.forward_id_referenced_by[id(se_c)]) == 0
-    assert len(st.backward_id_referenced_by[id(se_a)]) == 0
-    assert len(st.backward_id_referenced_by[id(se_b)]) == 0
-    assert len(st.backward_id_referenced_by[id(se_c)]) == 1
-    assert se_a in st.backward_id_referenced_by[id(se_c)]
+    assert len(st.forward_references_id_props[id(se_a)]) == 1
+    assert se_c in st.forward_references_id_props[id(se_a)]
+    assert len(st.forward_references_id_props[id(se_b)]) == 0
+    assert len(st.forward_references_id_props[id(se_c)]) == 0
+    assert len(st.backward_references_id_props[id(se_a)]) == 0
+    assert len(st.backward_references_id_props[id(se_b)]) == 0
+    assert len(st.backward_references_id_props[id(se_c)]) == 1
+    assert se_a in st.backward_references_id_props[id(se_c)]
+
+    assert len(st.forward_references_backref[id(se_a)]) == 1
+    assert se_c in st.forward_references_backref[id(se_a)]
+    assert len(st.forward_references_backref[id(se_b)]) == 0
+    assert len(st.forward_references_backref[id(se_c)]) == 0
+    assert len(st.backward_references_backref[id(se_a)]) == 0
+    assert len(st.backward_references_backref[id(se_b)]) == 0
+    assert len(st.backward_references_backref[id(se_c)]) == 1
+    assert se_a in st.backward_references_backref[id(se_c)]
 
     st.set_id_of_node(se_a, 101)
 
@@ -234,23 +234,23 @@ def test_merge_into_trivial(simple_adapter):
     assert len(st.backward_references[id(se_c)]) == 1
     assert se_b in st.backward_references[id(se_c)]
 
-    assert id(se_a) not in st.forward_id_references
-    assert len(st.forward_id_references[id(se_b)]) == 1
-    assert se_c in st.forward_id_references[id(se_b)]
-    assert len(st.forward_id_references[id(se_c)]) == 0
-    assert id(se_a) not in st.backward_id_references
-    assert len(st.backward_id_references[id(se_b)]) == 0
-    assert len(st.backward_id_references[id(se_c)]) == 1
-    assert se_b in st.backward_id_references[id(se_c)]
-
-    assert id(se_a) not in st.forward_id_referenced_by
-    assert len(st.forward_id_referenced_by[id(se_b)]) == 1
-    assert se_c in st.forward_id_referenced_by[id(se_b)]
-    assert len(st.forward_id_referenced_by[id(se_c)]) == 0
-    assert id(se_a) not in st.backward_id_referenced_by
-    assert len(st.backward_id_referenced_by[id(se_b)]) == 0
-    assert len(st.backward_id_referenced_by[id(se_c)]) == 1
-    assert se_b in st.backward_id_referenced_by[id(se_c)]
+    assert id(se_a) not in st.forward_references_id_props
+    assert len(st.forward_references_id_props[id(se_b)]) == 1
+    assert se_c in st.forward_references_id_props[id(se_b)]
+    assert len(st.forward_references_id_props[id(se_c)]) == 0
+    assert id(se_a) not in st.backward_references_id_props
+    assert len(st.backward_references_id_props[id(se_b)]) == 0
+    assert len(st.backward_references_id_props[id(se_c)]) == 1
+    assert se_b in st.backward_references_id_props[id(se_c)]
+
+    assert id(se_a) not in st.forward_references_backref
+    assert len(st.forward_references_backref[id(se_b)]) == 1
+    assert se_c in st.forward_references_backref[id(se_b)]
+    assert len(st.forward_references_backref[id(se_c)]) == 0
+    assert id(se_a) not in st.backward_references_backref
+    assert len(st.backward_references_backref[id(se_b)]) == 0
+    assert len(st.backward_references_backref[id(se_c)]) == 1
+    assert se_b in st.backward_references_backref[id(se_c)]
 
 
 def test_merge_into_simple(simple_adapter):
@@ -277,27 +277,27 @@ def test_merge_into_simple(simple_adapter):
     se_a in st.backward_references[id(se_c)]
     se_b in st.backward_references[id(se_c)]
 
-    assert len(st.forward_id_references[id(se_a)]) == 1
-    se_c in st.forward_id_references[id(se_a)]
-    assert len(st.forward_id_references[id(se_b)]) == 1
-    se_c in st.forward_id_references[id(se_b)]
-    assert len(st.forward_id_references[id(se_c)]) == 0
-    assert len(st.backward_id_references[id(se_a)]) == 0
-    assert len(st.backward_id_references[id(se_b)]) == 0
-    assert len(st.backward_id_references[id(se_c)]) == 2
-    se_a in st.backward_id_references[id(se_c)]
-    se_b in st.backward_id_references[id(se_c)]
-
-    assert len(st.forward_id_referenced_by[id(se_a)]) == 1
-    se_c in st.forward_id_referenced_by[id(se_a)]
-    assert len(st.forward_id_referenced_by[id(se_b)]) == 1
-    se_c in st.forward_id_referenced_by[id(se_b)]
-    assert len(st.forward_id_referenced_by[id(se_c)]) == 0
-    assert len(st.backward_id_referenced_by[id(se_a)]) == 0
-    assert len(st.backward_id_referenced_by[id(se_b)]) == 0
-    assert len(st.backward_id_referenced_by[id(se_c)]) == 2
-    se_a in st.backward_id_referenced_by[id(se_c)]
-    se_b in st.backward_id_referenced_by[id(se_c)]
+    assert len(st.forward_references_id_props[id(se_a)]) == 1
+    se_c in st.forward_references_id_props[id(se_a)]
+    assert len(st.forward_references_id_props[id(se_b)]) == 1
+    se_c in st.forward_references_id_props[id(se_b)]
+    assert len(st.forward_references_id_props[id(se_c)]) == 0
+    assert len(st.backward_references_id_props[id(se_a)]) == 0
+    assert len(st.backward_references_id_props[id(se_b)]) == 0
+    assert len(st.backward_references_id_props[id(se_c)]) == 2
+    se_a in st.backward_references_id_props[id(se_c)]
+    se_b in st.backward_references_id_props[id(se_c)]
+
+    assert len(st.forward_references_backref[id(se_a)]) == 1
+    se_c in st.forward_references_backref[id(se_a)]
+    assert len(st.forward_references_backref[id(se_b)]) == 1
+    se_c in st.forward_references_backref[id(se_b)]
+    assert len(st.forward_references_backref[id(se_c)]) == 0
+    assert len(st.backward_references_backref[id(se_a)]) == 0
+    assert len(st.backward_references_backref[id(se_b)]) == 0
+    assert len(st.backward_references_backref[id(se_c)]) == 2
+    se_a in st.backward_references_backref[id(se_c)]
+    se_b in st.backward_references_backref[id(se_c)]
 
     st._merge_into(se_a, se_b)
 
@@ -313,26 +313,26 @@ def test_merge_into_simple(simple_adapter):
     assert len(st.backward_references[id(se_c)]) == 1
     se_b in st.backward_references[id(se_c)]
 
-    assert id(se_a) not in st.forward_id_references
-    assert len(st.forward_id_references[id(se_b)]) == 1
-    se_c in st.forward_id_references[id(se_b)]
-    assert len(st.forward_id_references[id(se_c)]) == 0
-    assert id(se_a) not in st.backward_id_references
-    assert len(st.backward_id_references[id(se_b)]) == 0
-    assert len(st.backward_id_references[id(se_c)]) == 1
-    se_b in st.backward_id_references[id(se_c)]
-
-    assert id(se_a) not in st.forward_id_referenced_by
-    assert len(st.forward_id_referenced_by[id(se_b)]) == 1
-    se_c in st.forward_id_referenced_by[id(se_b)]
-    assert len(st.forward_id_referenced_by[id(se_c)]) == 0
-    assert id(se_a) not in st.backward_id_referenced_by
-    assert len(st.backward_id_referenced_by[id(se_b)]) == 0
-    assert len(st.backward_id_referenced_by[id(se_c)]) == 1
-    se_b in st.backward_id_referenced_by[id(se_c)]
-
-
-def test_backward_id_referenced_by():
+    assert id(se_a) not in st.forward_references_id_props
+    assert len(st.forward_references_id_props[id(se_b)]) == 1
+    se_c in st.forward_references_id_props[id(se_b)]
+    assert len(st.forward_references_id_props[id(se_c)]) == 0
+    assert id(se_a) not in st.backward_references_id_props
+    assert len(st.backward_references_id_props[id(se_b)]) == 0
+    assert len(st.backward_references_id_props[id(se_c)]) == 1
+    se_b in st.backward_references_id_props[id(se_c)]
+
+    assert id(se_a) not in st.forward_references_backref
+    assert len(st.forward_references_backref[id(se_b)]) == 1
+    se_c in st.forward_references_backref[id(se_b)]
+    assert len(st.forward_references_backref[id(se_c)]) == 0
+    assert id(se_a) not in st.backward_references_backref
+    assert len(st.backward_references_backref[id(se_b)]) == 0
+    assert len(st.backward_references_backref[id(se_c)]) == 1
+    se_b in st.backward_references_backref[id(se_c)]
+
+
+def test_backward_references_backref():
     # We use the reference as identifying reference in both directions. Thus the map is the same
     # for all three categories: references, id_references and id_referenced_by
     ident_a = db.RecordType().add_parent("BR").add_property("name")
@@ -345,7 +345,7 @@ def test_backward_id_referenced_by():
     ent_list = [referenced, db.Record(name="A").add_parent("BR").add_property("ref", referenced), ]
 
     st = SyncGraph(ent_list, ident_adapter)
-    assert st.nodes[1] in st.backward_id_referenced_by[id(st.nodes[0])]
+    assert st.nodes[1] in st.backward_references_backref[id(st.nodes[0])]
 
 
 def test_set_id_of_node(simple_adapter):
-- 
GitLab