mixed

28f222af · Henrik tom Wörden · f3fcb67c · 28f222af · 28f222af · 28f222af
Commit 28f222af authored 1 year ago by Henrik tom Wörden
--- a/src/caoscrawler/sync_graph.py
+++ b/src/caoscrawler/sync_graph.py
@@ -20,7 +20,8 @@
 #
 """
-A data model class for the semantic data that shall be created by synchronization of the crawler.
+A data model class for the graph of entities that shall be created during synchronization of the
+crawler.
 """
 from __future__ import annotations
@@ -42,6 +43,16 @@ logger = logging.getLogger(__name__)
 def _for_each_scalar_value(node, condition, kind, value=None):
+    """ helper function that performs an action on each value element of each property of a node
+    The action (remove or set) is performed on each property value of each property: in case on
+    lists, it is performed on each list element. The action is only performed if the condition that
+    is provided is fulfilled, i.e. the callable ``condition`` returns True. The callable
+    ``condition`` must take the property value (or list element) as the sole argument.
+    Thus, with "remove" you can conditionally remove values and with "set" you can conditionally
+    replace values
+    """
    for p in node.properties:
        if isinstance(p.value, list):
            for ii, el in enumerate(p.value):
@@ -58,21 +69,29 @@ def _for_each_scalar_value(node, condition, kind, value=None):
 def _remove_each_scalar_value(node, condition):
+    """ "remove" version of _for_each_scalar_value """
    _for_each_scalar_value(node, condition, "remove")
 def _set_each_scalar_value(node, condition, value):
+    """ "set" version of _for_each_scalar_value """
    _for_each_scalar_value(node, condition, "set", value=value)
 class SyncGraph():
-    """ combines nodes in the graph based on their identity in order to create a graph of objects
+    """
-    that can either be inserted or updated in(to) the remote server.
+    A data model class for the graph of entities that shall be created during synchronization of
+    the crawler.
+    The SyncGraph combines nodes in the graph based on their identity in order to create a graph of
+    objects that can either be inserted or updated in(to) the remote server. This combination of
+    SyncNodes happens during initialization and later on when the ID of SyncNodes is set.
    When the SyncGraph is initialized, the properties of given entities are scanned and used to
-    create multiple reference maps that track how SemanticEntities reference each other.
+    create multiple reference maps that track how SyncNodes reference each other.
    These maps are kept up to date when SyncNodes are merged because they are identified with each
-    other.
+    other. During initialization, SyncNodes are first merged based on their ID, path or
+    identifiable.
    When additional information is added to the graph by setting the ID of a node
    (via `set_id_of_node`) then the graph is updated accordingly:
@@ -83,14 +102,13 @@ class SyncGraph():
    - The new ID might make it possible to create the identifiables of connected nodes and thus
      might trigger further merging of nodes based on the new identifiables.
-    The target entities are composed using the information that was collected in the node.
+    A SyncGraph should only be manipulated via one function:
-    This is information like name, parents and properties and,
-    importantly, references and other properties.
-    This model should only be manipulated via one function:
    - set_id_of_node: a positive integer means the Entity exists, None means it is missing
    TODO what about String IDs
+    The SyncGraph can be converted back to lists of entities which allow to perform the desired
+    inserts and updates.
    Usage:
    - Initialize the Graph with a list of entities. Those will be converted to the SyncNodes of the
      graph.
@@ -158,9 +176,22 @@ class SyncGraph():
                self._mark_existing(node)
    def export_record_lists(self):
+        """ exports the SyncGraph in form of db.Entities
+        All nodes are converted to db.Entity objects and reference values that are SyncNodes are
+        replaced by their corresponding (newly created)  db.Entity objects.
+        Since the result is returned in form of two lists, one with Entities that have a valid ID
+        one with those that haven't, an error is raised if there are any SyncNodes without an
+        (possibly negative) ID.
+        """
        if len(self.unchecked) > 1:
            self.unchecked_contains_circular_dependency()
+        for el in self.nodes:
+            if el.id is None:
+                raise RuntimeError("Exporting unchecked entities is not supported")
        entities = []
        node_map = {}
        for el in self.nodes:
@@ -176,10 +207,6 @@ class SyncGraph():
        existing = [el for el in entities if el.id > 0]
        # remove negative IDs
        for el in missing:
-            if el.id is None:
-                raise RuntimeError("This should not happen")  # TODO remove
-            if el.id >= 0:
-                raise RuntimeError("This should not happen")  # TODO remove
            el.id = None
        return (missing, existing)

--- a/src/caoscrawler/sync_node.py
+++ b/src/caoscrawler/sync_node.py
@@ -130,13 +130,16 @@ class SyncNode():
                    entval = [entval]
                if not isinstance(pval, list):
                    pval = [pval]
-                for e_el, p_el in zip(entval, pval):
+                if len(entval) != len(pval):
-                    if isinstance(e_el, SyncNode) and e_el.id is not None:
+                    unequal = True
-                        e_el = e_el.id
+                else:
-                    if isinstance(p_el, SyncNode) and p_el.id is not None:
+                    for e_el, p_el in zip(entval, pval):
-                        p_el = p_el.id
+                        if isinstance(e_el, SyncNode) and e_el.id is not None:
-                    if e_el != p_el:
+                            e_el = e_el.id
-                        unequal = True
+                        if isinstance(p_el, SyncNode) and p_el.id is not None:
+                            p_el = p_el.id
+                        if e_el != p_el:
+                            unequal = True
                if unequal:
                    logger.error("The Crawler is trying to create an entity,"

--- a/unittests/test_sync_node.py
+++ b/unittests/test_sync_node.py
@@ -294,3 +294,12 @@ def test_export_node():
    with pytest.raises(ImpossibleMergeError):
        exp = SyncNode(rec_a).export_entity()
+    # different list sizes
+    rec_a = (db.Record(id=101)
+             .add_parent("B")
+             .add_property(name="a", value=[SyncNode(db.Record(id=1))])
+             .add_property(name="a", value=[SyncNode(db.Record(id=1)), SyncNode(db.Record(id=1))]))
+    with pytest.raises(ImpossibleMergeError):
+        exp = SyncNode(rec_a).export_entity()