diff --git a/src/caoscrawler/sync_graph.py b/src/caoscrawler/sync_graph.py index 7bc95111474510ce14e5ee8b32ab570d2c324579..380b8a6a42675c1b952f3e34b20d51a0ab10052f 100644 --- a/src/caoscrawler/sync_graph.py +++ b/src/caoscrawler/sync_graph.py @@ -20,7 +20,8 @@ # """ -A data model class for the semantic data that shall be created by synchronization of the crawler. +A data model class for the graph of entities that shall be created during synchronization of the +crawler. """ from __future__ import annotations @@ -42,6 +43,16 @@ logger = logging.getLogger(__name__) def _for_each_scalar_value(node, condition, kind, value=None): + """ helper function that performs an action on each value element of each property of a node + + The action (remove or set) is performed on each property value of each property: in case on + lists, it is performed on each list element. The action is only performed if the condition that + is provided is fulfilled, i.e. the callable ``condition`` returns True. The callable + ``condition`` must take the property value (or list element) as the sole argument. + + Thus, with "remove" you can conditionally remove values and with "set" you can conditionally + replace values + """ for p in node.properties: if isinstance(p.value, list): for ii, el in enumerate(p.value): @@ -58,21 +69,29 @@ def _for_each_scalar_value(node, condition, kind, value=None): def _remove_each_scalar_value(node, condition): + """ "remove" version of _for_each_scalar_value """ _for_each_scalar_value(node, condition, "remove") def _set_each_scalar_value(node, condition, value): + """ "set" version of _for_each_scalar_value """ _for_each_scalar_value(node, condition, "set", value=value) class SyncGraph(): - """ combines nodes in the graph based on their identity in order to create a graph of objects - that can either be inserted or updated in(to) the remote server. + """ + A data model class for the graph of entities that shall be created during synchronization of + the crawler. + + The SyncGraph combines nodes in the graph based on their identity in order to create a graph of + objects that can either be inserted or updated in(to) the remote server. This combination of + SyncNodes happens during initialization and later on when the ID of SyncNodes is set. When the SyncGraph is initialized, the properties of given entities are scanned and used to - create multiple reference maps that track how SemanticEntities reference each other. + create multiple reference maps that track how SyncNodes reference each other. These maps are kept up to date when SyncNodes are merged because they are identified with each - other. + other. During initialization, SyncNodes are first merged based on their ID, path or + identifiable. When additional information is added to the graph by setting the ID of a node (via `set_id_of_node`) then the graph is updated accordingly: @@ -83,14 +102,13 @@ class SyncGraph(): - The new ID might make it possible to create the identifiables of connected nodes and thus might trigger further merging of nodes based on the new identifiables. - The target entities are composed using the information that was collected in the node. - This is information like name, parents and properties and, - importantly, references and other properties. - - This model should only be manipulated via one function: + A SyncGraph should only be manipulated via one function: - set_id_of_node: a positive integer means the Entity exists, None means it is missing TODO what about String IDs + The SyncGraph can be converted back to lists of entities which allow to perform the desired + inserts and updates. + Usage: - Initialize the Graph with a list of entities. Those will be converted to the SyncNodes of the graph. @@ -158,9 +176,22 @@ class SyncGraph(): self._mark_existing(node) def export_record_lists(self): + """ exports the SyncGraph in form of db.Entities + + All nodes are converted to db.Entity objects and reference values that are SyncNodes are + replaced by their corresponding (newly created) db.Entity objects. + + Since the result is returned in form of two lists, one with Entities that have a valid ID + one with those that haven't, an error is raised if there are any SyncNodes without an + (possibly negative) ID. + """ if len(self.unchecked) > 1: self.unchecked_contains_circular_dependency() + for el in self.nodes: + if el.id is None: + raise RuntimeError("Exporting unchecked entities is not supported") + entities = [] node_map = {} for el in self.nodes: @@ -176,10 +207,6 @@ class SyncGraph(): existing = [el for el in entities if el.id > 0] # remove negative IDs for el in missing: - if el.id is None: - raise RuntimeError("This should not happen") # TODO remove - if el.id >= 0: - raise RuntimeError("This should not happen") # TODO remove el.id = None return (missing, existing) diff --git a/src/caoscrawler/sync_node.py b/src/caoscrawler/sync_node.py index 4a384cfd0a9c4d9c21b7b279a699c198773808fb..69437b1876baaa31454fdaf813244d0bb5e19643 100644 --- a/src/caoscrawler/sync_node.py +++ b/src/caoscrawler/sync_node.py @@ -130,13 +130,16 @@ class SyncNode(): entval = [entval] if not isinstance(pval, list): pval = [pval] - for e_el, p_el in zip(entval, pval): - if isinstance(e_el, SyncNode) and e_el.id is not None: - e_el = e_el.id - if isinstance(p_el, SyncNode) and p_el.id is not None: - p_el = p_el.id - if e_el != p_el: - unequal = True + if len(entval) != len(pval): + unequal = True + else: + for e_el, p_el in zip(entval, pval): + if isinstance(e_el, SyncNode) and e_el.id is not None: + e_el = e_el.id + if isinstance(p_el, SyncNode) and p_el.id is not None: + p_el = p_el.id + if e_el != p_el: + unequal = True if unequal: logger.error("The Crawler is trying to create an entity," diff --git a/unittests/test_sync_node.py b/unittests/test_sync_node.py index ff9e0f214c9501933b9692b602bf6ade20918bf0..54d1769af96d18161a7b3052d176dded077ac497 100644 --- a/unittests/test_sync_node.py +++ b/unittests/test_sync_node.py @@ -294,3 +294,12 @@ def test_export_node(): with pytest.raises(ImpossibleMergeError): exp = SyncNode(rec_a).export_entity() + + # different list sizes + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=[SyncNode(db.Record(id=1))]) + .add_property(name="a", value=[SyncNode(db.Record(id=1)), SyncNode(db.Record(id=1))])) + + with pytest.raises(ImpossibleMergeError): + exp = SyncNode(rec_a).export_entity()