Skip to content
Snippets Groups Projects
Commit 28f222af authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

mixed

parent f3fcb67c
No related branches found
No related tags found
2 merge requests!178FIX: #96 Better error output for crawl.py script.,!167Sync Graph
Pipeline #50896 failed
...@@ -20,7 +20,8 @@ ...@@ -20,7 +20,8 @@
# #
""" """
A data model class for the semantic data that shall be created by synchronization of the crawler. A data model class for the graph of entities that shall be created during synchronization of the
crawler.
""" """
from __future__ import annotations from __future__ import annotations
...@@ -42,6 +43,16 @@ logger = logging.getLogger(__name__) ...@@ -42,6 +43,16 @@ logger = logging.getLogger(__name__)
def _for_each_scalar_value(node, condition, kind, value=None): def _for_each_scalar_value(node, condition, kind, value=None):
""" helper function that performs an action on each value element of each property of a node
The action (remove or set) is performed on each property value of each property: in case on
lists, it is performed on each list element. The action is only performed if the condition that
is provided is fulfilled, i.e. the callable ``condition`` returns True. The callable
``condition`` must take the property value (or list element) as the sole argument.
Thus, with "remove" you can conditionally remove values and with "set" you can conditionally
replace values
"""
for p in node.properties: for p in node.properties:
if isinstance(p.value, list): if isinstance(p.value, list):
for ii, el in enumerate(p.value): for ii, el in enumerate(p.value):
...@@ -58,21 +69,29 @@ def _for_each_scalar_value(node, condition, kind, value=None): ...@@ -58,21 +69,29 @@ def _for_each_scalar_value(node, condition, kind, value=None):
def _remove_each_scalar_value(node, condition): def _remove_each_scalar_value(node, condition):
""" "remove" version of _for_each_scalar_value """
_for_each_scalar_value(node, condition, "remove") _for_each_scalar_value(node, condition, "remove")
def _set_each_scalar_value(node, condition, value): def _set_each_scalar_value(node, condition, value):
""" "set" version of _for_each_scalar_value """
_for_each_scalar_value(node, condition, "set", value=value) _for_each_scalar_value(node, condition, "set", value=value)
class SyncGraph(): class SyncGraph():
""" combines nodes in the graph based on their identity in order to create a graph of objects """
that can either be inserted or updated in(to) the remote server. A data model class for the graph of entities that shall be created during synchronization of
the crawler.
The SyncGraph combines nodes in the graph based on their identity in order to create a graph of
objects that can either be inserted or updated in(to) the remote server. This combination of
SyncNodes happens during initialization and later on when the ID of SyncNodes is set.
When the SyncGraph is initialized, the properties of given entities are scanned and used to When the SyncGraph is initialized, the properties of given entities are scanned and used to
create multiple reference maps that track how SemanticEntities reference each other. create multiple reference maps that track how SyncNodes reference each other.
These maps are kept up to date when SyncNodes are merged because they are identified with each These maps are kept up to date when SyncNodes are merged because they are identified with each
other. other. During initialization, SyncNodes are first merged based on their ID, path or
identifiable.
When additional information is added to the graph by setting the ID of a node When additional information is added to the graph by setting the ID of a node
(via `set_id_of_node`) then the graph is updated accordingly: (via `set_id_of_node`) then the graph is updated accordingly:
...@@ -83,14 +102,13 @@ class SyncGraph(): ...@@ -83,14 +102,13 @@ class SyncGraph():
- The new ID might make it possible to create the identifiables of connected nodes and thus - The new ID might make it possible to create the identifiables of connected nodes and thus
might trigger further merging of nodes based on the new identifiables. might trigger further merging of nodes based on the new identifiables.
The target entities are composed using the information that was collected in the node. A SyncGraph should only be manipulated via one function:
This is information like name, parents and properties and,
importantly, references and other properties.
This model should only be manipulated via one function:
- set_id_of_node: a positive integer means the Entity exists, None means it is missing - set_id_of_node: a positive integer means the Entity exists, None means it is missing
TODO what about String IDs TODO what about String IDs
The SyncGraph can be converted back to lists of entities which allow to perform the desired
inserts and updates.
Usage: Usage:
- Initialize the Graph with a list of entities. Those will be converted to the SyncNodes of the - Initialize the Graph with a list of entities. Those will be converted to the SyncNodes of the
graph. graph.
...@@ -158,9 +176,22 @@ class SyncGraph(): ...@@ -158,9 +176,22 @@ class SyncGraph():
self._mark_existing(node) self._mark_existing(node)
def export_record_lists(self): def export_record_lists(self):
""" exports the SyncGraph in form of db.Entities
All nodes are converted to db.Entity objects and reference values that are SyncNodes are
replaced by their corresponding (newly created) db.Entity objects.
Since the result is returned in form of two lists, one with Entities that have a valid ID
one with those that haven't, an error is raised if there are any SyncNodes without an
(possibly negative) ID.
"""
if len(self.unchecked) > 1: if len(self.unchecked) > 1:
self.unchecked_contains_circular_dependency() self.unchecked_contains_circular_dependency()
for el in self.nodes:
if el.id is None:
raise RuntimeError("Exporting unchecked entities is not supported")
entities = [] entities = []
node_map = {} node_map = {}
for el in self.nodes: for el in self.nodes:
...@@ -176,10 +207,6 @@ class SyncGraph(): ...@@ -176,10 +207,6 @@ class SyncGraph():
existing = [el for el in entities if el.id > 0] existing = [el for el in entities if el.id > 0]
# remove negative IDs # remove negative IDs
for el in missing: for el in missing:
if el.id is None:
raise RuntimeError("This should not happen") # TODO remove
if el.id >= 0:
raise RuntimeError("This should not happen") # TODO remove
el.id = None el.id = None
return (missing, existing) return (missing, existing)
......
...@@ -130,13 +130,16 @@ class SyncNode(): ...@@ -130,13 +130,16 @@ class SyncNode():
entval = [entval] entval = [entval]
if not isinstance(pval, list): if not isinstance(pval, list):
pval = [pval] pval = [pval]
for e_el, p_el in zip(entval, pval): if len(entval) != len(pval):
if isinstance(e_el, SyncNode) and e_el.id is not None: unequal = True
e_el = e_el.id else:
if isinstance(p_el, SyncNode) and p_el.id is not None: for e_el, p_el in zip(entval, pval):
p_el = p_el.id if isinstance(e_el, SyncNode) and e_el.id is not None:
if e_el != p_el: e_el = e_el.id
unequal = True if isinstance(p_el, SyncNode) and p_el.id is not None:
p_el = p_el.id
if e_el != p_el:
unequal = True
if unequal: if unequal:
logger.error("The Crawler is trying to create an entity," logger.error("The Crawler is trying to create an entity,"
......
...@@ -294,3 +294,12 @@ def test_export_node(): ...@@ -294,3 +294,12 @@ def test_export_node():
with pytest.raises(ImpossibleMergeError): with pytest.raises(ImpossibleMergeError):
exp = SyncNode(rec_a).export_entity() exp = SyncNode(rec_a).export_entity()
# different list sizes
rec_a = (db.Record(id=101)
.add_parent("B")
.add_property(name="a", value=[SyncNode(db.Record(id=1))])
.add_property(name="a", value=[SyncNode(db.Record(id=1)), SyncNode(db.Record(id=1))]))
with pytest.raises(ImpossibleMergeError):
exp = SyncNode(rec_a).export_entity()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment