diff --git a/src/caoscrawler/sync_graph.py b/src/caoscrawler/sync_graph.py index 72bd3bd23c05aa41d707a6f3b62020ee5dc5f29a..897918e275f6820bc82b3c5e3aa3490fdecc6609 100644 --- a/src/caoscrawler/sync_graph.py +++ b/src/caoscrawler/sync_graph.py @@ -27,24 +27,25 @@ crawler. from __future__ import annotations import logging -from typing import Any, Optional, Union, Callable +from typing import Any, Optional, Union, Callable import linkahead as db -from linkahead.apiutils import (EntityMergeConflictError, compare_entities, - merge_entities) -from linkahead.cached import cache_clear, cached_get_entity_by +from linkahead.cached import cached_get_entity_by from linkahead.exceptions import EmptyUniqueQueryError -from .exceptions import ImpossibleMergeError, MissingReferencingEntityError from .identifiable_adapters import IdentifiableAdapter from .identifiable import Identifiable from .sync_node import SyncNode, TempID +import re + logger = logging.getLogger(__name__) -def _set_each_scalar_value(node: SyncNode, condition: Callable[[Any], bool], value: Any): - """ helper function that conditionally replaces each value element of each property of a node +def _set_each_scalar_value( + node: SyncNode, condition: Callable[[Any], bool], value: Any +): + """helper function that conditionally replaces each value element of each property of a node If the property value is a list, the replacement is done for each list entry. The replacement is only performed if the condition that @@ -70,7 +71,7 @@ def _set_each_scalar_value(node: SyncNode, condition: Callable[[Any], bool], val p.value = value(p.value) -class SyncGraph(): +class SyncGraph: """ A data model class for the graph of entities that shall be created during synchronization of the crawler. @@ -133,7 +134,9 @@ class SyncGraph(): # Note, that when ever one node is changed, we check all dependend nodes (see usage of # `_get_nodes_whose_identity_relies_on`) whether something should be updated. Thus, we cannot # miss a necessary update. - def __init__(self, entities: list[db.Entity], identifiableAdapter: IdentifiableAdapter): + def __init__( + self, entities: list[db.Entity], identifiableAdapter: IdentifiableAdapter + ): self.identifiableAdapter = identifiableAdapter # A dictionary allowing for quick lookup of sync nodes using their (possibly negative) IDs. # This dictionary is initially set using _mark_entities_with_path_or_id and later updated @@ -192,8 +195,10 @@ class SyncGraph(): Last review by Alexander Schlemmer on 2024-05-24. """ if node.id is not None: - raise RuntimeError('Cannot update ID.\n' - f'It already is {node.id} and shall be set to {node_id}.') + raise RuntimeError( + "Cannot update ID.\n" + f"It already is {node.id} and shall be set to {node_id}." + ) if node_id is None: node_id = TempID(self._get_new_id()) node.id = node_id @@ -207,7 +212,7 @@ class SyncGraph(): self._mark_existing(node) def export_record_lists(self): - """ exports the SyncGraph in form of db.Entities + """exports the SyncGraph in form of db.Entities All nodes are converted to db.Entity objects and reference values that are SyncNodes are replaced by their corresponding (newly created) db.Entity objects. @@ -233,9 +238,11 @@ class SyncGraph(): node_map[id(el)] = entities[-1] for ent in entities: - _set_each_scalar_value(ent, - condition=lambda val: isinstance(val, SyncNode), - value=lambda val: node_map[id(val)]) + _set_each_scalar_value( + ent, + condition=lambda val: isinstance(val, SyncNode), + value=lambda val: node_map[id(val)], + ) missing = [el for el in entities if el.id < 0] existing = [el for el in entities if el.id > 0] @@ -253,10 +260,16 @@ class SyncGraph(): Last review by Alexander Schlemmer on 2024-05-27. """ - return any([id(ent) not in self._missing and id(ent) not in self._existing - for ent in self.forward_references_id_props[id(node)]] - + [id(ent) not in self._missing and id(ent) not in self._existing - for ent in self.backward_references_backref[id(node)]]) + return any( + [ + id(ent) not in self._missing and id(ent) not in self._existing + for ent in self.forward_references_id_props[id(node)] + ] + + [ + id(ent) not in self._missing and id(ent) not in self._existing + for ent in self.backward_references_backref[id(node)] + ] + ) def unchecked_contains_circular_dependency(self): """ @@ -309,23 +322,28 @@ class SyncGraph(): candidate = self._path_look_up[entity.path] if candidate is not entity: return candidate - if (entity.identifiable is not None and entity.identifiable.get_representation() in - self._identifiable_look_up): - candidate = self._identifiable_look_up[entity.identifiable.get_representation()] + if ( + entity.identifiable is not None + and entity.identifiable.get_representation() in self._identifiable_look_up + ): + candidate = self._identifiable_look_up[ + entity.identifiable.get_representation() + ] if candidate is not entity: return candidate return None def _get_new_id(self): - """ returns the next unused temporary ID + """returns the next unused temporary ID Last review by Alexander Schlemmer on 2024-05-24. """ self._remote_missing_counter -= 1 return self._remote_missing_counter - def _set_identifiable_of_node(self, node: SyncNode, - identifiable: Optional[Identifiable] = None): + def _set_identifiable_of_node( + self, node: SyncNode, identifiable: Optional[Identifiable] = None + ): """sets the identifiable and checks whether an equivalent node can be found with that new information. If an equivalent node is found, 'node' is merged into that node. @@ -333,12 +351,14 @@ class SyncGraph(): Raises a ValueError if the equivalent node found does not have an identifiable. Raises a RuntimeError if there is no equivalent node found and - the (unique) string representation of the identifiable of node is already contained in the identifiable_look_up. + the (unique) string representation of the identifiable of node is already contained in + the identifiable_look_up. """ if identifiable is None: self.identifiableAdapter.all_identifying_properties_exist(node) identifiable = self.identifiableAdapter.get_identifiable( - node, self.backward_references_backref[id(node)]) + node, self.backward_references_backref[id(node)] + ) node.identifiable = identifiable equivalent_se = self.get_equivalent(node) if equivalent_se is not None and equivalent_se is not node: @@ -361,9 +381,13 @@ class SyncGraph(): if ent.role == "Record" and len(ent.parents) == 0: raise ValueError(f"Records must have a parent.\n{ent}") if isinstance(ent.id, int) and ent.id < 0: - raise ValueError(f"Records must not have negative integers as IDs.\n{ent}") + raise ValueError( + f"Records must not have negative integers as IDs.\n{ent}" + ) if isinstance(ent.id, str) and re.match(r"^-\d+$", ent.id): - raise ValueError(f"Records must not have negative integers as IDs.\n{ent}") + raise ValueError( + f"Records must not have negative integers as IDs.\n{ent}" + ) def _get_nodes_whose_identity_relies_on(self, node: SyncNode): """returns a set of nodes that reference the given node as identifying property or are @@ -372,11 +396,14 @@ class SyncGraph(): Last review by Alexander Schlemmer on 2024-05-24. """ - return (self.backward_references_id_props[id(node)].union( - self.forward_references_backref[id(node)])) + return self.backward_references_id_props[id(node)].union( + self.forward_references_backref[id(node)] + ) @staticmethod - def _create_flat_list(ent_list: list[db.Entity], flat: Optional[list[db.Entity]] = None): + def _create_flat_list( + ent_list: list[db.Entity], flat: Optional[list[db.Entity]] = None + ): """ Recursively adds entities and all their properties contained in ent_list to the output list flat. @@ -446,25 +473,40 @@ class SyncGraph(): if isinstance(v, SyncNode): forward_references[id(node)].add(v) backward_references[id(v)].add(node) - if (node.registered_identifiable is not None - and len([el.name - for el in node.registered_identifiable.properties if - el.name == p.name]) > 0): + if ( + node.registered_identifiable is not None + and len( + [ + el.name + for el in node.registered_identifiable.properties + if el.name == p.name + ] + ) + > 0 + ): forward_references_id_props[id(node)].add(v) backward_references_id_props[id(v)].add(node) - if (v.registered_identifiable is not None and - IdentifiableAdapter.referencing_entity_has_appropriate_type( - node.parents, v.registered_identifiable)): + if ( + v.registered_identifiable is not None + and IdentifiableAdapter.referencing_entity_has_appropriate_type( + node.parents, v.registered_identifiable + ) + ): forward_references_backref[id(node)].add(v) backward_references_backref[id(v)].add(node) - return (forward_references, backward_references, forward_references_id_props, - backward_references_id_props, forward_references_backref, backward_references_backref, - ) + return ( + forward_references, + backward_references, + forward_references_id_props, + backward_references_id_props, + forward_references_backref, + backward_references_backref, + ) def _mark_entities_with_path_or_id(self): - """ A path or an ID is sufficiently identifying. Thus, those entities can be marked as - checked """ + """A path or an ID is sufficiently identifying. Thus, those entities can be marked as + checked""" for node in list(self.nodes): if node.id is not None: if self.get_equivalent(node) is not None: @@ -490,7 +532,7 @@ class SyncGraph(): self.set_id_of_node(node, remote_id) def _merge_into(self, source: SyncNode, target: SyncNode): - """ tries to merge source into target and performs the necessary updates: + """tries to merge source into target and performs the necessary updates: - update the membervariables of target using source (``target.update(source)``). - replaces reference values to source by target - updates the reference map @@ -509,20 +551,22 @@ class SyncGraph(): if self._id_look_up[source.id] != source: raise ValueError( "It is assumed that always only one node exists with a certain ID and that " - "node is in the look up") + "node is in the look up" + ) if target.path is None and source.path is not None: if self._id_look_up[source.path] != source: raise ValueError( "It is assumed that always only one node exists with a certain path and that" - " node is in the look up") + " node is in the look up" + ) target.update(source) # replace actual reference property values for node in self.backward_references[id(source)]: - _set_each_scalar_value(node, - condition=lambda val: val is source, - value=lambda val: target) + _set_each_scalar_value( + node, condition=lambda val: val is source, value=lambda val: target + ) # update reference mappings for node in self.forward_references.pop(id(source)): @@ -562,10 +606,13 @@ class SyncGraph(): if target.path is not None: self._path_look_up[target.path] = target if target.identifiable is not None: - self._identifiable_look_up[target.identifiable.get_representation()] = target + self._identifiable_look_up[target.identifiable.get_representation()] = ( + target + ) - if ((id(source) in self._existing and id(target) in self._missing) - or (id(target) in self._existing and id(source) in self._missing)): + if (id(source) in self._existing and id(target) in self._missing) or ( + id(target) in self._existing and id(source) in self._missing + ): raise RuntimeError("Trying to merge missing and existing") if id(source) in self._missing and id(target) not in self._missing: @@ -595,12 +642,16 @@ class SyncGraph(): Last review by Alexander Schlemmer on 2024-05-24. """ - return (node.identifiable is None and not self._identity_relies_on_unchecked_entity(node) - and self.identifiableAdapter.all_identifying_properties_exist( - node, raise_exception=False)) + return ( + node.identifiable is None + and not self._identity_relies_on_unchecked_entity(node) + and self.identifiableAdapter.all_identifying_properties_exist( + node, raise_exception=False + ) + ) def _initialize_nodes(self, entities: list[db.Entity]): - """ create initial set of SyncNodes from provided Entity list""" + """create initial set of SyncNodes from provided Entity list""" self._sanity_check(entities) entities = self._create_flat_list(entities) se_lookup: dict[int, SyncNode] = {} # lookup: python id -> SyncNode @@ -608,19 +659,24 @@ class SyncGraph(): # Create new sync nodes from the list of entities, their registered identifiables # are set from the identifiable adapter. for el in entities: - self.nodes.append(SyncNode( - el, - self.identifiableAdapter.get_registered_identifiable(el))) + self.nodes.append( + SyncNode(el, self.identifiableAdapter.get_registered_identifiable(el)) + ) se_lookup[id(el)] = self.nodes[-1] # replace db.Entity objects with SyncNodes in references: for node in self.nodes: - _set_each_scalar_value(node, - condition=lambda val: id(val) in se_lookup, - value=lambda val: se_lookup[id(val)]) + _set_each_scalar_value( + node, + condition=lambda val: id(val) in se_lookup, + value=lambda val: se_lookup[id(val)], + ) def _add_identifiables_to_dependend_nodes(self, node): - """ For each dependent node, we check whether this allows to create an identifiable """ + """For each dependent node, we check whether this allows to create an identifiable + + Last review by Alexander Schlemmer on 2024-05-29. + """ for other_node in self._get_nodes_whose_identity_relies_on(node): if self._identifiable_is_needed(other_node): self._set_identifiable_of_node(other_node)