diff --git a/src/caoscrawler/sync_graph.py b/src/caoscrawler/sync_graph.py index 89ad3fb08e262d88a5dfe47beaf55e046391fc2c..e345ca01150a2ab87be0dadbd39aecb0df4fd99f 100644 --- a/src/caoscrawler/sync_graph.py +++ b/src/caoscrawler/sync_graph.py @@ -193,8 +193,9 @@ class SyncGraph(): one with those that haven't, an error is raised if there are any SyncNodes without an (possibly negative) ID. """ - if len(self.unchecked) > 1: - self.unchecked_contains_circular_dependency() + # TODO reactivate once the implementation is appropriate + # if len(self.unchecked) > 1: + # self.unchecked_contains_circular_dependency() for el in self.nodes: if el.id is None: @@ -239,6 +240,10 @@ class SyncGraph(): TODO: for the sake of detecting problems for split_into_inserts_and_updates we should only consider references that are identifying properties. """ + raise NotImplementedError("This function is not yet properly implemented") + # TODO if the first element is not part of the circle, then + # this will not work + # We must created a better implementation (see also TODO in docstring) circle = [self.unchecked[0]] closed = False while not closed: @@ -258,6 +263,8 @@ class SyncGraph(): Return an equivalent SyncNode. Equivalent means that ID, path or identifiable are the same. + + Returns None if no equivalent node is found. """ if entity.id is not None and entity.id in self._id_look_up: candidate = self._id_look_up[entity.id] @@ -275,11 +282,17 @@ class SyncGraph(): return None def _get_new_id(self): + """ returns the next unused temporary ID""" self._remote_missing_counter -= 1 return self._remote_missing_counter def _set_identifiable_of_node(self, node: SyncNode, identifiable: Optional[Identifiable] = None): + """sets the identifiable and checks whether an equivalent node can be found with that new + information. + + if no identifiable is given, the identifiable is retrieved from the identifiable adapter + """ if identifiable is None: self.identifiableAdapter.check_identifying_props(node) identifiable = self.identifiableAdapter.get_identifiable( @@ -300,6 +313,10 @@ class SyncGraph(): raise RuntimeError(f"Records must have a parent.\n{ent}") def _get_nodes_whose_identity_relies_on(self, node: SyncNode): + """returns a set of nodes that reference the given node as identifying property or are + referenced by the given node and the parent of the given node is listed as + "is_referenced_by" + """ return (self.backward_id_references[id(node)].union( self.forward_id_referenced_by[id(node)])) @@ -336,17 +353,17 @@ class SyncGraph(): @staticmethod def _create_reference_mapping(flat: list[SyncNode]): """ - TODO update docstring - Create a dictionary of dictionaries of the form: - dict[int, dict[str, list[Union[int,None]]]] - - - The integer index is the Python id of the value object. - - The string is the name of the first parent of the referencing object. - - Each value objects is taken from the values of all properties from the list flat. - - So the returned mapping maps ids of entities to the ids of objects which are referring - to them. + Create six dictionaries that describe references among SyncNodes. All dictionaries use the + Python ID of SyncNodes as keys. + There is always one dictionary to describe the direction of the reference, i.e. + map[id(node)] -> other where other is a set of SyncNodes that are being referenced by node. + And then there is always one dictionary for the inverse direction. The two dictionaries are + named "forward_" and "backward_", respectively. + + Then there are three kinds of maps being generated: One includes all references + ("_references"), one includes references that are values of identifying properties + ("_id_references") and one includes references that are relevant for identifying + backreferences/"is_referenced_by" ("_id_references_by"). """ # TODO we need to treat children of RecordTypes somehow. forward_references: dict[int, set[SyncNode]] = {} @@ -425,35 +442,46 @@ class SyncGraph(): self._nonidentifiable[id(node)] = node def _merge_into(self, source: SyncNode, target: SyncNode): - """ FIXME tries to merge record into newrecord - - If it fails, record is added to the try_to_merge_later list. - In any case, references are bent to the newrecord object. - + """ tries to merge source into target and performs the necessary updates: + - update the membervariables of target using source (``target.update(source)``). + - replaces reference values to source by target + - updates the reference map + - updates lookup tables + - removes source from node lists + - marks target as missing/existing if source was marked that way + - adds an identifiable if now possible (e.g. merging based on ID might allow create an + identifiable when none of the two nodes hand the sufficient properties on its own before) + - check whether dependend nodes can now get and identifiable (the merge might have set the + ID such that dependen nodes can now create an identifiable) """ # sanity checks - assert source is not target + if source is target: + raise ValueError("source must not be target") if target.id is None and source.id is not None: - assert self._id_look_up[source.id] == source, ( - "It is assumed that always only one node exists with a certain ID and that node is" - " in the look up") + if self._id_look_up[source.id] != source: + raise ValueError( + "It is assumed that always only one node exists with a certain ID and that " + "node is in the look up") if target.path is None and source.path is not None: - assert self._id_look_up[source.path] == source, ( - "It is assumed that always only one node exists with a certain path and that node " - "is in the look up") + if self._id_look_up[source.path] != source: + raise ValueError( + "It is assumed that always only one node exists with a certain path and that" + " node is in the look up") target.update(source) + # replace actual reference property values + for node in self.backward_references.pop(id(source)): + _set_each_scalar_value(node, + condition=lambda val: val is source, + value=lambda val: target) + # update reference mappings for node in self.forward_references.pop(id(source)): self.forward_references[id(target)].add(node) self.backward_references[id(node)].remove(source) self.backward_references[id(node)].add(target) for node in self.backward_references.pop(id(source)): - # replace actual reference property values - _set_each_scalar_value(node, - condition=lambda val: val is source, - value=lambda val: target) self.backward_references[id(target)].add(node) self.forward_references[id(node)].remove(source) self.forward_references[id(node)].add(target) @@ -515,8 +543,8 @@ class SyncGraph(): def _initialize_nodes(self, entities: list[db.Entity]): """ create initial set of SyncNodes from provided Entity list""" - entities = self._create_flat_list(entities) self._sanity_check(entities) + entities = self._create_flat_list(entities) se_lookup: dict[int, SyncNode] = {} # lookup: python id -> SyncNode for el in entities: self.nodes.append(SyncNode(