diff --git a/src/caoscrawler/sync_graph.py b/src/caoscrawler/sync_graph.py index bacfde80c339cda85eaceb486bc6961ba44bf123..93ca4902343427250ea126e3c3462f69222db321 100644 --- a/src/caoscrawler/sync_graph.py +++ b/src/caoscrawler/sync_graph.py @@ -506,13 +506,18 @@ class SyncGraph: """A path or an ID is sufficiently identifying. Thus, those entities can be marked as checked + When this function returns, there is only one node for each ID (i.e. no two nodes with the + same ID). The same is true for paths. + + This function also updates _id_look_up and _path_look_up + Last review by Alexander Schlemmer on 2024-05-29. """ for node in list(self.nodes): if node.id is not None: eq_node = self.get_equivalent(node) if eq_node is not None: - self._merge_into(node, eq_node) + self._basic_merge_into(node, eq_node) else: self._id_look_up[node.id] = node self._mark_existing(node) @@ -521,7 +526,7 @@ class SyncGraph: if node.path is not None: eq_node = self.get_equivalent(node) if eq_node is not None: - self._merge_into(node, eq_node) + self._basic_merge_into(node, eq_node) else: self._path_look_up[node.path] = node try: @@ -531,39 +536,19 @@ class SyncGraph: remote_id = None if existing is not None: remote_id = existing.id - self._path_look_up[node.path] = node self.set_id_of_node(node, remote_id) - def _merge_into(self, source: SyncNode, target: SyncNode): - """tries to merge source into target and performs the necessary updates: - - update the member variables of target using source (``target.update(source)``). - - replaces reference values to source by target - - updates the reference map - - updates lookup tables - - removes source from node lists - - marks target as missing/existing if source was marked that way - - adds an identifiable if now possible (e.g. merging based on ID might allow create an - identifiable when none of the two nodes had the sufficient properties on its own before) - - check whether dependent nodes can now get an identifiable (the merge might have set the - ID such that dependent nodes can now create an identifiable) + def _basic_merge_into(self, source: SyncNode, target: SyncNode): + """tries to merge source into target and updates member variables - Last review by Alexander Schlemmer on 2024-05-29. + - reference maps are updated + - self.nodes is updated + - self.unchecked is updated + - lookups are being updated """ # sanity checks if source is target: raise ValueError("source must not be target") - if target.id is None and source.id is not None: - if self._id_look_up[source.id] != source: - raise ValueError( - "It is assumed that always only one node exists with a certain ID and that " - "node is in the look up" - ) - if target.path is None and source.path is not None: - if self._id_look_up[source.path] != source: - raise ValueError( - "It is assumed that always only one node exists with a certain path and that" - " node is in the look up" - ) target.update(source) @@ -575,12 +560,13 @@ class SyncGraph: # update reference mappings for setA, setB in ( - (self.forward_references, self.backward_references), - (self.backward_references, self.forward_references), - (self.forward_references_id_props, self.backward_references_id_props), - (self.backward_references_id_props, self.forward_references_id_props), - (self.forward_references_backref, self.backward_references_backref), - (self.backward_references_backref, self.forward_references_backref),): + (self.forward_references, self.backward_references), # ref: source -> other + (self.backward_references, self.forward_references), # ref: other -> source + (self.forward_references_id_props, self.backward_references_id_props), + (self.backward_references_id_props, self.forward_references_id_props), + (self.forward_references_backref, self.backward_references_backref), + (self.backward_references_backref, self.forward_references_backref), + ): for node in setA.pop(id(source)): setA[id(target)].add(node) setB[id(node)].remove(source) @@ -596,9 +582,24 @@ class SyncGraph: if target.path is not None: self._path_look_up[target.path] = target if target.identifiable is not None: - self._identifiable_look_up[target.identifiable.get_representation()] = ( - target - ) + self._identifiable_look_up[target.identifiable.get_representation()] = target + + def _merge_into(self, source: SyncNode, target: SyncNode): + """tries to merge source into target and performs the necessary updates: + - update the member variables of target using source (``target.update(source)``). + - replaces reference values to source by target + - updates the reference map + - updates lookup tables + - removes source from node lists + - marks target as missing/existing if source was marked that way + - adds an identifiable if now possible (e.g. merging based on ID might allow create an + identifiable when none of the two nodes had the sufficient properties on its own before) + - check whether dependent nodes can now get an identifiable (the merge might have set the + ID such that dependent nodes can now create an identifiable) + + Last review by Alexander Schlemmer on 2024-05-29. + """ + self._basic_merge_into(source, target) if (id(source) in self._existing and id(target) in self._missing) or ( id(target) in self._existing and id(source) in self._missing @@ -607,7 +608,7 @@ class SyncGraph: if id(source) in self._missing and id(target) not in self._missing: self._mark_missing(target) - if id(source) in self._existing and id(target) not in self._existing: + elif id(source) in self._existing and id(target) not in self._existing: self._mark_existing(target) # due to the merge it might now be possible to create an identifiable