diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 3b516a127f2e50020a5852276be84a81f086701c..da0ec0f4851f40a3e47256836be0e4e2e79c91e6 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -307,19 +307,10 @@ class Crawler(object): while entity_was_treated and len(st.unchecked) > 0: entity_was_treated = False - # For each element we try to find out whether we can find it in the server or whether - # it does not yet exist. Since a Record may reference other unkown Records it might not - # be possible to answer this right away. - for se in list(st.unchecked): - if se not in st.unchecked: # the node might have been checke in the mean time + for se in st.unchecked: + if se.identifiable is None: # we cannot yet identify this node continue - if st.identity_relies_on_unchecked_entity(se): # no remote server check possible - continue - - if se.identifiable is None: # there should be an identifiable - raise Exception("NOOOO") - # check remote server identified_record = ( st.identifiableAdapter.retrieve_identified_record_for_identifiable( @@ -331,6 +322,7 @@ class Crawler(object): # as missing st.set_id_of_node(se, remote_id) entity_was_treated = True + break # one or more nodes were just removed from st.unchecked -> back to start # This only might add properties of the postponed records to the already used ones. if len(st.unchecked) > 0: diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index 5ccff2766232b93644f647a727e74f049f3dac24..1b8525db01508018a819b72721bc829b1e304276 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -148,23 +148,35 @@ identifiabel, identifiable and identified record) for a Record. query_string = query_string[:-len(" AND ")] return query_string - def check_identifying_props(self, node): + def check_identifying_props(self, node, raise_exception=True): if node.registered_identifiable is None: - raise RuntimeError("no registered_identifiable") + if raise_exception: + raise RuntimeError("no registered_identifiable") + else: + return False for prop in node.registered_identifiable.properties: if prop.name.lower() == "is_referenced_by": continue if prop.name.lower() == "name": if node.name is None: - i = MissingIdentifyingProperty(f"The node has no name.") - i.prop = "name" - raise i + if raise_exception: + i = MissingIdentifyingProperty(f"The node has no name.") + i.prop = "name" + raise i + else: + return False else: continue if (len([el for el in node.properties if el.name.lower() == prop.name.lower()]) == 0): - i = MissingIdentifyingProperty(f"The property {prop.name} is missing.") - i.prop = prop.name + if raise_exception: + i = MissingIdentifyingProperty(f"The property {prop.name} is missing.") + i.prop = prop.name + raise i + else: + return False + + return True @staticmethod def __create_pov_snippet(pname: str, pvalue, startswith: bool = False): @@ -297,7 +309,8 @@ startswith: bool, optional raise MissingReferencingEntityError( f"Could not find referencing entities of type(s): {prop.value}\n" f"for registered identifiable:\n{registered_identifiable}\n" - f"There were {len(identifiable_backrefs)} referencing entities to choose from.\n" + f"There were {len(identifiable_backrefs) + } referencing entities to choose from.\n" f"This error can also occur in case of merge conflicts in the referencing entities." ) elif len([e.id for e in identifiable_backrefs if el.id is None]) > 0: diff --git a/src/caoscrawler/sync_graph.py b/src/caoscrawler/sync_graph.py index 91b9291f2de1a90922f2de449665af4c7839a57e..26ef7f82c2ba953f0bf1f55bff0f8ce73f3daefb 100644 --- a/src/caoscrawler/sync_graph.py +++ b/src/caoscrawler/sync_graph.py @@ -67,27 +67,43 @@ def _set_each_scalar_value(node, condition, value): class SyncGraph(): """ combines nodes in the graph based on their identity in order to create a graph of objects - that can either be inserted or update in(to) the remote server. + that can either be inserted or updated in(to) the remote server. - When additional information is added to a node of the graph, e.g. via `set_id_of_node`, then - the graph is updated accordingly: + When the SyncGraph is initialized, the properties of given entities are scanned and used to + create multiple reference maps that track how SemanticEntities reference each other. + These maps are kept up to date when SyncNodes are merged because they are identified with each + other. + + When additional information is added to the graph by setting the ID of a node + (via `set_id_of_node`) then the graph is updated accordingly: - if this information implies that the node is equivalent to another node (e.g. has same ID), then they are merged - if knowing that one node does not exist in the remote server, then this might imply that some other node also does not exist if its identity relies on the latter. - - The target entities are composed using the information of the entity fragments (db.Entity - objects) of SemanticEntities. This is information like name, parents and properties and, - importantly, references. Those references are typically given by a Python reference to some - other db.Entity object. These references are scanned initially and used to create multiple - reference maps that track how SemanticEntities reference each other. These maps are kept up to - date when SemanticEntities are merged because they are identified with each other. When - creating the final list of db.Entity objects (``export_record_lists``) the Python references - are updated according to the reference map. - - This model should only be manipulated via three functions: - - set_existing: declares that a SyncNode is existing on the remote server - - set_missing: declares that a SyncNode is NOT existing on the remote server + - The new ID might make it possible to create the identifiables of connected nodes and thus + might trigger further merging of nodes based on the new identifiables. + + The target entities are composed using the information that was collected in the node. + This is information like name, parents and properties and, + importantly, references and other properties. + + This model should only be manipulated via one function: + - set_id_of_node: a positive integer means the Entity exists, None means it is missing + TODO what about String IDs + + Usage: + - Initialize the Graph with a list of entities. Those will be converted to the SyncNodes of the + graph. + - SyncNodes that can be merged are automatically merged and SyncNodes where the existance can + be determined are automatically removed from the list of unchecked SyncNodes: + graph.unchecked. + - You manipulate the graph by setting the ID of a SyncNode (either to a valid ID or to None). + For example, you can check whether a SyncNode has an identifiable and then query the remote + server and use the result to set the ID. + - After each manipulation, the graph updates accordingly (see above) + - Ideally, the unchecked list is empty after some manipulation. + - You can export a list of entities to be inserted and one of entities to be updated with + export_record_lists. """ def __init__(self, entities: List[db.Entity], identifiableAdapter): @@ -106,6 +122,7 @@ class SyncGraph(): self.unchecked = list(self.nodes) # initialize reference mappings + # TODO only use python IDs for maps ( self.forward_references, self.backward_references, @@ -117,13 +134,10 @@ class SyncGraph(): self._mark_entities_with_path_or_id() for node in list(self.nodes): - try: - identifiable = self.identifiableAdapter.get_identifiable( - node, self.backward_id_referenced_by[node.uuid]) - self.set_identifiable_of_node(node, identifiable) - except Exception as es: - print(es) - pass + if self._identifiable_is_needed(node): + self._set_identifiable_of_node(node) + for node in self.unchecked: + self.identifiableAdapter.check_identifying_props(node) def set_id_of_node(self, node: SyncNode, node_id: Optional[str] = None): """sets the ID attribute of the given SyncNode. If node_id is None, a negative Id will be @@ -139,33 +153,20 @@ class SyncGraph(): else: self._id_look_up[node.id] = node if node.id < 0: - self._treat_missing(node) + self._mark_missing(node) else: - self._treat_existing(node) - - def _get_new_id(self): - self._remote_missing_counter -= 1 - return self._remote_missing_counter - - def set_identifiable_of_node(self, node: SyncNode, identifiable: Identifiable): - node.identifiable = identifiable - equivalent_se = self.get_equivalent(node) - if equivalent_se is not None and equivalent_se is not node: - self._merge_into(node, equivalent_se) - assert equivalent_se.identifiable is not None - else: - assert node.identifiable.get_representation() not in self._identifiable_look_up - self._identifiable_look_up[node.identifiable.get_representation()] = node + self._mark_existing(node) def export_record_lists(self): + if len(self.unchecked) > 1: + self.unchecked_contains_circular_dependency() + entities = [] node_map = {} for el in self.nodes: entities.append(el.export_entity()) node_map[id(el)] = entities[-1] - if len(self.unchecked) > 1: - self.unchecked_contains_circular_dependency() for ent in entities: _set_each_scalar_value(ent, condition=lambda val: isinstance(val, SyncNode), @@ -183,7 +184,7 @@ class SyncGraph(): return (missing, existing) - def identity_relies_on_unchecked_entity(self, node: SyncNode): + def _identity_relies_on_unchecked_entity(self, node: SyncNode): """ If a record for which it could not yet be verified whether it exists in LA or not is part of the identifying properties, this returns True, otherwise False @@ -259,12 +260,35 @@ class SyncGraph(): # else: # return None + def _get_new_id(self): + self._remote_missing_counter -= 1 + return self._remote_missing_counter + + def _set_identifiable_of_node(self, node: SyncNode, + identifiable: Optional[Identifiable] = None): + if identifiable is None: + self.identifiableAdapter.check_identifying_props(node) + identifiable = self.identifiableAdapter.get_identifiable( + node, self.backward_id_referenced_by[node.uuid]) + node.identifiable = identifiable + equivalent_se = self.get_equivalent(node) + if equivalent_se is not None and equivalent_se is not node: + self._merge_into(node, equivalent_se) + assert equivalent_se.identifiable is not None + else: + assert node.identifiable.get_representation() not in self._identifiable_look_up + self._identifiable_look_up[node.identifiable.get_representation()] = node + @staticmethod def _sanity_check(entities: List[db.Entity]): for ent in entities: if ent.role == "Record" and len(ent.parents) == 0: raise RuntimeError(f"Records must have a parent.\n{ent}") + def _get_nodes_whose_identity_relies_on(self, node: SyncNode): + return (self.backward_id_references[node.uuid].union( + self.forward_id_referenced_by[node.uuid])) + @staticmethod def _create_flat_list(ent_list: List[db.Entity], flat: Optional[List[db.Entity]] = None): """ @@ -360,7 +384,7 @@ class SyncGraph(): self._merge_into(node, self.get_equivalent(node)) else: self._id_look_up[node.id] = node - self._treat_existing(node) + self._mark_existing(node) for node in list(self.nodes): if node.path is not None: @@ -382,9 +406,7 @@ class SyncGraph(): """ A path or an ID is sufficiently identifying. Thus, those entities can be marked as checked """ for node in list(self.nodes[::-1]): - if "nonidentifiable" in [p.name for p in - node.registered_identifiable.properties]: - + if "nonidentifiable" in [p.name for p in node.registered_identifiable.properties]: self.unchecked.remove(node) def _merge_into(self, source: SyncNode, target: SyncNode): @@ -394,7 +416,17 @@ class SyncGraph(): In any case, references are bent to the newrecord object. """ + # sanity checks assert source is not target + if target.id is None and source.id is not None: + assert self._id_look_up[source.id] == source, ( + "It is assumed that always only one node exists with a certain ID and that node is" + " in the look up") + if target.path is None and source.path is not None: + assert self._id_look_up[source.path] == source, ( + "It is assumed that always only one node exists with a certain path and that node " + "is in the look up") + target.update(source) # update reference mappings @@ -443,17 +475,25 @@ class SyncGraph(): self._path_look_up[target.path] = target # due to the merge it might now be possible to create an identifiable - if (target.identifiable is None and not self.identity_relies_on_unchecked_entity(target)): - try: - identifiable = self.identifiableAdapter.get_identifiable( - target, self.backward_id_referenced_by[target.uuid]) - self.set_identifiable_of_node(target, identifiable) - except Exception as es: - pass + if self._identifiable_is_needed(target): + self._set_identifiable_of_node(target) if id(source) in self._missing and id(target) not in self._missing: - self._treat_missing(target) + self._mark_missing(target) if id(source) in self._existing and id(target) not in self._existing: - self._treat_existing(target) + self._mark_existing(target) + + # This is one of three cases that affect other nodes: + # - mark existing + # - mark missing + # - merge + # For each dependend node, we check whether this allows to create an identifiable + for other_node in self._get_nodes_whose_identity_relies_on(target): + if self._identifiable_is_needed(other_node): + self._set_identifiable_of_node(other_node) + + def _identifiable_is_needed(self, node): + return (node.identifiable is None and not self._identity_relies_on_unchecked_entity(node) + and self.identifiableAdapter.check_identifying_props(node, raise_exception=False)) def _initialize_nodes(self, entities): """ create initial set of SemanticEntities from provided Entity list""" @@ -466,37 +506,42 @@ class SyncGraph(): self.identifiableAdapter.get_registered_identifiable(el))) se_lookup[id(el)] = self.nodes[-1] for node in self.nodes: + # replace db.Entity objects with SyncNodes in references _set_each_scalar_value(node, condition=lambda val: id(val) in se_lookup, value=lambda val: se_lookup[id(val)]) - def _treat_missing(self, node): + def _mark_missing(self, node): self._missing[id(node)] = node self.unchecked.remove(node) - for other_node in (self.backward_id_references[node.uuid].union( - self.forward_id_referenced_by[node.uuid])): - try: - identifiable = self.identifiableAdapter.get_identifiable( - other_node, self.backward_id_referenced_by[other_node.uuid]) - self.set_identifiable_of_node(other_node, identifiable) - except Exception as es: - pass + # This is one of three cases that affect other nodes: + # - mark existing + # - mark missing + # - merge + # For each dependend node, we check whether this allows to create an identifiable + for other_node in self._get_nodes_whose_identity_relies_on(node): + if self._identifiable_is_needed(other_node): + self._set_identifiable_of_node(other_node) + # For each dependend node, we set the ID to None (missing) + for other_node in self._get_nodes_whose_identity_relies_on(node): if other_node in self.unchecked: + print(f"set\n{other_node}\n to missing due to missing\n{node}") self.set_id_of_node(other_node) - def _treat_existing(self, node): - assert node.id > 0 + def _mark_existing(self, node): + if node.id <= 0: + raise ValueError("ID must be positive for existing entities") self._existing[id(node)] = node self.unchecked.remove(node) - for other_node in (self.backward_id_references[node.uuid].union( - self.forward_id_referenced_by[node.uuid])): - try: - identifiable = self.identifiableAdapter.get_identifiable( - other_node, self.backward_id_referenced_by[other_node.uuid]) - self.set_identifiable_of_node(other_node, identifiable) - except Exception as es: - pass + # This is one of three cases that affect other nodes: + # - mark existing + # - mark missing + # - merge + # For each dependend node, we check whether this allows to create an identifiable + for other_node in self._get_nodes_whose_identity_relies_on(node): + if self._identifiable_is_needed(other_node): + self._set_identifiable_of_node(other_node) def __repr__(self): return f"SyncNode with ID={self.id}" diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index 90726f7756333abd8c71ae90c91018cf25fc00f1..0d4bb4209b9a881adce096ce9084a8b3d8ceaf82 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -282,10 +282,8 @@ def test_split_into_inserts_and_updates_trivial(): def test_split_into_inserts_and_updates_unidentified(crawler_mocked_identifiable_retrieve): crawler = crawler_mocked_identifiable_retrieve - st = SyncGraph([db.Record().add_parent("someparent")], - crawler.identifiableAdapter) with raises(MissingIdentifyingProperty) as err: - crawler.split_into_inserts_and_updates(st) + st = SyncGraph([db.Record().add_parent("someparent")], crawler.identifiableAdapter) assert str(err.value).startswith("The node has no name.") @@ -484,11 +482,11 @@ a: ([b1, b2]) st = SyncGraph(deepcopy([rec_a, *rec_b, *rec_c]), crawler.identifiableAdapter) for node in st.nodes: print(node.id, node.parents) - assert st.identity_relies_on_unchecked_entity(st.nodes[0]) is False - assert st.identity_relies_on_unchecked_entity(st.nodes[1]) - assert st.identity_relies_on_unchecked_entity(st.nodes[2]) - assert st.identity_relies_on_unchecked_entity(st.nodes[3]) - assert st.identity_relies_on_unchecked_entity(st.nodes[4]) + assert st._identity_relies_on_unchecked_entity(st.nodes[0]) is False + assert st._identity_relies_on_unchecked_entity(st.nodes[1]) + assert st._identity_relies_on_unchecked_entity(st.nodes[2]) + assert st._identity_relies_on_unchecked_entity(st.nodes[3]) + assert st._identity_relies_on_unchecked_entity(st.nodes[4]) print("SEEEEEEEEEEEEEEEET") print(st.nodes[1].properties) print(st.nodes[2].properties) @@ -546,16 +544,16 @@ def test_synchronization_no_commit(upmock, insmock): assert len(ups) == 1 -@ patch("caoscrawler.crawl.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by)) -@ patch("caoscrawler.identifiable_adapters.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by)) -@ patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter." - "retrieve_identified_record_for_identifiable", - new=Mock(side_effect=mock_retrieve_record)) -@ patch("caoscrawler.crawl.db.Container.insert") -@ patch("caoscrawler.crawl.db.Container.update") -@ patch("caoscrawler.crawl.UpdateCache.insert") +@patch("caoscrawler.crawl.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@patch("caoscrawler.identifiable_adapters.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter." + "retrieve_identified_record_for_identifiable", + new=Mock(side_effect=mock_retrieve_record)) +@patch("caoscrawler.crawl.db.Container.insert") +@patch("caoscrawler.crawl.db.Container.update") +@patch("caoscrawler.crawl.UpdateCache.insert") def test_security_mode(updateCacheMock, upmock, insmock): # trivial case: nothing to do crawled_data = [r.copy() for r in EXAMPLE_SERVER_STATE if r.role == "Record"] @@ -704,11 +702,10 @@ def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test) referenced = db.Record(name="B").add_parent("C") entlist = [referenced, db.Record(name="A").add_parent("BR").add_property("ref", referenced), ] - st = SyncGraph([db.Record(name="B").add_parent("C")], crawler.identifiableAdapter) # Test without referencing object # currently a RuntimeError is raised if necessary properties are missing. with raises(MissingReferencingEntityError): - crawler.split_into_inserts_and_updates(st) + st = SyncGraph([db.Record(name="B").add_parent("C")], crawler.identifiableAdapter) # identifiables were not yet checked st = SyncGraph(entlist, crawler.identifiableAdapter) @@ -1016,48 +1013,6 @@ def test_replace_name_with_referenced_entity(): assert caoscrawler.crawl.cached_get_entity_by.call_count == 3 -def test_treated_record_lookup(): - ident_adapter = CaosDBIdentifiableAdapter() - trlu = SyncGraph([db.Record().add_parent( - 'A'), db.Record().add_parent('A'), db.File()], ident_adapter) - exist = trlu.nodes[0] - miss = trlu.nodes[1] - fi = trlu.nodes[2] - trlu.set_id_of_node(exist, 1) - assert len(trlu._existing) == 1 - # was added to existing - assert trlu._existing[id(exist)] is exist - # is in ID lookup - assert trlu._id_look_up[exist.id] is exist - - trlu.set_identifiable_of_node(miss, Identifiable(name='a')) - trlu.set_id_of_node(miss) - # was added to missing - assert trlu._missing[id(miss)] is miss - # is in ident lookup - assert trlu._identifiable_look_up[miss.identifiable.get_representation()] is miss - -# fi.path = 'a' -# trlu.set_id_of_node(fi, 2) -# assert len(trlu._existing) == 2 -# # was added to existing -# assert trlu._existing[id(fi)] is fi -# # is in ID lookup -# assert trlu._id_look_up[fi.id] is fi -# # is in path lookup -# assert trlu._path_look_up[fi.path] is fi -# -# all_miss, all_exi = trlu.create_record_lists() -# assert fi.fragments[0] in all_exi -# assert exist.fragments[0] in all_exi -# assert miss.fragments[0] in all_miss -# -# # If a Record was added using the ID, the ID must be used to identify it even though later an -# # identifiable may be passed as well -# exist.identifiable = Identifiable(name='b') -# assert trlu.get_equivalent(exist) is exist - - def test_merge_entity_with_identifying_reference(crawler_mocked_identifiable_retrieve): # When one python object representing a record is merged into another python object # representing the same record, the former object can be forgotten and references from it to diff --git a/unittests/test_sync_graph.py b/unittests/test_sync_graph.py index 30f65d6c15ce19558fbe1ad1e8b503f276697470..7927c705dc3677c0a92a6eb6e8967ca0eb8c309a 100644 --- a/unittests/test_sync_graph.py +++ b/unittests/test_sync_graph.py @@ -49,6 +49,9 @@ def simple_adapter(): ident_adapter.register_identifiable( "RT4", db.RecordType().add_parent("RT4").add_property("RT3")) + ident_adapter.register_identifiable( + "RT5", + db.RecordType().add_parent("RT5").add_property("name")) return ident_adapter @@ -117,7 +120,7 @@ def test_create_reference_mapping(): def test_SyncGraph(): - a = db.Record().add_parent("A") + a = db.Record(id=101).add_parent("A") ident_a = db.RecordType().add_parent("A").add_property("prop_ident") ident_adapter = CaosDBIdentifiableAdapter() ident_adapter.register_identifiable("A", ident_a) @@ -129,13 +132,16 @@ def test_merge_into_trivial(simple_adapter): # b # (a reference c; b does not reference anything; a & b have the same target record) c = db.Record(name='c').add_parent("RT2") - a = db.Record().add_parent("RT1").add_property('RT2', c) - b = db.Record().add_parent("RT1") + a = db.Record(name='a').add_parent("RT1").add_property('RT2', c) + b = db.Record(id=101).add_parent("RT1") st = SyncGraph([a, b], simple_adapter) se_a = st.nodes[0] se_b = st.nodes[1] se_c = st.nodes[2] + assert se_a.name is 'a' + assert se_b.id is 101 + assert se_c.name is 'c' # CHECK REFERENCE MAP (before merge): # c is referenced by a @@ -166,7 +172,7 @@ def test_merge_into_trivial(simple_adapter): assert len(st.backward_id_referenced_by[se_c.uuid]) == 1 se_a in st.backward_id_referenced_by[se_c.uuid] - st._merge_into(se_a, se_b) + st.set_id_of_node(se_a, 101) # CHECK REFERENCE MAP (after merge): # c is now referenced by b @@ -295,7 +301,7 @@ def test_backward_id_referenced_by(): def test_set_id_of_node(simple_adapter): # setting the id should lead to the node being marked as existing - ent_list = [db.Record().add_parent("RT1")] + ent_list = [db.Record(name='a').add_parent("RT5")] st = SyncGraph(ent_list, simple_adapter) assert len(st.nodes) == 1 assert len(st.unchecked) == 1 @@ -318,8 +324,8 @@ def test_set_id_of_node(simple_adapter): # setting the id to one that already exists should lead to a merge ent_list = [ - db.Record(id=101).add_parent("RT1"), - db.Record().add_parent("RT1").add_property(name="a", value=1)] + db.Record(id=101).add_parent("RT5"), + db.Record(name='a').add_parent("RT5").add_property(name="RT2", value=1)] st = SyncGraph(ent_list, simple_adapter) assert len(st.nodes) == 2 assert len(st.unchecked) == 1 @@ -638,3 +644,18 @@ def test_export_node(): assert len(p.value) == len(exp.get_property(p.name).value) assert len(exp.properties) == len(rec_a.properties) assert len(exp.parents) == len(rec_a.parents) + + +# TODO create test that tests the assumptions after initialization: +# - no two (or more) nodes with the same id +# - no two (or more) nodes with the same path +# - nodes with id or path are not in unchecked +# - every node with id or path is in the corresponding look ups +# - every node that does not rely on something unchecked has an identifiable + + +# TODO create test that tests whether a cascading information application is fully applied +# TODO create test that tests whether setting something to missing allows to create an identifiable +# and thus a merge + +# TODO test that msising identifying props are reported after initialization