diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 610bf7ca0651eed4f2a423d48d48c1d4d49c18ab..17171e22d3571a6509128733d41299fb11edfb2c 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -371,7 +371,7 @@ class Crawler(object): continue if se.identifiable is None: - se.identifiable = self.identifiableAdapter.new_get_identifiable( + se.identifiable = self.identifiableAdapter.get_identifiable( se, st.backward_id_referenced_by[se.uuid]) equivalent_se = st.get_equivalent(se) diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index da80f77366c6e48ed32fe2978e7e8395513885c4..3e23f153fac4f4d909698831479c8279a18a8642 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -233,7 +233,7 @@ startswith: bool, optional refs.append(val) return refs - def new_get_identifiable(self, se: SemanticEntity, identifiable_backrefs): + def get_identifiable(self, se: SemanticEntity, identifiable_backrefs): """ Retrieve the registered identifiable and fill the property values to create an identifiable. @@ -275,14 +275,21 @@ startswith: bool, optional # treated elsewhere if prop.name.lower() == "is_referenced_by": + if len(identifiable_backrefs) == 0: + raise RuntimeError( + f"Could not find referencing entities of type(s): {prop.value}\n" + f"for registered identifiable:\n{registered_identifiable}\n" + f"There were {len(identifiable_backrefs)} referencing entities to choose from.\n" + f"This error can also occur in case of merge conflicts in the referencing entities." + ) continue options = [f.get_property(prop.name) for f in se.fragments - if f.get_property(prop.name) is None] + if f.get_property(prop.name) is not None] if len(options) == 0: raise NotImplementedError( f"The following record is missing an identifying property:\n" - f"RECORD\n{se}\nIdentifying PROPERTY\n{prop.name}" + f"RECORD\n{se.fragments[0]}\nIdentifying PROPERTY\n{prop.name}" ) assert all([f.value == options[0].value for f in options]) record_prop = options[0] @@ -315,81 +322,7 @@ startswith: bool, optional logger.error(f"Error while creating identifiable for this record:\n{se}") raise - def get_identifiable(self, record: db.Record, identifiable_backrefs): - """ - Retrieve the registered identifiable and fill the property values to create an - identifiable. - - Args: - record: the record for which the Identifiable shall be created. - referencing_entities: a dictionary (Type: dict[str, list[db.Entity]]), that - allows to look up entities with a certain RecordType, that reference ``record`` - - Returns: - Identifiable, the identifiable for record. - """ - - registered_identifiable = self.get_registered_identifiable(record) - - property_name_list_A = [] - property_name_list_B = [] - identifiable_props = {} - identifiable_backrefs = [] - name_is_identifying_property = False - - if registered_identifiable is not None: - # fill the values: - for prop in registered_identifiable.properties: - if prop.name == "name": - # The name can be an identifiable, but it isn't a property - name_is_identifying_property = True - continue - # problem: what happens with multi properties? - # case A: in the registered identifiable - # case B: in the identifiable - - # treated above - if prop.name.lower() == "is_referenced_by": - continue - - record_prop = record.get_property(prop.name) - if record_prop is None: - # TODO: how to handle missing values in identifiables - # raise an exception? - # TODO: is this the appropriate error? - raise NotImplementedError( - f"The following record is missing an identifying property:\n" - f"RECORD\n{record}\nIdentifying PROPERTY\n{prop.name}" - ) - identifiable_props[record_prop.name] = record_prop.value - property_name_list_A.append(prop.name) - - # check for multi properties in the record: - for prop in property_name_list_A: - property_name_list_B.append(prop) - if (len(set(property_name_list_B)) != len(property_name_list_B) or len( - set(property_name_list_A)) != len(property_name_list_A)): - raise RuntimeError( - "Multi properties used in identifiables could cause unpredictable results and " - "are not allowed. You might want to consider a Property with a list as value.") - - # use the RecordType of the registered Identifiable if it exists - # We do not use parents of Record because it might have multiple - try: - return Identifiable( - record_id=record.id, - record_type=(registered_identifiable.parents[0].name - if registered_identifiable else None), - name=record.name if name_is_identifying_property else None, - properties=identifiable_props, - path=record.path, - backrefs=identifiable_backrefs - ) - except Exception: - logger.error(f"Error while creating identifiable for this record:\n{record}") - raise - - @ abstractmethod + @abstractmethod def retrieve_identified_record_for_identifiable(self, identifiable: Identifiable): """ Retrieve identifiable record for a given identifiable. @@ -419,7 +352,7 @@ startswith: bool, optional return self.retrieve_identified_record_for_identifiable(identifiable) - @ staticmethod + @staticmethod def referencing_entity_has_appropriate_type(parents, register_identifiable): if register_identifiable.get_property("is_referenced_by") is None: return False diff --git a/src/caoscrawler/semantic_target.py b/src/caoscrawler/semantic_target.py index cfba71fc6059762b356c341730f3c3255d0eb719..5432c55a5d1d839af9e3908f7b743dc20b2b7ac1 100644 --- a/src/caoscrawler/semantic_target.py +++ b/src/caoscrawler/semantic_target.py @@ -48,12 +48,6 @@ class SemanticEntity(): self.registered_identifiable = registered_identifiable self.uuid = uuid() - def is_unchecked(): - return len([el.id for el in self.fragments if el.id is None]) > 0 - - def is_missing(): - return len([el.id for el in self.fragments if el.id < 0]) > 0 - def identify_with(self, remote_entity): for f in self.fragments: # side effect @@ -246,8 +240,10 @@ class SemanticTarget(): of the identifying properties, this returns True, otherwise False """ - return any([ent.is_unchecked() for ent in self.forward_id_references[se.uuid]] - + [ent.is_unchecked() for ent in self.backward_id_referenced_by[se.uuid]]) + return any([id(ent) not in self._missing and id(ent) not in self._existing + for ent in self.forward_id_references[se.uuid]] + + [id(ent) not in self._missing and id(ent) not in self._existing + for ent in self.backward_id_referenced_by[se.uuid]]) def identity_relies_on_missing_entity(self, se: SemanticEntity): """ @@ -256,8 +252,8 @@ class SemanticTarget(): properties, it means that it references another Entity, where we checked whether it exists remotely and it was not found. """ - return any([ent.is_missing() for ent in self.forward_id_references[se.uuid]] - + [ent.is_missing() for ent in self.backward_id_referenced_by[se.uuid]]) + return any([id(ent) in self._missing for ent in self.forward_id_references[se.uuid]] + + [id(ent) in self._missing for ent in self.backward_id_referenced_by[se.uuid]]) @staticmethod def sanity_check(entities: list[db.Entity]): @@ -397,6 +393,7 @@ class SemanticTarget(): el.name == p.name]) > 0: forward_id_references[se.uuid].add(vse) backward_id_references[vse.uuid].add(se) + ent.parents, vse.registered_identifiable)) if IdentifiableAdapter.referencing_entity_has_appropriate_type( ent.parents, vse.registered_identifiable): forward_id_referenced_by[se.uuid].add(vse) @@ -411,18 +408,18 @@ class SemanticTarget(): checked """ for semantic_entity in self.se: assert len(semantic_entity.fragments) == 1 - entity = semantic_entity.fragments[0] + entity= semantic_entity.fragments[0] if entity.id is None and entity.path is None: continue if entity.path is not None: try: - existing = cached_get_entity_by(path=entity.path) + existing= cached_get_entity_by(path=entity.path) except EmptyUniqueQueryError: - existing = None + existing= None if existing is not None: semantic_entity.identify_with(existing) - treated_before = self.get_equivalent(semantic_entity) + treated_before= self.get_equivalent(semantic_entity) if treated_before is None: if semantic_entity.id is None: self.add_to_missing(semantic_entity) @@ -431,7 +428,7 @@ class SemanticTarget(): else: self.merge_into(semantic_entity, self.se_lookup[id(treated_before)]) - @staticmethod + @ staticmethod def bend_references_to_new_object(old, new, entities): """ Bend references to the other object Iterate over all entities in `entities` and check the values of all properties of @@ -442,16 +439,16 @@ class SemanticTarget(): if isinstance(p.value, list): for index, val in enumerate(p.value): if val is old: - p.value[index] = new + p.value[index]= new else: if p.value is old: - p.value = new + p.value= new def _add_any(self, entity: SemanticEntity, lookup): if entity.id is not None: - self._id_look_up[entity.id] = entity + self._id_look_up[entity.id]= entity if entity.path is not None: - self._path_look_up[entity.path] = entity + self._path_look_up[entity.path]= entity if entity.identifiable is not None: - self._identifiable_look_up[entity.identifiable.get_representation()] = entity - lookup[id(entity)] = entity + self._identifiable_look_up[entity.identifiable.get_representation()]= entity + lookup[id(entity)]= entity diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index e7a226d7c17d07ebc00cbc1479de83b6f9a98832..baf41d04502f7c3ffa086b8cf87b6f2180817b02 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -480,8 +480,15 @@ a: ([b1, b2]) crawler = Crawler(identifiableAdapter=ident_adapter) - crawler.synchronize(commit_changes=False, - crawled_data=[rec_a, *rec_b, *rec_c]) + st = SemanticTarget([rec_a, *rec_b, *rec_c], crawler.identifiableAdapter) + assert st.identity_relies_on_unchecked_entity(st.se[0]) is False + assert st.identity_relies_on_unchecked_entity(st.se[1]) + assert st.identity_relies_on_unchecked_entity(st.se[2]) + assert st.identity_relies_on_unchecked_entity(st.se[3]) + assert st.identity_relies_on_unchecked_entity(st.se[4]) + st.add_to_missing(st.se[0]) + assert st.identity_relies_on_unchecked_entity(st.se[1]) is False + with raises(RuntimeError) as rte: crawler.synchronize(commit_changes=False, crawled_data=[rec_a, *rec_b, *rec_c]) @@ -537,29 +544,6 @@ def test_has_missing_object_in_references(): crawler.identifiableAdapter.get_registered_identifiable.assert_called() -@ pytest.mark.xfail() -def test_references_entities_without_ids(): - crawler = Crawler() - assert not crawler._has_reference_value_without_id(db.Record().add_parent("Person") - .add_property('last_name', 123) - .add_property('first_name', 123)) - # id and rec with id - assert not crawler._has_reference_value_without_id(db.Record().add_parent("Person") - .add_property('first_name', 123) - .add_property('last_name', - db.Record(id=123))) - # id and rec with id and one unneeded prop - assert crawler._has_reference_value_without_id(db.Record().add_parent("Person") - .add_property('first_name', 123) - .add_property('stuff', db.Record()) - .add_property('last_name', db.Record(id=123))) - - # one identifying prop is missing - assert crawler._has_reference_value_without_id(db.Record().add_parent("Person") - .add_property('first_name', 123) - .add_property('last_name', db.Record())) - - def test_replace_entities_with_ids(): crawler = Crawler() a = (db.Record().add_parent("B").add_property("A", 12345) @@ -761,17 +745,16 @@ def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test) referenced = db.Record(name="B").add_parent("C") entlist = [referenced, db.Record(name="A").add_parent("BR").add_property("ref", referenced), ] + st = SemanticTarget([db.Record(name="B").add_parent("C")], crawler.identifiableAdapter) # Test without referencing object # currently a RuntimeError is raised if necessary properties are missing. with raises(RuntimeError): - crawler.split_into_inserts_and_updates([db.Record(name="B").add_parent("C")]) + crawler.split_into_inserts_and_updates(st) # identifiables were not yet checked - assert crawler.treated_records_lookup.get_any(entlist[1], identlist[0]) is None - assert crawler.treated_records_lookup.get_any(entlist[0], identlist[1]) is None - # one with reference, one without - assert not crawler._has_reference_value_without_id(identlist[0]) - assert crawler._has_reference_value_without_id(identlist[1]) + st = SemanticTarget(entlist, crawler.identifiableAdapter) + assert st.get_equivalent(st.se[1]) is None + assert st.get_equivalent(st.se[0]) is None # one can be found remotely, one not assert crawler.identifiableAdapter.retrieve_identified_record_for_record( identlist[0]).id == 1111 @@ -779,7 +762,7 @@ def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test) identlist[1]) is None # check the split... - insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) + insert, update = crawler.split_into_inserts_and_updates(st) # A was found remotely and is therefore in the update list assert len(update) == 1 assert update[0].name == "A" diff --git a/unittests/test_semantic_target.py b/unittests/test_semantic_target.py index 58e52a3073efcababd86333a3275e21964fe5066..300851f293c934079f3568155a767e7d1fda4557 100644 --- a/unittests/test_semantic_target.py +++ b/unittests/test_semantic_target.py @@ -254,3 +254,19 @@ def test_merge_into(): assert len(st.backward_id_referenced_by[se_b.uuid]) == 0 assert len(st.backward_id_referenced_by[se_c.uuid]) == 1 se_b in st.backward_id_referenced_by[se_c.uuid] + + +def test_backward_id_referenced_by(): + # We use the reference as identifying reference in both directions. Thus the map is the same + # for all three categories: references, id_references and id_referenced_by + ident_a = db.RecordType().add_parent("BR").add_property("name") + ident_b = db.RecordType().add_parent("C").add_property("is_referenced_by", ["BR"]) + ident_adapter = CaosDBIdentifiableAdapter() + ident_adapter.register_identifiable("BR", ident_a) + ident_adapter.register_identifiable("C", ident_b) + + referenced = db.Record(name="B").add_parent("C") + entlist = [referenced, db.Record(name="A").add_parent("BR").add_property("ref", referenced), ] + + st = SemanticTarget(entlist, ident_adapter) + assert st.se[1] in st.backward_id_referenced_by[st.se[0].uuid]