diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 935db0a7e9c8320eb89011e935c1e2547b766c17..e53bbbe892272b67ee90e04dfbd20e328f195c5d 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -538,7 +538,7 @@ class Crawler(object): Crawler.create_flat_list([p.value], flat) return flat - def _has_missing_object_in_references(self, ident: Identifiable, referencing_entities: list): + def _has_missing_object_in_references(self, ident: Identifiable, referencing_entities: dict): """ returns False if any value in the properties attribute is a db.Entity object that is contained in the `remote_missing_cache`. If ident has such an object in @@ -551,20 +551,21 @@ class Crawler(object): # Entity instead of ID and not cached locally if (isinstance(pvalue, list)): for el in pvalue: - if (isinstance(el, db.Entity) and self.treated_records_lookup.get_missing( - el, - self.identifiableAdapter.get_identifiable( - el, referencing_entities)) is not None): + elident = self.identifiableAdapter.get_identifiable( + el, referencing_entities[id(el)]) + if (isinstance(el, db.Entity) + and self.treated_records_lookup.get_missing(el, elident) is not None): return True if (isinstance(pvalue, db.Entity) and self.treated_records_lookup.get_missing( pvalue, - self.identifiableAdapter.get_identifiable(pvalue, referencing_entities) + self.identifiableAdapter.get_identifiable(pvalue, + referencing_entities[id(pvalue)]) ) is not None): # might be checked when reference is resolved return True return False - def replace_references_with_cached(self, record: db.Record, referencing_entities: list): + def replace_references_with_cached(self, record: db.Record, referencing_entities: dict): """ Replace all references with the versions stored in the cache. @@ -577,7 +578,8 @@ class Crawler(object): if (isinstance(el, db.Entity) and el.id is None): cached = self.treated_records_lookup.get_any( el, - self.identifiableAdapter.get_identifiable(el, referencing_entities)) + self.identifiableAdapter.get_identifiable( + el, referencing_entities[id(el)])) if cached is None: lst.append(el) continue @@ -598,8 +600,9 @@ class Crawler(object): lst.append(el) p.value = lst if (isinstance(p.value, db.Entity) and p.value.id is None): - cached = self.treated_records_lookup.get_any(p.value, - self.identifiableAdapter.get_identifiable(p.value, referencing_entities)) + cached = self.treated_records_lookup.get_any( + p.value, self.identifiableAdapter.get_identifiable( + p.value, referencing_entities[id(p.value)])) if cached is None: continue if not check_identical(cached, p.value, True): @@ -658,23 +661,44 @@ class Crawler(object): entities=all_records ) + def _identity_relies_on_unchecked_entities(self, record: db.Record, referencing_entities): + """ + If a record for which it could not yet be verified whether it exists in LA or not is part + of the identifying properties, this returns True, otherwise False + """ + + registered_identifiable = self.identifiableAdapter.get_registered_identifiable(record) + refs = self.identifiableAdapter.get_identifying_referencing_entities(referencing_entities, + registered_identifiable) + if any([el is None for el in refs]): + return True + + refs = self.identifiableAdapter.get_identifying_referenced_entities( + record, registered_identifiable) + if any([self.treated_records_lookup.get_any(el) is None for el in refs]): + return True + + return False + @staticmethod def create_reference_mapping(flat: list[db.Entity]): """ Create a dictionary of dictionaries of the form: - dict[int, dict[str, list[db.Entity]]] + dict[int, dict[str, list[Union[int,None]]]] - The integer index is the Python id of the value object. - The string is the name of the first parent of the referencing object. Each value objects is taken from the values of all properties from the list flat. - So the returned mapping maps ids of entities to the objects which are referring + So the returned mapping maps ids of entities to the ids of objects which are referring to them. """ # TODO we need to treat children of RecordTypes somehow. - references: dict[int, dict[str, list[db.Entity]]] = {} + references: dict[int, dict[str, list[Union[int, None]]]] = {} for ent in flat: + if id(ent) not in references: + references[id(ent)] = {} for p in ent.properties: val = p.value if not isinstance(val, list): @@ -685,7 +709,7 @@ class Crawler(object): references[id(v)] = {} if ent.parents[0].name not in references[id(v)]: references[id(v)][ent.parents[0].name] = [] - references[id(v)][ent.parents[0].name].append(ent) + references[id(v)][ent.parents[0].name].append(ent.id) return references @@ -748,9 +772,14 @@ class Crawler(object): # 3. Does it have to be new since a needed reference is missing? for i in reversed(range(len(flat))): record = flat[i] + + if self._identity_relies_on_unchecked_entities(record, + referencing_entities[id(record)]): + continue + identifiable = self.identifiableAdapter.get_identifiable( record, - referencing_entities=referencing_entities) + referencing_entities=referencing_entities[id(record)]) # 1. Is it in the cache of already checked Records? if self.treated_records_lookup.get_any(record, identifiable) is not None: @@ -796,7 +825,7 @@ class Crawler(object): for record in try_to_merge_later: identifiable = self.identifiableAdapter.get_identifiable( record, - referencing_entities=referencing_entities) + referencing_entities=referencing_entities[id(record)]) newrecord = self.treated_records_lookup.get_any(record, identifiable) merge_entities(newrecord, record, merge_id_with_resolved_entity=True) if len(flat) > 0: @@ -834,7 +863,7 @@ class Crawler(object): if val.id is not None: el.value[index] = val.id - @staticmethod + @ staticmethod def compact_entity_list_representation(circle): """ a more readable representation than the standard xml representation @@ -850,7 +879,7 @@ class Crawler(object): return text + "--------\n" - @staticmethod + @ staticmethod def detect_circular_dependency(flat: list[db.Entity]): """ Detects whether there are circular references in the given entity list and returns a list @@ -883,7 +912,7 @@ class Crawler(object): return None return circle - @staticmethod + @ staticmethod def _merge_properties_from_remote( crawled_data: list[db.Record], identified_records: list[db.Record] @@ -925,7 +954,7 @@ class Crawler(object): return to_be_updated - @staticmethod + @ staticmethod def remove_unnecessary_updates( crawled_data: list[db.Record], identified_records: list[db.Record] @@ -951,7 +980,7 @@ class Crawler(object): return actual_updates - @staticmethod + @ staticmethod def execute_parent_updates_in_list(to_be_updated, securityMode, run_id, unique_names): """ Execute the updates of changed parents. @@ -994,13 +1023,13 @@ class Crawler(object): "mode. This might lead to a failure of inserts that follow.") logger.info(parent_updates) - @staticmethod + @ staticmethod def _get_property_id_for_datatype(rtname: str, name: str): return cached_get_entity_by( query=f"FIND Entity '{escape_squoted_text(rtname)}' " f"with name='{escape_squoted_text(name)}'").id - @staticmethod + @ staticmethod def replace_name_with_referenced_entity_id(prop: db.Property): """changes the given property in place if it is a reference property that has a name as value @@ -1045,7 +1074,7 @@ class Crawler(object): propval.append(el) prop.value = propval - @staticmethod + @ staticmethod def execute_inserts_in_list(to_be_inserted, securityMode, run_id: Optional[uuid.UUID] = None, unique_names=True): @@ -1065,7 +1094,7 @@ class Crawler(object): update_cache = UpdateCache() update_cache.insert(to_be_inserted, run_id, insert=True) - @staticmethod + @ staticmethod def set_ids_and_datatype_of_parents_and_properties(rec_list): for record in rec_list: for parent in record.parents: @@ -1077,7 +1106,7 @@ class Crawler(object): prop.id = entity.id _resolve_datatype(prop, entity) - @staticmethod + @ staticmethod def execute_updates_in_list(to_be_updated, securityMode, run_id: Optional[uuid.UUID] = None, unique_names=True): @@ -1091,7 +1120,7 @@ class Crawler(object): update_cache = UpdateCache() update_cache.insert(to_be_updated, run_id) - @staticmethod + @ staticmethod def check_whether_parent_exists(records: list[db.Entity], parents: list[str]): """ returns a list of all records in `records` that have a parent that is in `parents`""" problems = [] @@ -1211,7 +1240,7 @@ class Crawler(object): return (to_be_inserted, to_be_updated) - @staticmethod + @ staticmethod def create_entity_summary(entities: list[db.Entity]): """ Creates a summary string reprensentation of a list of entities.""" parents = {} @@ -1230,7 +1259,7 @@ class Crawler(object): output = output[:-2] + "\n" return output - @staticmethod + @ staticmethod def inform_about_pending_changes(pending_changes, run_id, path, inserts=False): # Sending an Email with a link to a form to authorize updates is if get_config_setting("send_crawler_notifications"): @@ -1251,7 +1280,7 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) + " by invoking the crawler" " with the run id: {rid}\n".format(rid=run_id)) - @staticmethod + @ staticmethod def debug_build_usage_tree(converter: Converter): res: dict[str, dict[str, Any]] = { converter.name: { diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index dd8c032041a74fa05b16d93abb06186e7e6fa569..069afc668561ad6ba56bd48d00c6eb2a3c0e70d6 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -186,6 +186,49 @@ identifiabel, identifiable and identified record) for a Record. """ pass + @staticmethod + def get_identifying_referencing_entities(referencing_entities, registered_identifiable): + refs = [] + for prop in registered_identifiable.properties: + if prop.name.lower() != "is_referenced_by": + continue + for givenrt in prop.value: + found = False + if givenrt == "*": + for val in referencing_entities.values(): + if len(val) > 0: + found = True + refs.extend(val) + else: + rt_and_children = get_children_of_rt(givenrt) + for rtname in rt_and_children: + if (rtname in referencing_entities): + refs.extend(referencing_entities[rtname]) + found = True + if not found: + raise NotImplementedError( + f"An identifying property:\n" + f"\nIdentifying PROPERTY\n{prop.name}" + ) + return refs + + @staticmethod + def get_identifying_referenced_entities(record, registered_identifiable): + refs = [] + for prop in registered_identifiable.properties: + pname = prop.name.lower() + if pname == "name" or pname == "is_referenced_by": + continue + if record.get_property(prop.name) is None: + raise RuntimeError("Missing identifying Property") + pval = record.get_property(prop.name).value + if not isinstance(prop.value, list): + pval = [prop.value] + for val in pval: + if isinstance(val, db.Entity): + refs.append(val) + return refs + def get_identifiable(self, record: db.Record, referencing_entities=None): """ retrieve the registered identifiable and fill the property values to create an @@ -193,7 +236,7 @@ identifiabel, identifiable and identified record) for a Record. Args: record: the record for which the Identifiable shall be created. - referencing_entities: a dictionary (Type: dict[int, dict[str, list[db.Entity]]]), that + referencing_entities: a dictionary (Type: dict[str, list[db.Entity]]), that allows to look up entities with a certain RecordType, that reference ``record`` Returns: @@ -212,6 +255,8 @@ identifiabel, identifiable and identified record) for a Record. name_is_identifying_property = False if registered_identifiable is not None: + identifiable_backrefs = self.get_identifying_referencing_entities( + referencing_entities, registered_identifiable) # fill the values: for prop in registered_identifiable.properties: if prop.name == "name": @@ -222,31 +267,8 @@ identifiabel, identifiable and identified record) for a Record. # case A: in the registered identifiable # case B: in the identifiable - # TODO: similar to the Identifiable class, Registered Identifiable should be a - # separate class too + # treated above if prop.name.lower() == "is_referenced_by": - for givenrt in prop.value: - found = False - if givenrt == "*": - if id(record) not in referencing_entities: - continue - for rt, rec in referencing_entities[id(record)].items(): - identifiable_backrefs.extend(rec) - found = True - else: - rt_and_children = get_children_of_rt(givenrt) - for rtname in rt_and_children: - if (id(record) in referencing_entities - and (rtname in referencing_entities[id(record)])): - identifiable_backrefs.extend( - referencing_entities[id(record)][rtname]) - found = True - if not found: - # TODO: is this the appropriate error? - raise NotImplementedError( - f"The following record is missing an identifying property:\n" - f"RECORD\n{record}\nIdentifying PROPERTY\n{prop.name}" - ) continue record_prop = record.get_property(prop.name) diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index 7c62b004b207da42a37bf26dce295810ec9e3075..23b374849543a6e132c605cd83ec0b51e4c2cbe9 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -369,36 +369,40 @@ def test_has_missing_object_in_references(): # one reference with id -> check assert not crawler._has_missing_object_in_references( - Identifiable(name="C", record_type="RTC", properties={'d': 123}), []) + Identifiable(name="C", record_type="RTC", properties={'d': 123}), {}) # one ref with Entity with id -> check + rec = db.Record(id=123).add_parent("C") assert not crawler._has_missing_object_in_references( - Identifiable(name="C", record_type="RTC", properties={'d': db.Record(id=123) - .add_parent("C")}), []) + Identifiable(name="C", record_type="RTC", properties={'d': rec}), {id(rec): {'C': [None]}}) # one ref with id one with Entity with id (mixed) -> check + rec = db.Record(id=123).add_parent("RTC") assert not crawler._has_missing_object_in_references( Identifiable(name="C", record_type="RTD", - properties={'d': 123, 'b': db.Record(id=123).add_parent("RTC")}), []) + properties={'d': 123, 'b': rec}), {id(rec): {'C': [None]}}) # entity to be referenced in the following a = db.Record(name="C").add_parent("C").add_property("d", 12311) # one ref with id one with Entity without id (but not identifying) -> fail assert not crawler._has_missing_object_in_references( - Identifiable(name="C", record_type="RTC", properties={'d': 123, 'e': a}), []) + Identifiable(name="C", record_type="RTC", properties={'d': 123, 'e': a}), + {id(a): {'C': [None]}}) # one ref with id one with Entity without id (mixed) -> fail assert not crawler._has_missing_object_in_references( - Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), []) + Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), + {id(a): {'C': [None]}}) crawler.treated_records_lookup.add(a, Identifiable(name="C", record_type="RTC", properties={'d': 12311})) # one ref with id one with Entity without id but in cache -> check assert crawler._has_missing_object_in_references( - Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), []) + Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), + {id(a): {'C': [None]}}) # if this ever fails, the mock up may be removed crawler.identifiableAdapter.get_registered_identifiable.assert_called() -@pytest.mark.xfail() +@ pytest.mark.xfail() def test_references_entities_without_ids(): crawler = Crawler() assert not crawler._has_reference_value_without_id(db.Record().add_parent("Person") @@ -448,15 +452,15 @@ def mock_retrieve_record(identifiable: Identifiable): return None -@patch("caoscrawler.crawl.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by)) -@patch("caoscrawler.identifiable_adapters.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by)) -@patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter." - "retrieve_identified_record_for_identifiable", - new=Mock(side_effect=mock_retrieve_record)) -@patch("caoscrawler.crawl.db.Container.insert") -@patch("caoscrawler.crawl.db.Container.update") +@ patch("caoscrawler.crawl.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@ patch("caoscrawler.identifiable_adapters.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@ patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter." + "retrieve_identified_record_for_identifiable", + new=Mock(side_effect=mock_retrieve_record)) +@ patch("caoscrawler.crawl.db.Container.insert") +@ patch("caoscrawler.crawl.db.Container.update") def test_synchronization_no_commit(upmock, insmock): crawled_data = [r.copy() for r in EXAMPLE_SERVER_STATE if r.role == "Record"] # change one; add one @@ -473,16 +477,16 @@ def test_synchronization_no_commit(upmock, insmock): assert len(ups) == 1 -@patch("caoscrawler.crawl.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by)) -@patch("caoscrawler.identifiable_adapters.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by)) -@patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter." - "retrieve_identified_record_for_identifiable", - new=Mock(side_effect=mock_retrieve_record)) -@patch("caoscrawler.crawl.db.Container.insert") -@patch("caoscrawler.crawl.db.Container.update") -@patch("caoscrawler.crawl.UpdateCache.insert") +@ patch("caoscrawler.crawl.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@ patch("caoscrawler.identifiable_adapters.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@ patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter." + "retrieve_identified_record_for_identifiable", + new=Mock(side_effect=mock_retrieve_record)) +@ patch("caoscrawler.crawl.db.Container.insert") +@ patch("caoscrawler.crawl.db.Container.update") +@ patch("caoscrawler.crawl.UpdateCache.insert") def test_security_mode(updateCacheMock, upmock, insmock): # trivial case: nothing to do crawled_data = [r.copy() for r in EXAMPLE_SERVER_STATE if r.role == "Record"] @@ -581,12 +585,13 @@ def test_security_mode(updateCacheMock, upmock, insmock): def test_create_reference_mapping(): a = db.Record().add_parent("A") - b = db.Record().add_parent("B").add_property('a', a) + b = db.Record(id=132).add_parent("B").add_property('a', a) ref = Crawler.create_reference_mapping([a, b]) assert id(a) in ref - assert id(b) not in ref + assert id(b) in ref assert "B" in ref[id(a)] - assert ref[id(a)]["B"] == [b] + assert {} == ref[id(b)] + assert ref[id(a)]["B"] == [132] def test_create_flat_list(): @@ -609,7 +614,7 @@ def test_create_flat_list(): assert c in flat -@pytest.fixture +@ pytest.fixture def crawler_mocked_for_backref_test(): crawler = Crawler() # mock retrieval of registered identifiabls: return Record with just a parent @@ -653,8 +658,8 @@ def test_validation_error_print(caplog): caplog.clear() -@patch("caoscrawler.identifiable_adapters.get_children_of_rt", - new=Mock(side_effect=lambda x: [x])) +@ patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test): crawler = crawler_mocked_for_backref_test identlist = [Identifiable(name="A", record_type="BR"), @@ -689,8 +694,8 @@ def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test) assert insert[0].name == "B" -@patch("caoscrawler.identifiable_adapters.get_children_of_rt", - new=Mock(side_effect=lambda x: [x])) +@ patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_test): # test whether multiple references of the same record type are correctly used crawler = crawler_mocked_for_backref_test @@ -702,7 +707,9 @@ def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_ # test whether both entities are listed in the backref attribute of the identifiable referencing_entities = crawler.create_reference_mapping(entlist) - identifiable = crawler.identifiableAdapter.get_identifiable(referenced, referencing_entities) + identifiable = crawler.identifiableAdapter.get_identifiable( + referenced, + referencing_entities[id(referenced)]) assert len(identifiable.backrefs) == 2 # check the split... @@ -711,8 +718,8 @@ def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_ assert len(insert) == 2 -@patch("caoscrawler.identifiable_adapters.get_children_of_rt", - new=Mock(side_effect=lambda x: [x])) +@ patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_test): # test whether multiple references of the different record types are correctly used crawler = crawler_mocked_for_backref_test @@ -724,7 +731,10 @@ def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_ # test whether both entities are listed in the backref attribute of the identifiable referencing_entities = crawler.create_reference_mapping(entlist) - identifiable = crawler.identifiableAdapter.get_identifiable(referenced, referencing_entities) + identifiable = crawler.identifiableAdapter.get_identifiable( + referenced, + referencing_entities[id(referenced)]) + assert len(identifiable.backrefs) == 2 # check the split... @@ -737,7 +747,7 @@ def mock_create_values(values, element): pass -@patch("caoscrawler.converters.IntegerElementConverter.create_values") +@ patch("caoscrawler.converters.IntegerElementConverter.create_values") def test_restricted_path(create_mock): """ The restricted_path argument allows to ignroe part of the crawled data structure. Here, we make @@ -830,7 +840,7 @@ def test_split_restricted_path(): # Filter the warning because we want to have it here and this way it does not hinder running # tests with -Werror. -@pytest.mark.filterwarnings("ignore:The prefix:DeprecationWarning") +@ pytest.mark.filterwarnings("ignore:The prefix:DeprecationWarning") def test_deprecated_prefix_option(): """Test that calling the crawler's main function with the deprecated `prefix` option raises the correct errors and warnings. @@ -896,8 +906,8 @@ def mock_get_entity_by_query(query=None): return db.Record(id=1111, name='rec_name').add_parent('RT') -@patch("caoscrawler.crawl.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by_query)) +@ patch("caoscrawler.crawl.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by_query)) def test_replace_name_with_referenced_entity(): test_text = 'lkajsdf' test_int = 134343 diff --git a/unittests/test_identifiable_adapters.py b/unittests/test_identifiable_adapters.py index 7759fa55ce37e1c30ff9092eac3260ca80348bca..ee0e0d6cd7c791f78e7cd2307dc6f34698326b4a 100644 --- a/unittests/test_identifiable_adapters.py +++ b/unittests/test_identifiable_adapters.py @@ -143,10 +143,9 @@ def test_wildcard_ref(): .add_property(name="last_name", value='Tom')) identifiable = ident.get_identifiable(rec, referencing_entities={ - id(rec): - {'A': [db.Record(id=1).add_parent("A")]}} + 'A': [1]} ) - assert identifiable.backrefs[0].id == 1 + assert identifiable.backrefs[0] == 1 def test_convert_value():