diff --git a/src/newcrawler/crawl.py b/src/newcrawler/crawl.py index c17605874df48c952ae51054bd71c2a4d0577d70..a0670eb149f82d348a515df4cc143f91e24e3e00 100644 --- a/src/newcrawler/crawl.py +++ b/src/newcrawler/crawl.py @@ -247,6 +247,8 @@ class Crawler(object): that the same identifiable exists twice) """ identifiable = self.identifiableAdapter.get_identifiable(record) + if identifiable is None: + return None if identifiable in self.identified_cache: return self.identified_cache[identifiable] else: @@ -261,6 +263,8 @@ class Crawler(object): that the same identifiable exists twice) """ identifiable = self.identifiableAdapter.get_identifiable(record) + if identifiable is None: + raise RuntimeError() self.identified_cache.add(identifiable=identifiable, record=record) def copy_attributes(self, fro: db.Entity, to: db.Entity): @@ -270,9 +274,14 @@ class Crawler(object): to_be_inserted = [] to_be_updated = [] flat = list(ent_list) - # assure all entities are direct members + # assure all entities are direct members TODO Can this be removed at some point?Check only? self.create_flat_list(ent_list, flat) + # TODO: can the following be removed at some point + for ent in flat: + if len(ent.parents) == 0: + raise RuntimeError("Records must have a parent.") + resolved_references = True # flat contains Entities which could not yet be checked against the remote server while resolved_references and len(flat) > 0: @@ -310,6 +319,8 @@ class Crawler(object): # side effect record.id = identified_record.id to_be_updated.append(record) + # TODO think this through + # self.add_identified_record_to_local_cache(record) del flat[i] resolved_references = True @@ -324,13 +335,22 @@ class Crawler(object): return to_be_inserted, to_be_updated + def replace_entities_by_ids(self, rec: db.Record): + for el in rec.properties: + if isinstance(el.value, db.Entity): + el.value = el.value.id + elif isinstance(el.value, list): + for index, val in enumerate(el.value): + if isinstance(val, db.Entity): + el.value[index] = val.id + def remove_unnecessary_updates(self, updateList: list[db.Record]): """ checks whether all relevant attributes (especially Property values) are equal """ for i in reversed(range(len(updateList))): record = updateList[i] - identifiable = self.get_identifiable(record) + identifiable = self.identifiableAdapter.retrieve_identifiable(record) comp = compare_entities(record, identifiable) identical = True @@ -382,13 +402,16 @@ class Crawler(object): to_be_inserted, to_be_updated = self.split_into_inserts_and_updates(updateList) # remove unnecessary updates from list + for el in to_be_updated: + self.replace_entities_by_ids(el) + self.remove_unnecessary_updates(to_be_updated) # TODO - self.execute_inserts_in_list(insertList) - self.execute_updates_in_list(updateList) + # self.execute_inserts_in_list(to_be_inserted) + # self.execute_updates_in_list(to_be_updated) - return (insertList, updateList) + return (to_be_inserted, to_be_updated) @staticmethod def debug_build_usage_tree(converter: Converter): diff --git a/src/newcrawler/identifiable_adapters.py b/src/newcrawler/identifiable_adapters.py index 27f2bae1884cecf1d73d3a1f3ca375bc5c9792a4..01eb55bbb3ec4b54a49633ef839b2ed99ab5b398 100644 --- a/src/newcrawler/identifiable_adapters.py +++ b/src/newcrawler/identifiable_adapters.py @@ -99,7 +99,7 @@ class IdentifiableAdapter(object): pass @abstractmethod - def resolve_references(self, record: db.Record): + def resolve_reference(self, record: db.Record): pass def get_identifiable(self, record: db.Record): @@ -112,7 +112,7 @@ class IdentifiableAdapter(object): if registered_identifiable is None: return None - identifiable = db.Record() + identifiable = db.Record(name=record.name) if len(registered_identifiable.parents) != 1: raise RuntimeError("Multiple parents for identifiables not supported.") identifiable.add_parent(registered_identifiable.parents[0]) @@ -125,8 +125,24 @@ class IdentifiableAdapter(object): # case A: in the registered identifiable # case B: in the identifiable - # TODO use id if value is Entity - identifiable.add_property(record.get_property(prop.name)) + record_prop = record.get_property(prop.name) + newval = record_prop.value + if isinstance(record_prop.value, db.Entity): + newval = self.resolve_reference(record_prop.value) + elif isinstance(record_prop.value, list): + newval = list() + for element in record_prop.value: + if isinstance(element, db.Entity): + newval.append(self.resolve_reference(element)) + else: + newval.append(element) + record_prop_new = db.Property(name=record_prop.name, + id=record_prop.id, + description=record_prop.description, + datatype=record_prop.datatype, + value=newval, + unit=record_prop.unit) + identifiable.add_property(record_prop_new) property_name_list_A.append(prop.name) # check for multi properties in the record: @@ -272,7 +288,7 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): return None return candidates[0] - def _resolve_reference(self, value: db.Entity): + def resolve_reference(self, value: db.Record): if self.get_registered_identifiable(value) is None: raise NotImplementedError("Non-identifiable references cannot" " be used as properties in identifiables.") @@ -287,14 +303,3 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): raise RuntimeError("The entity has not been assigned an ID.") return value_identifiable.id - - def resolve_references(self, record: db.Record): - for prop in record.properties: - if isinstance(prop.value, db.Entity): - prop.value = self._resolve_reference(prop.value) - - if isinstance(prop.value, list): - for element_index in range(len(prop.value)): - element = prop.value[element_index] - if isinstance(element, db.Entity): - prop.value[element_index] = self._resolve_reference(element) diff --git a/unittests/test_tool.py b/unittests/test_tool.py index 330716a5cc80fad3d3d0c101999192cde0935d4b..23912ff133fdb7dceb1805907d24a57578fc63ee 100755 --- a/unittests/test_tool.py +++ b/unittests/test_tool.py @@ -216,6 +216,7 @@ def test_ambigious_records(crawler, ident): def test_crawler_update_list(crawler, ident): + crawler.copy_attributes = Mock() # If the following assertions fail, that is a hint, that the test file records.xml is # incorrect: assert len(ident.get_records()) == 18 @@ -353,10 +354,8 @@ def test_provenance_debug_data(crawler): assert check_key_count("Person") == 14 -def test_split_into_inserts_and_updates(crawler): - # Try trivial argument - crawler.split_into_inserts_and_updates([]) - +@pytest.fixture +def mock_retrieve(crawler): # simulate remote server content by using the names to identify records def base_mocked_lookup(rec, known): if rec.name in known: @@ -364,77 +363,165 @@ def test_split_into_inserts_and_updates(crawler): else: return None - cache = [] - - def trivial_cache_loockup(stuff): - print("current cache", cache) - if stuff.name in cache: - return stuff - else: - return None - - def trivial_cache_add(stuff): - cache.append(stuff.name) - - crawler.get_identified_record_from_local_cache = Mock(side_effect=trivial_cache_loockup) - crawler.add_identified_record_to_local_cache = Mock(side_effect=trivial_cache_add) crawler.copy_attributes = Mock() # a record that is found remotely and should be added to the update list and one that is not # found and should be added to the insert one remote_known = {"A": db.Record(id=1111, name="A")} - entlist = [db.Record(name="A"), db.Record(name="B")] crawler.identifiableAdapter.retrieve_identifiable = Mock(side_effect=partial( base_mocked_lookup, known=remote_known)) + crawler.identifiableAdapter.get_registered_identifiable = ( + lambda x: db.Record().add_parent(x.parents[0].name)) + return crawler + + +def test_split_into_inserts_and_updates_trivial(crawler): + # Try trivial argument + crawler.split_into_inserts_and_updates([]) + + +def test_split_into_inserts_and_updates_single(mock_retrieve): + crawler = mock_retrieve + + entlist = [db.Record(name="A").add_parent("C"), db.Record(name="B").add_parent("C")] + + assert crawler.get_identified_record_from_local_cache(entlist[0]) is None + assert crawler.get_identified_record_from_local_cache(entlist[1]) is None + assert crawler.can_be_checked_externally(entlist[0]) + assert crawler.can_be_checked_externally(entlist[1]) + assert crawler.identifiableAdapter.retrieve_identifiable(entlist[0]).id == 1111 + assert crawler.identifiableAdapter.retrieve_identifiable(entlist[1]) is None + insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) - print(crawler.identifiableAdapter.retrieve_identifiable.call_args_list) - print(entlist) - # crawler.identifiableAdapter.retrieve_identifiable.assert_any_call(entlist[0]) - # crawler.identifiableAdapter.retrieve_identifiable.assert_any_call(entlist[1]) assert len(insert) == 1 assert insert[0].name == "B" assert len(update) == 1 assert update[0].name == "A" - # reset cache - cache.clear() +def test_split_into_inserts_and_updates_with_duplicate(mock_retrieve): + crawler = mock_retrieve + # try it with a reference + a = db.Record(name="A").add_parent("C") + b = db.Record(name="B").add_parent("C") + b.add_property("A", a) + c = db.Record(name="A").add_parent("C") + entlist = [a, b, c] + insert, update = crawler.split_into_inserts_and_updates(entlist) + assert len(insert) == 1 + assert insert[0].name == "B" + assert len(update) == 1 + assert update[0].name == "A" + + +def test_split_into_inserts_and_updates_with_ref(mock_retrieve): + crawler = mock_retrieve # try it with a reference - a = db.Record(name="A") - b = db.Record(name="B") + a = db.Record(name="A").add_parent("C") + b = db.Record(name="B").add_parent("C") b.add_property("A", a) entlist = [a, b] - crawler.identifiableAdapter.retrieve_identifiable = Mock(side_effect=partial( - base_mocked_lookup, known=remote_known)) insert, update = crawler.split_into_inserts_and_updates(entlist) assert len(insert) == 1 assert insert[0].name == "B" assert len(update) == 1 assert update[0].name == "A" - # reset cache - cache.clear() +def test_split_into_inserts_and_updates_with_circ(mock_retrieve): # try circular - a = db.Record(name="A") - b = db.Record(name="B") + crawler = mock_retrieve + a = db.Record(name="A").add_parent("C") + b = db.Record(name="B").add_parent("C") b.add_property("A", a) a.add_property("B", b) entlist = [a, b] - with raises(RuntimeError): - crawler.split_into_inserts_and_updates(entlist) + +def test_split_into_inserts_and_updates_with_complex(mock_retrieve): + crawler = mock_retrieve + # A + # ^ + # | + # F <- B <- G + a = db.Record(name="A").add_parent("C").add_property('d', 13).add_property('e', "lskdjlsfdj") + b = db.Record(name="B").add_parent("C") + g = db.Record(name="G").add_parent("C") + f = db.Record(name="F").add_parent("C") + g.add_property("A", a) + b.add_property("A", f) + b.add_property("A", a) + entlist = [a, b, g] + insert, update = crawler.split_into_inserts_and_updates(entlist) + assert len(insert) == 3 + assert "B" in [el.name for el in insert] + assert len(update) == 1 + assert update[0].name == "A" + + # TODO write test where the unresoled entity is not part of the identifiable + + +def test_split_into_inserts_and_updates_with_copy_attr(mock_retrieve): + crawler = mock_retrieve # assume identifiable is only the name - a = db.Record(name="A") + a = db.Record(name="A").add_parent("C") a.add_property("foo", 1) - b = db.Record(name="A") + b = db.Record(name="A").add_parent("C") b.add_property("bar", 2) + entlist = [a, b] + insert, update = crawler.split_into_inserts_and_updates(entlist) # expected TODO - #assert result.has_property("foo").value == 1 - #assert result.has_property("bar").value == 1 + assert update[0].get_property("bar").value == 2 + assert update[0].get_property("foo").value == 1 def test_all_references_are_existing_already(crawler): - pass - # crawler.all_references_are_existing_already(record) + def base_mocked_lookup(rec, known): + if rec.name in known: + return known[rec.name] + else: + return None + crawler.identifiableAdapter.get_registered_identifiable = Mock(side_effect=partial( + base_mocked_lookup, known={"A": db.Record(name="A").add_parent("C"), + "B": db.Record(name="B").add_parent("C")})) + + assert crawler.all_references_are_existing_already(db.Record().add_property('a', 123)) + assert crawler.all_references_are_existing_already(db.Record() + .add_property('a', db.Record(id=123))) + assert crawler.all_references_are_existing_already(db.Record() + .add_property('a', 123) + .add_property('b', db.Record(id=123))) + assert not crawler.all_references_are_existing_already(db.Record() + .add_property('a', 123) + .add_property('b', db.Record(name="A") + .add_parent("C"))) + a = db.Record(name="A").add_parent("C") + crawler.add_identified_record_to_local_cache(a) + assert crawler.all_references_are_existing_already(db.Record() + .add_property('a', 123) + .add_property('b', a)) + + +def test_can_be_checked_externally(crawler): + assert crawler.can_be_checked_externally(db.Record().add_property('a', 123)) + assert crawler.can_be_checked_externally(db.Record() + .add_property('a', db.Record(id=123))) + assert crawler.can_be_checked_externally(db.Record() + .add_property('a', 123) + .add_property('b', db.Record(id=123))) + + assert not crawler.can_be_checked_externally(db.Record() + .add_property('a', 123) + .add_property('b', db.Record())) + + +def test_replace_entities_by_ids(crawler): + a = (db.Record().add_parent("B").add_property("A", 12345) + .add_property("B", db.Record(id=12345)) + .add_property("C", [db.Record(id=12345), 233324])) + + crawler.replace_entities_by_ids(a) + assert a.get_property("A").value == 12345 + assert a.get_property("B").value == 12345 + assert a.get_property("C").value == [12345, 233324]