From 33e7f66f7e9417cdf6850d59c1d2e3aa9105dd91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Sun, 23 Oct 2022 14:53:40 +0200 Subject: [PATCH] WIP: some fixes --- src/caoscrawler/crawl.py | 20 +++++-------- src/caoscrawler/identifiable_adapters.py | 37 +++++++++++++++--------- unittests/test_tool.py | 28 ++++++++++++++---- 3 files changed, 54 insertions(+), 31 deletions(-) diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 72caf81a..f28eb5e8 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -697,8 +697,8 @@ class Crawler(object): Create a dictionary of dictionaries of the form: Dict[int, Dict[str, db.Entity]] - - The integer index is the id of the value object. - - The string is the name of the first parent of the value object. + - The integer index is the Python id of the value object. + - The string is the name of the first parent of the referencing object. Each value objects is taken from the values of all properties from the list flat. @@ -715,9 +715,7 @@ class Crawler(object): if isinstance(v, db.Entity): if id(v) not in references: references[id(v)] = {} - # TODO: Shouldn't the parent's name always be the same for - # a single id? - references[id(v)][v.parents[0].name] = ent + references[id(v)][ent.parents[0].name] = ent return references def split_into_inserts_and_updates(self, ent_list: List[db.Entity]): @@ -734,8 +732,7 @@ class Crawler(object): if ent.role == "Record" and len(ent.parents) == 0: raise RuntimeError("Records must have a parent.") - # TODO: where is this used: - references = self.create_reference_mapping(flat) + referencing_entities = self.create_reference_mapping(flat) resolved_references = True # flat contains Entities which could not yet be checked against the remote server @@ -763,10 +760,9 @@ class Crawler(object): # can we check whether the record(identifiable) exists on the remote server? elif not self.has_reference_value_without_id( self.identifiableAdapter.get_identifiable(record)): - # TODO: remove deepcopy? identified_record = ( self.identifiableAdapter.retrieve_identified_record_for_record( - deepcopy(record))) + record, referencing_entities=referencing_entities)) if identified_record is None: # identifiable does not exist remotely -> record needs to be inserted to_be_inserted.append(record) @@ -801,7 +797,7 @@ class Crawler(object): raise RuntimeError( "Could not resolve all Entity references. Circular Dependency?") - return to_be_inserted, to_be_updated + return to_be_inserted, to_be_updated, referencing_entities def replace_entities_with_ids(self, rec: db.Record): for el in rec.properties: @@ -949,7 +945,7 @@ class Crawler(object): if self.identifiableAdapter is None: raise RuntimeError("Should not happen.") - to_be_inserted, to_be_updated = self.split_into_inserts_and_updates( + to_be_inserted, to_be_updated, referencing_entities = self.split_into_inserts_and_updates( target_data) # TODO: refactoring of typo @@ -959,7 +955,7 @@ class Crawler(object): identified_records = [ self.identifiableAdapter.retrieve_identified_record_for_record( - record) + record, referencing_entities) for record in to_be_updated] # remove unnecessary updates from list by comparing the target records to the existing ones self.remove_unnecessary_updates(to_be_updated, identified_records) diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index 4c0b17fa..7e2d9864 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -100,6 +100,10 @@ class IdentifiableAdapter(metaclass=ABCMeta): "Multiple parents for identifiables not supported.") query_string = "FIND Record " + ident.get_parents()[0].name + if ident.get_property("is_referenced_by") is not None: + query_string += (" WHICH IS REFERENCED BY " + + ident.get_property("is_referenced_by").value + " AND") + query_string += " WITH " if ident.name is None and len(ident.get_properties()) == 0: @@ -118,7 +122,9 @@ class IdentifiableAdapter(metaclass=ABCMeta): def create_property_query(entity: db.Entity): query_string = "" for p in entity.get_properties(): - if p.value is None: + if p.name == "is_referenced_by": + continue + elif p.value is None: query_string += "'" + p.name + "' IS NULL AND " elif isinstance(p.value, list): for v in p.value: @@ -156,7 +162,7 @@ class IdentifiableAdapter(metaclass=ABCMeta): # ------------------------- # TODO: this uses an active database connection which is discouraged here! @staticmethod - def get_recordTypeName(rt_id): + def get_recordtype_name(rt_id): return db.RecordType(rt_id).retrieve().name # ------------------------- @@ -213,10 +219,11 @@ class IdentifiableAdapter(metaclass=ABCMeta): # case A: in the registered identifiable # case B: in the identifiable - if prop.name == "is_referenced_by": - rtname = self.get_recordTypeName(prop.value) - if rtname in referencing_entities: - record.add_property("is_referenced_by", referencing_entities[rtname]) + if prop.name.lower() == "is_referenced_by": + rtname = self.get_recordtype_name(prop.value) + if (id(record) in referencing_entities + and rtname in referencing_entities[id(record)]): + identifiable.add_property("is_referenced_by", referencing_entities[rtname]) continue record_prop = record.get_property(prop.name) if record_prop is None: @@ -247,7 +254,8 @@ class IdentifiableAdapter(metaclass=ABCMeta): return identifiable @abstractmethod - def retrieve_identified_record_for_identifiable(self, identifiable: db.Record): + def retrieve_identified_record_for_identifiable(self, identifiable: db.Record, + referencing_entities=None): """ Retrieve identifiable record for a given identifiable. @@ -260,7 +268,7 @@ class IdentifiableAdapter(metaclass=ABCMeta): # TODO: remove side effect # TODO: use ID if record has one? - def retrieve_identified_record_for_record(self, record: db.Record): + def retrieve_identified_record_for_record(self, record: db.Record, referencing_entities=None): """ This function combines all functionality of the IdentifierAdapter by returning the identifiable after having checked for an appropriate @@ -269,7 +277,7 @@ class IdentifiableAdapter(metaclass=ABCMeta): In case there was no appropriate registered identifiable or no identifiable could be found return value is None. """ - identifiable = self.get_identifiable(record) + identifiable = self.get_identifiable(record, referencing_entities=referencing_entities) if identifiable is None: return None @@ -277,7 +285,8 @@ class IdentifiableAdapter(metaclass=ABCMeta): if identifiable.role == "File": return self.get_file(identifiable) - return self.retrieve_identified_record_for_identifiable(identifiable) + return self.retrieve_identified_record_for_identifiable( + identifiable, referencing_entities=referencing_entities) class LocalStorageIdentifiableAdapter(IdentifiableAdapter): @@ -388,7 +397,8 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): return False return True - def retrieve_identified_record_for_identifiable(self, identifiable: db.Record): + def retrieve_identified_record_for_identifiable(self, identifiable: + db.Record, referencing_entities=None): candidates = [] for record in self._records: if self.check_record(record, identifiable): @@ -473,8 +483,9 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): return record return record.id - def retrieve_identified_record_for_identifiable(self, identifiable: db.Record): - query_string = self.create_query_for_identifiable(identifiable) + def retrieve_identified_record_for_identifiable(self, identifiable: db.Record, + referencing_entities=None): + query_string = self.create_query_for_identifiable(identifiable, referencing_entities) candidates = db.execute_query(query_string) if len(candidates) > 1: raise RuntimeError( diff --git a/unittests/test_tool.py b/unittests/test_tool.py index f08f8061..51ea835d 100755 --- a/unittests/test_tool.py +++ b/unittests/test_tool.py @@ -290,6 +290,12 @@ def test_identifiable_adapter(): .add_property("first_name", value="A") .add_property("last_name", value="B")) assert query.lower() == "find record person with 'first_name'='a' and 'last_name'='b' " + query = IdentifiableAdapter.create_query_for_identifiable( + db.Record().add_parent("Person") + .add_property("is_referenced_by", value="14433") + .add_property("last_name", value="B")) + assert query.lower() == ("find record person which is referenced by 14433 and with " + "'last_name'='b' ") def test_remove_unnecessary_updates(): @@ -359,7 +365,7 @@ def test_provenance_debug_data(crawler): assert check_key_count("Person") == 14 -def basic_retrieve_by_name_mock_up(rec, known): +def basic_retrieve_by_name_mock_up(rec, referencing_entities=None, known=None): """ returns a stored Record if rec.name is an existing key, None otherwise """ if rec.name in known: return known[rec.name] @@ -399,7 +405,7 @@ def test_split_into_inserts_and_updates_single(crawler_mocked_identifiable_retri assert crawler.identifiableAdapter.retrieve_identified_record_for_record( entlist[1]) is None - insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) + insert, update, _ = crawler.split_into_inserts_and_updates(deepcopy(entlist)) assert len(insert) == 1 assert insert[0].name == "B" assert len(update) == 1 @@ -417,7 +423,7 @@ def test_split_into_inserts_and_updates_with_duplicate(crawler_mocked_identifiab # This is identical to a and should be removed c = db.Record(name="A").add_parent("C") entlist = [a, b, c] - insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) + insert, update, _ = crawler.split_into_inserts_and_updates(deepcopy(entlist)) assert len(insert) == 1 assert insert[0].name == "B" assert len(update) == 1 @@ -434,7 +440,7 @@ def test_split_into_inserts_and_updates_with_ref(crawler_mocked_identifiable_ret b = db.Record(name="B").add_parent("C") b.add_property("A", a) entlist = [a, b] - insert, update = crawler.split_into_inserts_and_updates(entlist) + insert, update, _ = crawler.split_into_inserts_and_updates(entlist) assert len(insert) == 1 assert insert[0].name == "B" assert len(update) == 1 @@ -469,7 +475,7 @@ def test_split_into_inserts_and_updates_with_complex(crawler_mocked_identifiable b.add_property("A", f) b.add_property("A", a) entlist = [a, b, g] - insert, update = crawler.split_into_inserts_and_updates(entlist) + insert, update, _ = crawler.split_into_inserts_and_updates(entlist) assert len(insert) == 3 assert "B" in [el.name for el in insert] assert len(update) == 1 @@ -489,7 +495,7 @@ def test_split_into_inserts_and_updates_with_copy_attr(crawler_mocked_identifiab b = db.Record(name="A").add_parent("C") b.add_property("bar", 2) entlist = [a, b] - insert, update = crawler.split_into_inserts_and_updates(entlist) + insert, update, _ = crawler.split_into_inserts_and_updates(entlist) assert update[0].get_property("bar").value == 2 assert update[0].get_property("foo").value == 1 @@ -713,3 +719,13 @@ def test_security_mode(updateCacheMock, upmock, insmock, ident): reset_mocks([updateCacheMock, insmock, upmock]) # restore original ident ident._records = deepcopy(records_backup) + + +def test_create_reference_mapping(): + a = db.Record().add_parent("A") + b = db.Record().add_parent("B").add_property('a', a) + ref = Crawler.create_reference_mapping([a, b]) + assert id(a) in ref + assert id(b) not in ref + assert "B" in ref[id(a)] + assert ref[id(a)]["B"] is b -- GitLab