diff --git a/.gitignore b/.gitignore index 5599d7d263c8927025e128c37eabb185025bf96b..0cf92ee328c382ed5e225d8a9c689e150611266b 100644 --- a/.gitignore +++ b/.gitignore @@ -3,7 +3,6 @@ src/caoscrawler.egg-info/ __pycache__ .tox TAGS -src/.coverage build/ *~ .pdbrc diff --git a/integrationtests/test_realworld_example.py b/integrationtests/test_realworld_example.py index da3fb69ce635ae69cd33cbf01de9df8ebf019661..7360d9670554a039b879ec337b0ed78763d1d589 100644 --- a/integrationtests/test_realworld_example.py +++ b/integrationtests/test_realworld_example.py @@ -83,19 +83,7 @@ def clear_database(): def create_identifiable_adapter(): ident = CaosDBIdentifiableAdapter() - ident.register_identifiable("license", ( - db.RecordType() - .add_parent("license") - .add_property("name"))) - ident.register_identifiable("project_type", ( - db.RecordType() - .add_parent("project_type") - .add_property("name"))) - ident.register_identifiable("Person", ( - db.RecordType() - .add_parent("Person") - .add_property("full_name"))) - + ident.load_from_yaml_definition(os.path.join(DATADIR, "identifiables.yml")) return ident diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index b4c9c9ffeedfb0eaf82860b07afe695215ece04d..57b2735c8d2ec590822c0a1bf1ebf40cf6208f2c 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -193,9 +193,11 @@ class Crawler(object): Please use SecurityMode Enum """ - # TODO: check if this feature is really needed - - self.identified_cache = IdentifiedCache() + # The following caches store records, where we checked whether they exist on the remote + # server. Since, it is important to know whether they exist or not, we store them into two + # different caches. + self.remote_existing_cache = IdentifiedCache() + self.remote_missing_cache = IdentifiedCache() self.recordStore = RecordStore() self.securityMode = securityMode @@ -480,30 +482,23 @@ class Crawler(object): return self._synchronize(self.target_data, commit_changes, unique_names=unique_names) - def can_be_checked_externally(self, record: db.Record): + def has_reference_value_without_id(self, record: db.Record): """ - Returns False if there is at least one property in record which: + Returns True if there is at least one property in `record` which: a) is a reference property AND b) where the value is set to a db.Entity (instead of an ID) AND - c) where the ID of the value is not set (to an integer) + c) where the ID of the value (the db.Entity object in b)) is not set (to an integer) - Returns True otherwise. + Returns False otherwise. """ for p in record.properties: if isinstance(p.value, list): for el in p.value: if isinstance(el, db.Entity) and el.id is None: - return False - # TODO: please check! - # I removed the condition "is_reference", because the datatype field - # that is checked within this function is not always present for references - # parsed from the file structure. We have to rely on the condition, that - # if a property value is of type entity, it can be assumed to be a reference. - # elif (is_reference(p) and isinstance(p.value, db.Entity) - # and p.value.id is None): + return True elif isinstance(p.value, db.Entity) and p.value.id is None: - return False - return True + return True + return False def create_flat_list(self, ent_list: List[db.Entity], flat: List[db.Entity]): """ @@ -529,24 +524,26 @@ class Crawler(object): # TODO: move inside if block? self.create_flat_list([p.value], flat) - def all_references_are_existing_already(self, record: db.Record): + def has_missing_object_in_references(self, record: db.Record): """ - returns true if all references either have IDs or were checked remotely and not found (i.e. - they exist in the local cache) + returns False if any property value is a db.Entity object that + is contained in the `remote_missing_cache`. If the record has such an object in the + reference properties, it means that it references another Entity, where we checked + whether it exists remotely and it was not found. """ for p in record.properties: # if (is_reference(p) # Entity instead of ID and not cached locally if (isinstance(p.value, list)): for el in p.value: - if (isinstance(el, db.Entity) and el.id is None - and self.get_identified_record_from_local_cache(el) is None): - return False - if (isinstance(p.value, db.Entity) and p.value.id is None - and self.get_identified_record_from_local_cache(p.value) is None): + if (isinstance(el, db.Entity) + and self.get_from_remote_missing_cache(el) is not None): + return True + if (isinstance(p.value, db.Entity) + and self.get_from_remote_missing_cache(p.value) is not None): # might be checked when reference is resolved - return False - return True + return True + return False def replace_references_with_cached(self, record: db.Record): """ @@ -559,7 +556,7 @@ class Crawler(object): lst = [] for el in p.value: if (isinstance(el, db.Entity) and el.id is None): - cached = self.get_identified_record_from_local_cache( + cached = self.get_from_any_cache( el) if cached is None: raise RuntimeError("Not in cache.") @@ -574,7 +571,7 @@ class Crawler(object): lst.append(el) p.value = lst if (isinstance(p.value, db.Entity) and p.value.id is None): - cached = self.get_identified_record_from_local_cache(p.value) + cached = self.get_from_any_cache(p.value) if cached is None: raise RuntimeError("Not in cache.") if not check_identical(cached, p.value, True): @@ -585,7 +582,7 @@ class Crawler(object): raise RuntimeError("Not identical.") p.value = cached - def get_identified_record_from_local_cache(self, record: db.Record): + def get_from_remote_missing_cache(self, record: db.Record): """ returns the identifiable if an identifiable with the same values already exists locally (Each identifiable that is not found on the remote server, is 'cached' locally to prevent @@ -599,12 +596,57 @@ class Crawler(object): identifiable = record # return None - if identifiable in self.identified_cache: - return self.identified_cache[identifiable] + if identifiable in self.remote_missing_cache: + return self.remote_missing_cache[identifiable] else: return None - def add_identified_record_to_local_cache(self, record: db.Record): + def get_from_any_cache(self, record: db.Record): + """ + returns the identifiable if an identifiable with the same values already exists locally + (Each identifiable that is not found on the remote server, is 'cached' locally to prevent + that the same identifiable exists twice) + """ + if self.identifiableAdapter is None: + raise RuntimeError("Should not happen.") + identifiable = self.identifiableAdapter.get_identifiable(record) + if identifiable is None: + # TODO: check whether the same idea as below works here + identifiable = record + # return None + + if identifiable in self.remote_existing_cache: + return self.remote_existing_cache[identifiable] + elif identifiable in self.remote_missing_cache: + return self.remote_missing_cache[identifiable] + else: + return None + + def add_to_remote_missing_cache(self, record: db.Record): + """ + adds the given identifiable to the local cache + + No identifiable with the same values must exist locally. + (Each identifiable that is not found on the remote server, is 'cached' locally to prevent + that the same identifiable exists twice) + + Return False if there is no identifiable for this record and True otherwise. + """ + self.add_to_cache(record=record, cache=self.remote_missing_cache) + + def add_to_remote_existing_cache(self, record: db.Record): + """ + adds the given identifiable to the local cache + + No identifiable with the same values must exist locally. + (Each identifiable that is not found on the remote server, is 'cached' locally to prevent + that the same identifiable exists twice) + + Return False if there is no identifiable for this record and True otherwise. + """ + self.add_to_cache(record=record, cache=self.remote_existing_cache) + + def add_to_cache(self, record: db.Record, cache): """ adds the given identifiable to the local cache @@ -629,14 +671,23 @@ class Crawler(object): # if there is no identifiable, for the cache that is the same # as if the complete entity is the identifiable: identifiable = record - self.identified_cache.add(identifiable=identifiable, record=record) + cache.add(identifiable=identifiable, record=record) - def copy_attributes(self, fro: db.Entity, to: db.Entity): - """ - Copy all attributes from one entity to another entity. + @staticmethod + def bend_references_to_new_object(old, new, entities): + """ Bend references to the other object + Iterate over all entities in `entities` and check the values of all properties of + occurances of old Entity and replace them with new Entity """ - - merge_entities(to, fro) + for el in entities: + for p in el.properties: + if isinstance(p.value, list): + for index, val in enumerate(p.value): + if val is old: + p.value[index] = new + else: + if p.value is old: + p.value = new def split_into_inserts_and_updates(self, ent_list: List[db.Entity]): if self.identifiableAdapter is None: @@ -664,94 +715,54 @@ class Crawler(object): if (record.id is not None or record in to_be_inserted): raise RuntimeError("This should not be reached since treated elements" "are removed from the list") - # Check the local cache first for duplicate - elif self.get_identified_record_from_local_cache(record) is not None: - - # This record is a duplicate that can be removed. Make sure we do not lose - # information - # Update an (local) identified record that will be inserted - newrecord = self.get_identified_record_from_local_cache( - record) - self.copy_attributes(fro=record, to=newrecord) - # Bend references to the other object - # TODO refactor this - for el in flat + to_be_inserted + to_be_updated: - for p in el.properties: - if isinstance(p.value, list): - for index, val in enumerate(p.value): - if val is record: - p.value[index] = newrecord - else: - if p.value is record: - p.value = newrecord + # Check whether this record is a duplicate that can be removed + elif self.get_from_any_cache(record) is not None: + # We merge the two in order to prevent loss of information + newrecord = self.get_from_any_cache(record) + merge_entities(newrecord, record) + Crawler.bend_references_to_new_object( + old=record, new=newrecord, entities=flat+to_be_updated+to_be_inserted) del flat[i] + resolved_references = True - # all references need to be IDs that exist on the remote server - elif self.can_be_checked_externally(record): - - # Check remotely + # can we check whether the record(identifiable) exists on the remote server? + elif not self.has_reference_value_without_id( + self.identifiableAdapter.get_identifiable(record)): # TODO: remove deepcopy? - identified_record = self.identifiableAdapter.retrieve_identified_record_for_record( - deepcopy(record)) + identified_record = ( + self.identifiableAdapter.retrieve_identified_record_for_record( + deepcopy(record))) if identified_record is None: - # identifiable does not exist remotely + # identifiable does not exist remotely -> record needs to be inserted to_be_inserted.append(record) - self.add_identified_record_to_local_cache(record) + self.add_to_remote_missing_cache(record) del flat[i] else: # side effect record.id = identified_record.id - # On update every property needs to have an ID. - # This will be achieved by the function execute_updates_in_list below. - # For files this is not enough, we also need to copy over - # checksum and size: + # Copy over checksum and size too if it is a file if isinstance(record, db.File): record._size = identified_record._size record._checksum = identified_record._checksum to_be_updated.append(record) - # TODO think this through - self.add_identified_record_to_local_cache(record) + self.add_to_remote_existing_cache(record) del flat[i] resolved_references = True - # e.g. references an identifiable that does not exist remotely - elif self.all_references_are_existing_already(record): - - # TODO: (for review) - # This was the old version, but also for this case the - # check for identifiables has to be done. - # to_be_inserted.append(record) - # self.add_identified_record_to_local_cache(record) - # del flat[i] - - # TODO: (for review) - # If the following replacement is not done, the cache will - # be invalid as soon as references are resolved. - # replace references by versions from cache: - self.replace_references_with_cached(record) - - identified_record = self.identifiableAdapter.retrieve_identified_record_for_record( - deepcopy(record)) - if identified_record is None: - # identifiable does not exist remotely - to_be_inserted.append(record) - self.add_identified_record_to_local_cache(record) - del flat[i] - else: - # side effect - record.id = identified_record.id - # On update every property needs to have an ID. - # This will be achieved by the function execute_updates_in_list below. - - to_be_updated.append(record) - # TODO think this through - self.add_identified_record_to_local_cache(record) - del flat[i] - + # is it impossible to check this record because an identifiable references a + # missing record? + elif self.has_missing_object_in_references( + self.identifiableAdapter.get_identifiable(record)): + to_be_inserted.append(record) + self.add_to_remote_missing_cache(record) + del flat[i] resolved_references = True + for record in flat: + self.replace_references_with_cached(record) + if len(flat) > 0: raise RuntimeError( "Could not resolve all Entity references. Circular Dependency?") diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index d4c2b1d04316946dc28fec15489e0dc390cb9dd3..15b006c0fb73f61bb3f65d90d828889c44e734e8 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -208,18 +208,10 @@ class IdentifiableAdapter(metaclass=ABCMeta): # TODO: how to handle missing values in identifiables # raise an exception? raise NotImplementedError( - f"RECORD\n{record}\nPROPERTY\n{prop.name}" + f"The following record is missing an identifying property:" + f"RECORD\n{record}\nIdentifying PROPERTY\n{prop.name}" ) newval = record_prop.value - if isinstance(record_prop.value, db.Entity): - newval = self.resolve_reference(record_prop.value) - elif isinstance(record_prop.value, list): - newval = list() - for element in record_prop.value: - if isinstance(element, db.Entity): - newval.append(self.resolve_reference(element)) - else: - newval.append(element) record_prop_new = db.Property(name=record_prop.name, id=record_prop.id, description=record_prop.description, @@ -371,16 +363,13 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): # a) prop_record.value has a registered identifiable: # in this case, fetch the identifiable and set the value accordingly if isinstance(prop.value, db.Entity): # lists are not checked here - registered = self.get_registered_identifiable(prop.value) + otherid = prop_record.value + if isinstance(prop_record.value, db.Entity): + otherid = prop_record.value.id + if prop.value.id != otherid: + return False - if registered is None: - raise NotImplementedError("Non-identifiable references cannot" - " be used as properties in identifiables.") - - raise RuntimeError("The identifiable which is used as property" - " here has to be inserted first.") - - if prop.value != prop_record.value: + elif prop.value != prop_record.value: return False return True diff --git a/unittests/test_tool.py b/unittests/test_tool.py index a190efdeaaa9b3ede8d6fc1b9d1fb2d6e0d9c210..f08f8061728e2819b70079cabc0a10796154f619 100755 --- a/unittests/test_tool.py +++ b/unittests/test_tool.py @@ -390,10 +390,10 @@ def test_split_into_inserts_and_updates_single(crawler_mocked_identifiable_retri entlist = [db.Record(name="A").add_parent( "C"), db.Record(name="B").add_parent("C")] - assert crawler.get_identified_record_from_local_cache(entlist[0]) is None - assert crawler.get_identified_record_from_local_cache(entlist[1]) is None - assert crawler.can_be_checked_externally(entlist[0]) - assert crawler.can_be_checked_externally(entlist[1]) + assert crawler.get_from_any_cache(entlist[0]) is None + assert crawler.get_from_any_cache(entlist[1]) is None + assert not crawler.has_reference_value_without_id(entlist[0]) + assert not crawler.has_reference_value_without_id(entlist[1]) assert crawler.identifiableAdapter.retrieve_identified_record_for_record( entlist[0]).id == 1111 assert crawler.identifiableAdapter.retrieve_identified_record_for_record( @@ -498,45 +498,67 @@ def test_split_into_inserts_and_updates_with_copy_attr(crawler_mocked_identifiab crawler.identifiableAdapter.retrieve_identified_record_for_record.assert_called() -def test_all_references_are_existing_already(crawler): +def test_has_missing_object_in_references(crawler): # Simulate remote server content by using the names to identify records # There are only two known Records with name A and B crawler.identifiableAdapter.get_registered_identifiable = Mock(side_effect=partial( - basic_retrieve_by_name_mock_up, known={"A": db.Record(name="A").add_parent("C"), - "B": db.Record(name="B").add_parent("C")})) - - assert crawler.all_references_are_existing_already( - db.Record().add_property('a', 123)) - assert crawler.all_references_are_existing_already(db.Record() - .add_property('a', db.Record(id=123))) - assert crawler.all_references_are_existing_already(db.Record() - .add_property('a', 123) - .add_property('b', db.Record(id=123))) - assert not crawler.all_references_are_existing_already(db.Record() - .add_property('a', 123) - .add_property('b', db.Record(name="A") - .add_parent("C"))) - a = db.Record(name="A").add_parent("C") - crawler.add_identified_record_to_local_cache(a) - assert crawler.all_references_are_existing_already(db.Record() - .add_property('a', 123) - .add_property('b', a)) + basic_retrieve_by_name_mock_up, known={"C": db.Record(name="C").add_parent("RTC") + .add_property("d"), + "D": db.Record(name="D").add_parent("RTD") + .add_property("d").add_property("e"), + })) + + # one reference with id -> check + assert not crawler.has_missing_object_in_references(db.Record(name="C") + .add_parent("RTC").add_property('d', 123)) + # one ref with Entity with id -> check + assert not crawler.has_missing_object_in_references(db.Record(name="C") + .add_parent("RTC") + .add_property('d', db.Record(id=123) + .add_parent("C"))) + # one ref with id one with Entity with id (mixed) -> check + assert not crawler.has_missing_object_in_references(db.Record(name="C").add_parent("RTD") + .add_property('d', 123) + .add_property('b', db.Record(id=123) + .add_parent("RTC"))) + # entity to be referenced in the following + a = db.Record(name="C").add_parent("C").add_property("d", 12311) + # one ref with id one with Entity without id (but not identifying) -> fail + assert not crawler.has_missing_object_in_references(db.Record(name="C").add_parent("RTC") + .add_property('d', 123) + .add_property('e', a)) + + # one ref with id one with Entity without id (mixed) -> fail + assert not crawler.has_missing_object_in_references(db.Record(name="D").add_parent("RTD") + .add_property('d', 123) + .add_property('e', a)) + crawler.add_to_remote_missing_cache(a) + # one ref with id one with Entity without id but in cache -> check + assert crawler.has_missing_object_in_references(db.Record(name="D").add_parent("RTD") + .add_property('d', 123) + .add_property('e', a)) # if this ever fails, the mock up may be removed crawler.identifiableAdapter.get_registered_identifiable.assert_called() -def test_can_be_checked_externally(crawler): - assert crawler.can_be_checked_externally( - db.Record().add_property('a', 123)) - assert crawler.can_be_checked_externally(db.Record() - .add_property('a', db.Record(id=123))) - assert crawler.can_be_checked_externally(db.Record() - .add_property('a', 123) - .add_property('b', db.Record(id=123))) - - assert not crawler.can_be_checked_externally(db.Record() - .add_property('a', 123) - .add_property('b', db.Record())) +def test_references_entities_without_ids(crawler, ident): + assert not crawler.has_reference_value_without_id(db.Record().add_parent("Person") + .add_property('last_name', 123) + .add_property('first_name', 123)) + # id and rec with id + assert not crawler.has_reference_value_without_id(db.Record().add_parent("Person") + .add_property('first_name', 123) + .add_property('last_name', db.Record(id=123))) + # id and rec with id and one unneeded prop + assert crawler.has_reference_value_without_id(db.Record().add_parent("Person") + .add_property('first_name', 123) + .add_property('stuff', db.Record()) + .add_property('last_name', db.Record(id=123))) + + # one identifying prop is missing + assert crawler.has_reference_value_without_id(db.Record().add_parent("Person") + .add_property('first_name', 123) + .add_property('last_name', db.Record())) def test_replace_entities_with_ids(crawler):