diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 201b01edcef16f9656b4ca1ad174e55e8ff8d947..d9b06a505c72dc6e8de91d09d2447b3d0c2840b4 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -740,31 +740,13 @@ class Crawler(object): else: # side effect record.id = identified_record.id - # Special treatment for name and description in case - # they have been set in the server independently from - # the crawler - for attr in ["name", "description"]: - if getattr(record, attr) is None: - # The crawler didn't find any name or - # description, i.e., not an empty one. In this - # case (and only in this), keep any existing - # name or description. - setattr(record, attr, getattr( - identified_record, attr)) # Copy over checksum and size too if it is a file if isinstance(record, db.File): record._size = identified_record._size record._checksum = identified_record._checksum - # Create a temporary copy since the merge will be conducted in place - tmp = deepcopy(identified_record) - # A force merge will overwrite any properties that both - # the identified and the crawled record have with the - # values of the crawled record while keeping existing - # properties intact. - merge_entities(tmp, record, force=True) - to_be_updated.append(tmp) - self.add_to_remote_existing_cache(tmp) + to_be_updated.append(record) + self.add_to_remote_existing_cache(record) del flat[i] resolved_references = True @@ -798,12 +780,56 @@ class Crawler(object): el.value[index] = val.id @staticmethod - def remove_unnecessary_updates(target_data: List[db.Record], - identified_records: List[db.Record]): + def _merge_properties_from_remote( + target_data: List[db.Record], + identified_records: List[db.Record] + ): + """Merge planned updates with remotely found identified records + s.th. new properties and property values are updated correctly but + additional properties are not overwritten. + + Parameters + ---------- + target_data : list[db.Record] + List of the updates found by the crawler + identified_records : list[db.Record] + List of identified remote Records + + Returns + ------- + to_be_updated : list[db.Record] + List of merged records """ - checks whether all relevant attributes (especially Property values) are equal + to_be_updated = [] + for target, identified in zip(target_data, identified_records): + # Special treatment for name and description in case they have been + # set in the server independently from the crawler + for attr in ["name", "description"]: + if getattr(target, attr) is None: + # The crawler didn't find any name or description, i.e., not + # an empty one. In this case (and only in this), keep any + # existing name or description. + setattr(target, attr, getattr(identified, attr)) + + # Create a temporary copy since the merge will be conducted in place + tmp = deepcopy(identified) + # A force merge will overwrite any properties that both the + # identified and the crawled record have with the values of the + # crawled record while keeping existing properties intact. + merge_entities(tmp, target, force=True) + to_be_updated.append(tmp) + + return to_be_updated - Returns (in future) + @staticmethod + def remove_unnecessary_updates( + target_data: List[db.Record], + identified_records: List[db.Record] + ): + """Compare the Records to be updated with their remote + correspondant. Only update if there are actual differences. + + Returns ------- update list without unecessary updates @@ -811,15 +837,13 @@ class Crawler(object): if len(target_data) != len(identified_records): raise RuntimeError("The lists of updates and of identified records need to be of the " "same length!") - # TODO this can now easily be changed to a function without side effect + actual_updates = [] for i in reversed(range(len(target_data))): - identical = check_identical(target_data[i], identified_records[i]) - if identical: - del target_data[i] - continue - else: - pass + if not check_identical(target_data[i], identified_records[i]): + actual_updates.append(target_data[i]) + + return actual_updates @staticmethod def execute_parent_updates_in_list(to_be_updated, securityMode, run_id, unique_names): @@ -944,8 +968,13 @@ class Crawler(object): self.identifiableAdapter.retrieve_identified_record_for_record( record) for record in to_be_updated] - # remove unnecessary updates from list by comparing the target records to the existing ones - self.remove_unnecessary_updates(to_be_updated, identified_records) + # Merge with existing data to prevent unwanted overwrites + to_be_updated = self._merge_properties_from_remote(to_be_updated, + identified_records) + # remove unnecessary updates from list by comparing the target records + # to the existing ones + to_be_updated = self.remove_unnecessary_updates( + to_be_updated, identified_records) if commit_changes: self.execute_parent_updates_in_list(to_be_updated, securityMode=self.securityMode, diff --git a/unittests/test_tool.py b/unittests/test_tool.py index 85440fc76ca3dbdec4a88862abbc7d961fee8f5f..6fe0b44b9ad3a3505a30e4ed0efeba1929f4eaea 100755 --- a/unittests/test_tool.py +++ b/unittests/test_tool.py @@ -296,8 +296,8 @@ def test_remove_unnecessary_updates(): # test trvial case upl = [db.Record().add_parent("A")] irs = [db.Record().add_parent("A")] - Crawler.remove_unnecessary_updates(upl, irs) - assert len(upl) == 0 + updates = Crawler.remove_unnecessary_updates(upl, irs) + assert len(updates) == 0 # test property difference case # TODO this should work right? @@ -309,24 +309,24 @@ def test_remove_unnecessary_updates(): # test value difference case upl = [db.Record().add_parent("A").add_property("a", 5)] irs = [db.Record().add_parent("A").add_property("a")] - Crawler.remove_unnecessary_updates(upl, irs) - assert len(upl) == 1 + updates = Crawler.remove_unnecessary_updates(upl, irs) + assert len(updates) == 1 upl = [db.Record().add_parent("A").add_property("a", 5)] irs = [db.Record().add_parent("A").add_property("a", 5)] - Crawler.remove_unnecessary_updates(upl, irs) - assert len(upl) == 0 + updates = Crawler.remove_unnecessary_updates(upl, irs) + assert len(updates) == 0 # test unit difference case upl = [db.Record().add_parent("A").add_property("a", unit='cm')] irs = [db.Record().add_parent("A").add_property("a")] - Crawler.remove_unnecessary_updates(upl, irs) - assert len(upl) == 1 + updates = Crawler.remove_unnecessary_updates(upl, irs) + assert len(updates) == 1 # test None difference case upl = [db.Record().add_parent("A").add_property("a")] irs = [db.Record().add_parent("A").add_property("a", 5)] - Crawler.remove_unnecessary_updates(upl, irs) - assert len(upl) == 1 + updates = Crawler.remove_unnecessary_updates(upl, irs) + assert len(updates) == 1 # Current status: