diff --git a/CHANGELOG.md b/CHANGELOG.md index 7aade2344e6ac0ed9e66a12ec6c4e3e001ab0905..6fd18cee55ad484fa54aa527b997fd757fbedde0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,8 +18,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Removed ### ### Fixed ### -* FIX: https://gitlab.com/caosdb/caosdb-crawler/-/issues/31 Identified cache: Hash is the same for Records without IDs -* FIX: https://gitlab.com/caosdb/caosdb-crawler/-/issues/30 + +* [#31](https://gitlab.com/caosdb/caosdb-crawler/-/issues/31) Identified cache: + Hash is the same for Records without IDs +* [#30](https://gitlab.com/caosdb/caosdb-crawler/-/issues/30) +* [#23](https://gitlab.com/caosdb/caosdb-crawler/-/issues/23) Crawler may + overwrite and delete existing data in case of manually added properties ### Security ### diff --git a/integrationtests/basic_example/test_basic.py b/integrationtests/basic_example/test_basic.py index b24a1c658cfc9e23ca0ba2de266161864cb6b66c..571720ab1dafd0384672a55ec5c6e8a8aeffb605 100755 --- a/integrationtests/basic_example/test_basic.py +++ b/integrationtests/basic_example/test_basic.py @@ -108,7 +108,7 @@ def crawler_extended(ident): cr = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory(cr, cfood="scifolder_extended.yml") # correct paths for current working directory - file_list = [r for r in cr.target_data if r.role == "File"] + file_list = [r for r in cr.crawled_data if r.role == "File"] for f in file_list: f.file = rfp("..", "..", "unittests", "test_directories", f.file) return cr @@ -160,7 +160,7 @@ def test_insertion(clear_database, usemodel, ident, crawler): # Do a second run on the same data, there should a new insert: cr = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory(cr, "example_insert") - assert len(cr.target_data) == 3 + assert len(cr.crawled_data) == 3 ins, ups = cr.synchronize() assert len(ins) == 1 assert len(ups) == 0 @@ -168,7 +168,7 @@ def test_insertion(clear_database, usemodel, ident, crawler): # Do it again to check whether nothing is changed: cr = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory(cr, "example_insert") - assert len(cr.target_data) == 3 + assert len(cr.crawled_data) == 3 ins, ups = cr.synchronize() assert len(ins) == 0 assert len(ups) == 0 @@ -180,7 +180,7 @@ def test_insert_auth(clear_database, usemodel, ident, crawler): # Do a second run on the same data, there should a new insert: cr = Crawler(debug=True, identifiableAdapter=ident, securityMode=SecurityMode.RETRIEVE) crawl_standard_test_directory(cr, "example_insert") - assert len(cr.target_data) == 3 + assert len(cr.crawled_data) == 3 ins, ups = cr.synchronize() assert len(ins) == 1 assert not ins[0].is_valid() @@ -190,7 +190,7 @@ def test_insert_auth(clear_database, usemodel, ident, crawler): # Do it again to check whether nothing is changed: cr = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory(cr, "example_insert") - assert len(cr.target_data) == 3 + assert len(cr.crawled_data) == 3 ins, ups = cr.synchronize() assert len(ins) == 0 assert len(ups) == 0 @@ -205,9 +205,9 @@ def test_insertion_and_update(clear_database, usemodel, ident, crawler): cr = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory(cr, "example_overwrite_1") - # print(cr.target_data) + # print(cr.crawled_data) # cr.save_debug_data(rfp("provenance.yml")) - assert len(cr.target_data) == 3 + assert len(cr.crawled_data) == 3 ins, ups = cr.synchronize() assert len(ins) == 0 assert len(ups) == 1 @@ -222,7 +222,7 @@ def test_identifiable_update(clear_database, usemodel, ident, crawler): crawl_standard_test_directory(cr) # Test the addition of a single property: - l = cr.target_data + l = cr.crawled_data for record in l: if (record.parents[0].name == "Measurement" and record.get_property("date").value == "2020-01-03"): @@ -238,7 +238,7 @@ def test_identifiable_update(clear_database, usemodel, ident, crawler): # Test the change within one property: cr = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory(cr) - l = cr.target_data + l = cr.crawled_data for record in l: if (record.parents[0].name == "Measurement" and record.get_property("date").value == "2020-01-03"): @@ -252,7 +252,7 @@ def test_identifiable_update(clear_database, usemodel, ident, crawler): # Changing the date should result in a new insertion: cr = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory(cr) - l = cr.target_data + l = cr.crawled_data for record in l: if (record.parents[0].name == "Measurement" and record.get_property("date").value == "2020-01-03"): @@ -269,7 +269,7 @@ def test_file_insertion_dry(clear_database, usemodel, ident): crawler_extended = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory( crawler_extended, cfood="scifolder_extended.yml") - file_list = [r for r in crawler_extended.target_data if r.role == "File"] + file_list = [r for r in crawler_extended.crawled_data if r.role == "File"] assert len(file_list) == 11 for f in file_list: @@ -305,7 +305,7 @@ def test_file_update(clear_database, usemodel, ident, crawler_extended): cr = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory(cr, cfood="scifolder_extended.yml") - file_list = [r for r in cr.target_data if r.role == "File"] + file_list = [r for r in cr.crawled_data if r.role == "File"] for f in file_list: f.file = rfp("..", "..", "unittests", "test_directories", f.file) ins2, ups2 = cr.synchronize(commit_changes=True) @@ -320,7 +320,7 @@ def test_file_update(clear_database, usemodel, ident, crawler_extended): cr2 = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory(cr2, cfood="scifolder_extended2.yml") - file_list = [r for r in cr2.target_data if r.role == "File"] + file_list = [r for r in cr2.crawled_data if r.role == "File"] for f in file_list: f.file = rfp("..", "..", "unittests", "test_directories", f.file) ins3, ups3 = cr2.synchronize(commit_changes=True) diff --git a/integrationtests/test_issues.py b/integrationtests/test_issues.py index f0f2a7876f9db85aeff768c3659915ef2b0cb9f5..78beb63c55bbce9cb1082345bdb4ccb1f9a3129f 100644 --- a/integrationtests/test_issues.py +++ b/integrationtests/test_issues.py @@ -30,11 +30,12 @@ def clear_database(): db.execute_query("FIND ENTITY").delete(raise_exception_on_error=False) -@mark.xfail(reason="See issue https://gitlab.com/caosdb/caosdb-crawler/-/issues/23") def test_issue_23(clear_database): """Test that an update leaves existing properties, that were not found by the crawler, unchanged. + See issue https://gitlab.com/caosdb/caosdb-crawler/-/issues/23 + """ # insert a simplistic model an arecord of type TestType with identifying diff --git a/setup.cfg b/setup.cfg index 3a9ad9b613380831eaae0cd3502e3a8ed4f22e5a..994ba7cacf2fc3cf5ff93a17a23eb5bcec34067f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -20,7 +20,7 @@ packages = find: python_requires = >=3.8 install_requires = importlib-resources - caosdb + caosdb > 0.9.0 caosadvancedtools >= 0.6.0 yaml-header-tools >= 0.2.1 pyyaml diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 6187322b902ed9ea5861a945c318c79b018ccf16..ad77e678a31a9bd950c89019f38ce58a20d9c2e3 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -113,7 +113,7 @@ def check_identical(record1: db.Entity, record2: db.Entity, ignore_id=False): return False for attribute in ("datatype", "importance", "unit"): # only make an update for those attributes if there is a value difference and - # the value in the target_data is not None + # the value in the crawled_data is not None if attribute in comp[0]["properties"][key]: attr_val = comp[0]["properties"][key][attribute] other_attr_val = (comp[1]["properties"][key][attribute] @@ -447,7 +447,7 @@ class Crawler(object): Returns ------- - target_data : list + crawled_data : list the final list with the target state of Records. """ @@ -463,14 +463,14 @@ class Crawler(object): local_converters = Crawler.initialize_converters( crawler_definition, converter_registry) # This recursive crawling procedure generates the update list: - self.target_data: List[db.Record] = [] + self.crawled_data: List[db.Record] = [] self._crawl(items, local_converters, self.generalStore, self.recordStore, [], []) if self.debug: self.debug_converters = local_converters - return self.target_data + return self.crawled_data def synchronize(self, commit_changes: bool = True, unique_names=True): """ @@ -480,7 +480,7 @@ class Crawler(object): # After the crawling, the actual synchronization with the database, based on the # update list is carried out: - return self._synchronize(self.target_data, commit_changes, unique_names=unique_names) + return self._synchronize(self.crawled_data, commit_changes, unique_names=unique_names) def has_reference_value_without_id(self, record: db.Record): """ @@ -780,28 +780,70 @@ class Crawler(object): el.value[index] = val.id @staticmethod - def remove_unnecessary_updates(target_data: List[db.Record], - identified_records: List[db.Record]): + def _merge_properties_from_remote( + crawled_data: List[db.Record], + identified_records: List[db.Record] + ): + """Merge entity representation that was created by crawling the data with remotely found + identified records s.th. new properties and property values are updated correctly but + additional properties are not overwritten. + + Parameters + ---------- + crawled_data : list[db.Record] + List of the Entities created by the crawler + identified_records : list[db.Record] + List of identified remote Records + + Returns + ------- + to_be_updated : list[db.Record] + List of merged records """ - checks whether all relevant attributes (especially Property values) are equal + to_be_updated = [] + for target, identified in zip(crawled_data, identified_records): + # Special treatment for name and description in case they have been + # set in the server independently from the crawler + for attr in ["name", "description"]: + if getattr(target, attr) is None: + # The crawler didn't find any name or description, i.e., not + # an empty one. In this case (and only in this), keep any + # existing name or description. + setattr(target, attr, getattr(identified, attr)) + + # Create a temporary copy since the merge will be conducted in place + tmp = deepcopy(identified) + # A force merge will overwrite any properties that both the + # identified and the crawled record have with the values of the + # crawled record while keeping existing properties intact. + merge_entities(tmp, target, force=True) + to_be_updated.append(tmp) + + return to_be_updated - Returns (in future) + @staticmethod + def remove_unnecessary_updates( + crawled_data: List[db.Record], + identified_records: List[db.Record] + ): + """Compare the Records to be updated with their remote + correspondant. Only update if there are actual differences. + + Returns ------- update list without unecessary updates """ - if len(target_data) != len(identified_records): + if len(crawled_data) != len(identified_records): raise RuntimeError("The lists of updates and of identified records need to be of the " "same length!") - # TODO this can now easily be changed to a function without side effect - for i in reversed(range(len(target_data))): - identical = check_identical(target_data[i], identified_records[i]) + actual_updates = [] + for i in reversed(range(len(crawled_data))): - if identical: - del target_data[i] - continue - else: - pass + if not check_identical(crawled_data[i], identified_records[i]): + actual_updates.append(crawled_data[i]) + + return actual_updates @staticmethod def execute_parent_updates_in_list(to_be_updated, securityMode, run_id, unique_names): @@ -894,12 +936,12 @@ class Crawler(object): update_cache = UpdateCache() update_cache.insert(to_be_updated, run_id) - def _synchronize(self, target_data: List[db.Record], commit_changes: bool = True, + def _synchronize(self, crawled_data: List[db.Record], commit_changes: bool = True, unique_names=True): """ This function applies several stages: - 1) Retrieve identifiables for all records in target_data. - 2) Compare target_data with existing records. + 1) Retrieve identifiables for all records in crawled_data. + 2) Compare crawled_data with existing records. 3) Insert and update records based on the set of identified differences. This function makes use of an IdentifiableAdapter which is used to retrieve @@ -914,8 +956,7 @@ class Crawler(object): if self.identifiableAdapter is None: raise RuntimeError("Should not happen.") - to_be_inserted, to_be_updated = self.split_into_inserts_and_updates( - target_data) + to_be_inserted, to_be_updated = self.split_into_inserts_and_updates(crawled_data) # TODO: refactoring of typo for el in to_be_updated: @@ -926,8 +967,13 @@ class Crawler(object): self.identifiableAdapter.retrieve_identified_record_for_record( record) for record in to_be_updated] - # remove unnecessary updates from list by comparing the target records to the existing ones - self.remove_unnecessary_updates(to_be_updated, identified_records) + # Merge with existing data to prevent unwanted overwrites + to_be_updated = self._merge_properties_from_remote(to_be_updated, + identified_records) + # remove unnecessary updates from list by comparing the target records + # to the existing ones + to_be_updated = self.remove_unnecessary_updates( + to_be_updated, identified_records) if commit_changes: self.execute_parent_updates_in_list(to_be_updated, securityMode=self.securityMode, @@ -1086,7 +1132,7 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) # to the general update container. scoped_records = recordStore.get_records_current_scope() for record in scoped_records: - self.target_data.append(record) + self.crawled_data.append(record) # TODO: the scoped variables should be cleaned up as soon if the variables # are no longer in the current scope. This can be implemented as follows, @@ -1099,7 +1145,7 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) # del recordStore[name] # del generalStore[name] - return self.target_data + return self.crawled_data def crawler_main(crawled_directory_path: str, @@ -1161,7 +1207,7 @@ def crawler_main(crawled_directory_path: str, "update": updates})) else: rtsfinder = dict() - for elem in crawler.target_data: + for elem in crawler.crawled_data: if isinstance(elem, db.File): # correct the file path: # elem.file = os.path.join(args.path, elem.file) diff --git a/unittests/test_tool.py b/unittests/test_tool.py index 85440fc76ca3dbdec4a88862abbc7d961fee8f5f..0eef86b3a9f5ef6f64d9ccb9ce0102cd87208fa4 100755 --- a/unittests/test_tool.py +++ b/unittests/test_tool.py @@ -184,7 +184,7 @@ def test_record_structure_generation(crawler): def test_ambigious_records(crawler, ident): ident.get_records().clear() - ident.get_records().extend(crawler.target_data) + ident.get_records().extend(crawler.crawled_data) r = ident.get_records() id_r0 = ident.get_identifiable(r[0]) with raises(RuntimeError, match=".*unambigiously.*"): @@ -206,7 +206,7 @@ def test_crawler_update_list(crawler, ident): ) == 2 # The crawler contains lots of duplicates, because identifiables have not been resolved yet: - assert len(ident.get_records()) != len(crawler.target_data) + assert len(ident.get_records()) != len(crawler.crawled_data) # Check consistency: # Check whether identifiables retrieved from current identifiable store return @@ -296,8 +296,8 @@ def test_remove_unnecessary_updates(): # test trvial case upl = [db.Record().add_parent("A")] irs = [db.Record().add_parent("A")] - Crawler.remove_unnecessary_updates(upl, irs) - assert len(upl) == 0 + updates = Crawler.remove_unnecessary_updates(upl, irs) + assert len(updates) == 0 # test property difference case # TODO this should work right? @@ -309,24 +309,24 @@ def test_remove_unnecessary_updates(): # test value difference case upl = [db.Record().add_parent("A").add_property("a", 5)] irs = [db.Record().add_parent("A").add_property("a")] - Crawler.remove_unnecessary_updates(upl, irs) - assert len(upl) == 1 + updates = Crawler.remove_unnecessary_updates(upl, irs) + assert len(updates) == 1 upl = [db.Record().add_parent("A").add_property("a", 5)] irs = [db.Record().add_parent("A").add_property("a", 5)] - Crawler.remove_unnecessary_updates(upl, irs) - assert len(upl) == 0 + updates = Crawler.remove_unnecessary_updates(upl, irs) + assert len(updates) == 0 # test unit difference case upl = [db.Record().add_parent("A").add_property("a", unit='cm')] irs = [db.Record().add_parent("A").add_property("a")] - Crawler.remove_unnecessary_updates(upl, irs) - assert len(upl) == 1 + updates = Crawler.remove_unnecessary_updates(upl, irs) + assert len(updates) == 1 # test None difference case upl = [db.Record().add_parent("A").add_property("a")] irs = [db.Record().add_parent("A").add_property("a", 5)] - Crawler.remove_unnecessary_updates(upl, irs) - assert len(upl) == 1 + updates = Crawler.remove_unnecessary_updates(upl, irs) + assert len(updates) == 1 # Current status: @@ -339,7 +339,7 @@ def test_identifiable_adapter_no_identifiable(crawler, ident): insl, updl = crawler.synchronize() assert len(updl) == 0 - pers = [r for r in crawler.target_data if r.parents[0].name == "Person"] + pers = [r for r in crawler.crawled_data if r.parents[0].name == "Person"] # All persons are inserted, because they are not identifiable: assert len(insl) == len(pers)