diff --git a/integrationtests/basic_example/test_basic.py b/integrationtests/basic_example/test_basic.py index b24a1c658cfc9e23ca0ba2de266161864cb6b66c..571720ab1dafd0384672a55ec5c6e8a8aeffb605 100755 --- a/integrationtests/basic_example/test_basic.py +++ b/integrationtests/basic_example/test_basic.py @@ -108,7 +108,7 @@ def crawler_extended(ident): cr = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory(cr, cfood="scifolder_extended.yml") # correct paths for current working directory - file_list = [r for r in cr.target_data if r.role == "File"] + file_list = [r for r in cr.crawled_data if r.role == "File"] for f in file_list: f.file = rfp("..", "..", "unittests", "test_directories", f.file) return cr @@ -160,7 +160,7 @@ def test_insertion(clear_database, usemodel, ident, crawler): # Do a second run on the same data, there should a new insert: cr = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory(cr, "example_insert") - assert len(cr.target_data) == 3 + assert len(cr.crawled_data) == 3 ins, ups = cr.synchronize() assert len(ins) == 1 assert len(ups) == 0 @@ -168,7 +168,7 @@ def test_insertion(clear_database, usemodel, ident, crawler): # Do it again to check whether nothing is changed: cr = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory(cr, "example_insert") - assert len(cr.target_data) == 3 + assert len(cr.crawled_data) == 3 ins, ups = cr.synchronize() assert len(ins) == 0 assert len(ups) == 0 @@ -180,7 +180,7 @@ def test_insert_auth(clear_database, usemodel, ident, crawler): # Do a second run on the same data, there should a new insert: cr = Crawler(debug=True, identifiableAdapter=ident, securityMode=SecurityMode.RETRIEVE) crawl_standard_test_directory(cr, "example_insert") - assert len(cr.target_data) == 3 + assert len(cr.crawled_data) == 3 ins, ups = cr.synchronize() assert len(ins) == 1 assert not ins[0].is_valid() @@ -190,7 +190,7 @@ def test_insert_auth(clear_database, usemodel, ident, crawler): # Do it again to check whether nothing is changed: cr = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory(cr, "example_insert") - assert len(cr.target_data) == 3 + assert len(cr.crawled_data) == 3 ins, ups = cr.synchronize() assert len(ins) == 0 assert len(ups) == 0 @@ -205,9 +205,9 @@ def test_insertion_and_update(clear_database, usemodel, ident, crawler): cr = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory(cr, "example_overwrite_1") - # print(cr.target_data) + # print(cr.crawled_data) # cr.save_debug_data(rfp("provenance.yml")) - assert len(cr.target_data) == 3 + assert len(cr.crawled_data) == 3 ins, ups = cr.synchronize() assert len(ins) == 0 assert len(ups) == 1 @@ -222,7 +222,7 @@ def test_identifiable_update(clear_database, usemodel, ident, crawler): crawl_standard_test_directory(cr) # Test the addition of a single property: - l = cr.target_data + l = cr.crawled_data for record in l: if (record.parents[0].name == "Measurement" and record.get_property("date").value == "2020-01-03"): @@ -238,7 +238,7 @@ def test_identifiable_update(clear_database, usemodel, ident, crawler): # Test the change within one property: cr = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory(cr) - l = cr.target_data + l = cr.crawled_data for record in l: if (record.parents[0].name == "Measurement" and record.get_property("date").value == "2020-01-03"): @@ -252,7 +252,7 @@ def test_identifiable_update(clear_database, usemodel, ident, crawler): # Changing the date should result in a new insertion: cr = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory(cr) - l = cr.target_data + l = cr.crawled_data for record in l: if (record.parents[0].name == "Measurement" and record.get_property("date").value == "2020-01-03"): @@ -269,7 +269,7 @@ def test_file_insertion_dry(clear_database, usemodel, ident): crawler_extended = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory( crawler_extended, cfood="scifolder_extended.yml") - file_list = [r for r in crawler_extended.target_data if r.role == "File"] + file_list = [r for r in crawler_extended.crawled_data if r.role == "File"] assert len(file_list) == 11 for f in file_list: @@ -305,7 +305,7 @@ def test_file_update(clear_database, usemodel, ident, crawler_extended): cr = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory(cr, cfood="scifolder_extended.yml") - file_list = [r for r in cr.target_data if r.role == "File"] + file_list = [r for r in cr.crawled_data if r.role == "File"] for f in file_list: f.file = rfp("..", "..", "unittests", "test_directories", f.file) ins2, ups2 = cr.synchronize(commit_changes=True) @@ -320,7 +320,7 @@ def test_file_update(clear_database, usemodel, ident, crawler_extended): cr2 = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory(cr2, cfood="scifolder_extended2.yml") - file_list = [r for r in cr2.target_data if r.role == "File"] + file_list = [r for r in cr2.crawled_data if r.role == "File"] for f in file_list: f.file = rfp("..", "..", "unittests", "test_directories", f.file) ins3, ups3 = cr2.synchronize(commit_changes=True) diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index d9b06a505c72dc6e8de91d09d2447b3d0c2840b4..ad77e678a31a9bd950c89019f38ce58a20d9c2e3 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -113,7 +113,7 @@ def check_identical(record1: db.Entity, record2: db.Entity, ignore_id=False): return False for attribute in ("datatype", "importance", "unit"): # only make an update for those attributes if there is a value difference and - # the value in the target_data is not None + # the value in the crawled_data is not None if attribute in comp[0]["properties"][key]: attr_val = comp[0]["properties"][key][attribute] other_attr_val = (comp[1]["properties"][key][attribute] @@ -447,7 +447,7 @@ class Crawler(object): Returns ------- - target_data : list + crawled_data : list the final list with the target state of Records. """ @@ -463,14 +463,14 @@ class Crawler(object): local_converters = Crawler.initialize_converters( crawler_definition, converter_registry) # This recursive crawling procedure generates the update list: - self.target_data: List[db.Record] = [] + self.crawled_data: List[db.Record] = [] self._crawl(items, local_converters, self.generalStore, self.recordStore, [], []) if self.debug: self.debug_converters = local_converters - return self.target_data + return self.crawled_data def synchronize(self, commit_changes: bool = True, unique_names=True): """ @@ -480,7 +480,7 @@ class Crawler(object): # After the crawling, the actual synchronization with the database, based on the # update list is carried out: - return self._synchronize(self.target_data, commit_changes, unique_names=unique_names) + return self._synchronize(self.crawled_data, commit_changes, unique_names=unique_names) def has_reference_value_without_id(self, record: db.Record): """ @@ -781,17 +781,17 @@ class Crawler(object): @staticmethod def _merge_properties_from_remote( - target_data: List[db.Record], + crawled_data: List[db.Record], identified_records: List[db.Record] ): - """Merge planned updates with remotely found identified records - s.th. new properties and property values are updated correctly but + """Merge entity representation that was created by crawling the data with remotely found + identified records s.th. new properties and property values are updated correctly but additional properties are not overwritten. Parameters ---------- - target_data : list[db.Record] - List of the updates found by the crawler + crawled_data : list[db.Record] + List of the Entities created by the crawler identified_records : list[db.Record] List of identified remote Records @@ -801,7 +801,7 @@ class Crawler(object): List of merged records """ to_be_updated = [] - for target, identified in zip(target_data, identified_records): + for target, identified in zip(crawled_data, identified_records): # Special treatment for name and description in case they have been # set in the server independently from the crawler for attr in ["name", "description"]: @@ -823,7 +823,7 @@ class Crawler(object): @staticmethod def remove_unnecessary_updates( - target_data: List[db.Record], + crawled_data: List[db.Record], identified_records: List[db.Record] ): """Compare the Records to be updated with their remote @@ -834,14 +834,14 @@ class Crawler(object): update list without unecessary updates """ - if len(target_data) != len(identified_records): + if len(crawled_data) != len(identified_records): raise RuntimeError("The lists of updates and of identified records need to be of the " "same length!") actual_updates = [] - for i in reversed(range(len(target_data))): + for i in reversed(range(len(crawled_data))): - if not check_identical(target_data[i], identified_records[i]): - actual_updates.append(target_data[i]) + if not check_identical(crawled_data[i], identified_records[i]): + actual_updates.append(crawled_data[i]) return actual_updates @@ -936,12 +936,12 @@ class Crawler(object): update_cache = UpdateCache() update_cache.insert(to_be_updated, run_id) - def _synchronize(self, target_data: List[db.Record], commit_changes: bool = True, + def _synchronize(self, crawled_data: List[db.Record], commit_changes: bool = True, unique_names=True): """ This function applies several stages: - 1) Retrieve identifiables for all records in target_data. - 2) Compare target_data with existing records. + 1) Retrieve identifiables for all records in crawled_data. + 2) Compare crawled_data with existing records. 3) Insert and update records based on the set of identified differences. This function makes use of an IdentifiableAdapter which is used to retrieve @@ -956,8 +956,7 @@ class Crawler(object): if self.identifiableAdapter is None: raise RuntimeError("Should not happen.") - to_be_inserted, to_be_updated = self.split_into_inserts_and_updates( - target_data) + to_be_inserted, to_be_updated = self.split_into_inserts_and_updates(crawled_data) # TODO: refactoring of typo for el in to_be_updated: @@ -1133,7 +1132,7 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) # to the general update container. scoped_records = recordStore.get_records_current_scope() for record in scoped_records: - self.target_data.append(record) + self.crawled_data.append(record) # TODO: the scoped variables should be cleaned up as soon if the variables # are no longer in the current scope. This can be implemented as follows, @@ -1146,7 +1145,7 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) # del recordStore[name] # del generalStore[name] - return self.target_data + return self.crawled_data def crawler_main(crawled_directory_path: str, @@ -1208,7 +1207,7 @@ def crawler_main(crawled_directory_path: str, "update": updates})) else: rtsfinder = dict() - for elem in crawler.target_data: + for elem in crawler.crawled_data: if isinstance(elem, db.File): # correct the file path: # elem.file = os.path.join(args.path, elem.file) diff --git a/unittests/test_tool.py b/unittests/test_tool.py index 6fe0b44b9ad3a3505a30e4ed0efeba1929f4eaea..0eef86b3a9f5ef6f64d9ccb9ce0102cd87208fa4 100755 --- a/unittests/test_tool.py +++ b/unittests/test_tool.py @@ -184,7 +184,7 @@ def test_record_structure_generation(crawler): def test_ambigious_records(crawler, ident): ident.get_records().clear() - ident.get_records().extend(crawler.target_data) + ident.get_records().extend(crawler.crawled_data) r = ident.get_records() id_r0 = ident.get_identifiable(r[0]) with raises(RuntimeError, match=".*unambigiously.*"): @@ -206,7 +206,7 @@ def test_crawler_update_list(crawler, ident): ) == 2 # The crawler contains lots of duplicates, because identifiables have not been resolved yet: - assert len(ident.get_records()) != len(crawler.target_data) + assert len(ident.get_records()) != len(crawler.crawled_data) # Check consistency: # Check whether identifiables retrieved from current identifiable store return @@ -339,7 +339,7 @@ def test_identifiable_adapter_no_identifiable(crawler, ident): insl, updl = crawler.synchronize() assert len(updl) == 0 - pers = [r for r in crawler.target_data if r.parents[0].name == "Person"] + pers = [r for r in crawler.crawled_data if r.parents[0].name == "Person"] # All persons are inserted, because they are not identifiable: assert len(insl) == len(pers)