From 7234cf345555afe7ca1b0d9a90a639a7dedfa925 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Thu, 14 Jul 2022 17:13:28 +0200 Subject: [PATCH] MAINT: some refactoring of the main function in crawl.py (and collateral) --- integrationtests/basic_example/test.py | 26 ++-- integrationtests/test_realworld_example.py | 3 +- .../test_use_case_simple_presentation.py | 3 +- src/caoscrawler/crawl.py | 119 ++++++++++-------- unittests/test_tool.py | 6 +- 5 files changed, 83 insertions(+), 74 deletions(-) diff --git a/integrationtests/basic_example/test.py b/integrationtests/basic_example/test.py index 6e35f7f2..2c972fa9 100755 --- a/integrationtests/basic_example/test.py +++ b/integrationtests/basic_example/test.py @@ -102,8 +102,7 @@ def crawler_extended(ident): cr = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory(cr, cfood="scifolder_extended.yml") # correct paths for current working directory - updateList = cr.updateList - fileList = [r for r in updateList if r.role == "File"] + fileList = [r for r in cr.targetData if r.role == "File"] for f in fileList: f.file = rfp("..", "unittests", "test_directories", "examples_article", f.file) @@ -157,7 +156,7 @@ def test_insertion(clear_database, usemodel, ident, crawler): # Do a second run on the same data, there should a new insert: cr = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory(cr, "example_insert") - assert len(cr.updateList) == 3 + assert len(cr.targetData) == 3 ins, ups = cr.synchronize() assert len(ins) == 1 assert len(ups) == 0 @@ -165,7 +164,7 @@ def test_insertion(clear_database, usemodel, ident, crawler): # Do it again to check whether nothing is changed: cr = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory(cr, "example_insert") - assert len(cr.updateList) == 3 + assert len(cr.targetData) == 3 ins, ups = cr.synchronize() assert len(ins) == 0 assert len(ups) == 0 @@ -180,9 +179,9 @@ def test_insertion_and_update(clear_database, usemodel, ident, crawler): cr = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory(cr, "example_overwrite_1") - # print(cr.updateList) + # print(cr.targetData) # cr.save_debug_data(rfp("provenance.yml")) - assert len(cr.updateList) == 3 + assert len(cr.targetData) == 3 ins, ups = cr.synchronize() assert len(ins) == 0 assert len(ups) == 1 @@ -197,7 +196,7 @@ def test_identifiable_update(clear_database, usemodel, ident, crawler): crawl_standard_test_directory(cr) # Test the addition of a single property: - l = cr.updateList + l = cr.targetData for record in l: if (record.parents[0].name == "Measurement" and record.get_property("date").value == "2020-01-03"): @@ -213,7 +212,7 @@ def test_identifiable_update(clear_database, usemodel, ident, crawler): # Test the change within one property: cr = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory(cr) - l = cr.updateList + l = cr.targetData for record in l: if (record.parents[0].name == "Measurement" and record.get_property("date").value == "2020-01-03"): @@ -227,7 +226,7 @@ def test_identifiable_update(clear_database, usemodel, ident, crawler): # Changing the date should result in a new insertion: cr = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory(cr) - l = cr.updateList + l = cr.targetData for record in l: if (record.parents[0].name == "Measurement" and record.get_property("date").value == "2020-01-03"): @@ -244,8 +243,7 @@ def test_file_insertion_dry(clear_database, usemodel, ident): crawler_extended = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory( crawler_extended, cfood="scifolder_extended.yml") - updateList = crawler_extended.updateList - fileList = [r for r in updateList if r.role == "File"] + fileList = [r for r in crawler_extended.targetData if r.role == "File"] assert len(fileList) == 11 for f in fileList: @@ -281,8 +279,7 @@ def test_file_update(clear_database, usemodel, ident, crawler_extended): cr = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory(cr, cfood="scifolder_extended.yml") - updateList = cr.updateList - fileList = [r for r in updateList if r.role == "File"] + fileList = [r for r in cr.targetData if r.role == "File"] for f in fileList: f.file = rfp("..", "unittests", "test_directories", "examples_article", f.file) @@ -298,8 +295,7 @@ def test_file_update(clear_database, usemodel, ident, crawler_extended): cr2 = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory(cr2, cfood="scifolder_extended2.yml") - updateList = cr2.updateList - fileList = [r for r in updateList if r.role == "File"] + fileList = [r for r in cr2.targetData if r.role == "File"] for f in fileList: f.file = rfp("..", "unittests", "test_directories", "examples_article", f.file) diff --git a/integrationtests/test_realworld_example.py b/integrationtests/test_realworld_example.py index 342a3d13..feb9893f 100644 --- a/integrationtests/test_realworld_example.py +++ b/integrationtests/test_realworld_example.py @@ -29,7 +29,7 @@ import os import caosdb as db -from caoscrawler.crawl import Crawler, crawler_main +from caoscrawler.crawl import Crawler, main as crawler_main from caoscrawler.converters import JSONFileConverter, DictConverter from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter from caoscrawler.structure_elements import File, JSONFile, Directory @@ -137,7 +137,6 @@ def test_event_update(clear_database, usemodel): identifiable_path, True, os.path.join(DATADIR, "provenance.yml"), - False, True, "" ) diff --git a/integrationtests/test_use_case_simple_presentation.py b/integrationtests/test_use_case_simple_presentation.py index ba8009fc..f1c838d1 100644 --- a/integrationtests/test_use_case_simple_presentation.py +++ b/integrationtests/test_use_case_simple_presentation.py @@ -32,7 +32,7 @@ from subprocess import run import caosdb as db from caosadvancedtools.loadFiles import loadpath from caosadvancedtools.models import parser as parser -from caoscrawler.crawl import crawler_main +from caoscrawler.crawl import main as crawler_main # TODO(fspreck) Re-eneable once this is part of dev in advancedusertools. @@ -77,7 +77,6 @@ def test_complete_crawler( True, os.path.join(DATADIR, "provenance.yml"), False, - True, "/use_case_simple_presentation") res = db.execute_query("FIND Record Experiment") diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 4c096d19..ac1bba3b 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -102,7 +102,7 @@ def check_identical(record1: db.Entity, record2: db.Entity, ignore_id=False): return False for attribute in ("datatype", "importance", "unit"): # only make an update for those attributes if there is a value difference and - # the value in the updateList is not None + # the value in the targetData is not None if attribute in comp[0]["properties"][key]: attr_val = comp[0]["properties"][key][attribute] other_attr_val = (comp[1]["properties"][key][attribute] @@ -386,7 +386,7 @@ class Crawler(object): local_converters = Crawler.create_local_converters(crawler_definition, converter_registry) # This recursive crawling procedure generates the update list: - self.updateList: list[db.Record] = [] + self.targetData: list[db.Record] = [] self._crawl(items, self.global_converters, local_converters, self.generalStore, self.recordStore, [], []) @@ -394,7 +394,7 @@ class Crawler(object): if self.debug: self.debug_converters = self.global_converters + local_converters - return self.updateList + return self.targetData def synchronize(self, commit_changes: bool = True): """ @@ -404,7 +404,7 @@ class Crawler(object): # After the crawling, the actual synchronization with the database, based on the # update list is carried out: - return self._synchronize(self.updateList, commit_changes) + return self._synchronize(self.targetData, commit_changes) def can_be_checked_externally(self, record: db.Record): """ @@ -696,7 +696,7 @@ class Crawler(object): el.value[index] = val.id @staticmethod - def remove_unnecessary_updates(updateList: list[db.Record], + def remove_unnecessary_updates(targetData: list[db.Record], identified_records: list[db.Record]): """ checks whether all relevant attributes (especially Property values) are equal @@ -706,15 +706,15 @@ class Crawler(object): update list without unecessary updates """ - if len(updateList) != len(identified_records): + if len(targetData) != len(identified_records): raise RuntimeError("The lists of updates and of identified records need to be of the " "same length!") # TODO this can now easily be changed to a function without side effect - for i in reversed(range(len(updateList))): - identical = check_identical(updateList[i], identified_records[i]) + for i in reversed(range(len(targetData))): + identical = check_identical(targetData[i], identified_records[i]) if identical: - del updateList[i] + del targetData[i] continue else: pass @@ -747,11 +747,11 @@ class Crawler(object): if len(to_be_updated) > 0: db.Container().extend(to_be_updated).update() - def _synchronize(self, updateList: list[db.Record], commit_changes: bool = True): + def _synchronize(self, targetData: list[db.Record], commit_changes: bool = True): """ This function applies several stages: - 1) Retrieve identifiables for all records in updateList. - 2) Compare updateList with existing records. + 1) Retrieve identifiables for all records in targetData. + 2) Compare targetData with existing records. 3) Insert and update records based on the set of identified differences. This function makes use of an IdentifiableAdapter which is used to retrieve @@ -760,13 +760,13 @@ class Crawler(object): if commit_changes is True, the changes are synchronized to the CaosDB server. For debugging in can be useful to set this to False. - Return the final insertList and updateList as tuple. + Return the final insertList and targetData as tuple. """ if self.identifiableAdapter is None: raise RuntimeError("Should not happen.") - to_be_inserted, to_be_updated = self.split_into_inserts_and_updates(updateList) + to_be_inserted, to_be_updated = self.split_into_inserts_and_updates(targetData) # TODO: refactoring of typo for el in to_be_updated: @@ -896,7 +896,7 @@ class Crawler(object): # to the general update container. scoped_records = recordStore.get_records_current_scope() for record in scoped_records: - self.updateList.append(record) + self.targetData.append(record) # TODO: the scoped variables should be cleaned up as soon if the variables # are no longer in the current scope. This can be implemented as follows, @@ -909,29 +909,52 @@ class Crawler(object): # del recordStore[name] # del generalStore[name] - return self.updateList + return self.targetData -def crawler_main(args_path, - args_cfood, - args_load_identifiables, - args_debug, - args_provenance, - args_dry_sync, - args_sync, - args_prefix): - crawler = Crawler(debug=args_debug) - crawler.crawl_directory(args_path, args_cfood) - if args_provenance is not None: - crawler.save_debug_data(args_provenance) +def main(crawled_directory_path: str, + cfood_file_name: str, + identifiables_definition_file: str = None, + debug: bool = False, + provenance_file: str = None, + dry_run: bool = False, + prefix: str = ""): + """ + + Parameters + ---------- + crawled_directory_path : str + path to be crawled + cfood_file_name : str + filename of the cfood to be used + identifiables_definition_file : str + filename of an identifiable definition yaml file + debug : bool + whether or not to run in debug mode + provenance_file : str + provenance information will be stored in a file with given filename + dry_run : bool + do not commit any chnages to the server + prefix : str + remove the given prefix from file paths + + Returns + ------- + return_value : int + 0 if successful + """ + crawler = Crawler(debug=debug) + crawler.crawl_directory(crawled_directory_path, cfood_file_name) + if provenance_file is not None: + crawler.save_debug_data(provenance_file) - if args_load_identifiables is not None: + if identifiables_definition_file is not None: ident = CaosDBIdentifiableAdapter() - ident.load_from_yaml_definition(args_load_identifiables) + ident.load_from_yaml_definition(identifiables_definition_file) crawler.identifiableAdapter = ident - if args_dry_sync: + if dry_run: ins, upd = crawler.synchronize(commit_changes=False) inserts = [str(i) for i in ins] updates = [str(i) for i in upd] @@ -939,14 +962,14 @@ def crawler_main(args_path, f.write(yaml.dump({ "insert": inserts, "update": updates})) - elif args_sync: + else: rtsfinder = dict() - for elem in crawler.updateList: + for elem in crawler.targetData: if isinstance(elem, db.File): # correct the file path: # elem.file = os.path.join(args.path, elem.file) - if elem.path.startswith(args_prefix): - elem.path = elem.path[len(args_prefix):] + if elem.path.startswith(prefix): + elem.path = elem.path[len(prefix):] elem.file = None # TODO: as long as the new file backend is not finished # we are using the loadFiles function to insert symlinks. @@ -978,18 +1001,18 @@ def crawler_main(args_path, def parse_args(): parser = argparse.ArgumentParser(description=__doc__, formatter_class=RawTextHelpFormatter) - parser.add_argument("cfood", + parser.add_argument("cfood_file_name", help="Path name of the cfood yaml file to be used.") parser.add_argument("--provenance", required=False, help="Path name of the provenance yaml file. " "This file will only be generated if this option is set.") parser.add_argument("--debug", required=False, action="store_true", help="Path name of the cfood yaml file to be used.") - parser.add_argument("path", + parser.add_argument("crawled_directory_path", help="The subtree of files below the given path will " "be considered. Use '/' for everything.") - parser.add_argument("-n", "--dry-sync", action="store_true", + parser.add_argument("-n", "--dry-run", action="store_true", help="Create two files dry.yml to show" "what would actually be committed without doing the synchronization.") @@ -997,9 +1020,6 @@ def parse_args(): parser.add_argument("-i", "--load-identifiables", help="Load identifiables from " "the given yaml file.") - parser.add_argument("-s", "--sync", action="store_true", - help="Do the synchronization. This is probably the expected " - "standard behavior of the crawler.") parser.add_argument("-p", "--prefix", help="Remove the given prefix from the paths " @@ -1008,19 +1028,14 @@ def parse_args(): return parser.parse_args() -def main(): +if __name__ == "__main__": args = parse_args() - return crawler_main( - args.path, - args.cfood, + sys.exit(main( + args.crawled_directory_path, + args.cfood_file_name, args.load_identifiables, args.debug, args.provenance, - args.dry_sync, - args.sync, + args.dry_run, args.prefix - ) - - -if __name__ == "__main__": - sys.exit(main()) + )) diff --git a/unittests/test_tool.py b/unittests/test_tool.py index 61fc4f48..d6187d18 100755 --- a/unittests/test_tool.py +++ b/unittests/test_tool.py @@ -173,7 +173,7 @@ def test_record_structure_generation(crawler): def test_ambigious_records(crawler, ident): ident.get_records().clear() - ident.get_records().extend(crawler.updateList) + ident.get_records().extend(crawler.targetData) r = ident.get_records() id_r0 = ident.get_identifiable(r[0]) with raises(RuntimeError, match=".*unambigiously.*"): @@ -195,7 +195,7 @@ def test_crawler_update_list(crawler, ident): ) == 2 # The crawler contains lots of duplicates, because identifiables have not been resolved yet: - assert len(ident.get_records()) != len(crawler.updateList) + assert len(ident.get_records()) != len(crawler.targetData) # Check consistency: # Check whether identifiables retrieved from current identifiable store return the same results. @@ -327,7 +327,7 @@ def test_identifiable_adapter_no_identifiable(crawler, ident): insl, updl = crawler.synchronize() assert len(updl) == 0 - pers = [r for r in crawler.updateList if r.parents[0].name == "Person"] + pers = [r for r in crawler.targetData if r.parents[0].name == "Person"] # All persons are inserted, because they are not identifiable: assert len(insl) == len(pers) -- GitLab