From 292b52ac9769688c3b64318a50fc21851f98e1a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Mon, 27 Mar 2023 21:17:07 +0200 Subject: [PATCH] DOC: add docstrings --- src/caoscrawler/crawl.py | 44 ++++++++++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index caff4584..992615b8 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -977,6 +977,11 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) def _create_status_record(logfile_url, run_id): + """Insert a CrawlerRun Record + + CrawlerRun Records are used to have a (somewhat) persistent feedback from crawler runs that + are easyly accessible by users. + """ if get_config_setting("create_crawler_status_records"): (db.Record() .add_parent('CrawlerRun') @@ -988,18 +993,34 @@ def _create_status_record(logfile_url, run_id): def _update_status_record(run_id, n_inserts, n_updates, status): + """Update the CrawlerRun Record + + The Record is identified using the run_id. The status is changed and some information about the + run is added. + """ if get_config_setting("create_crawler_status_records"): cr_rec = db.execute_query(f"FIND RECORD CrawlerRun WITH run_id={run_id}", unique=True) cr_rec.get_property('status').value = status (cr_rec - .add_property(db.execute_query(f"FIND Property with name='number_of_inserted_entities'", unique=True).id, n_inserts) - .add_property(db.execute_query(f"FIND Property with name='number_of_updated_entities'", unique=True).id, n_updates) - .add_property(db.execute_query(f"FIND Property with name='finished'", unique=True).id, datetime.now().isoformat())) - print(cr_rec) + .add_property(db.execute_query( + f"FIND Property with name='number_of_inserted_entities'", unique=True).id, + n_inserts) + .add_property( + db.execute_query(f"FIND Property with name='number_of_updated_entities'", + unique=True).id, n_updates) + .add_property( + db.execute_query(f"FIND Property with name='finished'", + unique=True).id, datetime.now().isoformat())) cr_rec.update() def _notify_about_inserts_and_updates(n_inserts, n_updates, logfile, run_id): + """send an email notification + + Only if there were inserts or updates. + + The email contains some basic information and a link to the log and the CrawlerRun Record. + """ if not get_config_setting("send_crawler_notifications"): return if n_inserts == 0 and n_updates == 0: @@ -1024,6 +1045,7 @@ the CaosDB Crawler successfully crawled the data and def _treat_deprecated_prefix(prefix, remove_prefix): + """notify about deprecation and use given value""" if prefix != "": warnings.warn(DeprecationWarning("The prefix argument is deprecated and will be removed " "in the future. Please use `remove_prefix` instead.")) @@ -1035,7 +1057,12 @@ def _treat_deprecated_prefix(prefix, remove_prefix): return remove_prefix -def _fix_file_paths(crawled_data, remove_prefix): +def _fix_file_paths(crawled_data, add_prefix, remove_prefix): + """adjust the path according to add_/remove_prefix + + Also remove the `file` attribute from File entities (because inserts need currently be done + by loadfiles. + """ for elem in crawled_data: if isinstance(elem, db.File): # correct the file path: @@ -1058,6 +1085,10 @@ def _fix_file_paths(crawled_data, remove_prefix): def _check_record_types(crawled_data): + """Check for all parents in crawled_data whether they exists + + raise Error if it does not + """ rtsfinder = dict() for elem in crawled_data: @@ -1080,6 +1111,7 @@ def _check_record_types(crawled_data): def _store_dry_run_data(ins, upd): + """write insets and updates to a file """ inserts = [str(i) for i in ins] updates = [str(i) for i in upd] with open("dry.yml", "w") as f: @@ -1148,7 +1180,7 @@ def crawler_main(crawled_directory_path: str, debug_tree = DebugTree() crawled_data = scan_directory( crawled_directory_path, cfood_file_name, restricted_path, debug_tree=debug_tree) - _fix_file_paths(crawled_data, remove_prefix) + _fix_file_paths(crawled_data, add_prefix, remove_prefix) _check_record_types(crawled_data) if provenance_file is not None and debug: -- GitLab