From 292b52ac9769688c3b64318a50fc21851f98e1a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com>
Date: Mon, 27 Mar 2023 21:17:07 +0200
Subject: [PATCH] DOC: add docstrings

---
 src/caoscrawler/crawl.py | 44 ++++++++++++++++++++++++++++++++++------
 1 file changed, 38 insertions(+), 6 deletions(-)

diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py
index caff4584..992615b8 100644
--- a/src/caoscrawler/crawl.py
+++ b/src/caoscrawler/crawl.py
@@ -977,6 +977,11 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3]))
 
 
 def _create_status_record(logfile_url, run_id):
+    """Insert a CrawlerRun Record
+
+    CrawlerRun Records are used to have a (somewhat) persistent feedback from crawler runs that
+    are easyly accessible by users.
+    """
     if get_config_setting("create_crawler_status_records"):
         (db.Record()
             .add_parent('CrawlerRun')
@@ -988,18 +993,34 @@ def _create_status_record(logfile_url, run_id):
 
 
 def _update_status_record(run_id, n_inserts, n_updates, status):
+    """Update the CrawlerRun Record
+
+    The Record is identified using the run_id. The status is changed and some information about the
+    run is added.
+    """
     if get_config_setting("create_crawler_status_records"):
         cr_rec = db.execute_query(f"FIND RECORD CrawlerRun WITH run_id={run_id}", unique=True)
         cr_rec.get_property('status').value = status
         (cr_rec
-            .add_property(db.execute_query(f"FIND Property with name='number_of_inserted_entities'", unique=True).id, n_inserts)
-            .add_property(db.execute_query(f"FIND Property with name='number_of_updated_entities'", unique=True).id, n_updates)
-            .add_property(db.execute_query(f"FIND Property with name='finished'", unique=True).id, datetime.now().isoformat()))
-        print(cr_rec)
+            .add_property(db.execute_query(
+                f"FIND Property with name='number_of_inserted_entities'", unique=True).id,
+                n_inserts)
+            .add_property(
+                db.execute_query(f"FIND Property with name='number_of_updated_entities'",
+                                 unique=True).id, n_updates)
+            .add_property(
+                db.execute_query(f"FIND Property with name='finished'",
+                                 unique=True).id, datetime.now().isoformat()))
         cr_rec.update()
 
 
 def _notify_about_inserts_and_updates(n_inserts, n_updates, logfile, run_id):
+    """send an email notification
+
+    Only if there were inserts or updates.
+
+    The email contains some basic information and a link to the log and the CrawlerRun Record.
+    """
     if not get_config_setting("send_crawler_notifications"):
         return
     if n_inserts == 0 and n_updates == 0:
@@ -1024,6 +1045,7 @@ the CaosDB Crawler successfully crawled the data and
 
 
 def _treat_deprecated_prefix(prefix, remove_prefix):
+    """notify about deprecation and use given value"""
     if prefix != "":
         warnings.warn(DeprecationWarning("The prefix argument is deprecated and will be removed "
                                          "in the future. Please use `remove_prefix` instead."))
@@ -1035,7 +1057,12 @@ def _treat_deprecated_prefix(prefix, remove_prefix):
     return remove_prefix
 
 
-def _fix_file_paths(crawled_data, remove_prefix):
+def _fix_file_paths(crawled_data, add_prefix, remove_prefix):
+    """adjust the path according to add_/remove_prefix
+
+    Also remove the `file` attribute from File entities (because inserts need currently be done
+    by loadfiles.
+    """
     for elem in crawled_data:
         if isinstance(elem, db.File):
             # correct the file path:
@@ -1058,6 +1085,10 @@ def _fix_file_paths(crawled_data, remove_prefix):
 
 
 def _check_record_types(crawled_data):
+    """Check for all parents in crawled_data whether they exists
+
+    raise Error if it does not
+    """
     rtsfinder = dict()
 
     for elem in crawled_data:
@@ -1080,6 +1111,7 @@ def _check_record_types(crawled_data):
 
 
 def _store_dry_run_data(ins, upd):
+    """write insets and updates to a file """
     inserts = [str(i) for i in ins]
     updates = [str(i) for i in upd]
     with open("dry.yml", "w") as f:
@@ -1148,7 +1180,7 @@ def crawler_main(crawled_directory_path: str,
         debug_tree = DebugTree()
         crawled_data = scan_directory(
             crawled_directory_path, cfood_file_name, restricted_path, debug_tree=debug_tree)
-        _fix_file_paths(crawled_data, remove_prefix)
+        _fix_file_paths(crawled_data, add_prefix, remove_prefix)
         _check_record_types(crawled_data)
 
         if provenance_file is not None and debug:
-- 
GitLab