From 7234cf345555afe7ca1b0d9a90a639a7dedfa925 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com>
Date: Thu, 14 Jul 2022 17:13:28 +0200
Subject: [PATCH] MAINT: some refactoring of the main function in crawl.py (and
 collateral)

---
 integrationtests/basic_example/test.py        |  26 ++--
 integrationtests/test_realworld_example.py    |   3 +-
 .../test_use_case_simple_presentation.py      |   3 +-
 src/caoscrawler/crawl.py                      | 119 ++++++++++--------
 unittests/test_tool.py                        |   6 +-
 5 files changed, 83 insertions(+), 74 deletions(-)

diff --git a/integrationtests/basic_example/test.py b/integrationtests/basic_example/test.py
index 6e35f7f2..2c972fa9 100755
--- a/integrationtests/basic_example/test.py
+++ b/integrationtests/basic_example/test.py
@@ -102,8 +102,7 @@ def crawler_extended(ident):
     cr = Crawler(debug=True, identifiableAdapter=ident)
     crawl_standard_test_directory(cr, cfood="scifolder_extended.yml")
     # correct paths for current working directory
-    updateList = cr.updateList
-    fileList = [r for r in updateList if r.role == "File"]
+    fileList = [r for r in cr.targetData if r.role == "File"]
     for f in fileList:
         f.file = rfp("..", "unittests", "test_directories",
                      "examples_article", f.file)
@@ -157,7 +156,7 @@ def test_insertion(clear_database, usemodel, ident, crawler):
     # Do a second run on the same data, there should a new insert:
     cr = Crawler(debug=True, identifiableAdapter=ident)
     crawl_standard_test_directory(cr, "example_insert")
-    assert len(cr.updateList) == 3
+    assert len(cr.targetData) == 3
     ins, ups = cr.synchronize()
     assert len(ins) == 1
     assert len(ups) == 0
@@ -165,7 +164,7 @@ def test_insertion(clear_database, usemodel, ident, crawler):
     # Do it again to check whether nothing is changed:
     cr = Crawler(debug=True, identifiableAdapter=ident)
     crawl_standard_test_directory(cr, "example_insert")
-    assert len(cr.updateList) == 3
+    assert len(cr.targetData) == 3
     ins, ups = cr.synchronize()
     assert len(ins) == 0
     assert len(ups) == 0
@@ -180,9 +179,9 @@ def test_insertion_and_update(clear_database, usemodel, ident, crawler):
 
     cr = Crawler(debug=True, identifiableAdapter=ident)
     crawl_standard_test_directory(cr, "example_overwrite_1")
-    # print(cr.updateList)
+    # print(cr.targetData)
     # cr.save_debug_data(rfp("provenance.yml"))
-    assert len(cr.updateList) == 3
+    assert len(cr.targetData) == 3
     ins, ups = cr.synchronize()
     assert len(ins) == 0
     assert len(ups) == 1
@@ -197,7 +196,7 @@ def test_identifiable_update(clear_database, usemodel, ident, crawler):
     crawl_standard_test_directory(cr)
 
     # Test the addition of a single property:
-    l = cr.updateList
+    l = cr.targetData
     for record in l:
         if (record.parents[0].name == "Measurement" and
                 record.get_property("date").value == "2020-01-03"):
@@ -213,7 +212,7 @@ def test_identifiable_update(clear_database, usemodel, ident, crawler):
     # Test the change within one property:
     cr = Crawler(debug=True, identifiableAdapter=ident)
     crawl_standard_test_directory(cr)
-    l = cr.updateList
+    l = cr.targetData
     for record in l:
         if (record.parents[0].name == "Measurement" and
                 record.get_property("date").value == "2020-01-03"):
@@ -227,7 +226,7 @@ def test_identifiable_update(clear_database, usemodel, ident, crawler):
     # Changing the date should result in a new insertion:
     cr = Crawler(debug=True, identifiableAdapter=ident)
     crawl_standard_test_directory(cr)
-    l = cr.updateList
+    l = cr.targetData
     for record in l:
         if (record.parents[0].name == "Measurement" and
                 record.get_property("date").value == "2020-01-03"):
@@ -244,8 +243,7 @@ def test_file_insertion_dry(clear_database, usemodel, ident):
     crawler_extended = Crawler(debug=True, identifiableAdapter=ident)
     crawl_standard_test_directory(
         crawler_extended, cfood="scifolder_extended.yml")
-    updateList = crawler_extended.updateList
-    fileList = [r for r in updateList if r.role == "File"]
+    fileList = [r for r in crawler_extended.targetData if r.role == "File"]
     assert len(fileList) == 11
 
     for f in fileList:
@@ -281,8 +279,7 @@ def test_file_update(clear_database, usemodel, ident, crawler_extended):
     cr = Crawler(debug=True, identifiableAdapter=ident)
     crawl_standard_test_directory(cr, cfood="scifolder_extended.yml")
 
-    updateList = cr.updateList
-    fileList = [r for r in updateList if r.role == "File"]
+    fileList = [r for r in cr.targetData if r.role == "File"]
     for f in fileList:
         f.file = rfp("..", "unittests", "test_directories",
                      "examples_article", f.file)
@@ -298,8 +295,7 @@ def test_file_update(clear_database, usemodel, ident, crawler_extended):
     cr2 = Crawler(debug=True, identifiableAdapter=ident)
     crawl_standard_test_directory(cr2, cfood="scifolder_extended2.yml")
 
-    updateList = cr2.updateList
-    fileList = [r for r in updateList if r.role == "File"]
+    fileList = [r for r in cr2.targetData if r.role == "File"]
     for f in fileList:
         f.file = rfp("..", "unittests", "test_directories",
                      "examples_article", f.file)
diff --git a/integrationtests/test_realworld_example.py b/integrationtests/test_realworld_example.py
index 342a3d13..feb9893f 100644
--- a/integrationtests/test_realworld_example.py
+++ b/integrationtests/test_realworld_example.py
@@ -29,7 +29,7 @@ import os
 
 import caosdb as db
 
-from caoscrawler.crawl import Crawler, crawler_main
+from caoscrawler.crawl import Crawler, main as crawler_main
 from caoscrawler.converters import JSONFileConverter, DictConverter
 from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter
 from caoscrawler.structure_elements import File, JSONFile, Directory
@@ -137,7 +137,6 @@ def test_event_update(clear_database, usemodel):
         identifiable_path,
         True,
         os.path.join(DATADIR, "provenance.yml"),
-        False,
         True,
         ""
     )
diff --git a/integrationtests/test_use_case_simple_presentation.py b/integrationtests/test_use_case_simple_presentation.py
index ba8009fc..f1c838d1 100644
--- a/integrationtests/test_use_case_simple_presentation.py
+++ b/integrationtests/test_use_case_simple_presentation.py
@@ -32,7 +32,7 @@ from subprocess import run
 import caosdb as db
 from caosadvancedtools.loadFiles import loadpath
 from caosadvancedtools.models import parser as parser
-from caoscrawler.crawl import crawler_main
+from caoscrawler.crawl import main as crawler_main
 
 
 # TODO(fspreck) Re-eneable once this is part of dev in advancedusertools.
@@ -77,7 +77,6 @@ def test_complete_crawler(
                  True,
                  os.path.join(DATADIR, "provenance.yml"),
                  False,
-                 True,
                  "/use_case_simple_presentation")
 
     res = db.execute_query("FIND Record Experiment")
diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py
index 4c096d19..ac1bba3b 100644
--- a/src/caoscrawler/crawl.py
+++ b/src/caoscrawler/crawl.py
@@ -102,7 +102,7 @@ def check_identical(record1: db.Entity, record2: db.Entity, ignore_id=False):
             return False
         for attribute in ("datatype", "importance", "unit"):
             # only make an update for those attributes if there is a value difference and
-            # the value in the updateList is not None
+            # the value in the targetData is not None
             if attribute in comp[0]["properties"][key]:
                 attr_val = comp[0]["properties"][key][attribute]
                 other_attr_val = (comp[1]["properties"][key][attribute]
@@ -386,7 +386,7 @@ class Crawler(object):
         local_converters = Crawler.create_local_converters(crawler_definition,
                                                            converter_registry)
         # This recursive crawling procedure generates the update list:
-        self.updateList: list[db.Record] = []
+        self.targetData: list[db.Record] = []
         self._crawl(items,
                     self.global_converters, local_converters, self.generalStore, self.recordStore,
                     [], [])
@@ -394,7 +394,7 @@ class Crawler(object):
         if self.debug:
             self.debug_converters = self.global_converters + local_converters
 
-        return self.updateList
+        return self.targetData
 
     def synchronize(self, commit_changes: bool = True):
         """
@@ -404,7 +404,7 @@ class Crawler(object):
         # After the crawling, the actual synchronization with the database, based on the
         # update list is carried out:
 
-        return self._synchronize(self.updateList, commit_changes)
+        return self._synchronize(self.targetData, commit_changes)
 
     def can_be_checked_externally(self, record: db.Record):
         """
@@ -696,7 +696,7 @@ class Crawler(object):
                             el.value[index] = val.id
 
     @staticmethod
-    def remove_unnecessary_updates(updateList: list[db.Record],
+    def remove_unnecessary_updates(targetData: list[db.Record],
                                    identified_records: list[db.Record]):
         """
         checks whether all relevant attributes (especially Property values) are equal
@@ -706,15 +706,15 @@ class Crawler(object):
         update list without unecessary updates
 
         """
-        if len(updateList) != len(identified_records):
+        if len(targetData) != len(identified_records):
             raise RuntimeError("The lists of updates and of identified records need to be of the "
                                "same length!")
         # TODO this can now easily be changed to a function without side effect
-        for i in reversed(range(len(updateList))):
-            identical = check_identical(updateList[i], identified_records[i])
+        for i in reversed(range(len(targetData))):
+            identical = check_identical(targetData[i], identified_records[i])
 
             if identical:
-                del updateList[i]
+                del targetData[i]
                 continue
             else:
                 pass
@@ -747,11 +747,11 @@ class Crawler(object):
         if len(to_be_updated) > 0:
             db.Container().extend(to_be_updated).update()
 
-    def _synchronize(self, updateList: list[db.Record], commit_changes: bool = True):
+    def _synchronize(self, targetData: list[db.Record], commit_changes: bool = True):
         """
         This function applies several stages:
-        1) Retrieve identifiables for all records in updateList.
-        2) Compare updateList with existing records.
+        1) Retrieve identifiables for all records in targetData.
+        2) Compare targetData with existing records.
         3) Insert and update records based on the set of identified differences.
 
         This function makes use of an IdentifiableAdapter which is used to retrieve
@@ -760,13 +760,13 @@ class Crawler(object):
         if commit_changes is True, the changes are synchronized to the CaosDB server.
         For debugging in can be useful to set this to False.
 
-        Return the final insertList and updateList as tuple.
+        Return the final insertList and targetData as tuple.
         """
 
         if self.identifiableAdapter is None:
             raise RuntimeError("Should not happen.")
 
-        to_be_inserted, to_be_updated = self.split_into_inserts_and_updates(updateList)
+        to_be_inserted, to_be_updated = self.split_into_inserts_and_updates(targetData)
 
         # TODO: refactoring of typo
         for el in to_be_updated:
@@ -896,7 +896,7 @@ class Crawler(object):
         # to the general update container.
         scoped_records = recordStore.get_records_current_scope()
         for record in scoped_records:
-            self.updateList.append(record)
+            self.targetData.append(record)
 
         # TODO: the scoped variables should be cleaned up as soon if the variables
         #       are no longer in the current scope. This can be implemented as follows,
@@ -909,29 +909,52 @@ class Crawler(object):
         #     del recordStore[name]
         #     del generalStore[name]
 
-        return self.updateList
+        return self.targetData
 
 
-def crawler_main(args_path,
-                 args_cfood,
-                 args_load_identifiables,
-                 args_debug,
-                 args_provenance,
-                 args_dry_sync,
-                 args_sync,
-                 args_prefix):
-    crawler = Crawler(debug=args_debug)
-    crawler.crawl_directory(args_path, args_cfood)
-    if args_provenance is not None:
-        crawler.save_debug_data(args_provenance)
+def main(crawled_directory_path: str,
+         cfood_file_name: str,
+         identifiables_definition_file: str = None,
+         debug: bool = False,
+         provenance_file: str = None,
+         dry_run: bool = False,
+         prefix: str = ""):
+    """
+
+    Parameters
+    ----------
+    crawled_directory_path : str
+        path to be crawled
+    cfood_file_name : str
+        filename of the cfood to be used
+    identifiables_definition_file : str
+        filename of an identifiable definition yaml file
+    debug : bool
+        whether or not to run in debug mode
+    provenance_file : str
+        provenance information will be stored in a file with given filename
+    dry_run : bool
+        do not commit any chnages to the server
+    prefix : str
+        remove the given prefix from file paths
+
+    Returns
+    -------
+    return_value : int
+        0 if successful
+    """
+    crawler = Crawler(debug=debug)
+    crawler.crawl_directory(crawled_directory_path, cfood_file_name)
+    if provenance_file is not None:
+        crawler.save_debug_data(provenance_file)
 
-    if args_load_identifiables is not None:
+    if identifiables_definition_file is not None:
 
         ident = CaosDBIdentifiableAdapter()
-        ident.load_from_yaml_definition(args_load_identifiables)
+        ident.load_from_yaml_definition(identifiables_definition_file)
         crawler.identifiableAdapter = ident
 
-    if args_dry_sync:
+    if dry_run:
         ins, upd = crawler.synchronize(commit_changes=False)
         inserts = [str(i) for i in ins]
         updates = [str(i) for i in upd]
@@ -939,14 +962,14 @@ def crawler_main(args_path,
             f.write(yaml.dump({
                 "insert": inserts,
                 "update": updates}))
-    elif args_sync:
+    else:
         rtsfinder = dict()
-        for elem in crawler.updateList:
+        for elem in crawler.targetData:
             if isinstance(elem, db.File):
                 # correct the file path:
                 # elem.file = os.path.join(args.path, elem.file)
-                if elem.path.startswith(args_prefix):
-                    elem.path = elem.path[len(args_prefix):]
+                if elem.path.startswith(prefix):
+                    elem.path = elem.path[len(prefix):]
                 elem.file = None
                 # TODO: as long as the new file backend is not finished
                 #       we are using the loadFiles function to insert symlinks.
@@ -978,18 +1001,18 @@ def crawler_main(args_path,
 def parse_args():
     parser = argparse.ArgumentParser(description=__doc__,
                                      formatter_class=RawTextHelpFormatter)
-    parser.add_argument("cfood",
+    parser.add_argument("cfood_file_name",
                         help="Path name of the cfood yaml file to be used.")
     parser.add_argument("--provenance", required=False,
                         help="Path name of the provenance yaml file. "
                         "This file will only be generated if this option is set.")
     parser.add_argument("--debug", required=False, action="store_true",
                         help="Path name of the cfood yaml file to be used.")
-    parser.add_argument("path",
+    parser.add_argument("crawled_directory_path",
                         help="The subtree of files below the given path will "
                         "be considered. Use '/' for everything.")
 
-    parser.add_argument("-n", "--dry-sync", action="store_true",
+    parser.add_argument("-n", "--dry-run", action="store_true",
                         help="Create two files dry.yml to show"
                         "what would actually be committed without doing the synchronization.")
 
@@ -997,9 +1020,6 @@ def parse_args():
     parser.add_argument("-i", "--load-identifiables",
                         help="Load identifiables from "
                         "the given yaml file.")
-    parser.add_argument("-s", "--sync", action="store_true",
-                        help="Do the synchronization. This is probably the expected "
-                        "standard behavior of the crawler.")
 
     parser.add_argument("-p", "--prefix",
                         help="Remove the given prefix from the paths "
@@ -1008,19 +1028,14 @@ def parse_args():
     return parser.parse_args()
 
 
-def main():
+if __name__ == "__main__":
     args = parse_args()
-    return crawler_main(
-        args.path,
-        args.cfood,
+    sys.exit(main(
+        args.crawled_directory_path,
+        args.cfood_file_name,
         args.load_identifiables,
         args.debug,
         args.provenance,
-        args.dry_sync,
-        args.sync,
+        args.dry_run,
         args.prefix
-    )
-
-
-if __name__ == "__main__":
-    sys.exit(main())
+    ))
diff --git a/unittests/test_tool.py b/unittests/test_tool.py
index 61fc4f48..d6187d18 100755
--- a/unittests/test_tool.py
+++ b/unittests/test_tool.py
@@ -173,7 +173,7 @@ def test_record_structure_generation(crawler):
 
 def test_ambigious_records(crawler, ident):
     ident.get_records().clear()
-    ident.get_records().extend(crawler.updateList)
+    ident.get_records().extend(crawler.targetData)
     r = ident.get_records()
     id_r0 = ident.get_identifiable(r[0])
     with raises(RuntimeError, match=".*unambigiously.*"):
@@ -195,7 +195,7 @@ def test_crawler_update_list(crawler, ident):
     ) == 2
 
     # The crawler contains lots of duplicates, because identifiables have not been resolved yet:
-    assert len(ident.get_records()) != len(crawler.updateList)
+    assert len(ident.get_records()) != len(crawler.targetData)
 
     # Check consistency:
     # Check whether identifiables retrieved from current identifiable store return the same results.
@@ -327,7 +327,7 @@ def test_identifiable_adapter_no_identifiable(crawler, ident):
     insl, updl = crawler.synchronize()
     assert len(updl) == 0
 
-    pers = [r for r in crawler.updateList if r.parents[0].name == "Person"]
+    pers = [r for r in crawler.targetData if r.parents[0].name == "Person"]
     # All persons are inserted, because they are not identifiable:
     assert len(insl) == len(pers)
 
-- 
GitLab