From d90d85019791b11eff3a618798996281e0c09c02 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com>
Date: Fri, 15 Jul 2022 13:39:21 +0200
Subject: [PATCH] ENH: add authorization of changes

---
 README.md                  |   2 +
 integrationtests/README.md |   1 +
 src/caoscrawler/crawl.py   | 121 +++++++++++++++++++++++++++++--------
 3 files changed, 100 insertions(+), 24 deletions(-)

diff --git a/README.md b/README.md
index 59b88aaa..8576e5c9 100644
--- a/README.md
+++ b/README.md
@@ -25,6 +25,8 @@ After installation of the package run (within the project folder):
 pytest
 ```
 
+## Integration Tests
+see `integrationtests/README.md`
 
 # Contributers
 
diff --git a/integrationtests/README.md b/integrationtests/README.md
index 96789ed9..88d55902 100644
--- a/integrationtests/README.md
+++ b/integrationtests/README.md
@@ -1,2 +1,3 @@
 1. Mount test_data/extroot as extroot folder in the CaosDB server
 2. use an empty server
+3. run pytest from `src`: `python -m pytest ../integrationtests`
diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py
index 3458b32b..ab544397 100644
--- a/src/caoscrawler/crawl.py
+++ b/src/caoscrawler/crawl.py
@@ -28,13 +28,19 @@ Crawl a file structure using a yaml cfood definition and synchronize
 the acuired data with CaosDB.
 """
 
+import importlib
+from caosadvancedtools.cache import UpdateCache
+import uuid
 import sys
 import os
 import yaml
+from enum import Enum
+import logging
 from importlib_resources import files
 import argparse
 from argparse import RawTextHelpFormatter
 import caosdb as db
+from caosadvancedtools.crawler import Crawler as OldCrawler
 from caosdb.common.datatype import is_reference
 from .stores import GeneralStore, RecordStore
 from .identified_cache import IdentifiedCache
@@ -49,8 +55,8 @@ from caosdb.apiutils import compare_entities, merge_entities
 from copy import deepcopy
 from jsonschema import validate
 
+logger = logging.getLogger(__name__)
 
-import importlib
 
 SPECIAL_PROPERTIES_STRICT = ("description", "name", "id", "path")
 SPECIAL_PROPERTIES_NOT_STRICT = ("file", "checksum", "size")
@@ -139,6 +145,12 @@ def _resolve_datatype(prop: db.Property, remote_entity: db.Entity):
     return prop
 
 
+class SecurityMode(Enum):
+    RETRIEVE = 0
+    INSERT = 1
+    UPDATE = 2
+
+
 class Crawler(object):
     """
     Crawler class that encapsulates crawling functions.
@@ -146,23 +158,35 @@ class Crawler(object):
     storage for values (general store).
     """
 
-    def __init__(self, converters: list[Converter] = [],
+    def __init__(self,
+                 converters: list[Converter] = [],
                  generalStore: Optional[GeneralStore] = None,
                  debug: bool = False,
-                 identifiableAdapter: IdentifiableAdapter = None):
+                 identifiableAdapter: IdentifiableAdapter = None,
+                 securityMode: int = SecurityMode.UPDATE
+                 ):
         """
         Create a new crawler and initialize an empty RecordStore and GeneralStore.
 
-        converters: The set of converters used for this crawler.
-        recordStore: An initial GeneralStore which might store e.g. environment variables.
-
-        debug: Create a debugging information tree when set to True.
-               The debugging information tree is a variable stored in
-               self.debug_tree. It is a dictionary mapping directory entries
-               to a tuple of general stores and record stores which are valid for the directory scope.
-               Furthermore, it is stored in a second tree named self.debug_copied whether the
-               objects in debug_tree had been copied from a higher level in the hierarchy
-               of the structureelements.
+        Parameters
+        ----------
+        converters : list[Converter]
+             The set of converters used for this crawler.
+        recordStore : GeneralStore
+             An initial GeneralStore which might store e.g. environment variables.
+        debug : bool
+             Create a debugging information tree when set to True.
+             The debugging information tree is a variable stored in
+             self.debug_tree. It is a dictionary mapping directory entries
+             to a tuple of general stores and record stores which are valid for the directory scope.
+             Furthermore, it is stored in a second tree named self.debug_copied whether the
+             objects in debug_tree had been copied from a higher level in the hierarchy
+             of the structureelements.
+        identifiableAdapter : IdentifiableAdapter
+             TODO describe
+        securityMode : int
+             Whether only retrieves are allowed or also inserts or even updates.
+             Please use SecurityMode Enum
         """
 
         # TODO: check if this feature is really needed
@@ -170,6 +194,7 @@ class Crawler(object):
 
         self.identified_cache = IdentifiedCache()
         self.recordStore = RecordStore()
+        self.securityMode = securityMode
 
         self.generalStore = generalStore
         if generalStore is None:
@@ -178,7 +203,8 @@ class Crawler(object):
         self.identifiableAdapter = identifiableAdapter
         if identifiableAdapter is None:
             self.identifiableAdapter = LocalStorageIdentifiableAdapter()
-
+        # If a directory is crawled this may hold the path to that directory
+        self.crawled_directory = None
         self.debug = debug
         if self.debug:
             # order in the tuple:
@@ -328,6 +354,7 @@ class Crawler(object):
             raise ValueError(
                 "You have to provide a non-empty path for crawling.")
         dir_structure_name = os.path.basename(dirname)
+        self.crawled_directory = dirname
         if not dir_structure_name and dirname.endswith('/'):
             if dirname == '/':
                 # Crawling the entire file system
@@ -390,6 +417,7 @@ class Crawler(object):
         if not isinstance(items, list):
             items = [items]
 
+        self.run_id = uuid.uuid1()
         local_converters = Crawler.create_local_converters(crawler_definition,
                                                            converter_registry)
         # This recursive crawling procedure generates the update list:
@@ -727,18 +755,23 @@ class Crawler(object):
                 pass
 
     @staticmethod
-    def execute_inserts_in_list(to_be_inserted):
+    def execute_inserts_in_list(to_be_inserted, run_id: int = None):
         for record in to_be_inserted:
             for prop in record.properties:
                 entity = db.Entity(name=prop.name).retrieve()
+                # TODO return value unused
                 prop = _resolve_datatype(prop, entity)
         print("INSERT")
         print(to_be_inserted)
         if len(to_be_inserted) > 0:
-            db.Container().extend(to_be_inserted).insert()
+            if self.securityMode > SecurityMode.RETRIEVE:
+                db.Container().extend(to_be_inserted).insert()
+            elif run_id is not None:
+                update_cache = UpdateCache()
+                update_cache.insert(to_be_inserted, run_id)
 
     @staticmethod
-    def execute_updates_in_list(to_be_updated):
+    def execute_updates_in_list(to_be_updated, run_id: int = None):
         # retrieve ids of properties when missing:
         for record in to_be_updated:
             for parent in record.parents:
@@ -748,11 +781,16 @@ class Crawler(object):
                 if prop.id is None:
                     entity = db.Entity(name=prop.name).retrieve()
                     prop.id = entity.id
+                    # TODO return value unused
                     prop = _resolve_datatype(prop, entity)
         print("UPDATE")
         print(to_be_updated)
         if len(to_be_updated) > 0:
-            db.Container().extend(to_be_updated).update()
+            if self.securityMode > SecurityMode.INSERT:
+                db.Container().extend(to_be_updated).update()
+            elif run_id is not None:
+                update_cache = UpdateCache()
+                update_cache.insert(to_be_updated, run_id)
 
     def _synchronize(self, targetData: list[db.Record], commit_changes: bool = True):
         """
@@ -787,11 +825,38 @@ class Crawler(object):
         self.remove_unnecessary_updates(to_be_updated, identified_records)
 
         if commit_changes:
-            self.execute_inserts_in_list(to_be_inserted)
-            self.execute_updates_in_list(to_be_updated)
+            self.execute_inserts_in_list(to_be_inserted, self.run_id)
+            self.execute_updates_in_list(to_be_updated, self.run_id)
+
+        update_cache = UpdateCache()
+        pending_changes = update_cache.get_updates(self.run_id)
+
+        if pending_changes:
+            Crawler.inform_about_pending_changes(
+                pending_changes, self.run_id, self.crawled_directory)
 
         return (to_be_inserted, to_be_updated)
 
+    @staticmethod
+    def inform_about_pending_changes(pending_changes, run_id, path):
+        # Sending an Email with a link to a form to authorize updates is
+        # only done in SSS mode
+
+        if "SHARED_DIR" in os.environ:
+            filename = OldCrawler.save_form([el[3] for el in pending_changes], path, run_id)
+            OldCrawler.send_mail([el[3] for el in pending_changes], filename)
+
+        for i, el in enumerate(pending_changes):
+
+            logger.debug(
+                """
+UNAUTHORIZED UPDATE ({} of {}):
+____________________\n""".format(i+1, len(pending_changes)) + str(el[3]))
+        logger.info("There were unauthorized changes (see above). An "
+                    "email was sent to the curator.\n"
+                    "You can authorize the updates by invoking the crawler"
+                    " with the run id: {rid}\n".format(rid=self.run_id))
+
     @staticmethod
     def debug_build_usage_tree(converter: Converter):
         res: dict[str, dict[str, Any]] = {
@@ -925,7 +990,8 @@ def main(crawled_directory_path: str,
          debug: bool = False,
          provenance_file: str = None,
          dry_run: bool = False,
-         prefix: str = ""):
+         prefix: str = "",
+         securityMode: int = SecurityMode.UPDATE):
     """
 
     Parameters
@@ -944,13 +1010,15 @@ def main(crawled_directory_path: str,
         do not commit any chnages to the server
     prefix : str
         remove the given prefix from file paths
+    securityMode : int
+        securityMode of Crawler
 
     Returns
     -------
     return_value : int
         0 if successful
     """
-    crawler = Crawler(debug=debug)
+    crawler = Crawler(debug=debug, securityMode=securityMode)
     crawler.crawl_directory(crawled_directory_path, cfood_file_name)
     if provenance_file is not None:
         crawler.save_debug_data(provenance_file)
@@ -1018,7 +1086,9 @@ def parse_args():
     parser.add_argument("crawled_directory_path",
                         help="The subtree of files below the given path will "
                         "be considered. Use '/' for everything.")
-
+    parser.add_argument("-s", "--security-mode", choices=["retrieve", "insert", "update"],
+                        help="Determines whether entities may only be read from the server, or "
+                        "whether inserts or even updates may be done.")
     parser.add_argument("-n", "--dry-run", action="store_true",
                         help="Create two files dry.yml to show"
                         "what would actually be committed without doing the synchronization.")
@@ -1044,5 +1114,8 @@ if __name__ == "__main__":
         args.debug,
         args.provenance,
         args.dry_run,
-        args.prefix
+        args.prefix,
+        {"retrieve": SecurityMode.RETRIEVE,
+         "insert": SecurityMode.INSERT,
+         "update": SecurityMode.UPDATE}[args.security_mode]
     ))
-- 
GitLab