From d90d85019791b11eff3a618798996281e0c09c02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Fri, 15 Jul 2022 13:39:21 +0200 Subject: [PATCH] ENH: add authorization of changes --- README.md | 2 + integrationtests/README.md | 1 + src/caoscrawler/crawl.py | 121 +++++++++++++++++++++++++++++-------- 3 files changed, 100 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 59b88aaa..8576e5c9 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,8 @@ After installation of the package run (within the project folder): pytest ``` +## Integration Tests +see `integrationtests/README.md` # Contributers diff --git a/integrationtests/README.md b/integrationtests/README.md index 96789ed9..88d55902 100644 --- a/integrationtests/README.md +++ b/integrationtests/README.md @@ -1,2 +1,3 @@ 1. Mount test_data/extroot as extroot folder in the CaosDB server 2. use an empty server +3. run pytest from `src`: `python -m pytest ../integrationtests` diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 3458b32b..ab544397 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -28,13 +28,19 @@ Crawl a file structure using a yaml cfood definition and synchronize the acuired data with CaosDB. """ +import importlib +from caosadvancedtools.cache import UpdateCache +import uuid import sys import os import yaml +from enum import Enum +import logging from importlib_resources import files import argparse from argparse import RawTextHelpFormatter import caosdb as db +from caosadvancedtools.crawler import Crawler as OldCrawler from caosdb.common.datatype import is_reference from .stores import GeneralStore, RecordStore from .identified_cache import IdentifiedCache @@ -49,8 +55,8 @@ from caosdb.apiutils import compare_entities, merge_entities from copy import deepcopy from jsonschema import validate +logger = logging.getLogger(__name__) -import importlib SPECIAL_PROPERTIES_STRICT = ("description", "name", "id", "path") SPECIAL_PROPERTIES_NOT_STRICT = ("file", "checksum", "size") @@ -139,6 +145,12 @@ def _resolve_datatype(prop: db.Property, remote_entity: db.Entity): return prop +class SecurityMode(Enum): + RETRIEVE = 0 + INSERT = 1 + UPDATE = 2 + + class Crawler(object): """ Crawler class that encapsulates crawling functions. @@ -146,23 +158,35 @@ class Crawler(object): storage for values (general store). """ - def __init__(self, converters: list[Converter] = [], + def __init__(self, + converters: list[Converter] = [], generalStore: Optional[GeneralStore] = None, debug: bool = False, - identifiableAdapter: IdentifiableAdapter = None): + identifiableAdapter: IdentifiableAdapter = None, + securityMode: int = SecurityMode.UPDATE + ): """ Create a new crawler and initialize an empty RecordStore and GeneralStore. - converters: The set of converters used for this crawler. - recordStore: An initial GeneralStore which might store e.g. environment variables. - - debug: Create a debugging information tree when set to True. - The debugging information tree is a variable stored in - self.debug_tree. It is a dictionary mapping directory entries - to a tuple of general stores and record stores which are valid for the directory scope. - Furthermore, it is stored in a second tree named self.debug_copied whether the - objects in debug_tree had been copied from a higher level in the hierarchy - of the structureelements. + Parameters + ---------- + converters : list[Converter] + The set of converters used for this crawler. + recordStore : GeneralStore + An initial GeneralStore which might store e.g. environment variables. + debug : bool + Create a debugging information tree when set to True. + The debugging information tree is a variable stored in + self.debug_tree. It is a dictionary mapping directory entries + to a tuple of general stores and record stores which are valid for the directory scope. + Furthermore, it is stored in a second tree named self.debug_copied whether the + objects in debug_tree had been copied from a higher level in the hierarchy + of the structureelements. + identifiableAdapter : IdentifiableAdapter + TODO describe + securityMode : int + Whether only retrieves are allowed or also inserts or even updates. + Please use SecurityMode Enum """ # TODO: check if this feature is really needed @@ -170,6 +194,7 @@ class Crawler(object): self.identified_cache = IdentifiedCache() self.recordStore = RecordStore() + self.securityMode = securityMode self.generalStore = generalStore if generalStore is None: @@ -178,7 +203,8 @@ class Crawler(object): self.identifiableAdapter = identifiableAdapter if identifiableAdapter is None: self.identifiableAdapter = LocalStorageIdentifiableAdapter() - + # If a directory is crawled this may hold the path to that directory + self.crawled_directory = None self.debug = debug if self.debug: # order in the tuple: @@ -328,6 +354,7 @@ class Crawler(object): raise ValueError( "You have to provide a non-empty path for crawling.") dir_structure_name = os.path.basename(dirname) + self.crawled_directory = dirname if not dir_structure_name and dirname.endswith('/'): if dirname == '/': # Crawling the entire file system @@ -390,6 +417,7 @@ class Crawler(object): if not isinstance(items, list): items = [items] + self.run_id = uuid.uuid1() local_converters = Crawler.create_local_converters(crawler_definition, converter_registry) # This recursive crawling procedure generates the update list: @@ -727,18 +755,23 @@ class Crawler(object): pass @staticmethod - def execute_inserts_in_list(to_be_inserted): + def execute_inserts_in_list(to_be_inserted, run_id: int = None): for record in to_be_inserted: for prop in record.properties: entity = db.Entity(name=prop.name).retrieve() + # TODO return value unused prop = _resolve_datatype(prop, entity) print("INSERT") print(to_be_inserted) if len(to_be_inserted) > 0: - db.Container().extend(to_be_inserted).insert() + if self.securityMode > SecurityMode.RETRIEVE: + db.Container().extend(to_be_inserted).insert() + elif run_id is not None: + update_cache = UpdateCache() + update_cache.insert(to_be_inserted, run_id) @staticmethod - def execute_updates_in_list(to_be_updated): + def execute_updates_in_list(to_be_updated, run_id: int = None): # retrieve ids of properties when missing: for record in to_be_updated: for parent in record.parents: @@ -748,11 +781,16 @@ class Crawler(object): if prop.id is None: entity = db.Entity(name=prop.name).retrieve() prop.id = entity.id + # TODO return value unused prop = _resolve_datatype(prop, entity) print("UPDATE") print(to_be_updated) if len(to_be_updated) > 0: - db.Container().extend(to_be_updated).update() + if self.securityMode > SecurityMode.INSERT: + db.Container().extend(to_be_updated).update() + elif run_id is not None: + update_cache = UpdateCache() + update_cache.insert(to_be_updated, run_id) def _synchronize(self, targetData: list[db.Record], commit_changes: bool = True): """ @@ -787,11 +825,38 @@ class Crawler(object): self.remove_unnecessary_updates(to_be_updated, identified_records) if commit_changes: - self.execute_inserts_in_list(to_be_inserted) - self.execute_updates_in_list(to_be_updated) + self.execute_inserts_in_list(to_be_inserted, self.run_id) + self.execute_updates_in_list(to_be_updated, self.run_id) + + update_cache = UpdateCache() + pending_changes = update_cache.get_updates(self.run_id) + + if pending_changes: + Crawler.inform_about_pending_changes( + pending_changes, self.run_id, self.crawled_directory) return (to_be_inserted, to_be_updated) + @staticmethod + def inform_about_pending_changes(pending_changes, run_id, path): + # Sending an Email with a link to a form to authorize updates is + # only done in SSS mode + + if "SHARED_DIR" in os.environ: + filename = OldCrawler.save_form([el[3] for el in pending_changes], path, run_id) + OldCrawler.send_mail([el[3] for el in pending_changes], filename) + + for i, el in enumerate(pending_changes): + + logger.debug( + """ +UNAUTHORIZED UPDATE ({} of {}): +____________________\n""".format(i+1, len(pending_changes)) + str(el[3])) + logger.info("There were unauthorized changes (see above). An " + "email was sent to the curator.\n" + "You can authorize the updates by invoking the crawler" + " with the run id: {rid}\n".format(rid=self.run_id)) + @staticmethod def debug_build_usage_tree(converter: Converter): res: dict[str, dict[str, Any]] = { @@ -925,7 +990,8 @@ def main(crawled_directory_path: str, debug: bool = False, provenance_file: str = None, dry_run: bool = False, - prefix: str = ""): + prefix: str = "", + securityMode: int = SecurityMode.UPDATE): """ Parameters @@ -944,13 +1010,15 @@ def main(crawled_directory_path: str, do not commit any chnages to the server prefix : str remove the given prefix from file paths + securityMode : int + securityMode of Crawler Returns ------- return_value : int 0 if successful """ - crawler = Crawler(debug=debug) + crawler = Crawler(debug=debug, securityMode=securityMode) crawler.crawl_directory(crawled_directory_path, cfood_file_name) if provenance_file is not None: crawler.save_debug_data(provenance_file) @@ -1018,7 +1086,9 @@ def parse_args(): parser.add_argument("crawled_directory_path", help="The subtree of files below the given path will " "be considered. Use '/' for everything.") - + parser.add_argument("-s", "--security-mode", choices=["retrieve", "insert", "update"], + help="Determines whether entities may only be read from the server, or " + "whether inserts or even updates may be done.") parser.add_argument("-n", "--dry-run", action="store_true", help="Create two files dry.yml to show" "what would actually be committed without doing the synchronization.") @@ -1044,5 +1114,8 @@ if __name__ == "__main__": args.debug, args.provenance, args.dry_run, - args.prefix + args.prefix, + {"retrieve": SecurityMode.RETRIEVE, + "insert": SecurityMode.INSERT, + "update": SecurityMode.UPDATE}[args.security_mode] )) -- GitLab