From df571be8d7c6884e6df3bbf570b789ab9585e54c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com>
Date: Mon, 27 Mar 2023 13:54:47 +0200
Subject: [PATCH] ENH: setup logging and reporting for serverside execution

- crawler_run_model.yml contains a model for Records that contain
  informaiton on crawler jobs
- the new logging module sets up logging for serverside jobs
- crawl.py was refactored and now use the above additions
---
 crawler_run_model.yml      |  22 ++++
 src/caoscrawler/config.py  |  34 ++++++
 src/caoscrawler/crawl.py   | 227 +++++++++++++++++++++++++------------
 src/caoscrawler/logging.py |  73 ++++++++++++
 4 files changed, 282 insertions(+), 74 deletions(-)
 create mode 100644 crawler_run_model.yml
 create mode 100644 src/caoscrawler/config.py
 create mode 100644 src/caoscrawler/logging.py

diff --git a/crawler_run_model.yml b/crawler_run_model.yml
new file mode 100644
index 00000000..e80244b5
--- /dev/null
+++ b/crawler_run_model.yml
@@ -0,0 +1,22 @@
+
+CrawlerRun:
+  recommended_properties:
+    logfile:
+      datatype: TEXT
+      description: 'A log file.'
+    status:
+      datatype: TEXT
+      description: 'RUNNING, FAILED or SUCCESS'
+    run_id:
+      datatype: TEXT
+      description: 'Unique crawler run identifier'
+    started:
+      datatype: DATETIME
+      description: 'Time when the crawler started.'
+    finished:
+      datatype: DATETIME
+      description: 'Time when the crawler finished.'
+    number_of_inserted_entities:
+      datatype: INTEGER
+    number_of_updated_entities:
+      datatype: INTEGER
diff --git a/src/caoscrawler/config.py b/src/caoscrawler/config.py
new file mode 100644
index 00000000..18993b53
--- /dev/null
+++ b/src/caoscrawler/config.py
@@ -0,0 +1,34 @@
+# This file is a part of the CaosDB Project.
+#
+# Copyright (C) 2023 Indiscale GmbH <info@indiscale.com>
+# Copyright (C) 2023 Henrik tom Wörden <h.tomwoerden@indiscale.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+
+import caosdb as db
+
+DEFAULTS = {
+    "send_crawler_notifications": False,
+    "create_crawler_status_records": False,
+    "public_host_url": "/",
+}
+
+
+def get_config_setting(setting):
+    caosdb_config = db.configuration.get_config()
+    if "caoscrawler" in caosdb_config and setting in caosdb_config["caoscrawler"]:
+        return caosdb_config["caoscrawler"][setting]
+    else:
+        return DEFAULTS[setting]
diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py
index bacc5356..6f2354ad 100644
--- a/src/caoscrawler/crawl.py
+++ b/src/caoscrawler/crawl.py
@@ -32,6 +32,7 @@ the acuired data with CaosDB.
 from __future__ import annotations
 
 import argparse
+from datetime import datetime
 import importlib
 import logging
 import os
@@ -43,6 +44,7 @@ import yaml
 from argparse import RawTextHelpFormatter
 from collections import defaultdict
 from copy import deepcopy
+
 from enum import Enum
 from importlib_resources import files
 from jsonschema import validate
@@ -53,6 +55,7 @@ import caosdb as db
 from caosadvancedtools.utils import create_entity_link
 from caosadvancedtools.cache import UpdateCache, Cache
 from caosadvancedtools.crawler import Crawler as OldCrawler
+from caosadvancedtools.serverside.helper import send_mail
 from caosdb.apiutils import (compare_entities, EntityMergeConflictError,
                              merge_entities)
 from caosdb.common.datatype import is_reference
@@ -67,6 +70,8 @@ from .macros import defmacro_constructor, macro_constructor
 from .stores import GeneralStore, RecordStore
 from .structure_elements import StructureElement, Directory, NoneElement
 from .version import check_cfood_version
+from .config import get_config_setting
+from .logging import configure_server_side_logging
 
 from .scanner import (scan_directory,
                       load_definition,
@@ -213,7 +218,10 @@ class Crawler(object):
         # different caches.
         self.remote_existing_cache = IdentifiedCache()
         self.remote_missing_cache = IdentifiedCache()
+        # TODO does it make sense to have this as member variable?
         self.securityMode = securityMode
+        # TODO does it make sense to have this as member variable(run_id)?
+        self.generate_run_id()
 
         self.identifiableAdapter: IdentifiableAdapter = LocalStorageIdentifiableAdapter()
         if identifiableAdapter is not None:
@@ -830,8 +838,6 @@ class Crawler(object):
                 "use for example the Scanner to create this data."))
             crawled_data = self.crawled_data
 
-        self.generate_run_id()
-
         to_be_inserted, to_be_updated = self.split_into_inserts_and_updates(crawled_data)
         referencing_entities = self.create_reference_mapping(to_be_updated + to_be_inserted)
 
@@ -902,9 +908,7 @@ class Crawler(object):
     @staticmethod
     def inform_about_pending_changes(pending_changes, run_id, path, inserts=False):
         # Sending an Email with a link to a form to authorize updates is
-        # only done in SSS mode
-
-        if "SHARED_DIR" in os.environ:
+        if get_config_setting("send_crawler_notifications"):
             filename = OldCrawler.save_form(
                 [el[3] for el in pending_changes], path, run_id)
             OldCrawler.send_mail([el[3] for el in pending_changes], filename)
@@ -972,6 +976,118 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3]))
             f.write(yaml.dump(paths, sort_keys=False))
 
 
+def _create_status_record(logfile_url, run_id):
+    if get_config_setting("create_crawler_status_records"):
+        (db.Record()
+            .add_parent('CrawlerRun')
+            .add_property('logfile', logfile_url)
+            .add_property('status', "RUNNING")
+            .add_property('run_id', run_id)
+            .add_property('started', datetime.now().isoformat())
+         .insert())
+
+
+def _update_status_record(run_id, n_inserts, n_updates, status):
+    if get_config_setting("create_crawler_status_records"):
+        cr_rec = db.execute_query(f"FIND RECORD CrawlerRun WITH run_id={run_id}", unique=True)
+        cr_rec.get_property('status').value = status
+        (cr_rec
+            .add_property(db.execute_query(f"FIND Property with name='number_of_inserted_entities'", unique=True).id, n_inserts)
+            .add_property(db.execute_query(f"FIND Property with name='number_of_updated_entities'", unique=True).id, n_updates)
+            .add_property(db.execute_query(f"FIND Property with name='finished'", unique=True).id, datetime.now().isoformat()))
+        print(cr_rec)
+        cr_rec.update()
+
+
+def _notify_about_inserts_and_updates(n_inserts, n_updates, logfile, run_id):
+    if not get_config_setting("send_crawler_notifications"):
+        return
+    if n_inserts == 0 and n_updates == 0:
+        return
+    text = f"""Dear Curator,
+the CaosDB Crawler successfully crawled the data and
+- inserted {n_inserts} new Entities and
+- updated {n_updates} existing Entities.
+
+"""
+
+    if get_config_setting("create_crawler_status_records"):
+        domain = get_config_setting("public_host_url")
+        text += ("You can checkout the CrawlerRun Record for more information:\n"
+                 f"{domain}/Entity/?P=0L10&query=find%20crawlerrun%20with%20run_id=%27{run_id}%27\n\n")
+    text += (f"You can download the logfile here:\n{domain}/Shared/" + logfile)
+    send_mail(
+        from_addr=get_config_setting("sendmail_from_address"),
+        to=get_config_setting("sendmail_to_address"),
+        subject="Crawler Update",
+        body=text)
+
+
+def _treat_deprecated_prefix(prefix, remove_prefix):
+    if prefix != "":
+        warnings.warn(DeprecationWarning("The prefix argument is deprecated and will be removed "
+                                         "in the future. Please use `remove_prefix` instead."))
+        if remove_prefix is not None:
+            raise ValueError("Please do not supply the (deprecated) `prefix` and the "
+                             "`remove_prefix` argument at the same time. Only use "
+                             "`remove_prefix` instead.")
+        return prefix
+    return remove_prefix
+
+
+def _fix_file_paths(crawled_data):
+    for elem in crawled_data:
+        if isinstance(elem, db.File):
+            # correct the file path:
+            # elem.file = os.path.join(args.path, elem.file)
+            if remove_prefix:
+                if elem.path.startswith(remove_prefix):
+                    elem.path = elem.path[len(remove_prefix):]
+                else:
+                    raise RuntimeError("Prefix shall be removed from file path but the path "
+                                       "does not start with the prefix:"
+                                       f"\n{remove_prefix}\n{elem.path}")
+            if add_prefix:
+                elem.path = add_prefix + elem.path
+            elem.file = None
+            # TODO: as long as the new file backend is not finished
+            #       we are using the loadFiles function to insert symlinks.
+            #       Therefore, I am setting the files to None here.
+            #       Otherwise, the symlinks in the database would be replaced
+            #       by uploads of the files which we currently do not want to happen.
+
+
+def _check_record_types(crawled_data):
+    rtsfinder = dict()
+
+    for elem in crawled_data:
+        # Check whether all needed RecordTypes exist:
+        if len(elem.parents) > 0:
+            for parent in elem.parents:
+                if parent.name in rtsfinder:
+                    continue
+
+                rt = db.RecordType(name=parent.name)
+                try:
+                    rt.retrieve()
+                    rtsfinder[parent.name] = True
+                except db.TransactionError:
+                    rtsfinder[parent.name] = False
+
+    notfound = [k for k, v in rtsfinder.items() if not v]
+    if len(notfound) > 0:
+        raise RuntimeError("Missing RecordTypes: {}". format(", ".join(notfound)))
+
+
+def _store_dry_run_data(ins, upd):
+    inserts = [str(i) for i in ins]
+    updates = [str(i) for i in upd]
+    with open("dry.yml", "w") as f:
+        f.write(yaml.dump({
+            "insert": inserts,
+            "update": updates}))
+
+
 def crawler_main(crawled_directory_path: str,
                  cfood_file_name: str,
                  identifiables_definition_file: Optional[str] = None,
@@ -996,7 +1112,7 @@ def crawler_main(crawled_directory_path: str,
     identifiables_definition_file : str
         filename of an identifiable definition yaml file
     debug : bool
-        whether or not to run in debug mode
+        DEPRECATED, whether or not to run in debug mode
     provenance_file : str
         provenance information will be stored in a file with given filename
     dry_run : bool
@@ -1020,82 +1136,45 @@ def crawler_main(crawled_directory_path: str,
     return_value : int
         0 if successful
     """
-    crawler = Crawler(securityMode=securityMode)
     try:
+        crawler = Crawler(securityMode=securityMode)
+
+        # setup logging and reporting if serverside execution
+        if "SHARED_DIR" in os.environ:
+            userlog_public, htmluserlog_public, _ = configure_server_side_logging()
+            _create_status_record(
+                get_config_setting("public_host_url") + "/Shared/" + htmluserlog_public, crawler.run_id)
 
         debug_tree = DebugTree()
         crawled_data = scan_directory(
             crawled_directory_path, cfood_file_name, restricted_path, debug_tree=debug_tree)
-    except ConverterValidationError as err:
-        logger.error(err)
-        return 1
-    if provenance_file is not None and debug:
-        crawler.save_debug_data(debug_tree, provenance_file)
+        _fix_file_paths(crawled_data)
+        _check_record_types(crawled_data)
 
-    if identifiables_definition_file is not None:
-        ident = CaosDBIdentifiableAdapter()
-        ident.load_from_yaml_definition(identifiables_definition_file)
-        crawler.identifiableAdapter = ident
+        if provenance_file is not None and debug:
+            crawler.save_debug_data(debug_tree, provenance_file)
 
-    if prefix != "":
-        warnings.warn(DeprecationWarning("The prefix argument is deprecated and will be removed "
-                                         "in the future. Please use `remove_prefix` instead."))
-        if remove_prefix is not None:
-            raise ValueError("Please do not supply the (deprecated) `prefix` and the "
-                             "`remove_prefix` argument at the same time. Only use "
-                             "`remove_prefix` instead.")
-        remove_prefix = prefix
-
-    if dry_run:
-        ins, upd = crawler.synchronize(commit_changes=False, crawled_data=crawled_data)
-        inserts = [str(i) for i in ins]
-        updates = [str(i) for i in upd]
-        with open("dry.yml", "w") as f:
-            f.write(yaml.dump({
-                "insert": inserts,
-                "update": updates}))
-    else:
-        rtsfinder = dict()
-        for elem in crawled_data:
-            if isinstance(elem, db.File):
-                # correct the file path:
-                # elem.file = os.path.join(args.path, elem.file)
-                if remove_prefix:
-                    if elem.path.startswith(remove_prefix):
-                        elem.path = elem.path[len(remove_prefix):]
-                    else:
-                        raise RuntimeError("Prefix shall be removed from file path but the path "
-                                           "does not start with the prefix:"
-                                           f"\n{remove_prefix}\n{elem.path}")
-                if add_prefix:
-                    elem.path = add_prefix + elem.path
-                elem.file = None
-                # TODO: as long as the new file backend is not finished
-                #       we are using the loadFiles function to insert symlinks.
-                #       Therefore, I am setting the files to None here.
-                #       Otherwise, the symlinks in the database would be replaced
-                #       by uploads of the files which we currently do not want to happen.
-
-            # Check whether all needed RecordTypes exist:
-            if len(elem.parents) > 0:
-                for parent in elem.parents:
-                    if parent.name in rtsfinder:
-                        continue
+        if identifiables_definition_file is not None:
+            ident = CaosDBIdentifiableAdapter()
+            ident.load_from_yaml_definition(identifiables_definition_file)
+            crawler.identifiableAdapter = ident
 
-                    rt = db.RecordType(name=parent.name)
-                    try:
-                        rt.retrieve()
-                        rtsfinder[parent.name] = True
-                    except db.TransactionError:
-                        rtsfinder[parent.name] = False
-        notfound = [k for k, v in rtsfinder.items() if not v]
-        if len(notfound) > 0:
-            raise RuntimeError("Missing RecordTypes: {}".
-                               format(", ".join(notfound)))
-
-        crawler.synchronize(commit_changes=True, unique_names=unique_names,
-                            crawled_data=crawled_data)
-    return 0
+        remove_prefix = _treat_deprecated_prefix(prefix, remove_prefix)
+
+        if dry_run:
+            inserts, updates = crawler.synchronize(commit_changes=False, crawled_data=crawled_data)
+            _store_dry_run_data(inserts, updates)
+        else:
+            inserts, updates = crawler.synchronize(commit_changes=True, unique_names=unique_names,
+                                                   crawled_data=crawled_data)
+            _notify_about_inserts_and_updates(len(inserts), len(updates), userlog_public,
+                                              crawler.run_id)
+            _update_status_record(crawler.run_id, len(inserts), len(updates), status="OK")
+        return 0
+    except ConverterValidationError as err:
+        logger.error(err)
+        _update_status_record(0, 0, status="FAILED")
+        return 1
 
 
 def parse_args():
diff --git a/src/caoscrawler/logging.py b/src/caoscrawler/logging.py
new file mode 100644
index 00000000..2ebe2c5f
--- /dev/null
+++ b/src/caoscrawler/logging.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+#
+# This file is a part of the CaosDB Project.
+#
+# Copyright (C) 2023 Henrik tom Wörden <h.tomwoerden@indiscale.com>
+# Copyright (C) 2023 IndiScale GmbH    <info@indiscale.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+import logging
+
+from caosadvancedtools.webui_formatter import WebUI_Formatter
+from caosadvancedtools.serverside.helper import get_shared_filename
+import sys
+
+
+def configure_server_side_logging():
+    """
+    Set logging up to save one plain debugging log file, one plain info log
+    file (for users) and a stdout stream with messages wrapped in html elements
+
+    returns the path to the file with debugging output
+    """
+    adv_logger = logging.getLogger("caosadvancedtools")
+    adv_logger.setLevel(level=logging.DEBUG)
+
+    cr_logger = logging.getLogger("caoscrawler")
+    cr_logger.setLevel(level=logging.DEBUG)
+
+    userlog_public, userlog_internal = get_shared_filename("userlog.txt")
+
+    root_logger = logging.getLogger()
+    root_logger.setLevel(level=logging.INFO)
+
+    # this is a log file with INFO level for the user
+    user_file_handler = logging.FileHandler(filename=userlog_internal)
+    user_file_handler.setLevel(logging.INFO)
+    root_logger.addHandler(user_file_handler)
+
+    # The output shall be printed in the webui. Thus wrap it in html elements.
+    formatter = WebUI_Formatter(full_file="/Shared/{}".format(userlog_public))
+    web_handler = logging.StreamHandler(stream=sys.stdout)
+    web_handler.setFormatter(formatter)
+    web_handler.setLevel(logging.INFO)
+    root_logger.addHandler(web_handler)
+
+    # Also create an HTML version for later use.
+    htmluserlog_public, htmluserlog_internal = get_shared_filename("userlog.html")
+    formatter = WebUI_Formatter(full_file="/Shared/{}".format(htmluserlog_public))
+    lweb_handler = logging.FileHandler(filename=htmluserlog_internal)
+    lweb_handler.setFormatter(formatter)
+    lweb_handler.setLevel(logging.INFO)
+    root_logger.addHandler(lweb_handler)
+
+    # one log file with debug level output
+    debuglog_public, debuglog_internal = get_shared_filename("debuglog.txt")
+    debug_handler = logging.FileHandler(filename=debuglog_internal)
+    debug_handler.setLevel(logging.DEBUG)
+    root_logger.addHandler(debug_handler)
+
+    return userlog_public, htmluserlog_public, debuglog_public
-- 
GitLab