From df571be8d7c6884e6df3bbf570b789ab9585e54c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Mon, 27 Mar 2023 13:54:47 +0200 Subject: [PATCH] ENH: setup logging and reporting for serverside execution - crawler_run_model.yml contains a model for Records that contain informaiton on crawler jobs - the new logging module sets up logging for serverside jobs - crawl.py was refactored and now use the above additions --- crawler_run_model.yml | 22 ++++ src/caoscrawler/config.py | 34 ++++++ src/caoscrawler/crawl.py | 227 +++++++++++++++++++++++++------------ src/caoscrawler/logging.py | 73 ++++++++++++ 4 files changed, 282 insertions(+), 74 deletions(-) create mode 100644 crawler_run_model.yml create mode 100644 src/caoscrawler/config.py create mode 100644 src/caoscrawler/logging.py diff --git a/crawler_run_model.yml b/crawler_run_model.yml new file mode 100644 index 00000000..e80244b5 --- /dev/null +++ b/crawler_run_model.yml @@ -0,0 +1,22 @@ + +CrawlerRun: + recommended_properties: + logfile: + datatype: TEXT + description: 'A log file.' + status: + datatype: TEXT + description: 'RUNNING, FAILED or SUCCESS' + run_id: + datatype: TEXT + description: 'Unique crawler run identifier' + started: + datatype: DATETIME + description: 'Time when the crawler started.' + finished: + datatype: DATETIME + description: 'Time when the crawler finished.' + number_of_inserted_entities: + datatype: INTEGER + number_of_updated_entities: + datatype: INTEGER diff --git a/src/caoscrawler/config.py b/src/caoscrawler/config.py new file mode 100644 index 00000000..18993b53 --- /dev/null +++ b/src/caoscrawler/config.py @@ -0,0 +1,34 @@ +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2023 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2023 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +import caosdb as db + +DEFAULTS = { + "send_crawler_notifications": False, + "create_crawler_status_records": False, + "public_host_url": "/", +} + + +def get_config_setting(setting): + caosdb_config = db.configuration.get_config() + if "caoscrawler" in caosdb_config and setting in caosdb_config["caoscrawler"]: + return caosdb_config["caoscrawler"][setting] + else: + return DEFAULTS[setting] diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index bacc5356..6f2354ad 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -32,6 +32,7 @@ the acuired data with CaosDB. from __future__ import annotations import argparse +from datetime import datetime import importlib import logging import os @@ -43,6 +44,7 @@ import yaml from argparse import RawTextHelpFormatter from collections import defaultdict from copy import deepcopy + from enum import Enum from importlib_resources import files from jsonschema import validate @@ -53,6 +55,7 @@ import caosdb as db from caosadvancedtools.utils import create_entity_link from caosadvancedtools.cache import UpdateCache, Cache from caosadvancedtools.crawler import Crawler as OldCrawler +from caosadvancedtools.serverside.helper import send_mail from caosdb.apiutils import (compare_entities, EntityMergeConflictError, merge_entities) from caosdb.common.datatype import is_reference @@ -67,6 +70,8 @@ from .macros import defmacro_constructor, macro_constructor from .stores import GeneralStore, RecordStore from .structure_elements import StructureElement, Directory, NoneElement from .version import check_cfood_version +from .config import get_config_setting +from .logging import configure_server_side_logging from .scanner import (scan_directory, load_definition, @@ -213,7 +218,10 @@ class Crawler(object): # different caches. self.remote_existing_cache = IdentifiedCache() self.remote_missing_cache = IdentifiedCache() + # TODO does it make sense to have this as member variable? self.securityMode = securityMode + # TODO does it make sense to have this as member variable(run_id)? + self.generate_run_id() self.identifiableAdapter: IdentifiableAdapter = LocalStorageIdentifiableAdapter() if identifiableAdapter is not None: @@ -830,8 +838,6 @@ class Crawler(object): "use for example the Scanner to create this data.")) crawled_data = self.crawled_data - self.generate_run_id() - to_be_inserted, to_be_updated = self.split_into_inserts_and_updates(crawled_data) referencing_entities = self.create_reference_mapping(to_be_updated + to_be_inserted) @@ -902,9 +908,7 @@ class Crawler(object): @staticmethod def inform_about_pending_changes(pending_changes, run_id, path, inserts=False): # Sending an Email with a link to a form to authorize updates is - # only done in SSS mode - - if "SHARED_DIR" in os.environ: + if get_config_setting("send_crawler_notifications"): filename = OldCrawler.save_form( [el[3] for el in pending_changes], path, run_id) OldCrawler.send_mail([el[3] for el in pending_changes], filename) @@ -972,6 +976,118 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) f.write(yaml.dump(paths, sort_keys=False)) +def _create_status_record(logfile_url, run_id): + if get_config_setting("create_crawler_status_records"): + (db.Record() + .add_parent('CrawlerRun') + .add_property('logfile', logfile_url) + .add_property('status', "RUNNING") + .add_property('run_id', run_id) + .add_property('started', datetime.now().isoformat()) + .insert()) + + +def _update_status_record(run_id, n_inserts, n_updates, status): + if get_config_setting("create_crawler_status_records"): + cr_rec = db.execute_query(f"FIND RECORD CrawlerRun WITH run_id={run_id}", unique=True) + cr_rec.get_property('status').value = status + (cr_rec + .add_property(db.execute_query(f"FIND Property with name='number_of_inserted_entities'", unique=True).id, n_inserts) + .add_property(db.execute_query(f"FIND Property with name='number_of_updated_entities'", unique=True).id, n_updates) + .add_property(db.execute_query(f"FIND Property with name='finished'", unique=True).id, datetime.now().isoformat())) + print(cr_rec) + cr_rec.update() + + +def _notify_about_inserts_and_updates(n_inserts, n_updates, logfile, run_id): + if not get_config_setting("send_crawler_notifications"): + return + if n_inserts == 0 and n_updates == 0: + return + text = f"""Dear Curator, +the CaosDB Crawler successfully crawled the data and +- inserted {n_inserts} new Entities and +- updated {n_updates} existing Entities. + +""" + + if get_config_setting("create_crawler_status_records"): + domain = get_config_setting("public_host_url") + text += ("You can checkout the CrawlerRun Record for more information:\n" + f"{domain}/Entity/?P=0L10&query=find%20crawlerrun%20with%20run_id=%27{run_id}%27\n\n") + text += (f"You can download the logfile here:\n{domain}/Shared/" + logfile) + send_mail( + from_addr=get_config_setting("sendmail_from_address"), + to=get_config_setting("sendmail_to_address"), + subject="Crawler Update", + body=text) + + +def _treat_deprecated_prefix(prefix, remove_prefix): + if prefix != "": + warnings.warn(DeprecationWarning("The prefix argument is deprecated and will be removed " + "in the future. Please use `remove_prefix` instead.")) + if remove_prefix is not None: + raise ValueError("Please do not supply the (deprecated) `prefix` and the " + "`remove_prefix` argument at the same time. Only use " + "`remove_prefix` instead.") + return prefix + return remove_prefix + + +def _fix_file_paths(crawled_data): + for elem in crawled_data: + if isinstance(elem, db.File): + # correct the file path: + # elem.file = os.path.join(args.path, elem.file) + if remove_prefix: + if elem.path.startswith(remove_prefix): + elem.path = elem.path[len(remove_prefix):] + else: + raise RuntimeError("Prefix shall be removed from file path but the path " + "does not start with the prefix:" + f"\n{remove_prefix}\n{elem.path}") + if add_prefix: + elem.path = add_prefix + elem.path + elem.file = None + # TODO: as long as the new file backend is not finished + # we are using the loadFiles function to insert symlinks. + # Therefore, I am setting the files to None here. + # Otherwise, the symlinks in the database would be replaced + # by uploads of the files which we currently do not want to happen. + + +def _check_record_types(crawled_data): + rtsfinder = dict() + + for elem in crawled_data: + # Check whether all needed RecordTypes exist: + if len(elem.parents) > 0: + for parent in elem.parents: + if parent.name in rtsfinder: + continue + + rt = db.RecordType(name=parent.name) + try: + rt.retrieve() + rtsfinder[parent.name] = True + except db.TransactionError: + rtsfinder[parent.name] = False + + notfound = [k for k, v in rtsfinder.items() if not v] + if len(notfound) > 0: + raise RuntimeError("Missing RecordTypes: {}". format(", ".join(notfound))) + + +def _store_dry_run_data(ins, upd): + inserts = [str(i) for i in ins] + updates = [str(i) for i in upd] + with open("dry.yml", "w") as f: + f.write(yaml.dump({ + "insert": inserts, + "update": updates})) + + def crawler_main(crawled_directory_path: str, cfood_file_name: str, identifiables_definition_file: Optional[str] = None, @@ -996,7 +1112,7 @@ def crawler_main(crawled_directory_path: str, identifiables_definition_file : str filename of an identifiable definition yaml file debug : bool - whether or not to run in debug mode + DEPRECATED, whether or not to run in debug mode provenance_file : str provenance information will be stored in a file with given filename dry_run : bool @@ -1020,82 +1136,45 @@ def crawler_main(crawled_directory_path: str, return_value : int 0 if successful """ - crawler = Crawler(securityMode=securityMode) try: + crawler = Crawler(securityMode=securityMode) + + # setup logging and reporting if serverside execution + if "SHARED_DIR" in os.environ: + userlog_public, htmluserlog_public, _ = configure_server_side_logging() + _create_status_record( + get_config_setting("public_host_url") + "/Shared/" + htmluserlog_public, crawler.run_id) debug_tree = DebugTree() crawled_data = scan_directory( crawled_directory_path, cfood_file_name, restricted_path, debug_tree=debug_tree) - except ConverterValidationError as err: - logger.error(err) - return 1 - if provenance_file is not None and debug: - crawler.save_debug_data(debug_tree, provenance_file) + _fix_file_paths(crawled_data) + _check_record_types(crawled_data) - if identifiables_definition_file is not None: - ident = CaosDBIdentifiableAdapter() - ident.load_from_yaml_definition(identifiables_definition_file) - crawler.identifiableAdapter = ident + if provenance_file is not None and debug: + crawler.save_debug_data(debug_tree, provenance_file) - if prefix != "": - warnings.warn(DeprecationWarning("The prefix argument is deprecated and will be removed " - "in the future. Please use `remove_prefix` instead.")) - if remove_prefix is not None: - raise ValueError("Please do not supply the (deprecated) `prefix` and the " - "`remove_prefix` argument at the same time. Only use " - "`remove_prefix` instead.") - remove_prefix = prefix - - if dry_run: - ins, upd = crawler.synchronize(commit_changes=False, crawled_data=crawled_data) - inserts = [str(i) for i in ins] - updates = [str(i) for i in upd] - with open("dry.yml", "w") as f: - f.write(yaml.dump({ - "insert": inserts, - "update": updates})) - else: - rtsfinder = dict() - for elem in crawled_data: - if isinstance(elem, db.File): - # correct the file path: - # elem.file = os.path.join(args.path, elem.file) - if remove_prefix: - if elem.path.startswith(remove_prefix): - elem.path = elem.path[len(remove_prefix):] - else: - raise RuntimeError("Prefix shall be removed from file path but the path " - "does not start with the prefix:" - f"\n{remove_prefix}\n{elem.path}") - if add_prefix: - elem.path = add_prefix + elem.path - elem.file = None - # TODO: as long as the new file backend is not finished - # we are using the loadFiles function to insert symlinks. - # Therefore, I am setting the files to None here. - # Otherwise, the symlinks in the database would be replaced - # by uploads of the files which we currently do not want to happen. - - # Check whether all needed RecordTypes exist: - if len(elem.parents) > 0: - for parent in elem.parents: - if parent.name in rtsfinder: - continue + if identifiables_definition_file is not None: + ident = CaosDBIdentifiableAdapter() + ident.load_from_yaml_definition(identifiables_definition_file) + crawler.identifiableAdapter = ident - rt = db.RecordType(name=parent.name) - try: - rt.retrieve() - rtsfinder[parent.name] = True - except db.TransactionError: - rtsfinder[parent.name] = False - notfound = [k for k, v in rtsfinder.items() if not v] - if len(notfound) > 0: - raise RuntimeError("Missing RecordTypes: {}". - format(", ".join(notfound))) - - crawler.synchronize(commit_changes=True, unique_names=unique_names, - crawled_data=crawled_data) - return 0 + remove_prefix = _treat_deprecated_prefix(prefix, remove_prefix) + + if dry_run: + inserts, updates = crawler.synchronize(commit_changes=False, crawled_data=crawled_data) + _store_dry_run_data(inserts, updates) + else: + inserts, updates = crawler.synchronize(commit_changes=True, unique_names=unique_names, + crawled_data=crawled_data) + _notify_about_inserts_and_updates(len(inserts), len(updates), userlog_public, + crawler.run_id) + _update_status_record(crawler.run_id, len(inserts), len(updates), status="OK") + return 0 + except ConverterValidationError as err: + logger.error(err) + _update_status_record(0, 0, status="FAILED") + return 1 def parse_args(): diff --git a/src/caoscrawler/logging.py b/src/caoscrawler/logging.py new file mode 100644 index 00000000..2ebe2c5f --- /dev/null +++ b/src/caoscrawler/logging.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2023 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# Copyright (C) 2023 IndiScale GmbH <info@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +import logging + +from caosadvancedtools.webui_formatter import WebUI_Formatter +from caosadvancedtools.serverside.helper import get_shared_filename +import sys + + +def configure_server_side_logging(): + """ + Set logging up to save one plain debugging log file, one plain info log + file (for users) and a stdout stream with messages wrapped in html elements + + returns the path to the file with debugging output + """ + adv_logger = logging.getLogger("caosadvancedtools") + adv_logger.setLevel(level=logging.DEBUG) + + cr_logger = logging.getLogger("caoscrawler") + cr_logger.setLevel(level=logging.DEBUG) + + userlog_public, userlog_internal = get_shared_filename("userlog.txt") + + root_logger = logging.getLogger() + root_logger.setLevel(level=logging.INFO) + + # this is a log file with INFO level for the user + user_file_handler = logging.FileHandler(filename=userlog_internal) + user_file_handler.setLevel(logging.INFO) + root_logger.addHandler(user_file_handler) + + # The output shall be printed in the webui. Thus wrap it in html elements. + formatter = WebUI_Formatter(full_file="/Shared/{}".format(userlog_public)) + web_handler = logging.StreamHandler(stream=sys.stdout) + web_handler.setFormatter(formatter) + web_handler.setLevel(logging.INFO) + root_logger.addHandler(web_handler) + + # Also create an HTML version for later use. + htmluserlog_public, htmluserlog_internal = get_shared_filename("userlog.html") + formatter = WebUI_Formatter(full_file="/Shared/{}".format(htmluserlog_public)) + lweb_handler = logging.FileHandler(filename=htmluserlog_internal) + lweb_handler.setFormatter(formatter) + lweb_handler.setLevel(logging.INFO) + root_logger.addHandler(lweb_handler) + + # one log file with debug level output + debuglog_public, debuglog_internal = get_shared_filename("debuglog.txt") + debug_handler = logging.FileHandler(filename=debuglog_internal) + debug_handler.setLevel(logging.DEBUG) + root_logger.addHandler(debug_handler) + + return userlog_public, htmluserlog_public, debuglog_public -- GitLab