diff --git a/CHANGELOG.md b/CHANGELOG.md index f1a1f00586bda96fc0fbcd693953d02e1593459f..3fef595a99507b5781bcde96e22e27c5658f93b8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### +- Standard logging for server side execution +- Email notification if the `pycaosdb.ini` contains a `[caoscrawler]` with + `send_crawler_notifications=True`. +- Creation of CrawlerRun Records that contain status information about data + integration of the crawler if the `pycaosdb.ini` contains a `[caoscrawler]` + with `create_crawler_status_records=True`. + + ### Changed ### ### Deprecated ### diff --git a/crawler_run_model.yml b/crawler_run_model.yml new file mode 100644 index 0000000000000000000000000000000000000000..e80244b5f449206410f25504b4d3e8bdbcfe0416 --- /dev/null +++ b/crawler_run_model.yml @@ -0,0 +1,22 @@ + +CrawlerRun: + recommended_properties: + logfile: + datatype: TEXT + description: 'A log file.' + status: + datatype: TEXT + description: 'RUNNING, FAILED or SUCCESS' + run_id: + datatype: TEXT + description: 'Unique crawler run identifier' + started: + datatype: DATETIME + description: 'Time when the crawler started.' + finished: + datatype: DATETIME + description: 'Time when the crawler finished.' + number_of_inserted_entities: + datatype: INTEGER + number_of_updated_entities: + datatype: INTEGER diff --git a/integrationtests/test_use_case_simple_presentation.py b/integrationtests/test_use_case_simple_presentation.py index 0f48677d4bf64158374a0eb0865eb2b85ea715db..5f66e6fae7a77a315437f4a030110ded8d0ce867 100644 --- a/integrationtests/test_use_case_simple_presentation.py +++ b/integrationtests/test_use_case_simple_presentation.py @@ -22,6 +22,7 @@ # ** end header # +import logging import os import pytest from subprocess import run @@ -38,7 +39,7 @@ DATADIR = os.path.join(os.path.dirname(__file__), "test_data", "extroot", "use_case_simple_presentation") -def test_complete_crawler(clear_database): +def test_complete_crawler(clear_database, caplog): # Setup the data model: model = parser.parse_model_from_yaml(os.path.join(DATADIR, "model.yml")) model.sync_data_model(noquestion=True, verbose=False) @@ -56,16 +57,17 @@ def test_complete_crawler(clear_database): forceAllowSymlinks=False) # test that a bad value for "remove_prefix" leads to runtime error - with pytest.raises(RuntimeError) as re: - crawler_main( - crawled_directory_path=os.path.join(DATADIR), - cfood_file_name=os.path.join(DATADIR, "cfood.yml"), - identifiables_definition_file=os.path.join(DATADIR, "identifiables.yml"), - provenance_file=os.path.join(DATADIR, "provenance.yml"), - dry_run=False, - remove_prefix="sldkfjsldf", - ) - assert "path does not start with the prefix" in str(re.value) + caplog.set_level(logging.DEBUG, logger="caoscrawler.crawl") + assert 1 == crawler_main( + crawled_directory_path=os.path.join(DATADIR), + cfood_file_name=os.path.join(DATADIR, "cfood.yml"), + identifiables_definition_file=os.path.join(DATADIR, "identifiables.yml"), + provenance_file=os.path.join(DATADIR, "provenance.yml"), + dry_run=False, + remove_prefix="sldkfjsldf", + ) + assert "path does not start with the prefix" in caplog.text + caplog.clear() crawler_main( crawled_directory_path=os.path.join(DATADIR), diff --git a/src/caoscrawler/config.py b/src/caoscrawler/config.py new file mode 100644 index 0000000000000000000000000000000000000000..18993b539a09aa58fa280759333b3e7fd315c5e0 --- /dev/null +++ b/src/caoscrawler/config.py @@ -0,0 +1,34 @@ +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2023 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2023 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +import caosdb as db + +DEFAULTS = { + "send_crawler_notifications": False, + "create_crawler_status_records": False, + "public_host_url": "/", +} + + +def get_config_setting(setting): + caosdb_config = db.configuration.get_config() + if "caoscrawler" in caosdb_config and setting in caosdb_config["caoscrawler"]: + return caosdb_config["caoscrawler"][setting] + else: + return DEFAULTS[setting] diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 3c93f12aee49f3ab2b67e8ed89c84c69fb9dfeae..cadd7798d93b94bf4f11c76d18fe8431e61c5d0a 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -32,6 +32,7 @@ the acuired data with CaosDB. from __future__ import annotations import argparse +from datetime import datetime import importlib import logging import os @@ -43,6 +44,7 @@ import yaml from argparse import RawTextHelpFormatter from collections import defaultdict from copy import deepcopy + from enum import Enum from importlib_resources import files from jsonschema import validate @@ -53,6 +55,7 @@ import caosdb as db from caosadvancedtools.utils import create_entity_link from caosadvancedtools.cache import UpdateCache, Cache from caosadvancedtools.crawler import Crawler as OldCrawler +from caosadvancedtools.serverside.helper import send_mail from caosdb.apiutils import (compare_entities, EntityMergeConflictError, merge_entities) from caosdb.common.datatype import is_reference @@ -67,6 +70,8 @@ from .macros import defmacro_constructor, macro_constructor from .stores import GeneralStore, RecordStore from .structure_elements import StructureElement, Directory, NoneElement from .version import check_cfood_version +from .config import get_config_setting +from .logging import configure_server_side_logging from .scanner import (scan_directory, load_definition, @@ -213,7 +218,10 @@ class Crawler(object): # different caches. self.remote_existing_cache = IdentifiedCache() self.remote_missing_cache = IdentifiedCache() + # TODO does it make sense to have this as member variable? self.securityMode = securityMode + # TODO does it make sense to have this as member variable(run_id)? + self.generate_run_id() self.identifiableAdapter: IdentifiableAdapter = LocalStorageIdentifiableAdapter() if identifiableAdapter is not None: @@ -830,8 +838,6 @@ class Crawler(object): "use for example the Scanner to create this data.")) crawled_data = self.crawled_data - self.generate_run_id() - to_be_inserted, to_be_updated = self.split_into_inserts_and_updates(crawled_data) referencing_entities = self.create_reference_mapping(to_be_updated + to_be_inserted) @@ -902,9 +908,7 @@ class Crawler(object): @staticmethod def inform_about_pending_changes(pending_changes, run_id, path, inserts=False): # Sending an Email with a link to a form to authorize updates is - # only done in SSS mode - - if "SHARED_DIR" in os.environ: + if get_config_setting("send_crawler_notifications"): filename = OldCrawler.save_form( [el[3] for el in pending_changes], path, run_id) OldCrawler.send_mail([el[3] for el in pending_changes], filename) @@ -972,6 +976,150 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) f.write(yaml.dump(paths, sort_keys=False)) +def _create_status_record(logfile_url, run_id): + """Insert a CrawlerRun Record + + CrawlerRun Records are used to have a (somewhat) persistent feedback from crawler runs that + are easyly accessible by users. + """ + if get_config_setting("create_crawler_status_records"): + (db.Record() + .add_parent('CrawlerRun') + .add_property('logfile', logfile_url) + .add_property('status', "RUNNING") + .add_property('run_id', run_id) + .add_property('started', datetime.now().isoformat()) + .insert()) + + +def _update_status_record(run_id, n_inserts, n_updates, status): + """Update the CrawlerRun Record + + The Record is identified using the run_id. The status is changed and some information about the + run is added. + """ + if get_config_setting("create_crawler_status_records"): + cr_rec = db.execute_query(f"FIND RECORD CrawlerRun WITH run_id={run_id}", unique=True) + cr_rec.get_property('status').value = status + (cr_rec + .add_property(db.execute_query( + f"FIND Property with name='number_of_inserted_entities'", unique=True).id, + n_inserts) + .add_property( + db.execute_query(f"FIND Property with name='number_of_updated_entities'", + unique=True).id, n_updates) + .add_property( + db.execute_query(f"FIND Property with name='finished'", + unique=True).id, datetime.now().isoformat())) + cr_rec.update() + + +def _notify_about_inserts_and_updates(n_inserts, n_updates, logfile, run_id): + """send an email notification + + Only if there were inserts or updates. + + The email contains some basic information and a link to the log and the CrawlerRun Record. + """ + if not get_config_setting("send_crawler_notifications"): + return + if n_inserts == 0 and n_updates == 0: + return + text = f"""Dear Curator, +the CaosDB Crawler successfully crawled the data and +- inserted {n_inserts} new Entities and +- updated {n_updates} existing Entities. + +""" + + if get_config_setting("create_crawler_status_records"): + domain = get_config_setting("public_host_url") + text += ("You can checkout the CrawlerRun Record for more information:\n" + f"{domain}/Entity/?P=0L10&query=find%20crawlerrun%20with%20run_id=%27{run_id}%27\n\n") + text += (f"You can download the logfile here:\n{domain}/Shared/" + logfile) + send_mail( + from_addr=get_config_setting("sendmail_from_address"), + to=get_config_setting("sendmail_to_address"), + subject="Crawler Update", + body=text) + + +def _treat_deprecated_prefix(prefix, remove_prefix): + """notify about deprecation and use given value""" + if prefix != "": + warnings.warn(DeprecationWarning("The prefix argument is deprecated and will be removed " + "in the future. Please use `remove_prefix` instead.")) + if remove_prefix is not None: + raise ValueError("Please do not supply the (deprecated) `prefix` and the " + "`remove_prefix` argument at the same time. Only use " + "`remove_prefix` instead.") + return prefix + return remove_prefix + + +def _fix_file_paths(crawled_data, add_prefix, remove_prefix): + """adjust the path according to add_/remove_prefix + + Also remove the `file` attribute from File entities (because inserts need currently be done + by loadfiles. + """ + for elem in crawled_data: + if isinstance(elem, db.File): + # correct the file path: + # elem.file = os.path.join(args.path, elem.file) + if remove_prefix: + if elem.path.startswith(remove_prefix): + elem.path = elem.path[len(remove_prefix):] + else: + raise RuntimeError("Prefix shall be removed from file path but the path " + "does not start with the prefix:" + f"\n{remove_prefix}\n{elem.path}") + if add_prefix: + elem.path = add_prefix + elem.path + elem.file = None + # TODO: as long as the new file backend is not finished + # we are using the loadFiles function to insert symlinks. + # Therefore, I am setting the files to None here. + # Otherwise, the symlinks in the database would be replaced + # by uploads of the files which we currently do not want to happen. + + +def _check_record_types(crawled_data): + """Check for all parents in crawled_data whether they exists + + raise Error if it does not + """ + rtsfinder = dict() + + for elem in crawled_data: + # Check whether all needed RecordTypes exist: + if len(elem.parents) > 0: + for parent in elem.parents: + if parent.name in rtsfinder: + continue + + rt = db.RecordType(name=parent.name) + try: + rt.retrieve() + rtsfinder[parent.name] = True + except db.TransactionError: + rtsfinder[parent.name] = False + + notfound = [k for k, v in rtsfinder.items() if not v] + if len(notfound) > 0: + raise RuntimeError("Missing RecordTypes: {}". format(", ".join(notfound))) + + +def _store_dry_run_data(ins, upd): + """write insets and updates to a file """ + inserts = [str(i) for i in ins] + updates = [str(i) for i in upd] + with open("dry.yml", "w") as f: + f.write(yaml.dump({ + "insert": inserts, + "update": updates})) + + def crawler_main(crawled_directory_path: str, cfood_file_name: str, identifiables_definition_file: Optional[str] = None, @@ -996,7 +1144,7 @@ def crawler_main(crawled_directory_path: str, identifiables_definition_file : str filename of an identifiable definition yaml file debug : bool - whether or not to run in debug mode + DEPRECATED, whether or not to run in debug mode provenance_file : str provenance information will be stored in a file with given filename dry_run : bool @@ -1020,82 +1168,55 @@ def crawler_main(crawled_directory_path: str, return_value : int 0 if successful """ - crawler = Crawler(securityMode=securityMode) try: + crawler = Crawler(securityMode=securityMode) + + # setup logging and reporting if serverside execution + if "SHARED_DIR" in os.environ: + userlog_public, htmluserlog_public, debuglog_public = configure_server_side_logging() + _create_status_record( + get_config_setting("public_host_url") + "/Shared/" + htmluserlog_public, crawler.run_id) debug_tree = DebugTree() crawled_data = scan_directory( crawled_directory_path, cfood_file_name, restricted_path, debug_tree=debug_tree) + _fix_file_paths(crawled_data, add_prefix, remove_prefix) + _check_record_types(crawled_data) + + if provenance_file is not None and debug: + crawler.save_debug_data(debug_tree=debug_tree, filename=provenance_file) + + if identifiables_definition_file is not None: + ident = CaosDBIdentifiableAdapter() + ident.load_from_yaml_definition(identifiables_definition_file) + crawler.identifiableAdapter = ident + + remove_prefix = _treat_deprecated_prefix(prefix, remove_prefix) + + if dry_run: + inserts, updates = crawler.synchronize(commit_changes=False, crawled_data=crawled_data) + _store_dry_run_data(inserts, updates) + else: + inserts, updates = crawler.synchronize(commit_changes=True, unique_names=unique_names, + crawled_data=crawled_data) + if "SHARED_DIR" in os.environ: + _notify_about_inserts_and_updates(len(inserts), len(updates), userlog_public, + crawler.run_id) + _update_status_record(crawler.run_id, len(inserts), len(updates), status="OK") + return 0 except ConverterValidationError as err: logger.error(err) + _update_status_record(crawler.run_id, 0, 0, status="FAILED") return 1 - if provenance_file is not None and debug: - crawler.save_debug_data(debug_tree=debug_tree, filename=provenance_file) + except Exception as err: + logger.debug(err) - if identifiables_definition_file is not None: - ident = CaosDBIdentifiableAdapter() - ident.load_from_yaml_definition(identifiables_definition_file) - crawler.identifiableAdapter = ident - - if prefix != "": - warnings.warn(DeprecationWarning("The prefix argument is deprecated and will be removed " - "in the future. Please use `remove_prefix` instead.")) - if remove_prefix is not None: - raise ValueError("Please do not supply the (deprecated) `prefix` and the " - "`remove_prefix` argument at the same time. Only use " - "`remove_prefix` instead.") - remove_prefix = prefix - - if dry_run: - ins, upd = crawler.synchronize(commit_changes=False, crawled_data=crawled_data) - inserts = [str(i) for i in ins] - updates = [str(i) for i in upd] - with open("dry.yml", "w") as f: - f.write(yaml.dump({ - "insert": inserts, - "update": updates})) - else: - rtsfinder = dict() - for elem in crawled_data: - if isinstance(elem, db.File): - # correct the file path: - # elem.file = os.path.join(args.path, elem.file) - if remove_prefix: - if elem.path.startswith(remove_prefix): - elem.path = elem.path[len(remove_prefix):] - else: - raise RuntimeError("Prefix shall be removed from file path but the path " - "does not start with the prefix:" - f"\n{remove_prefix}\n{elem.path}") - if add_prefix: - elem.path = add_prefix + elem.path - elem.file = None - # TODO: as long as the new file backend is not finished - # we are using the loadFiles function to insert symlinks. - # Therefore, I am setting the files to None here. - # Otherwise, the symlinks in the database would be replaced - # by uploads of the files which we currently do not want to happen. - - # Check whether all needed RecordTypes exist: - if len(elem.parents) > 0: - for parent in elem.parents: - if parent.name in rtsfinder: - continue - - rt = db.RecordType(name=parent.name) - try: - rt.retrieve() - rtsfinder[parent.name] = True - except db.TransactionError: - rtsfinder[parent.name] = False - notfound = [k for k, v in rtsfinder.items() if not v] - if len(notfound) > 0: - raise RuntimeError("Missing RecordTypes: {}". - format(", ".join(notfound))) - - crawler.synchronize(commit_changes=True, unique_names=unique_names, - crawled_data=crawled_data) - return 0 + if "SHARED_DIR" in os.environ: + domain = get_config_setting("public_host_url") + logger.error("Unexpected Error: Please tell your administrator about this and provide the" + f" following path.\n{domain}/Shared/" + debuglog_public) + _update_status_record(crawler.run_id, 0, 0, status="FAILED") + return 1 def parse_args(): diff --git a/src/caoscrawler/logging.py b/src/caoscrawler/logging.py new file mode 100644 index 0000000000000000000000000000000000000000..69ec1fabb97e1d236162552540a35815e25a33fb --- /dev/null +++ b/src/caoscrawler/logging.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2023 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# Copyright (C) 2023 IndiScale GmbH <info@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +import logging + +from caosadvancedtools.webui_formatter import WebUI_Formatter +from caosadvancedtools.serverside.helper import get_shared_filename +import sys + + +def configure_server_side_logging(): + """ + Set logging up to save one plain debugging log file, one plain info log + file (for users) and a stdout stream with messages wrapped in html elements + + returns the path to the file with debugging output + """ + adv_logger = logging.getLogger("caosadvancedtools") + adv_logger.setLevel(level=logging.DEBUG) + + cr_logger = logging.getLogger("caoscrawler") + cr_logger.setLevel(level=logging.DEBUG) + + userlog_public, userlog_internal = get_shared_filename("userlog.txt") + + root_logger = logging.getLogger() + root_logger.setLevel(level=logging.INFO) + + # this is a log file with INFO level for the user + user_file_handler = logging.FileHandler(filename=userlog_internal) + user_file_handler.setLevel(logging.INFO) + root_logger.addHandler(user_file_handler) + + # The output shall be printed in the webui. Thus wrap it in html elements. + formatter = WebUI_Formatter(full_file="/Shared/{}".format(userlog_public)) + web_handler = logging.StreamHandler(stream=sys.stdout) + web_handler.setFormatter(formatter) + web_handler.setLevel(logging.INFO) + root_logger.addHandler(web_handler) + + # Also create an HTML version for later use. + htmluserlog_public, htmluserlog_internal = get_shared_filename("userlog.html") + formatter = WebUI_Formatter(full_file="/Shared/{}".format(userlog_public)) + lweb_handler = logging.FileHandler(filename=htmluserlog_internal) + lweb_handler.setFormatter(formatter) + lweb_handler.setLevel(logging.INFO) + root_logger.addHandler(lweb_handler) + + # one log file with debug level output + debuglog_public, debuglog_internal = get_shared_filename("debuglog.txt") + debug_handler = logging.FileHandler(filename=debuglog_internal) + debug_handler.setLevel(logging.DEBUG) + root_logger.addHandler(debug_handler) + + return userlog_public, htmluserlog_public, debuglog_public diff --git a/src/doc/getting_started/index.rst b/src/doc/getting_started/index.rst index 74ffa7daeff393d05605e1066a5985984c2e9751..490c705f2feb9eeedc399e8c1d91e28abcd7fd12 100644 --- a/src/doc/getting_started/index.rst +++ b/src/doc/getting_started/index.rst @@ -9,6 +9,7 @@ Getting Started Installation<INSTALL> prerequisites helloworld + optionalfeatures This section will help you get going! From the first installation steps to the first simple crawl. diff --git a/src/doc/getting_started/optionalfeatures.rst b/src/doc/getting_started/optionalfeatures.rst new file mode 100644 index 0000000000000000000000000000000000000000..d326d7fce6f77a0278c9f2d05a641888203a2089 --- /dev/null +++ b/src/doc/getting_started/optionalfeatures.rst @@ -0,0 +1,47 @@ +Optional Features +================= + +Email notifications +------------------- + +The crawler can send email notifications if it made some changes or if +new data was inserted. This is (currently) only available if the crawler +runs as server side script of CaosDB. You need to add the following +section to your ``.pycaosdb.ini`` + +.. code:: ini + + [caoscrawler] + send_crawler_notifications=True + public_host_url=https://example..eu + sendmail_to_address=someone@example.de + sendmail_from_address=caosdb-no-reply@example.eu + +This feature uses the ``sendmail`` functionality of +``caosadvancedtools``. Thus, it uses the setting + +.. code:: ini + + [Misc] + sendmail = /usr/sbin/sendmail + #sendmail = /usr/local/bin/sendmail_to_file + +to decide what tool is used for sending mails (use the upper one if you +want to actually send mails. See ``sendmail`` configuration in the +LinkAhead docs. + +Crawler Status Records +---------------------- + +The crawler can insert and update Records that contain essential +information about the data integration process. This is (currently) only +available if the crawler runs as server side script of CaosDB. To enable +this, add the following to your ``.pycaosdb.ini`` + +.. code:: ini + + [caoscrawler] + create_crawler_status_records=True + +You also need to add the data model needed for this as desribed by +``crawler_run_model.yml``. diff --git a/unittests/test_tool.py b/unittests/test_tool.py index 08b3a0e4f9623e996540746ac408801090b97aa3..b88720f4da89dfa735e782a4d2e41ccc3b0f4d3c 100755 --- a/unittests/test_tool.py +++ b/unittests/test_tool.py @@ -29,7 +29,8 @@ import logging from caoscrawler.stores import GeneralStore, RecordStore import os -from caoscrawler.crawl import Crawler, SecurityMode, split_restricted_path +from caoscrawler.crawl import (_treat_deprecated_prefix, Crawler, crawler_main, + SecurityMode, split_restricted_path) from caoscrawler.identifiable import Identifiable from caoscrawler.structure_elements import File, DictTextElement, DictListElement, DictElement from caoscrawler.scanner import scan_directory @@ -39,7 +40,6 @@ from simulated_server_data import full_data from functools import partial from copy import deepcopy from unittest.mock import patch -from caoscrawler.crawl import crawler_main import caosdb.common.models as dbmodels from unittest.mock import MagicMock, Mock from os.path import join, dirname, basename @@ -994,9 +994,13 @@ def test_deprecated_prefix_option(): with pytest.deprecated_call(): crawler_main("./", rfp("scifolder_cfood.yml"), prefix="to/be/removed") + # Check that crawler main terminates with an error + assert 1 == crawler_main("./", rfp("scifolder_cfood.yml"), prefix="to/be/removed", + remove_prefix="to/be/removed") + with raises(ValueError) as ve: - crawler_main("./", rfp("scifolder_cfood.yml"), prefix="to/be/removed", - remove_prefix="to/be/removed") + + _treat_deprecated_prefix(prefix="to/be/removed", remove_prefix="to/be/removed") assert "(deprecated) `prefix` and the `remove_prefix`" in str(ve.value)