diff --git a/CHANGELOG.md b/CHANGELOG.md index fb0cdeca89be0e55a2b66a6f0bf74db83c4290c1..ccf689a7aa3666c92efda515bc01138a632319d0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ``` - Support for Python 3.13 - ROCrateConverter, ELNFileConverter and ROCrateEntityConverter for crawling ROCrate and .eln files +- `max_log_level` parameter to `logging.configure_server_side_logging` + to control the server-side debuglog's verboosity, and an optional + `sss_max_log_level` parameter to `crawler_main` to control the SSS + loglevel separately from the global `debug` option. ### Changed ### @@ -30,6 +34,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - if `value` starts with '+', collection mode is "list". - if `value` starts with '*', collection mode is "multiproperty". - in all other cases, collection mode is "single". +- The default server-side scrippting debug level is now controlled by + the global `debug` option by default and set to log level `INFO` in + case of `debug=False`. The previous behavior can be restored by + calling `crawler_main` with `sss_max_log_level=logging.DEBUG`. ### Deprecated ### @@ -39,6 +47,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added better error message for some cases of broken converter and record definitions. +- [#108](https://gitlab.com/linkahead/linkahead-crawler/-/issues/108) + Too verbose server-side scripting logs that could lead to high disk + usage. ### Security ### diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 89b5ba001ed446c7f5a0f261898deecd3e7a5e00..b2eb031291b93b713f7f961172d862106e6ce2ff 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -1020,6 +1020,7 @@ def crawler_main(crawled_directory_path: str, restricted_path: Optional[list[str]] = None, remove_prefix: Optional[str] = None, add_prefix: Optional[str] = None, + sss_max_log_level: Optional[int] = None, ): """ @@ -1053,6 +1054,12 @@ def crawler_main(crawled_directory_path: str, add_prefix : Optional[str] Add the given prefix to file paths. See docstring of '_fix_file_paths' for more details. + sss_max_log_level : Optional[int] + If given, set the maximum log level of the server-side + scripting log separately from the general ``debug`` option. If + None is given, the maximum sss log level will be determined + from the value of ``debug``: ``logging.INFO`` if ``debug`` is + False, ``logging.DEBUG`` if ``debug`` is True. Returns ------- @@ -1063,7 +1070,11 @@ def crawler_main(crawled_directory_path: str, crawler = Crawler(securityMode=securityMode) if "SHARED_DIR" in os.environ: # setup logging and reporting if serverside execution - userlog_public, htmluserlog_public, debuglog_public = configure_server_side_logging() + if sss_max_log_level is None: + sss_max_log_level = logging.DEBUG if debug else logging.INFO + userlog_public, htmluserlog_public, debuglog_public = configure_server_side_logging( + max_log_level=sss_max_log_level + ) # TODO make this optional _create_status_record( get_shared_resource_link(get_config_setting("public_host_url"), htmluserlog_public), diff --git a/src/caoscrawler/logging.py b/src/caoscrawler/logging.py index 69ec1fabb97e1d236162552540a35815e25a33fb..f22f9678de768586b04e6781e1448242802406ee 100644 --- a/src/caoscrawler/logging.py +++ b/src/caoscrawler/logging.py @@ -26,23 +26,40 @@ from caosadvancedtools.serverside.helper import get_shared_filename import sys -def configure_server_side_logging(): +def configure_server_side_logging(max_log_level: int = logging.INFO): """ Set logging up to save one plain debugging log file, one plain info log file (for users) and a stdout stream with messages wrapped in html elements returns the path to the file with debugging output + + Parameters + ---------- + max_log_level : int, optional + The maximum log level to use for SSS-logs. Default is + ``logging.INFO``. + + Returns + ------- + userlog_public, htmluserlog_public, debuglog_public: str + Public paths of the respective log files. """ adv_logger = logging.getLogger("caosadvancedtools") - adv_logger.setLevel(level=logging.DEBUG) + # The max_<level> variables will be used to set the logger levels + # to the respective maximum of intended level and max_log_level, + # effectively cutting off logging above the specified + # max_log_level. + max_info = max(logging.INFO, max_log_level) + max_debug = max(logging.DEBUG, max_log_level) + adv_logger.setLevel(level=max_debug) cr_logger = logging.getLogger("caoscrawler") - cr_logger.setLevel(level=logging.DEBUG) + cr_logger.setLevel(level=max_debug) userlog_public, userlog_internal = get_shared_filename("userlog.txt") root_logger = logging.getLogger() - root_logger.setLevel(level=logging.INFO) + root_logger.setLevel(level=max_info) # this is a log file with INFO level for the user user_file_handler = logging.FileHandler(filename=userlog_internal)