diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index 9b759d3c4a0674f8cd647d5b86be2ece7c19d9aa..ed48c130c578734a1757eb59b6778814085a8bf4 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -43,6 +43,8 @@ from string import Template import yaml_header_tools import pandas as pd +import logging + import yaml @@ -51,6 +53,8 @@ import yaml SPECIAL_PROPERTIES = ("description", "name", "id", "path", "file", "checksum", "size") +logger = logging.getLogger(__name__) + def _only_max(children_with_keys): @@ -364,7 +368,8 @@ class Converter(object, metaclass=ABCMeta): if rule not in FILTER_FUNCTIONS: raise RuntimeError( - f"{rule} is not a known filter rule. Only {list(FILTER_FUNCTIONS.keys())} are implemented." + f"{rule} is not a known filter rule. Only " + f"{list(FILTER_FUNCTIONS.keys())} are implemented." ) to_be_filtered = [] @@ -391,19 +396,21 @@ class Converter(object, metaclass=ABCMeta): pass @staticmethod - def _debug_matching_template(name: str, regexp: list[str], matched: list[str], result: Optional[dict]): + def _debug_matching_template(name: str, regexp: list[str], matched: list[str], + result: Optional[dict]): """ Template for the debugging output for the match function """ - print("\n--------", name, "-----------") + msg = "\n--------" + name + "-----------" for re, ma in zip(regexp, matched): - print("matching against:\n" + re) - print("matching:\n" + ma) - print("---------") + msg += "matching against:\n" + re + msg += "matching:\n" + ma + msg += "---------" if result is None: - print("No match") + msg += "No match" else: - print("Matched groups:") - print(result) - print("----------------------------------------") + msg += "Matched groups:" + msg += str(result) + msg += "----------------------------------------" + logger.debug(msg) @staticmethod def debug_matching(kind=None): diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 66a81c77605cd42f772ad7aa7d73c1b02c702d55..fe6c63ff7af86a76fd6649323c96b8977b9fcc2f 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -1266,7 +1266,7 @@ def crawler_main(crawled_directory_path: str, try: crawler.crawl_directory(crawled_directory_path, cfood_file_name, restricted_path) except ConverterValidationError as err: - print(err) + logger.error(err) return 1 if provenance_file is not None and debug: crawler.save_debug_data(provenance_file) diff --git a/src/caoscrawler/identifiable.py b/src/caoscrawler/identifiable.py index b793924d26c13078485c456bbb0989891a53059f..eda113d8fc0c5fc64a620ef7540dec4004401aef 100644 --- a/src/caoscrawler/identifiable.py +++ b/src/caoscrawler/identifiable.py @@ -25,6 +25,9 @@ from datetime import datetime import json from hashlib import sha256 from typing import Union +import logging + +logger = logging.getLogger(__name__) class Identifiable(): diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index 40c801547a85afaf32e1ab6a668bc47d98d60b66..6ddc533021f2abb87040f0f90173f9435a7423db 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -33,6 +33,7 @@ import caosdb as db import logging from abc import abstractmethod, ABCMeta from .utils import has_parent + logger = logging.getLogger(__name__) diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 9390e65c08da6b19e335424b8021a88528b93e19..f72deda18152f9d12161d740e41271f90fcb848c 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -25,6 +25,8 @@ test the converters module """ import json import yaml +import logging +import sys import importlib import os from itertools import product @@ -371,7 +373,6 @@ def test_filter_children_of_directory(converter_registry, capsys): dc = DirectoryConverter( definition={ "match": "(.*)", - "debug_match": True, "filter": { "expr": "test_(?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2}).json", "group": "date", @@ -384,14 +385,6 @@ def test_filter_children_of_directory(converter_registry, capsys): m = dc.match(test_dir) assert m is not None - # checking debug output - captured = capsys.readouterr() - # the name - assert "examples_filter_children" in captured.out - # the regexp - assert "(.*)" in captured.out - # the empty result set - assert "{}" in captured.out # This should only contain the youngest json and the csv that doesn't match # the above filter expression. @@ -541,7 +534,8 @@ def test_converter_value_match(converter_registry): assert m is not None -def test_match_debug(converter_registry, capsys): +def test_match_debug(converter_registry, caplog): + caplog.set_level(logging.DEBUG, logger="caoscrawler.converters") for m, mn, mv in product([".*", None], [".*", None], [".*", None]): defi = {"debug_match": True} if m: @@ -563,14 +557,13 @@ def test_match_debug(converter_registry, capsys): mtch = dc.match(IntegerElement(name="a", value=4)) if not (m is None and mn is None and mv is None): assert mtch is not None - # checking debug output - captured = capsys.readouterr() # the name - assert "a" in captured.out + assert "a" in caplog.text # the regexp - assert ".*" in captured.out + assert ".*" in caplog.text # the empty result set - assert "{}" in captured.out + assert "{}" in caplog.text + caplog.clear() def test_date_converter(): @@ -596,7 +589,7 @@ def test_date_converter(): matches = dictconverter.match(TextElement("text", "alve")) assert matches is None - + def test_load_converters(): c = Crawler() converter_registry = c.load_converters({}) diff --git a/unittests/test_tool.py b/unittests/test_tool.py index 187ec06e097a3aba1053c865eac1190654a267c0..23b35f2dc9228eeda9137945198c49c19bf5c474 100755 --- a/unittests/test_tool.py +++ b/unittests/test_tool.py @@ -25,6 +25,7 @@ Tests for the tool using pytest Adapted from check-sfs """ +import logging from caoscrawler.stores import GeneralStore, RecordStore import os @@ -780,7 +781,8 @@ def crawler_mocked_for_backref_test(crawler): return crawler -def test_validation_error_print(capsys): +def test_validation_error_print(caplog): + caplog.set_level(logging.DEBUG, logger="caoscrawler.converters") # there should be no server interaction since we only test the behavior if a validation error # occurs during the data collection stage DATADIR = os.path.join(os.path.dirname(__file__), "test_data", "failing_validation") @@ -792,8 +794,8 @@ def test_validation_error_print(capsys): None, False, "/use_case_simple_presentation") - captured = capsys.readouterr() - assert "Couldn't validate" in captured.out + assert "Couldn't validate" in caplog.text + caplog.clear() def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test):