diff --git a/src/caoscrawler/converters/converters.py b/src/caoscrawler/converters/converters.py index d06415f78df2949dfee5a7a352b631e4a0b0264f..da752f685ab3106f8dc8bc1226f1c7c3143ec4f1 100644 --- a/src/caoscrawler/converters/converters.py +++ b/src/caoscrawler/converters/converters.py @@ -769,6 +769,11 @@ class DirectoryConverter(Converter): m = re.match(self.definition["match"], element.name) if m is None: return None + if "match_newer_than_file" in self.definition: + last_modified = self._get_most_recent_change_in_dir(element) + reference = self._get_reference_file_timestamp() + if last_modified < reference: + return None return m.groupdict() @staticmethod @@ -791,6 +796,49 @@ class DirectoryConverter(Converter): return children + @staticmethod + def _get_most_recent_change_in_dir(element: Directory) -> datetime.datetime: + """Return the datetime of the most recent change of any file + or directory in the given Directory element. + + """ + most_recent = os.path.getmtime(element.path) + + for root, _, files in os.walk(element.path): + mtimes = [os.path.getmtime(root)] + \ + [os.path.getmtime(os.path.join(root, fname)) for fname in files] + if max(mtimes) > most_recent: + most_recent = max(mtimes) + + return datetime.datetime.fromtimestamp(most_recent) + + def _get_reference_file_timestamp(self) -> datetime.datetime: + """Return a time stamp read from a reference file if it + exists. Otherwise return datetime.datetime.min, i.e., the + earliest datetime known to datetime. + + """ + + if "match_newer_than_file" not in self.definition: + logger.debug("No reference file specified.") + return datetime.datetime.min + + elif not os.path.isfile(self.definition["match_newer_than_file"]): + logger.debug("Reference file doesn't exist.") + return datetime.datetime.min + + with open(self.definition["match_newer_than_file"]) as ref_file: + stamp_str = ref_file.readline().strip() + try: + return datetime.datetime.fromisoformat(stamp_str) + except ValueError: + logger.warn( + f"Reference file in {self.definition['match_newer_than_file']} " + "doesn't contain a ISO formatted datetime in its first line. " + "Match regardless of modification times." + ) + return datetime.datetime.min + class SimpleFileConverter(Converter): """Just a file, ignore the contents.""" diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 12285e463cdcab12f853931abc5f314ed6b20782..7b22aa84646729caea171e14666ea1dd843b5ed6 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -29,12 +29,15 @@ import importlib import json import logging import os +import pytest +import yaml + from itertools import product from pathlib import Path +from tempfile import NamedTemporaryFile import linkahead as db -import pytest -import yaml + from caoscrawler.converters import (Converter, ConverterValidationError, DateElementConverter, DictElementConverter, DictIntegerElementConverter, @@ -1070,3 +1073,58 @@ def test_dict_match_properties(converter_registry): "prop_d": 24 # duplicate matches }) records = scan_structure_elements(root_dict_element, def_dict, converter_registry) + + +def test_directory_converter_change_date(caplog, converter_registry): + """Test that only directories that were modified after a certain + date are crawled. + + """ + test_dir_element = Directory("test_directories", UNITTESTDIR / "test_directories") + date_of_dir_change = DirectoryConverter._get_most_recent_change_in_dir(test_dir_element) + past_date = date_of_dir_change - datetime.timedelta(days=1) + future_date = date_of_dir_change + datetime.timedelta(days=1) + + tmpfi = NamedTemporaryFile(delete=False) + + # Write down past + with open(tmpfi.name, "w") as fi: + fi.write(f"{past_date.isoformat()}\n") + + converter_def = { + "type": "Directory", + "match": "^test_directories$", + "match_newer_than_file": tmpfi.name + } + dc = DirectoryConverter(name="DC1", definition=converter_def, + converter_registry=converter_registry) + assert dc.match(test_dir_element) is not None + + # Write down future, so nothing should match + with open(tmpfi.name, "w") as fi: + fi.write(f"{future_date.isoformat()}\n") + assert dc.match(test_dir_element) is None + + # Also match in the corner case of equality: + with open(tmpfi.name, "w") as fi: + fi.write(f"{date_of_dir_change.isoformat()}\n") + assert dc.match(test_dir_element) is not None + + # Match but warn + with open(tmpfi.name, "w") as fi: + fi.write(f"This is garbage.\n") + assert dc.match(test_dir_element) is not None + assert len(caplog.record_tuples) == 1 + assert caplog.record_tuples[0][1] == logging.WARNING + assert tmpfi.name in caplog.record_tuples[0][2] + assert "doesn't contain a ISO formatted datetime in its first line" in caplog.record_tuples[0][2] + + # Match anything since file doesn't exist, inform in debug log. + os.remove(tmpfi.name) + # Clear log and enforce debug level. + caplog.clear() + caplog.set_level(logging.DEBUG) + assert dc.match(test_dir_element) is not None + assert len(caplog.record_tuples) == 1 + assert caplog.record_tuples[0][1] == logging.DEBUG + assert "Reference file doesn't exist." == caplog.record_tuples[0][2]