diff --git a/CHANGELOG.md b/CHANGELOG.md index fe302156cc95b80a74bf546f262246b1d5914926..354024f9be37fc102f035a5d6562b6d522aaa915 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,6 +30,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 `securityMode=SecurityMode.INSERT` since the functionality to authoriye pending inserts or updates doesn't support path lists yet and will raise a NotImplementedError for now. +- `match_newer_than_file` option for `DirectoryConverter`: A reference + file containing (only) an ISO-formatted datetime string can be + specified here. Directories with this option won't match if all + their contents were last modified before that datetime. ### Changed ### diff --git a/src/caoscrawler/cfood-schema.yml b/src/caoscrawler/cfood-schema.yml index c5e0eaad092c12efbceb5f55b62b3d7cf8afdccf..d2e4cea24f0f2803499116420091b36e95b2c781 100644 --- a/src/caoscrawler/cfood-schema.yml +++ b/src/caoscrawler/cfood-schema.yml @@ -88,6 +88,12 @@ cfood: match_value: description: a regexp that is matched to the value of a key-value pair type: string + match_newer_than_file: + description: | + Only relevant for Directory. A path to a file containing + an ISO-formatted datetime. Only match if the contents of the + Directory have been modified after that datetime. + type: string record_from_dict: description: Only relevant for PropertiesFromDictElement. Specify the root record which is generated from the contained dictionary. type: object diff --git a/src/caoscrawler/converters/converters.py b/src/caoscrawler/converters/converters.py index d06415f78df2949dfee5a7a352b631e4a0b0264f..df0d77b1184a7a9418ccd0be292a7b4c452a8e6b 100644 --- a/src/caoscrawler/converters/converters.py +++ b/src/caoscrawler/converters/converters.py @@ -769,6 +769,11 @@ class DirectoryConverter(Converter): m = re.match(self.definition["match"], element.name) if m is None: return None + if "match_newer_than_file" in self.definition: + last_modified = self._get_most_recent_change_in_dir(element) + reference = self._get_reference_file_timestamp() + if last_modified < reference: + return None return m.groupdict() @staticmethod @@ -791,6 +796,49 @@ class DirectoryConverter(Converter): return children + @staticmethod + def _get_most_recent_change_in_dir(element: Directory) -> datetime.datetime: + """Return the datetime of the most recent change of any file + or directory in the given Directory element. + + """ + most_recent = os.path.getmtime(element.path) + + for root, _, files in os.walk(element.path): + mtimes = [os.path.getmtime(root)] + \ + [os.path.getmtime(os.path.join(root, fname)) for fname in files] + if max(mtimes) > most_recent: + most_recent = max(mtimes) + + return datetime.datetime.fromtimestamp(most_recent) + + def _get_reference_file_timestamp(self) -> datetime.datetime: + """Return a time stamp read from a reference file if it + exists. Otherwise return datetime.datetime.min, i.e., the + earliest datetime known to datetime. + + """ + + if "match_newer_than_file" not in self.definition: + logger.debug("No reference file specified.") + return datetime.datetime.min + + elif not os.path.isfile(self.definition["match_newer_than_file"]): + logger.debug("Reference file doesn't exist.") + return datetime.datetime.min + + with open(self.definition["match_newer_than_file"]) as ref_file: + stamp_str = ref_file.readline().strip() + try: + return datetime.datetime.fromisoformat(stamp_str) + except ValueError as e: + logger.error( + f"Reference file in {self.definition['match_newer_than_file']} " + "doesn't contain a ISO formatted datetime in its first line. " + "Match regardless of modification times." + ) + raise e + class SimpleFileConverter(Converter): """Just a file, ignore the contents.""" diff --git a/src/doc/converters/standard_converters.rst b/src/doc/converters/standard_converters.rst index f7f18794496e5e658a8abdb5676b562d5e047675..5f86abb5b324e0cc1584e42e6abb2612acc8067f 100644 --- a/src/doc/converters/standard_converters.rst +++ b/src/doc/converters/standard_converters.rst @@ -6,9 +6,17 @@ These are the standard converters that exist in a default installation. For wri Directory Converter =================== -The Directory Converter creates StructureElements for each File and Directory -inside the current Directory. You can match a regular expression against the -directory name using the 'match' key. + +The Directory Converter creates StructureElements for each File and +Directory inside the current Directory. You can match a regular +expression against the directory name using the 'match' key. + +With the optional ``match_newer_than_file`` key, a path to file +containing only an ISO-formatted datetime string can be specified. If +this is done, a directory will only match if it contains at least one +file or directory that has been modified since that datetime. If the +file doesn't exist or contains an invalid string, the directory will +be matched regardless of the modification times. Simple File Converter ===================== diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 12285e463cdcab12f853931abc5f314ed6b20782..e4b442d91060c7ba98cb1a910156b1800f050be3 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -29,12 +29,15 @@ import importlib import json import logging import os +import pytest +import yaml + from itertools import product from pathlib import Path +from tempfile import NamedTemporaryFile import linkahead as db -import pytest -import yaml + from caoscrawler.converters import (Converter, ConverterValidationError, DateElementConverter, DictElementConverter, DictIntegerElementConverter, @@ -1070,3 +1073,59 @@ def test_dict_match_properties(converter_registry): "prop_d": 24 # duplicate matches }) records = scan_structure_elements(root_dict_element, def_dict, converter_registry) + + +def test_directory_converter_change_date(caplog, converter_registry): + """Test that only directories that were modified after a certain + date are crawled. + + """ + test_dir_element = Directory("test_directories", UNITTESTDIR / "test_directories") + date_of_dir_change = DirectoryConverter._get_most_recent_change_in_dir(test_dir_element) + past_date = date_of_dir_change - datetime.timedelta(days=1) + future_date = date_of_dir_change + datetime.timedelta(days=1) + + tmpfi = NamedTemporaryFile(delete=False) + + # Write down past + with open(tmpfi.name, "w") as fi: + fi.write(f"{past_date.isoformat()}\n") + + converter_def = { + "type": "Directory", + "match": "^test_directories$", + "match_newer_than_file": tmpfi.name + } + dc = DirectoryConverter(name="DC1", definition=converter_def, + converter_registry=converter_registry) + assert dc.match(test_dir_element) is not None + + # Write down future, so nothing should match + with open(tmpfi.name, "w") as fi: + fi.write(f"{future_date.isoformat()}\n") + assert dc.match(test_dir_element) is None + + # Also match in the corner case of equality: + with open(tmpfi.name, "w") as fi: + fi.write(f"{date_of_dir_change.isoformat()}\n") + assert dc.match(test_dir_element) is not None + + # Match but warn + with open(tmpfi.name, "w") as fi: + fi.write(f"This is garbage.\n") + with pytest.raises(ValueError): + dc.match(test_dir_element) + assert len(caplog.record_tuples) == 1 + assert caplog.record_tuples[0][1] == logging.ERROR + assert tmpfi.name in caplog.record_tuples[0][2] + assert "doesn't contain a ISO formatted datetime in its first line" in caplog.record_tuples[0][2] + + # Match anything since file doesn't exist, inform in debug log. + os.remove(tmpfi.name) + # Clear log and enforce debug level. + caplog.clear() + caplog.set_level(logging.DEBUG) + assert dc.match(test_dir_element) is not None + assert len(caplog.record_tuples) == 1 + assert caplog.record_tuples[0][1] == logging.DEBUG + assert "Reference file doesn't exist." == caplog.record_tuples[0][2]