Skip to content
Snippets Groups Projects
Commit a72b0fc0 authored by Florian Spreckelsen's avatar Florian Spreckelsen
Browse files

ENH: Add option to only match directories with contents newer than a reference file

parent e5d75eb6
No related branches found
No related tags found
2 merge requests!217TST: Make NamedTemporaryFiles Windows-compatible,!206F dir change date
......@@ -769,6 +769,11 @@ class DirectoryConverter(Converter):
m = re.match(self.definition["match"], element.name)
if m is None:
return None
if "match_newer_than_file" in self.definition:
last_modified = self._get_most_recent_change_in_dir(element)
reference = self._get_reference_file_timestamp()
if last_modified < reference:
return None
return m.groupdict()
@staticmethod
......@@ -791,6 +796,49 @@ class DirectoryConverter(Converter):
return children
@staticmethod
def _get_most_recent_change_in_dir(element: Directory) -> datetime.datetime:
"""Return the datetime of the most recent change of any file
or directory in the given Directory element.
"""
most_recent = os.path.getmtime(element.path)
for root, _, files in os.walk(element.path):
mtimes = [os.path.getmtime(root)] + \
[os.path.getmtime(os.path.join(root, fname)) for fname in files]
if max(mtimes) > most_recent:
most_recent = max(mtimes)
return datetime.datetime.fromtimestamp(most_recent)
def _get_reference_file_timestamp(self) -> datetime.datetime:
"""Return a time stamp read from a reference file if it
exists. Otherwise return datetime.datetime.min, i.e., the
earliest datetime known to datetime.
"""
if "match_newer_than_file" not in self.definition:
logger.debug("No reference file specified.")
return datetime.datetime.min
elif not os.path.isfile(self.definition["match_newer_than_file"]):
logger.debug("Reference file doesn't exist.")
return datetime.datetime.min
with open(self.definition["match_newer_than_file"]) as ref_file:
stamp_str = ref_file.readline().strip()
try:
return datetime.datetime.fromisoformat(stamp_str)
except ValueError:
logger.warn(
f"Reference file in {self.definition['match_newer_than_file']} "
"doesn't contain a ISO formatted datetime in its first line. "
"Match regardless of modification times."
)
return datetime.datetime.min
class SimpleFileConverter(Converter):
"""Just a file, ignore the contents."""
......
......@@ -29,12 +29,15 @@ import importlib
import json
import logging
import os
import pytest
import yaml
from itertools import product
from pathlib import Path
from tempfile import NamedTemporaryFile
import linkahead as db
import pytest
import yaml
from caoscrawler.converters import (Converter, ConverterValidationError,
DateElementConverter, DictElementConverter,
DictIntegerElementConverter,
......@@ -1070,3 +1073,58 @@ def test_dict_match_properties(converter_registry):
"prop_d": 24 # duplicate matches
})
records = scan_structure_elements(root_dict_element, def_dict, converter_registry)
def test_directory_converter_change_date(caplog, converter_registry):
"""Test that only directories that were modified after a certain
date are crawled.
"""
test_dir_element = Directory("test_directories", UNITTESTDIR / "test_directories")
date_of_dir_change = DirectoryConverter._get_most_recent_change_in_dir(test_dir_element)
past_date = date_of_dir_change - datetime.timedelta(days=1)
future_date = date_of_dir_change + datetime.timedelta(days=1)
tmpfi = NamedTemporaryFile(delete=False)
# Write down past
with open(tmpfi.name, "w") as fi:
fi.write(f"{past_date.isoformat()}\n")
converter_def = {
"type": "Directory",
"match": "^test_directories$",
"match_newer_than_file": tmpfi.name
}
dc = DirectoryConverter(name="DC1", definition=converter_def,
converter_registry=converter_registry)
assert dc.match(test_dir_element) is not None
# Write down future, so nothing should match
with open(tmpfi.name, "w") as fi:
fi.write(f"{future_date.isoformat()}\n")
assert dc.match(test_dir_element) is None
# Also match in the corner case of equality:
with open(tmpfi.name, "w") as fi:
fi.write(f"{date_of_dir_change.isoformat()}\n")
assert dc.match(test_dir_element) is not None
# Match but warn
with open(tmpfi.name, "w") as fi:
fi.write(f"This is garbage.\n")
assert dc.match(test_dir_element) is not None
assert len(caplog.record_tuples) == 1
assert caplog.record_tuples[0][1] == logging.WARNING
assert tmpfi.name in caplog.record_tuples[0][2]
assert "doesn't contain a ISO formatted datetime in its first line" in caplog.record_tuples[0][2]
# Match anything since file doesn't exist, inform in debug log.
os.remove(tmpfi.name)
# Clear log and enforce debug level.
caplog.clear()
caplog.set_level(logging.DEBUG)
assert dc.match(test_dir_element) is not None
assert len(caplog.record_tuples) == 1
assert caplog.record_tuples[0][1] == logging.DEBUG
assert "Reference file doesn't exist." == caplog.record_tuples[0][2]
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment