Skip to content
Snippets Groups Projects
Commit 02325fd4 authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

Merge branch 'dev' into f-unify-notifications

parents caa680ae 493d1acc
No related branches found
No related tags found
2 merge requests!217TST: Make NamedTemporaryFiles Windows-compatible,!208ENH: Allow crawler_main to operate on a list of paths
Pipeline #58840 passed
......@@ -30,6 +30,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
`securityMode=SecurityMode.INSERT` since the functionality to
authoriye pending inserts or updates doesn't support path lists yet
and will raise a NotImplementedError for now.
- `match_newer_than_file` option for `DirectoryConverter`: A reference
file containing (only) an ISO-formatted datetime string can be
specified here. Directories with this option won't match if all
their contents were last modified before that datetime.
### Changed ###
......
......@@ -88,6 +88,12 @@ cfood:
match_value:
description: a regexp that is matched to the value of a key-value pair
type: string
match_newer_than_file:
description: |
Only relevant for Directory. A path to a file containing
an ISO-formatted datetime. Only match if the contents of the
Directory have been modified after that datetime.
type: string
record_from_dict:
description: Only relevant for PropertiesFromDictElement. Specify the root record which is generated from the contained dictionary.
type: object
......
......@@ -769,6 +769,11 @@ class DirectoryConverter(Converter):
m = re.match(self.definition["match"], element.name)
if m is None:
return None
if "match_newer_than_file" in self.definition:
last_modified = self._get_most_recent_change_in_dir(element)
reference = self._get_reference_file_timestamp()
if last_modified < reference:
return None
return m.groupdict()
@staticmethod
......@@ -791,6 +796,49 @@ class DirectoryConverter(Converter):
return children
@staticmethod
def _get_most_recent_change_in_dir(element: Directory) -> datetime.datetime:
"""Return the datetime of the most recent change of any file
or directory in the given Directory element.
"""
most_recent = os.path.getmtime(element.path)
for root, _, files in os.walk(element.path):
mtimes = [os.path.getmtime(root)] + \
[os.path.getmtime(os.path.join(root, fname)) for fname in files]
if max(mtimes) > most_recent:
most_recent = max(mtimes)
return datetime.datetime.fromtimestamp(most_recent)
def _get_reference_file_timestamp(self) -> datetime.datetime:
"""Return a time stamp read from a reference file if it
exists. Otherwise return datetime.datetime.min, i.e., the
earliest datetime known to datetime.
"""
if "match_newer_than_file" not in self.definition:
logger.debug("No reference file specified.")
return datetime.datetime.min
elif not os.path.isfile(self.definition["match_newer_than_file"]):
logger.debug("Reference file doesn't exist.")
return datetime.datetime.min
with open(self.definition["match_newer_than_file"]) as ref_file:
stamp_str = ref_file.readline().strip()
try:
return datetime.datetime.fromisoformat(stamp_str)
except ValueError as e:
logger.error(
f"Reference file in {self.definition['match_newer_than_file']} "
"doesn't contain a ISO formatted datetime in its first line. "
"Match regardless of modification times."
)
raise e
class SimpleFileConverter(Converter):
"""Just a file, ignore the contents."""
......
......@@ -6,9 +6,17 @@ These are the standard converters that exist in a default installation. For wri
Directory Converter
===================
The Directory Converter creates StructureElements for each File and Directory
inside the current Directory. You can match a regular expression against the
directory name using the 'match' key.
The Directory Converter creates StructureElements for each File and
Directory inside the current Directory. You can match a regular
expression against the directory name using the 'match' key.
With the optional ``match_newer_than_file`` key, a path to file
containing only an ISO-formatted datetime string can be specified. If
this is done, a directory will only match if it contains at least one
file or directory that has been modified since that datetime. If the
file doesn't exist or contains an invalid string, the directory will
be matched regardless of the modification times.
Simple File Converter
=====================
......
......@@ -29,12 +29,15 @@ import importlib
import json
import logging
import os
import pytest
import yaml
from itertools import product
from pathlib import Path
from tempfile import NamedTemporaryFile
import linkahead as db
import pytest
import yaml
from caoscrawler.converters import (Converter, ConverterValidationError,
DateElementConverter, DictElementConverter,
DictIntegerElementConverter,
......@@ -1070,3 +1073,59 @@ def test_dict_match_properties(converter_registry):
"prop_d": 24 # duplicate matches
})
records = scan_structure_elements(root_dict_element, def_dict, converter_registry)
def test_directory_converter_change_date(caplog, converter_registry):
"""Test that only directories that were modified after a certain
date are crawled.
"""
test_dir_element = Directory("test_directories", UNITTESTDIR / "test_directories")
date_of_dir_change = DirectoryConverter._get_most_recent_change_in_dir(test_dir_element)
past_date = date_of_dir_change - datetime.timedelta(days=1)
future_date = date_of_dir_change + datetime.timedelta(days=1)
tmpfi = NamedTemporaryFile(delete=False)
# Write down past
with open(tmpfi.name, "w") as fi:
fi.write(f"{past_date.isoformat()}\n")
converter_def = {
"type": "Directory",
"match": "^test_directories$",
"match_newer_than_file": tmpfi.name
}
dc = DirectoryConverter(name="DC1", definition=converter_def,
converter_registry=converter_registry)
assert dc.match(test_dir_element) is not None
# Write down future, so nothing should match
with open(tmpfi.name, "w") as fi:
fi.write(f"{future_date.isoformat()}\n")
assert dc.match(test_dir_element) is None
# Also match in the corner case of equality:
with open(tmpfi.name, "w") as fi:
fi.write(f"{date_of_dir_change.isoformat()}\n")
assert dc.match(test_dir_element) is not None
# Match but warn
with open(tmpfi.name, "w") as fi:
fi.write(f"This is garbage.\n")
with pytest.raises(ValueError):
dc.match(test_dir_element)
assert len(caplog.record_tuples) == 1
assert caplog.record_tuples[0][1] == logging.ERROR
assert tmpfi.name in caplog.record_tuples[0][2]
assert "doesn't contain a ISO formatted datetime in its first line" in caplog.record_tuples[0][2]
# Match anything since file doesn't exist, inform in debug log.
os.remove(tmpfi.name)
# Clear log and enforce debug level.
caplog.clear()
caplog.set_level(logging.DEBUG)
assert dc.match(test_dir_element) is not None
assert len(caplog.record_tuples) == 1
assert caplog.record_tuples[0][1] == logging.DEBUG
assert "Reference file doesn't exist." == caplog.record_tuples[0][2]
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment