Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
CaosDB Crawler
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
caosdb
Software
CaosDB Crawler
Commits
a72b0fc0
Commit
a72b0fc0
authored
5 months ago
by
Florian Spreckelsen
Browse files
Options
Downloads
Patches
Plain Diff
ENH: Add option to only match directories with contents newer than a reference file
parent
e5d75eb6
No related branches found
Branches containing commit
No related tags found
Tags containing commit
2 merge requests
!217
TST: Make NamedTemporaryFiles Windows-compatible
,
!206
F dir change date
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
src/caoscrawler/converters/converters.py
+48
-0
48 additions, 0 deletions
src/caoscrawler/converters/converters.py
unittests/test_converters.py
+60
-2
60 additions, 2 deletions
unittests/test_converters.py
with
108 additions
and
2 deletions
src/caoscrawler/converters/converters.py
+
48
−
0
View file @
a72b0fc0
...
...
@@ -769,6 +769,11 @@ class DirectoryConverter(Converter):
m
=
re
.
match
(
self
.
definition
[
"
match
"
],
element
.
name
)
if
m
is
None
:
return
None
if
"
match_newer_than_file
"
in
self
.
definition
:
last_modified
=
self
.
_get_most_recent_change_in_dir
(
element
)
reference
=
self
.
_get_reference_file_timestamp
()
if
last_modified
<
reference
:
return
None
return
m
.
groupdict
()
@staticmethod
...
...
@@ -791,6 +796,49 @@ class DirectoryConverter(Converter):
return
children
@staticmethod
def
_get_most_recent_change_in_dir
(
element
:
Directory
)
->
datetime
.
datetime
:
"""
Return the datetime of the most recent change of any file
or directory in the given Directory element.
"""
most_recent
=
os
.
path
.
getmtime
(
element
.
path
)
for
root
,
_
,
files
in
os
.
walk
(
element
.
path
):
mtimes
=
[
os
.
path
.
getmtime
(
root
)]
+
\
[
os
.
path
.
getmtime
(
os
.
path
.
join
(
root
,
fname
))
for
fname
in
files
]
if
max
(
mtimes
)
>
most_recent
:
most_recent
=
max
(
mtimes
)
return
datetime
.
datetime
.
fromtimestamp
(
most_recent
)
def
_get_reference_file_timestamp
(
self
)
->
datetime
.
datetime
:
"""
Return a time stamp read from a reference file if it
exists. Otherwise return datetime.datetime.min, i.e., the
earliest datetime known to datetime.
"""
if
"
match_newer_than_file
"
not
in
self
.
definition
:
logger
.
debug
(
"
No reference file specified.
"
)
return
datetime
.
datetime
.
min
elif
not
os
.
path
.
isfile
(
self
.
definition
[
"
match_newer_than_file
"
]):
logger
.
debug
(
"
Reference file doesn
'
t exist.
"
)
return
datetime
.
datetime
.
min
with
open
(
self
.
definition
[
"
match_newer_than_file
"
])
as
ref_file
:
stamp_str
=
ref_file
.
readline
().
strip
()
try
:
return
datetime
.
datetime
.
fromisoformat
(
stamp_str
)
except
ValueError
:
logger
.
warn
(
f
"
Reference file in
{
self
.
definition
[
'
match_newer_than_file
'
]
}
"
"
doesn
'
t contain a ISO formatted datetime in its first line.
"
"
Match regardless of modification times.
"
)
return
datetime
.
datetime
.
min
class
SimpleFileConverter
(
Converter
):
"""
Just a file, ignore the contents.
"""
...
...
This diff is collapsed.
Click to expand it.
unittests/test_converters.py
+
60
−
2
View file @
a72b0fc0
...
...
@@ -29,12 +29,15 @@ import importlib
import
json
import
logging
import
os
import
pytest
import
yaml
from
itertools
import
product
from
pathlib
import
Path
from
tempfile
import
NamedTemporaryFile
import
linkahead
as
db
import
pytest
import
yaml
from
caoscrawler.converters
import
(
Converter
,
ConverterValidationError
,
DateElementConverter
,
DictElementConverter
,
DictIntegerElementConverter
,
...
...
@@ -1070,3 +1073,58 @@ def test_dict_match_properties(converter_registry):
"
prop_d
"
:
24
# duplicate matches
})
records
=
scan_structure_elements
(
root_dict_element
,
def_dict
,
converter_registry
)
def
test_directory_converter_change_date
(
caplog
,
converter_registry
):
"""
Test that only directories that were modified after a certain
date are crawled.
"""
test_dir_element
=
Directory
(
"
test_directories
"
,
UNITTESTDIR
/
"
test_directories
"
)
date_of_dir_change
=
DirectoryConverter
.
_get_most_recent_change_in_dir
(
test_dir_element
)
past_date
=
date_of_dir_change
-
datetime
.
timedelta
(
days
=
1
)
future_date
=
date_of_dir_change
+
datetime
.
timedelta
(
days
=
1
)
tmpfi
=
NamedTemporaryFile
(
delete
=
False
)
# Write down past
with
open
(
tmpfi
.
name
,
"
w
"
)
as
fi
:
fi
.
write
(
f
"
{
past_date
.
isoformat
()
}
\n
"
)
converter_def
=
{
"
type
"
:
"
Directory
"
,
"
match
"
:
"
^test_directories$
"
,
"
match_newer_than_file
"
:
tmpfi
.
name
}
dc
=
DirectoryConverter
(
name
=
"
DC1
"
,
definition
=
converter_def
,
converter_registry
=
converter_registry
)
assert
dc
.
match
(
test_dir_element
)
is
not
None
# Write down future, so nothing should match
with
open
(
tmpfi
.
name
,
"
w
"
)
as
fi
:
fi
.
write
(
f
"
{
future_date
.
isoformat
()
}
\n
"
)
assert
dc
.
match
(
test_dir_element
)
is
None
# Also match in the corner case of equality:
with
open
(
tmpfi
.
name
,
"
w
"
)
as
fi
:
fi
.
write
(
f
"
{
date_of_dir_change
.
isoformat
()
}
\n
"
)
assert
dc
.
match
(
test_dir_element
)
is
not
None
# Match but warn
with
open
(
tmpfi
.
name
,
"
w
"
)
as
fi
:
fi
.
write
(
f
"
This is garbage.
\n
"
)
assert
dc
.
match
(
test_dir_element
)
is
not
None
assert
len
(
caplog
.
record_tuples
)
==
1
assert
caplog
.
record_tuples
[
0
][
1
]
==
logging
.
WARNING
assert
tmpfi
.
name
in
caplog
.
record_tuples
[
0
][
2
]
assert
"
doesn
'
t contain a ISO formatted datetime in its first line
"
in
caplog
.
record_tuples
[
0
][
2
]
# Match anything since file doesn't exist, inform in debug log.
os
.
remove
(
tmpfi
.
name
)
# Clear log and enforce debug level.
caplog
.
clear
()
caplog
.
set_level
(
logging
.
DEBUG
)
assert
dc
.
match
(
test_dir_element
)
is
not
None
assert
len
(
caplog
.
record_tuples
)
==
1
assert
caplog
.
record_tuples
[
0
][
1
]
==
logging
.
DEBUG
assert
"
Reference file doesn
'
t exist.
"
==
caplog
.
record_tuples
[
0
][
2
]
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment