Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
CaosDB Crawler
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
caosdb
Software
CaosDB Crawler
Commits
cd98761d
Commit
cd98761d
authored
2 years ago
by
florian
Browse files
Options
Downloads
Patches
Plain Diff
ENH: Implement filter function for min and max
parent
7842197c
No related branches found
No related tags found
2 merge requests
!53
Release 0.1
,
!39
F children filter
Pipeline
#28719
failed
2 years ago
Stage: info
Stage: setup
Stage: cert
Stage: style
Stage: test
Changes
2
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
src/caoscrawler/converters.py
+44
-4
44 additions, 4 deletions
src/caoscrawler/converters.py
unittests/test_converters.py
+50
-14
50 additions, 14 deletions
unittests/test_converters.py
with
94 additions
and
18 deletions
src/caoscrawler/converters.py
+
44
−
4
View file @
cd98761d
...
@@ -35,7 +35,7 @@ from .structure_elements import (StructureElement, Directory, File, Dict, JSONFi
...
@@ -35,7 +35,7 @@ from .structure_elements import (StructureElement, Directory, File, Dict, JSONFi
DictIntegerElement
,
DictBooleanElement
,
DictIntegerElement
,
DictBooleanElement
,
DictFloatElement
,
DictDictElement
,
DictFloatElement
,
DictDictElement
,
TextElement
,
DictTextElement
,
DictElement
,
DictListElement
)
TextElement
,
DictTextElement
,
DictElement
,
DictListElement
)
from
typing
import
Dict
as
Dict_t
,
List
,
Optional
,
Union
from
typing
import
Dict
as
Dict_t
,
List
,
Optional
,
Tuple
,
Union
from
abc
import
abstractmethod
from
abc
import
abstractmethod
from
string
import
Template
from
string
import
Template
import
yaml_header_tools
import
yaml_header_tools
...
@@ -50,6 +50,23 @@ SPECIAL_PROPERTIES = ("description", "name", "id", "path",
...
@@ -50,6 +50,23 @@ SPECIAL_PROPERTIES = ("description", "name", "id", "path",
"
file
"
,
"
checksum
"
,
"
size
"
)
"
file
"
,
"
checksum
"
,
"
size
"
)
def
_only_max
(
children_with_keys
):
return
[
max
(
children_with_keys
,
key
=
lambda
x
:
x
[
1
])[
0
]]
def
_only_min
(
children_with_keys
):
return
[
min
(
children_with_keys
,
key
=
lambda
x
:
x
[
1
])[
0
]]
# names of functions that can be used to filter children
FILTER_FUNCTIONS
=
{
"
only_max
"
:
_only_max
,
"
only_min
"
:
_only_min
,
}
def
str_to_bool
(
x
):
def
str_to_bool
(
x
):
if
str
(
x
).
lower
()
==
"
true
"
:
if
str
(
x
).
lower
()
==
"
true
"
:
return
True
return
True
...
@@ -315,10 +332,25 @@ class Converter(object):
...
@@ -315,10 +332,25 @@ class Converter(object):
records
,
records
,
self
.
definition
[
"
records
"
])
self
.
definition
[
"
records
"
])
def
filter_children
(
self
,
children_with_strings
:
List
[],
expr
,
groupname
,
rule
):
def
filter_children
(
self
,
children_with_strings
:
List
[
Tuple
[
StructureElement
,
str
]],
expr
:
str
,
group
:
str
,
rule
:
str
):
"""
Filter children according to regexp `expr` and `rule`.
"""
"""
Filter children according to regexp `expr` and `rule`.
"""
pass
to_be_filtered
=
[]
unmatched_children
=
[]
for
(
child
,
name
)
in
children_with_strings
:
m
=
re
.
match
(
expr
,
name
)
if
m
is
None
:
unmatched_children
.
append
(
child
)
else
:
to_be_filtered
.
append
((
child
,
m
.
groupdict
()[
group
]))
filtered_children
=
FILTER_FUNCTIONS
[
rule
](
to_be_filtered
)
return
filtered_children
+
unmatched_children
@abstractmethod
@abstractmethod
def
typecheck
(
self
,
element
:
StructureElement
):
def
typecheck
(
self
,
element
:
StructureElement
):
...
@@ -344,7 +376,15 @@ class DirectoryConverter(Converter):
...
@@ -344,7 +376,15 @@ class DirectoryConverter(Converter):
raise
RuntimeError
(
raise
RuntimeError
(
"
Directory converters can only create children from directories.
"
)
"
Directory converters can only create children from directories.
"
)
return
self
.
create_children_from_directory
(
element
)
children
=
self
.
create_children_from_directory
(
element
)
if
"
filter
"
in
self
.
definition
:
tuple_list
=
[(
c
,
c
.
name
)
for
c
in
children
]
return
self
.
filter_children
(
tuple_list
,
**
self
.
definition
[
"
filter
"
])
return
children
def
typecheck
(
self
,
element
:
StructureElement
):
def
typecheck
(
self
,
element
:
StructureElement
):
return
isinstance
(
element
,
Directory
)
return
isinstance
(
element
,
Directory
)
...
...
This diff is collapsed.
Click to expand it.
unittests/test_converters.py
+
50
−
14
View file @
cd98761d
...
@@ -26,16 +26,14 @@ test the converters module
...
@@ -26,16 +26,14 @@ test the converters module
from
caoscrawler.converters
import
Converter
from
caoscrawler.converters
import
Converter
from
caoscrawler.stores
import
GeneralStore
from
caoscrawler.stores
import
GeneralStore
from
caoscrawler.converters
import
(
ConverterValidationError
,
from
caoscrawler.converters
import
(
ConverterValidationError
,
DictConverter
,
MarkdownFileConverter
,
JSONFileConverter
,
DirectoryConverter
,
handle_value
,
DictConverter
)
MarkdownFileConverter
,
JSONFileConverter
)
from
caoscrawler.structure_elements
import
Directory
from
caoscrawler.structure_elements
import
(
File
,
DictTextElement
,
from
caoscrawler.structure_elements
import
(
File
,
DictTextElement
,
DictListElement
,
DictElement
,
DictListElement
,
DictElement
,
DictBooleanElement
,
DictDictElement
,
DictBooleanElement
,
DictDictElement
,
DictIntegerElement
,
DictFloatElement
)
DictIntegerElement
,
DictFloatElement
,
Directory
)
from
caoscrawler.converters
import
handle_value
from
test_tool
import
rfp
from
test_tool
import
rfp
...
@@ -279,17 +277,55 @@ def test_filter_children_of_directory(converter_registry):
...
@@ -279,17 +277,55 @@ def test_filter_children_of_directory(converter_registry):
correctly.
correctly.
"""
"""
test_dir
=
Directory
(
"
examples_filter_children
"
,
rfp
(
"
test_directories
"
,
"
examples_filter_children
"
))
test_dir
=
Directory
(
"
examples_filter_children
"
,
rfp
(
"
test_directories
"
,
"
examples_filter_children
"
))
dc
=
DirectoryConverter
(
dc
=
DirectoryConverter
(
defnition
=
{
def
i
nition
=
{
"
match
"
:
"
(.*)
"
,
"
match
"
:
"
(.*)
"
,
"
filter
"
:
{
"
filter
"
:
{
"
expr
"
:
""
,
"
expr
"
:
"
test_(?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2}).json
"
,
"
group
"
:
"
date
"
,
"
group
"
:
"
date
"
,
"
filter
"
:
"
only_max
"
"
rule
"
:
"
only_max
"
}
},
},
name
=
"
TestOnlyMaxDirectoryConverter
"
,
name
=
"
TestOnlyMaxDirectoryConverter
"
,
converter_registry
=
converter_registry
converter_registry
=
converter_registry
)
m
=
dc
.
match
(
test_dir
)
assert
m
is
not
None
# This should only contain the youngest json and the csv that doesn't match
# the above filter expression.
children
=
dc
.
create_children
(
None
,
test_dir
)
assert
len
(
children
)
==
2
assert
children
[
0
].
__class__
==
File
assert
children
[
0
].
name
==
"
test_2022-02-02.json
"
assert
children
[
1
].
__class__
==
File
assert
children
[
1
].
name
==
"
some_other_file.csv
"
dc
=
DirectoryConverter
(
definition
=
{
"
match
"
:
"
(.*)
"
,
"
filter
"
:
{
"
expr
"
:
"
test_(?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2}).json
"
,
"
group
"
:
"
date
"
,
"
rule
"
:
"
only_min
"
}
}
},
name
=
"
TestOnlyMinDirectoryConverter
"
,
converter_registry
=
converter_registry
)
)
m
=
dc
.
match
(
test_dir
)
assert
m
is
not
None
# This should only contain the youngest json and the csv that doesn't match
# the above filter expression.
children
=
dc
.
create_children
(
None
,
test_dir
)
assert
len
(
children
)
==
2
assert
children
[
0
].
__class__
==
File
assert
children
[
0
].
name
==
"
test_2022-01-01.json
"
assert
children
[
1
].
__class__
==
File
assert
children
[
1
].
name
==
"
some_other_file.csv
"
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment