Skip to content
Snippets Groups Projects
Commit cd98761d authored by florian's avatar florian
Browse files

ENH: Implement filter function for min and max

parent 7842197c
No related branches found
No related tags found
2 merge requests!53Release 0.1,!39F children filter
Pipeline #28719 failed
...@@ -35,7 +35,7 @@ from .structure_elements import (StructureElement, Directory, File, Dict, JSONFi ...@@ -35,7 +35,7 @@ from .structure_elements import (StructureElement, Directory, File, Dict, JSONFi
DictIntegerElement, DictBooleanElement, DictIntegerElement, DictBooleanElement,
DictFloatElement, DictDictElement, DictFloatElement, DictDictElement,
TextElement, DictTextElement, DictElement, DictListElement) TextElement, DictTextElement, DictElement, DictListElement)
from typing import Dict as Dict_t, List, Optional, Union from typing import Dict as Dict_t, List, Optional, Tuple, Union
from abc import abstractmethod from abc import abstractmethod
from string import Template from string import Template
import yaml_header_tools import yaml_header_tools
...@@ -50,6 +50,23 @@ SPECIAL_PROPERTIES = ("description", "name", "id", "path", ...@@ -50,6 +50,23 @@ SPECIAL_PROPERTIES = ("description", "name", "id", "path",
"file", "checksum", "size") "file", "checksum", "size")
def _only_max(children_with_keys):
return [max(children_with_keys, key=lambda x: x[1])[0]]
def _only_min(children_with_keys):
return [min(children_with_keys, key=lambda x: x[1])[0]]
# names of functions that can be used to filter children
FILTER_FUNCTIONS = {
"only_max": _only_max,
"only_min": _only_min,
}
def str_to_bool(x): def str_to_bool(x):
if str(x).lower() == "true": if str(x).lower() == "true":
return True return True
...@@ -315,10 +332,25 @@ class Converter(object): ...@@ -315,10 +332,25 @@ class Converter(object):
records, records,
self.definition["records"]) self.definition["records"])
def filter_children(self, children_with_strings: List[], expr, groupname, rule): def filter_children(self, children_with_strings:
List[Tuple[StructureElement, str]], expr: str,
group: str, rule: str):
"""Filter children according to regexp `expr` and `rule`.""" """Filter children according to regexp `expr` and `rule`."""
pass to_be_filtered = []
unmatched_children = []
for (child, name) in children_with_strings:
m = re.match(expr, name)
if m is None:
unmatched_children.append(child)
else:
to_be_filtered.append((child, m.groupdict()[group]))
filtered_children = FILTER_FUNCTIONS[rule](to_be_filtered)
return filtered_children+unmatched_children
@abstractmethod @abstractmethod
def typecheck(self, element: StructureElement): def typecheck(self, element: StructureElement):
...@@ -344,7 +376,15 @@ class DirectoryConverter(Converter): ...@@ -344,7 +376,15 @@ class DirectoryConverter(Converter):
raise RuntimeError( raise RuntimeError(
"Directory converters can only create children from directories.") "Directory converters can only create children from directories.")
return self.create_children_from_directory(element) children = self.create_children_from_directory(element)
if "filter" in self.definition:
tuple_list = [(c, c.name) for c in children]
return self.filter_children(tuple_list, **self.definition["filter"])
return children
def typecheck(self, element: StructureElement): def typecheck(self, element: StructureElement):
return isinstance(element, Directory) return isinstance(element, Directory)
......
...@@ -26,16 +26,14 @@ test the converters module ...@@ -26,16 +26,14 @@ test the converters module
from caoscrawler.converters import Converter from caoscrawler.converters import Converter
from caoscrawler.stores import GeneralStore from caoscrawler.stores import GeneralStore
from caoscrawler.converters import (ConverterValidationError, from caoscrawler.converters import (ConverterValidationError, DictConverter,
MarkdownFileConverter, JSONFileConverter, DirectoryConverter, handle_value,
DictConverter) MarkdownFileConverter, JSONFileConverter)
from caoscrawler.structure_elements import Directory
from caoscrawler.structure_elements import (File, DictTextElement, from caoscrawler.structure_elements import (File, DictTextElement,
DictListElement, DictElement, DictListElement, DictElement,
DictBooleanElement, DictDictElement, DictBooleanElement, DictDictElement,
DictIntegerElement, DictFloatElement) DictIntegerElement,
DictFloatElement, Directory)
from caoscrawler.converters import handle_value
from test_tool import rfp from test_tool import rfp
...@@ -279,17 +277,55 @@ def test_filter_children_of_directory(converter_registry): ...@@ -279,17 +277,55 @@ def test_filter_children_of_directory(converter_registry):
correctly. correctly.
""" """
test_dir = Directory("examples_filter_children", rfp("test_directories", "examples_filter_children")) test_dir = Directory("examples_filter_children", rfp(
"test_directories", "examples_filter_children"))
dc = DirectoryConverter( dc = DirectoryConverter(
defnition={ definition={
"match": "(.*)", "match": "(.*)",
"filter": { "filter": {
"expr": "", "expr": "test_(?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2}).json",
"group": "date", "group": "date",
"filter": "only_max" "rule": "only_max"
}
}, },
name="TestOnlyMaxDirectoryConverter", name="TestOnlyMaxDirectoryConverter",
converter_registry=converter_registry converter_registry=converter_registry
)
m = dc.match(test_dir)
assert m is not None
# This should only contain the youngest json and the csv that doesn't match
# the above filter expression.
children = dc.create_children(None, test_dir)
assert len(children) == 2
assert children[0].__class__ == File
assert children[0].name == "test_2022-02-02.json"
assert children[1].__class__ == File
assert children[1].name == "some_other_file.csv"
dc = DirectoryConverter(
definition={
"match": "(.*)",
"filter": {
"expr": "test_(?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2}).json",
"group": "date",
"rule": "only_min"
} }
},
name="TestOnlyMinDirectoryConverter",
converter_registry=converter_registry
) )
m = dc.match(test_dir)
assert m is not None
# This should only contain the youngest json and the csv that doesn't match
# the above filter expression.
children = dc.create_children(None, test_dir)
assert len(children) == 2
assert children[0].__class__ == File
assert children[0].name == "test_2022-01-01.json"
assert children[1].__class__ == File
assert children[1].name == "some_other_file.csv"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment