diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index e4609b5110cf60bccc94599d235b6e4ca4026036..7f9dce83f54845151faf1564d1b3a4fc3909891d 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -35,7 +35,7 @@ from .structure_elements import (StructureElement, Directory, File, Dict, JSONFi DictIntegerElement, DictBooleanElement, DictFloatElement, DictDictElement, TextElement, DictTextElement, DictElement, DictListElement) -from typing import Dict as Dict_t, List, Optional, Union +from typing import Dict as Dict_t, List, Optional, Tuple, Union from abc import abstractmethod from string import Template import yaml_header_tools @@ -50,6 +50,23 @@ SPECIAL_PROPERTIES = ("description", "name", "id", "path", "file", "checksum", "size") +def _only_max(children_with_keys): + + return [max(children_with_keys, key=lambda x: x[1])[0]] + + +def _only_min(children_with_keys): + + return [min(children_with_keys, key=lambda x: x[1])[0]] + + +# names of functions that can be used to filter children +FILTER_FUNCTIONS = { + "only_max": _only_max, + "only_min": _only_min, +} + + def str_to_bool(x): if str(x).lower() == "true": return True @@ -315,10 +332,25 @@ class Converter(object): records, self.definition["records"]) - def filter_children(self, children_with_strings: List[], expr, groupname, rule): + def filter_children(self, children_with_strings: + List[Tuple[StructureElement, str]], expr: str, + group: str, rule: str): """Filter children according to regexp `expr` and `rule`.""" - pass + to_be_filtered = [] + unmatched_children = [] + + for (child, name) in children_with_strings: + + m = re.match(expr, name) + if m is None: + unmatched_children.append(child) + else: + to_be_filtered.append((child, m.groupdict()[group])) + + filtered_children = FILTER_FUNCTIONS[rule](to_be_filtered) + + return filtered_children+unmatched_children @abstractmethod def typecheck(self, element: StructureElement): @@ -344,7 +376,15 @@ class DirectoryConverter(Converter): raise RuntimeError( "Directory converters can only create children from directories.") - return self.create_children_from_directory(element) + children = self.create_children_from_directory(element) + + if "filter" in self.definition: + + tuple_list = [(c, c.name) for c in children] + + return self.filter_children(tuple_list, **self.definition["filter"]) + + return children def typecheck(self, element: StructureElement): return isinstance(element, Directory) diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 0be8dbe7e5782c9f3889095d085df59246e27c4e..b50ceef134dc59dbbda6cff632f133568a140ddb 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -26,16 +26,14 @@ test the converters module from caoscrawler.converters import Converter from caoscrawler.stores import GeneralStore -from caoscrawler.converters import (ConverterValidationError, - MarkdownFileConverter, JSONFileConverter, - DictConverter) -from caoscrawler.structure_elements import Directory +from caoscrawler.converters import (ConverterValidationError, DictConverter, + DirectoryConverter, handle_value, + MarkdownFileConverter, JSONFileConverter) from caoscrawler.structure_elements import (File, DictTextElement, DictListElement, DictElement, DictBooleanElement, DictDictElement, - DictIntegerElement, DictFloatElement) - -from caoscrawler.converters import handle_value + DictIntegerElement, + DictFloatElement, Directory) from test_tool import rfp @@ -279,17 +277,55 @@ def test_filter_children_of_directory(converter_registry): correctly. """ - test_dir = Directory("examples_filter_children", rfp("test_directories", "examples_filter_children")) - + test_dir = Directory("examples_filter_children", rfp( + "test_directories", "examples_filter_children")) + dc = DirectoryConverter( - defnition={ + definition={ "match": "(.*)", "filter": { - "expr": "", + "expr": "test_(?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2}).json", "group": "date", - "filter": "only_max" - }, + "rule": "only_max" + } + }, name="TestOnlyMaxDirectoryConverter", converter_registry=converter_registry - } ) + + m = dc.match(test_dir) + assert m is not None + + # This should only contain the youngest json and the csv that doesn't match + # the above filter expression. + children = dc.create_children(None, test_dir) + assert len(children) == 2 + assert children[0].__class__ == File + assert children[0].name == "test_2022-02-02.json" + assert children[1].__class__ == File + assert children[1].name == "some_other_file.csv" + + dc = DirectoryConverter( + definition={ + "match": "(.*)", + "filter": { + "expr": "test_(?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2}).json", + "group": "date", + "rule": "only_min" + } + }, + name="TestOnlyMinDirectoryConverter", + converter_registry=converter_registry + ) + + m = dc.match(test_dir) + assert m is not None + + # This should only contain the youngest json and the csv that doesn't match + # the above filter expression. + children = dc.create_children(None, test_dir) + assert len(children) == 2 + assert children[0].__class__ == File + assert children[0].name == "test_2022-01-01.json" + assert children[1].__class__ == File + assert children[1].name == "some_other_file.csv"