diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index d4e25f73a8a9e7dad42c50d907745dfb7329bb13..a2ad1df474158e463f1499ac49b43adf614c6559 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -1101,3 +1101,21 @@ class CSVTableConverter(TableConverter): child_elements.append( DictElement(str(index), row.to_dict())) return child_elements + + +class DateElementConverter(TextElementConverter): + """ + allows to convert different text formats of dates to Python date objects. + + The text to be parsed must be contained in the "date" group. The format string can be supplied + under "dateformat" in the Converter definition. The library used is datetime so see its + documentation for information on how to create the format string. + """ + + def match(self, element: StructureElement): + matches = super().match(element) + if matches is not None and "date" in matches: + matches.update({"date": datetime.datetime.strptime( + matches["date"], + self.definition["date_format"]).date()}) + return matches diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 5942b1e124ebd1228a619ed7a1024738c70ee0aa..83ebaf543de9390ba9907c6fdb67d544b9cc5eb8 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -28,12 +28,13 @@ import yaml import importlib import os from itertools import product +import datetime import pytest import yaml from caoscrawler.converters import (Converter, ConverterValidationError, DictElementConverter, DirectoryConverter, DictIntegerElementConverter, - handle_value, MarkdownFileConverter, + handle_value, MarkdownFileConverter, DateElementConverter, FloatElementConverter, IntegerElementConverter, JSONFileConverter, YAMLFileConverter) from caoscrawler.converters import _AbstractScalarValueElementConverter @@ -55,6 +56,9 @@ def converter_registry(): "MarkdownFile": { "converter": "MarkdownFileConverter", "package": "caoscrawler.converters"}, + "Date": { + "converter": "DateElementConverter", + "package": "caoscrawler.converters"}, "DictElement": { "converter": "DictElementConverter", "package": "caoscrawler.converters"}, @@ -64,9 +68,6 @@ def converter_registry(): "ListElement": { "converter": "ListElementConverter", "package": "caoscrawler.converters"}, - "TextElement": { - "converter": "TextElementConverter", - "package": "caoscrawler.converters"}, "JSONFile": { "converter": "JSONFileConverter", "package": "caoscrawler.converters"}, @@ -570,3 +571,15 @@ def test_match_debug(converter_registry, capsys): assert ".*" in captured.out # the empty result set assert "{}" in captured.out + + +def test_date_converter(): + dictconverter = DateElementConverter( + definition={"match_value": "(?P<date>.*)", + "date_format": "%Y-%m-%d"}, + name="conv", + converter_registry=converter_registry) + matches = dictconverter.match(TextElement("text", "2022-11-11")) + assert "date" in matches + assert isinstance(matches["date"], datetime.date) + assert matches["date"].year == 2022