From 3f2c3cd8464b570a4769ddeddbbf1d414c61be8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Wed, 1 Feb 2023 11:56:51 +0100 Subject: [PATCH] ENH: add date converter --- src/caoscrawler/converters.py | 18 ++++++++++++++++++ unittests/test_converters.py | 21 +++++++++++++++++---- 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index d4e25f73..a2ad1df4 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -1101,3 +1101,21 @@ class CSVTableConverter(TableConverter): child_elements.append( DictElement(str(index), row.to_dict())) return child_elements + + +class DateElementConverter(TextElementConverter): + """ + allows to convert different text formats of dates to Python date objects. + + The text to be parsed must be contained in the "date" group. The format string can be supplied + under "dateformat" in the Converter definition. The library used is datetime so see its + documentation for information on how to create the format string. + """ + + def match(self, element: StructureElement): + matches = super().match(element) + if matches is not None and "date" in matches: + matches.update({"date": datetime.datetime.strptime( + matches["date"], + self.definition["date_format"]).date()}) + return matches diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 5942b1e1..83ebaf54 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -28,12 +28,13 @@ import yaml import importlib import os from itertools import product +import datetime import pytest import yaml from caoscrawler.converters import (Converter, ConverterValidationError, DictElementConverter, DirectoryConverter, DictIntegerElementConverter, - handle_value, MarkdownFileConverter, + handle_value, MarkdownFileConverter, DateElementConverter, FloatElementConverter, IntegerElementConverter, JSONFileConverter, YAMLFileConverter) from caoscrawler.converters import _AbstractScalarValueElementConverter @@ -55,6 +56,9 @@ def converter_registry(): "MarkdownFile": { "converter": "MarkdownFileConverter", "package": "caoscrawler.converters"}, + "Date": { + "converter": "DateElementConverter", + "package": "caoscrawler.converters"}, "DictElement": { "converter": "DictElementConverter", "package": "caoscrawler.converters"}, @@ -64,9 +68,6 @@ def converter_registry(): "ListElement": { "converter": "ListElementConverter", "package": "caoscrawler.converters"}, - "TextElement": { - "converter": "TextElementConverter", - "package": "caoscrawler.converters"}, "JSONFile": { "converter": "JSONFileConverter", "package": "caoscrawler.converters"}, @@ -570,3 +571,15 @@ def test_match_debug(converter_registry, capsys): assert ".*" in captured.out # the empty result set assert "{}" in captured.out + + +def test_date_converter(): + dictconverter = DateElementConverter( + definition={"match_value": "(?P<date>.*)", + "date_format": "%Y-%m-%d"}, + name="conv", + converter_registry=converter_registry) + matches = dictconverter.match(TextElement("text", "2022-11-11")) + assert "date" in matches + assert isinstance(matches["date"], datetime.date) + assert matches["date"].year == 2022 -- GitLab