diff --git a/CHANGELOG.md b/CHANGELOG.md index 55c2a08227f7654b8d80c539708c3e23a0ed0cc5..a7fe9410eab0a357dfb3e2ccb3b29550c050b893 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ## ### Added ### +- DateElementConverter: allows to interpret text as a date object ### Changed ### diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index d4e25f73a8a9e7dad42c50d907745dfb7329bb13..de399e2677cf35a036c87f36f13c9a5b016f8a53 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -1101,3 +1101,22 @@ class CSVTableConverter(TableConverter): child_elements.append( DictElement(str(index), row.to_dict())) return child_elements + + +class DateElementConverter(TextElementConverter): + """ + allows to convert different text formats of dates to Python date objects. + + The text to be parsed must be contained in the "date" group. The format string can be supplied + under "dateformat" in the Converter definition. The library used is datetime so see its + documentation for information on how to create the format string. + """ + + def match(self, element: StructureElement): + matches = super().match(element) + if matches is not None and "date" in matches: + matches.update({"date": datetime.datetime.strptime( + matches["date"], + self.definition["date_format"] if "date_format" in self.definition else "%Y-%m-%d" + ).date()}) + return matches diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 6cf025a024e8cc392a7175421d47fb69059302a4..e9cf584cf1b930363a7f8c12cefe95371ef20559 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -374,6 +374,9 @@ class Crawler(object): "TextElement": { "converter": "TextElementConverter", "package": "caoscrawler.converters"}, + "Date": { + "converter": "DateElementConverter", + "package": "caoscrawler.converters"}, "DictIntegerElement": { "converter": "IntegerElementConverter", "package": "caoscrawler.converters"}, diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 5942b1e124ebd1228a619ed7a1024738c70ee0aa..25251f62f4e8c0cb1c9e4551a3b1fa4e1ce1fe35 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -28,12 +28,13 @@ import yaml import importlib import os from itertools import product +import datetime import pytest import yaml from caoscrawler.converters import (Converter, ConverterValidationError, DictElementConverter, DirectoryConverter, DictIntegerElementConverter, - handle_value, MarkdownFileConverter, + handle_value, MarkdownFileConverter, DateElementConverter, FloatElementConverter, IntegerElementConverter, JSONFileConverter, YAMLFileConverter) from caoscrawler.converters import _AbstractScalarValueElementConverter @@ -55,6 +56,9 @@ def converter_registry(): "MarkdownFile": { "converter": "MarkdownFileConverter", "package": "caoscrawler.converters"}, + "Date": { + "converter": "DateElementConverter", + "package": "caoscrawler.converters"}, "DictElement": { "converter": "DictElementConverter", "package": "caoscrawler.converters"}, @@ -64,9 +68,6 @@ def converter_registry(): "ListElement": { "converter": "ListElementConverter", "package": "caoscrawler.converters"}, - "TextElement": { - "converter": "TextElementConverter", - "package": "caoscrawler.converters"}, "JSONFile": { "converter": "JSONFileConverter", "package": "caoscrawler.converters"}, @@ -570,3 +571,27 @@ def test_match_debug(converter_registry, capsys): assert ".*" in captured.out # the empty result set assert "{}" in captured.out + + +def test_date_converter(): + dictconverter = DateElementConverter( + definition={"match_value": "(?P<date>.*)"}, + name="conv", + converter_registry=converter_registry) + matches = dictconverter.match(TextElement("text", "2022-11-11")) + assert "date" in matches + assert isinstance(matches["date"], datetime.date) + assert matches["date"].year == 2022 + + dictconverter = DateElementConverter( + definition={"match_value": r"(?P<date>(\d|-)+)", + "date_format": "%y-%m-%d"}, + name="conv", + converter_registry=converter_registry) + matches = dictconverter.match(TextElement("text", "22-11-11")) + assert "date" in matches + assert isinstance(matches["date"], datetime.date) + assert matches["date"].year == 2022 + + matches = dictconverter.match(TextElement("text", "alve")) + assert matches is None