From 3f2c3cd8464b570a4769ddeddbbf1d414c61be8e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com>
Date: Wed, 1 Feb 2023 11:56:51 +0100
Subject: [PATCH] ENH: add date converter

---
 src/caoscrawler/converters.py | 18 ++++++++++++++++++
 unittests/test_converters.py  | 21 +++++++++++++++++----
 2 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py
index d4e25f73..a2ad1df4 100644
--- a/src/caoscrawler/converters.py
+++ b/src/caoscrawler/converters.py
@@ -1101,3 +1101,21 @@ class CSVTableConverter(TableConverter):
             child_elements.append(
                 DictElement(str(index), row.to_dict()))
         return child_elements
+
+
+class DateElementConverter(TextElementConverter):
+    """
+    allows to convert different text formats of dates to Python date objects.
+
+    The text to be parsed must be contained in the "date" group. The format string can be supplied
+    under "dateformat" in the Converter definition. The library used is datetime so see its
+    documentation for information on how to create the format string.
+    """
+
+    def match(self, element: StructureElement):
+        matches = super().match(element)
+        if matches is not None and "date" in matches:
+            matches.update({"date": datetime.datetime.strptime(
+                matches["date"],
+                self.definition["date_format"]).date()})
+        return matches
diff --git a/unittests/test_converters.py b/unittests/test_converters.py
index 5942b1e1..83ebaf54 100644
--- a/unittests/test_converters.py
+++ b/unittests/test_converters.py
@@ -28,12 +28,13 @@ import yaml
 import importlib
 import os
 from itertools import product
+import datetime
 import pytest
 import yaml
 
 from caoscrawler.converters import (Converter, ConverterValidationError, DictElementConverter,
                                     DirectoryConverter, DictIntegerElementConverter,
-                                    handle_value, MarkdownFileConverter,
+                                    handle_value, MarkdownFileConverter, DateElementConverter,
                                     FloatElementConverter, IntegerElementConverter,
                                     JSONFileConverter, YAMLFileConverter)
 from caoscrawler.converters import _AbstractScalarValueElementConverter
@@ -55,6 +56,9 @@ def converter_registry():
         "MarkdownFile": {
             "converter": "MarkdownFileConverter",
             "package": "caoscrawler.converters"},
+        "Date": {
+            "converter": "DateElementConverter",
+            "package": "caoscrawler.converters"},
         "DictElement": {
             "converter": "DictElementConverter",
             "package": "caoscrawler.converters"},
@@ -64,9 +68,6 @@ def converter_registry():
         "ListElement": {
             "converter": "ListElementConverter",
             "package": "caoscrawler.converters"},
-        "TextElement": {
-            "converter": "TextElementConverter",
-            "package": "caoscrawler.converters"},
         "JSONFile": {
             "converter": "JSONFileConverter",
             "package": "caoscrawler.converters"},
@@ -570,3 +571,15 @@ def test_match_debug(converter_registry, capsys):
             assert ".*" in captured.out
             # the empty result set
             assert "{}" in captured.out
+
+
+def test_date_converter():
+    dictconverter = DateElementConverter(
+        definition={"match_value": "(?P<date>.*)",
+                    "date_format": "%Y-%m-%d"},
+        name="conv",
+        converter_registry=converter_registry)
+    matches = dictconverter.match(TextElement("text", "2022-11-11"))
+    assert "date" in matches
+    assert isinstance(matches["date"], datetime.date)
+    assert matches["date"].year == 2022
-- 
GitLab