From 2a7272a20b8f6526584b8ebe5ffe97bd41207fb2 Mon Sep 17 00:00:00 2001 From: Daniel <d.hornung@indiscale.com> Date: Thu, 27 Jun 2024 11:34:12 +0200 Subject: [PATCH] ENH: Datetime converter --- src/caoscrawler/cfood-schema.yml | 1 + src/caoscrawler/converters.py | 23 ++++++++++++++++++++++- src/caoscrawler/default_converters.yml | 3 +++ unittests/test_converters.py | 2 +- 4 files changed, 27 insertions(+), 2 deletions(-) diff --git a/src/caoscrawler/cfood-schema.yml b/src/caoscrawler/cfood-schema.yml index 47af0171..340e5b9d 100644 --- a/src/caoscrawler/cfood-schema.yml +++ b/src/caoscrawler/cfood-schema.yml @@ -28,6 +28,7 @@ cfood: - Definitions - Dict - Date + - Datetime - JSONFile - YAMLFile - CSVTableConverter diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index 6c63dc62..59272227 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -1241,7 +1241,7 @@ class DateElementConverter(TextElementConverter): """allows to convert different text formats of dates to Python date objects. The text to be parsed must be contained in the "date" group. The format string can be supplied - under "dateformat" in the Converter definition. The library used is datetime so see its + under "date_format" in the Converter definition. The library used is datetime so see its documentation for information on how to create the format string. """ @@ -1254,3 +1254,24 @@ class DateElementConverter(TextElementConverter): self.definition["date_format"] if "date_format" in self.definition else "%Y-%m-%d" ).date()}) return matches + + +class DatetimeElementConverter(TextElementConverter): + """Convert text so that it is formatted in a way that LinkAhead can understand it. + +The text to be parsed must be in the ``val`` parameter. The format string can be supplied in the +``datetime_format`` node. This class uses the ``datetime`` module, so ``datetime_format`` must +follow this specificaton: +https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes + + """ + + # TODO make `val` configurable + def match(self, element: StructureElement): + matches = super().match(element) + if matches is not None and "val" in matches: + fmt_default = "%Y-%m-%dT%H:%M:%S" + fmt = self.definition.get("datetime_format", fmt_default) + dt_str = datetime.datetime.strptime(matches["val"], fmt).strftime(fmt_default) + matches.update({"val": dt_str}) + return matches diff --git a/src/caoscrawler/default_converters.yml b/src/caoscrawler/default_converters.yml index af2b1c76..9a5fc248 100644 --- a/src/caoscrawler/default_converters.yml +++ b/src/caoscrawler/default_converters.yml @@ -8,6 +8,9 @@ BooleanElement: Date: converter: DateElementConverter package: caoscrawler.converters +Datetime: + converter: DatetimeElementConverter + package: caoscrawler.converters Dict: converter: DictElementConverter package: caoscrawler.converters diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 1d2492a7..f5125e61 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -633,7 +633,7 @@ def test_load_converters(): # converter classes can be loaded from their respective packages. # Please adapt, if defaults change! - assert len(converter_registry) == 23 + assert len(converter_registry) == 24 # All of them are contained in caoscrawler.converters for conv_key, conv in converter_registry.items(): -- GitLab