From a78d24e58a5227a09a216eaf849af60e1173fcfd Mon Sep 17 00:00:00 2001 From: Daniel <d.hornung@indiscale.com> Date: Fri, 28 Jun 2024 12:22:30 +0200 Subject: [PATCH] ENH: `date_parse` transformer function --- src/caoscrawler/converters.py | 3 ++- src/caoscrawler/default_transformers.yml | 3 +++ src/caoscrawler/transformer_functions.py | 16 ++++++++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index 59272227..40d3b72b 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -1246,6 +1246,7 @@ class DateElementConverter(TextElementConverter): """ + # TODO make `date` parameter name configurable def match(self, element: StructureElement): matches = super().match(element) if matches is not None and "date" in matches: @@ -1266,7 +1267,7 @@ https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-cod """ - # TODO make `val` configurable + # TODO make `val` parameter name configurable def match(self, element: StructureElement): matches = super().match(element) if matches is not None and "val" in matches: diff --git a/src/caoscrawler/default_transformers.yml b/src/caoscrawler/default_transformers.yml index 61639fe5..ffcb1b15 100644 --- a/src/caoscrawler/default_transformers.yml +++ b/src/caoscrawler/default_transformers.yml @@ -9,6 +9,9 @@ split: replace: package: caoscrawler.transformer_functions function: replace +date_parse: + package: caoscrawler.transformer_functions + function: date_parse datetime_parse: package: caoscrawler.transformer_functions function: datetime_parse diff --git a/src/caoscrawler/transformer_functions.py b/src/caoscrawler/transformer_functions.py index 2f3b5234..26f9de65 100644 --- a/src/caoscrawler/transformer_functions.py +++ b/src/caoscrawler/transformer_functions.py @@ -68,6 +68,22 @@ def replace(in_value: Any, in_parameters: dict): return in_value.replace(in_parameters['remove'], in_parameters['insert']) +def date_parse(in_value: str, params: dict): + """Transform text so that it is formatted in a way that LinkAhead can understand it. + +Parameters +========== + +- date_format: str, optional + A format string using the ``datetime`` specificaton: + https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes + """ + fmt_default = "%Y-%m-%d" + fmt = params.get("date_format", fmt_default) + dt_str = datetime.datetime.strptime(in_value, fmt).strftime(fmt_default) + return dt_str + + def datetime_parse(in_value: str, params: dict): """Transform text so that it is formatted in a way that LinkAhead can understand it. -- GitLab