diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index 592722279cdad2e0cb1ab5ff4ef42aaeceaf4987..40d3b72bfe7564cfb815e11a69a952f9142c3e55 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -1246,6 +1246,7 @@ class DateElementConverter(TextElementConverter): """ + # TODO make `date` parameter name configurable def match(self, element: StructureElement): matches = super().match(element) if matches is not None and "date" in matches: @@ -1266,7 +1267,7 @@ https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-cod """ - # TODO make `val` configurable + # TODO make `val` parameter name configurable def match(self, element: StructureElement): matches = super().match(element) if matches is not None and "val" in matches: diff --git a/src/caoscrawler/default_transformers.yml b/src/caoscrawler/default_transformers.yml index 61639fe5651c404256c3c90a3334f92374adce9a..ffcb1b15bd2bad71083cc8f0ba84172ee3daf2b0 100644 --- a/src/caoscrawler/default_transformers.yml +++ b/src/caoscrawler/default_transformers.yml @@ -9,6 +9,9 @@ split: replace: package: caoscrawler.transformer_functions function: replace +date_parse: + package: caoscrawler.transformer_functions + function: date_parse datetime_parse: package: caoscrawler.transformer_functions function: datetime_parse diff --git a/src/caoscrawler/transformer_functions.py b/src/caoscrawler/transformer_functions.py index 2f3b5234e8a59615350229071be1ce2ac2885e35..26f9de65b946046dacd0933744c2aca3362565ae 100644 --- a/src/caoscrawler/transformer_functions.py +++ b/src/caoscrawler/transformer_functions.py @@ -68,6 +68,22 @@ def replace(in_value: Any, in_parameters: dict): return in_value.replace(in_parameters['remove'], in_parameters['insert']) +def date_parse(in_value: str, params: dict): + """Transform text so that it is formatted in a way that LinkAhead can understand it. + +Parameters +========== + +- date_format: str, optional + A format string using the ``datetime`` specificaton: + https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes + """ + fmt_default = "%Y-%m-%d" + fmt = params.get("date_format", fmt_default) + dt_str = datetime.datetime.strptime(in_value, fmt).strftime(fmt_default) + return dt_str + + def datetime_parse(in_value: str, params: dict): """Transform text so that it is formatted in a way that LinkAhead can understand it.