diff --git a/src/caoscrawler/cfood-schema.yml b/src/caoscrawler/cfood-schema.yml index 47af0171d8ca7942942f94375ab591bf97834603..340e5b9dec0e8f05b1c39ec2511196249ec87d31 100644 --- a/src/caoscrawler/cfood-schema.yml +++ b/src/caoscrawler/cfood-schema.yml @@ -28,6 +28,7 @@ cfood: - Definitions - Dict - Date + - Datetime - JSONFile - YAMLFile - CSVTableConverter diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index 6c63dc622b6d26ff1eb1ac69f5143d7b2b61b338..592722279cdad2e0cb1ab5ff4ef42aaeceaf4987 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -1241,7 +1241,7 @@ class DateElementConverter(TextElementConverter): """allows to convert different text formats of dates to Python date objects. The text to be parsed must be contained in the "date" group. The format string can be supplied - under "dateformat" in the Converter definition. The library used is datetime so see its + under "date_format" in the Converter definition. The library used is datetime so see its documentation for information on how to create the format string. """ @@ -1254,3 +1254,24 @@ class DateElementConverter(TextElementConverter): self.definition["date_format"] if "date_format" in self.definition else "%Y-%m-%d" ).date()}) return matches + + +class DatetimeElementConverter(TextElementConverter): + """Convert text so that it is formatted in a way that LinkAhead can understand it. + +The text to be parsed must be in the ``val`` parameter. The format string can be supplied in the +``datetime_format`` node. This class uses the ``datetime`` module, so ``datetime_format`` must +follow this specificaton: +https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes + + """ + + # TODO make `val` configurable + def match(self, element: StructureElement): + matches = super().match(element) + if matches is not None and "val" in matches: + fmt_default = "%Y-%m-%dT%H:%M:%S" + fmt = self.definition.get("datetime_format", fmt_default) + dt_str = datetime.datetime.strptime(matches["val"], fmt).strftime(fmt_default) + matches.update({"val": dt_str}) + return matches diff --git a/src/caoscrawler/default_converters.yml b/src/caoscrawler/default_converters.yml index af2b1c764ac637c1391c89861ddba12386e6240e..9a5fc248c45a77b848611c322ed7d2a5fdbd3721 100644 --- a/src/caoscrawler/default_converters.yml +++ b/src/caoscrawler/default_converters.yml @@ -8,6 +8,9 @@ BooleanElement: Date: converter: DateElementConverter package: caoscrawler.converters +Datetime: + converter: DatetimeElementConverter + package: caoscrawler.converters Dict: converter: DictElementConverter package: caoscrawler.converters diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 1d2492a7c2eb59b0533d707bad7e1cb3e51529bd..f5125e61efa49fe627480696703e570ef9b70e6f 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -633,7 +633,7 @@ def test_load_converters(): # converter classes can be loaded from their respective packages. # Please adapt, if defaults change! - assert len(converter_registry) == 23 + assert len(converter_registry) == 24 # All of them are contained in caoscrawler.converters for conv_key, conv in converter_registry.items():