diff --git a/CHANGELOG.md b/CHANGELOG.md index 30b236582e28dd6d7057dbb443b91c36e4724e88..2c9160a5aee105188873e9a444706d1726de2211 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,13 +10,24 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### * Support for Python 3.12 and experimental support for 3.13 -* `spss_to_datamodel` script. -* `SPSSConverter` class * CFood macros now accept complex objects as values, not just strings. +* More options for the `CSVTableConverter` +* New converters: + * `DatetimeElementConverter` + * `SPSSConverter` +* New scripts: + * `spss_to_datamodel` + * `csv_to_datamodel` +* New transformer functions: + * `date_parse` + * `datetime_parse` ### Changed ### * CFood macros do not render everything into strings now. +* Better internal handling of identifiable/reference resolving and merging of entities. This also + includes more understandable output for users. +* Better handling of missing imports, with nice messages for users. ### Deprecated ### @@ -27,11 +38,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed ### * [93](https://gitlab.com/linkahead/linkahead-crawler/-/issues/93) cfood.yaml does not allow umlaut in $expression +* [96](https://gitlab.com/linkahead/linkahead-crawler/-/issues/96) Do not fail silently on transaction errors ### Security ### ### Documentation ### +* General improvement of the documentaion, in many small places. +* The API documentation should now also include documentation of the constructors. + ## [0.7.1] - 2024-03-21 ## ### Fixed ### diff --git a/src/caoscrawler/cfood-schema.yml b/src/caoscrawler/cfood-schema.yml index 47af0171d8ca7942942f94375ab591bf97834603..340e5b9dec0e8f05b1c39ec2511196249ec87d31 100644 --- a/src/caoscrawler/cfood-schema.yml +++ b/src/caoscrawler/cfood-schema.yml @@ -28,6 +28,7 @@ cfood: - Definitions - Dict - Date + - Datetime - JSONFile - YAMLFile - CSVTableConverter diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index c690da172ef36f7e1fdbe1fef22b25d18fabc677..40d3b72bfe7564cfb815e11a69a952f9142c3e55 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -432,6 +432,7 @@ class Converter(object, metaclass=ABCMeta): return for transformer_key, transformer in self.definition["transform"].items(): in_value = replace_variables(transformer["in"], values) + out_value = in_value for tr_func_el in transformer["functions"]: if not isinstance(tr_func_el, dict): @@ -1240,11 +1241,12 @@ class DateElementConverter(TextElementConverter): """allows to convert different text formats of dates to Python date objects. The text to be parsed must be contained in the "date" group. The format string can be supplied - under "dateformat" in the Converter definition. The library used is datetime so see its + under "date_format" in the Converter definition. The library used is datetime so see its documentation for information on how to create the format string. """ + # TODO make `date` parameter name configurable def match(self, element: StructureElement): matches = super().match(element) if matches is not None and "date" in matches: @@ -1253,3 +1255,24 @@ class DateElementConverter(TextElementConverter): self.definition["date_format"] if "date_format" in self.definition else "%Y-%m-%d" ).date()}) return matches + + +class DatetimeElementConverter(TextElementConverter): + """Convert text so that it is formatted in a way that LinkAhead can understand it. + +The text to be parsed must be in the ``val`` parameter. The format string can be supplied in the +``datetime_format`` node. This class uses the ``datetime`` module, so ``datetime_format`` must +follow this specificaton: +https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes + + """ + + # TODO make `val` parameter name configurable + def match(self, element: StructureElement): + matches = super().match(element) + if matches is not None and "val" in matches: + fmt_default = "%Y-%m-%dT%H:%M:%S" + fmt = self.definition.get("datetime_format", fmt_default) + dt_str = datetime.datetime.strptime(matches["val"], fmt).strftime(fmt_default) + matches.update({"val": dt_str}) + return matches diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 2ce5eae9afbd78cbf4b78db0b152fa7578258ee9..627e1dc4ef041ea4dae03efedd2fabadb37bd07f 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -55,6 +55,9 @@ from linkahead.apiutils import (compare_entities, merge_entities) from linkahead.cached import cache_clear, cached_get_entity_by from linkahead.common.datatype import get_list_datatype, is_reference +from linkahead.exceptions import ( + TransactionError, +) from linkahead.utils.escape import escape_squoted_text from .config import get_config_setting @@ -1081,15 +1084,24 @@ def crawler_main(crawled_directory_path: str, logger.error(err) _update_status_record(crawler.run_id, 0, 0, status="FAILED") return 1 + except TransactionError as err: + logger.debug(traceback.format_exc()) + logger.error(err) + logger.error("Transaction error details:") + for suberr in err.errors: + logger.error("---") + logger.error(suberr.msg) + logger.error(suberr.entity) + return 1 except Exception as err: logger.debug(traceback.format_exc()) - logger.debug(err) + logger.error(err) if "SHARED_DIR" in os.environ: # pylint: disable=E0601 domain = get_config_setting("public_host_url") - logger.error("Unexpected Error: Please tell your administrator about this and provide the" - f" following path.\n{domain}/Shared/" + debuglog_public) + logger.error("Unexpected Error: Please tell your administrator about this and provide " + f"the following path.\n{domain}/Shared/" + debuglog_public) _update_status_record(crawler.run_id, 0, 0, status="FAILED") return 1 diff --git a/src/caoscrawler/default_converters.yml b/src/caoscrawler/default_converters.yml index af2b1c764ac637c1391c89861ddba12386e6240e..9a5fc248c45a77b848611c322ed7d2a5fdbd3721 100644 --- a/src/caoscrawler/default_converters.yml +++ b/src/caoscrawler/default_converters.yml @@ -8,6 +8,9 @@ BooleanElement: Date: converter: DateElementConverter package: caoscrawler.converters +Datetime: + converter: DatetimeElementConverter + package: caoscrawler.converters Dict: converter: DictElementConverter package: caoscrawler.converters diff --git a/src/caoscrawler/default_transformers.yml b/src/caoscrawler/default_transformers.yml index d0ad23912176bdfbf2446aa6e04bd7fa6b858777..ffcb1b15bd2bad71083cc8f0ba84172ee3daf2b0 100644 --- a/src/caoscrawler/default_transformers.yml +++ b/src/caoscrawler/default_transformers.yml @@ -1,4 +1,4 @@ - +# Lookup table for matching functions and cfood yaml node names. submatch: package: caoscrawler.transformer_functions @@ -9,3 +9,9 @@ split: replace: package: caoscrawler.transformer_functions function: replace +date_parse: + package: caoscrawler.transformer_functions + function: date_parse +datetime_parse: + package: caoscrawler.transformer_functions + function: datetime_parse diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index a22ad488f444999cab735bf37828805d96d4d449..854ee614638712bdcf957c592ef2946dbdd43afc 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -173,7 +173,10 @@ class IdentifiableAdapter(metaclass=ABCMeta): """ if node.registered_identifiable is None: if raise_exception: - raise RuntimeError("no registered_identifiable") + parents = [p.name for p in node.parents] + parents_str = "\n".join(f"- {p}" for p in parents) + raise RuntimeError("No registered identifiable for node with these parents:\n" + + parents_str) else: return False for prop in node.registered_identifiable.properties: diff --git a/src/caoscrawler/scripts/generators.py b/src/caoscrawler/scripts/generators.py index 927d4dcaf7f6123a50d30657beff1cb1b32d381e..ba8e6e39cc03e9be1923d72ec5c8d699c01fa8f9 100644 --- a/src/caoscrawler/scripts/generators.py +++ b/src/caoscrawler/scripts/generators.py @@ -104,17 +104,27 @@ metadata: directory: # corresponds to the directory given to the crawler type: Directory match: .* # we do not care how it is named here + records: + DirRecord: # One record for each directory. subtree: # This is the file thisfile: type: []{file} match: []{match} + records: + DatFileRecord: # One record for each matching file + role: File + path: $thisfile + file: $thisfile subtree: entry: type: Dict match: .* # Name is irrelevant records: - MyParent: + BaseElement: # One BaseElement record for each row in the CSV/TSV file + DatFileRecord: $DatFileRecord + DirRecord: + BaseElement: +$BaseElement subtree: !macro """ @@ -196,8 +206,24 @@ cfood: str defs.append(def_str) del defs + sep = repr(sniffed.delimiter) + sep = f'"{sep[1:-1]}"' + match_str = f"""'.*[ct]sv' + sep: {sep} + # "header": [int] + # "names": [str] + # "index_col": [int] + # "usecols": [int] + # "true_values": [str] + # "false_values": [str] + # "na_values": [str] + # "skiprows": [int] + # "nrows": [int] + # "keep_default_na": [bool] + """ + cfood_str = (_CustomTemplate(CFOOD_TEMPLATE).substitute({"file": "CSVTableConverter", - "match": ".*\\[ct]sv"}) + "match": match_str}) + prefix[2:] + "ColumnValue:\n" + "".join(defs_col_value) + prefix[2:] + "ColumnValueReference:\n" + "".join(defs_col_value_ref) ) diff --git a/src/caoscrawler/transformer_functions.py b/src/caoscrawler/transformer_functions.py index eda9f3c2bc98c8d2561f152f9f6ddd422daee00a..ce08bc6bc05caa84f342cdc25f3243c5bab0b79c 100644 --- a/src/caoscrawler/transformer_functions.py +++ b/src/caoscrawler/transformer_functions.py @@ -20,9 +20,14 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. +"""Definition of default transformer functions. + +See https://docs.indiscale.com/caosdb-crawler/converters.html#transform-functions for more +information. + """ -Defnition of default transformer functions. -""" + +import datetime import re from typing import Any @@ -61,3 +66,36 @@ def replace(in_value: Any, in_parameters: dict): if not isinstance(in_value, str): raise RuntimeError("must be string") return in_value.replace(in_parameters['remove'], in_parameters['insert']) + + +def date_parse(in_value: str, params: dict) -> str: + """Transform text so that it is formatted in a way that LinkAhead can understand it. + +Parameters +========== + +- date_format: str, optional + A format string using the ``datetime`` specificaton: + https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes + """ + fmt_default = "%Y-%m-%d" + fmt = params.get("date_format", fmt_default) + dt_str = datetime.datetime.strptime(in_value, fmt).strftime(fmt_default) + return dt_str + + +def datetime_parse(in_value: str, params: dict) -> str: + """Transform text so that it is formatted in a way that LinkAhead can understand it. + + +Parameters +========== + +- datetime_format: str, optional + A format string using the ``datetime`` specificaton: + https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes + """ + fmt_default = "%Y-%m-%dT%H:%M:%S" + fmt = params.get("datetime_format", fmt_default) + dt_str = datetime.datetime.strptime(in_value, fmt).strftime(fmt_default) + return dt_str diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 1d2492a7c2eb59b0533d707bad7e1cb3e51529bd..f5125e61efa49fe627480696703e570ef9b70e6f 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -633,7 +633,7 @@ def test_load_converters(): # converter classes can be loaded from their respective packages. # Please adapt, if defaults change! - assert len(converter_registry) == 23 + assert len(converter_registry) == 24 # All of them are contained in caoscrawler.converters for conv_key, conv in converter_registry.items(): diff --git a/unittests/test_transformers.py b/unittests/test_transformers.py index e7fae484c1a0010d952809fc502e25a02bfe6ec5..4ed12751d9052c839aa4db4abd586c419bed1018 100644 --- a/unittests/test_transformers.py +++ b/unittests/test_transformers.py @@ -46,6 +46,38 @@ from pytest import raises UNITTESTDIR = Path(__file__).parent +@pytest.fixture +def converter_registry(): + converter_registry: dict[str, dict[str, str]] = { + "Directory": { + "converter": "DirectoryConverter", + "package": "caoscrawler.converters"}, + "MarkdownFile": { + "converter": "MarkdownFileConverter", + "package": "caoscrawler.converters"}, + "Date": { + "converter": "DateElementConverter", + "package": "caoscrawler.converters"}, + "DictElement": { + "converter": "DictElementConverter", + "package": "caoscrawler.converters"}, + "TextElement": { + "converter": "TextElementConverter", + "package": "caoscrawler.converters"}, + "ListElement": { + "converter": "ListElementConverter", + "package": "caoscrawler.converters"}, + "JSONFile": { + "converter": "JSONFileConverter", + "package": "caoscrawler.converters"}, + } + + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry + + def test_simple_transformer(): """ Test the correct list of returned records by the scanner using the @@ -82,38 +114,6 @@ def test_simple_transformer(): assert False -@pytest.fixture -def converter_registry(): - converter_registry: dict[str, dict[str, str]] = { - "Directory": { - "converter": "DirectoryConverter", - "package": "caoscrawler.converters"}, - "MarkdownFile": { - "converter": "MarkdownFileConverter", - "package": "caoscrawler.converters"}, - "Date": { - "converter": "DateElementConverter", - "package": "caoscrawler.converters"}, - "DictElement": { - "converter": "DictElementConverter", - "package": "caoscrawler.converters"}, - "TextElement": { - "converter": "TextElementConverter", - "package": "caoscrawler.converters"}, - "ListElement": { - "converter": "ListElementConverter", - "package": "caoscrawler.converters"}, - "JSONFile": { - "converter": "JSONFileConverter", - "package": "caoscrawler.converters"}, - } - - for key, value in converter_registry.items(): - module = importlib.import_module(value["package"]) - value["class"] = getattr(module, value["converter"]) - return converter_registry - - def test_apply_replace(converter_registry): cfood_def = {"type": 'ListElement', "match_name": ".*", 'transform': {'test': {'in': '$a', 'out': '$b', 'functions': [{ @@ -146,3 +146,21 @@ def test_apply_replace_from_def(converter_registry): conv.apply_transformers(values, transformer_functions) assert values['b'] == "16:45" + + +def test_empty_functions_list(converter_registry): + cfood_def = {"type": 'ListElement', + "match_name": ".*", + 'transform': {'test': {'in': '$a', 'out': '$b', + 'functions': []}}} + values = GeneralStore() + values["a"] = "16_45" + + # transformer_functions = create_transformer_registry(crawler_definition) + transformer_functions = {"replace": replace} + + conv = ListElementConverter(definition=cfood_def, name='test', + converter_registry=converter_registry) + + conv.apply_transformers(values, transformer_functions) + assert values['b'] == "16_45"