diff --git a/.docker/Dockerfile b/.docker/Dockerfile index 539ac0d4e70bfbde2f630d4254cacc7419105611..1e9763f3496c9dca6cc33e6ba8217a654bed487e 100644 --- a/.docker/Dockerfile +++ b/.docker/Dockerfile @@ -1,27 +1,31 @@ -FROM debian:bullseye +FROM debian:bookworm RUN apt-get update && \ apt-get install \ curl \ git \ - openjdk-11-jdk-headless \ + openjdk-17-jdk-headless \ python3-autopep8 \ python3-pip \ python3-pytest \ python3-sphinx \ tox \ -y -RUN pip3 install pylint recommonmark sphinx-rtd-theme +RUN pip3 install --break-system-packages \ + pylint \ + recommonmark \ + sphinx-rtd-theme \ + ; COPY .docker/wait-for-it.sh /wait-for-it.sh ARG PYLIB ADD https://gitlab.indiscale.com/api/v4/projects/97/repository/commits/${PYLIB} \ pylib_version.json RUN git clone https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git && \ - cd caosdb-pylib && git checkout ${PYLIB} && pip3 install . + cd caosdb-pylib && git checkout ${PYLIB} && pip3 install --break-system-packages . ARG ADVANCED ADD https://gitlab.indiscale.com/api/v4/projects/104/repository/commits/${ADVANCED} \ advanced_version.json RUN git clone https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git && \ - cd caosdb-advanced-user-tools && git checkout ${ADVANCED} && pip3 install .[h5-crawler] + cd caosdb-advanced-user-tools && git checkout ${ADVANCED} && pip3 install --break-system-packages .[h5-crawler] COPY . /git # Delete .git because it is huge. @@ -30,7 +34,7 @@ RUN rm -r /git/.git # Install pycaosdb.ini for the tests RUN mv /git/.docker/tester_pycaosdb.ini /git/integrationtests/pycaosdb.ini -RUN cd /git/ && pip3 install . +RUN cd /git/ && pip3 install --break-system-packages .[h5-crawler,spss] WORKDIR /git/integrationtests # wait for server, diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9ec1727db301affd8e984df78abbb78a2b16ffaa..8812abacc0ef157c418e8f658a4fa7261bb04743 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -113,12 +113,13 @@ info: script: - *env -unittest_py3.9: +unittest_py3.11: tags: [cached-dind] stage: test image: $CI_REGISTRY_IMAGE script: - - tox + - python3 -c "import sys; assert sys.version.startswith('3.11')" + - tox unittest_py3.8: tags: [cached-dind] @@ -130,21 +131,21 @@ unittest_py3.8: # TODO: Use f-branch logic here - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@dev - - pip install .[h5-crawler] + - pip install .[h5-crawler,spss] # actual test - caosdb-crawler --help - pytest --cov=caosdb -vv ./unittests -unittest_py3.10: +unittest_py3.9: tags: [cached-dind] stage: test - image: python:3.10 + image: python:3.9 script: *python_test_script -unittest_py3.11: +unittest_py3.10: tags: [cached-dind] stage: test - image: python:3.11 + image: python:3.10 script: *python_test_script unittest_py3.12: @@ -160,18 +161,18 @@ unittest_py3.13: image: python:3.13-rc script: # TODO: Replace by '*python_test_script' as soon as 3.13 has been officially released. + # TODO Remove the "!" after 3.13 release, which serves as an xfail - apt update && apt install -y cargo # install dependencies - pip install pytest pytest-cov # TODO: Use f-branch logic here - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev - - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@dev - - pip install .[h5-crawler] + - (! pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@dev) + - (! pip install .[h5-crawler,spss]) # actual test - - caosdb-crawler --help - - pytest --cov=caosdb -vv ./unittests + - (! caosdb-crawler --help) + - (! pytest --cov=caosdb -vv ./unittests) - inttest: tags: [docker] services: @@ -306,7 +307,8 @@ code-style: - job: build-testenv optional: true script: - - autopep8 -r --diff --exit-code . + - autopep8 --version + - autopep8 -r --diff --exit-code . allow_failure: true pylint: diff --git a/CHANGELOG.md b/CHANGELOG.md index 2b4c8f976e83cfb73ebde3904b6aa5991dfca94c..15a35a01473f02a55ef5d9f04aac6f2e13af4ca6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### * Support for Python 3.12 and experimental support for 3.13 +* `spss_to_datamodel` script. +* `SPSSConverter` class ### Changed ### diff --git a/setup.cfg b/setup.cfg index 1b4a91859f39ff2695c36aace396b7db240a5f1f..848150363c42776993029c54e777f4ff6ccf72ea 100644 --- a/setup.cfg +++ b/setup.cfg @@ -25,6 +25,7 @@ install_requires = odfpy #make optional packaging pandas + pyarrow # Will be required by Pandas >= 3.0. pyyaml yaml-header-tools >= 0.2.1 @@ -39,8 +40,12 @@ per-file-ignores = __init__.py:F401 [options.entry_points] console_scripts = caosdb-crawler = caoscrawler.crawl:main + spss_to_datamodel = caoscrawler.conv_impl.spss:spss_to_datamodel_main + csv_to_datamodel = caoscrawler.scripts.generators:csv_to_datamodel_main [options.extras_require] h5-crawler = h5py >= 3.8 numpy +spss = + pandas[spss] diff --git a/src/caoscrawler/__init__.py b/src/caoscrawler/__init__.py index 05bad0b54d9098c0b7f165d8295a0faa2966fa32..41b96323b1106d8ce28caadc4a2da012f3dc22ea 100644 --- a/src/caoscrawler/__init__.py +++ b/src/caoscrawler/__init__.py @@ -1,4 +1,15 @@ +from . import converters, utils +try: + from .conv_impl.spss import SPSSConverter +except ImportError as err: + SPSSConverter: type = utils.MissingImport( + name="SPSSConverter", hint="Try installing with the `spss` extra option.", + err=err) from .crawl import Crawler, SecurityMode from .version import CfoodRequiredVersionError, get_caoscrawler_version __version__ = get_caoscrawler_version() + +# Convenience members ######################################################### +# mypy: disable-error-code="attr-defined" +converters.SPSSConverter = SPSSConverter diff --git a/src/caoscrawler/cfood-schema.yml b/src/caoscrawler/cfood-schema.yml index 5a6e1e50345382ca6e5a1e6ef3a8fbeafb806b84..85032ba30877dff97bdf4ff9ba904d070c4a95b2 100644 --- a/src/caoscrawler/cfood-schema.yml +++ b/src/caoscrawler/cfood-schema.yml @@ -31,6 +31,7 @@ cfood: - JSONFile - CSVTableConverter - XLSXTableConverter + - SPSSFile - H5File - H5Dataset - H5Group diff --git a/src/caoscrawler/conv_impl/__init__.py b/src/caoscrawler/conv_impl/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/caoscrawler/conv_impl/spss.py b/src/caoscrawler/conv_impl/spss.py new file mode 100644 index 0000000000000000000000000000000000000000..5dfad0ff8be55e2ca3ddf0db3397dbac5fc9f2b0 --- /dev/null +++ b/src/caoscrawler/conv_impl/spss.py @@ -0,0 +1,303 @@ +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Converter for SAV files (stored by SPSS).""" + +from __future__ import annotations # Can be removed with 3.10. + +import argparse +from collections import OrderedDict + +import numpy as np +import pandas as pd +import pyreadstat +import yaml + +from .. import converters +from ..stores import GeneralStore +from ..structure_elements import (File, StructureElement) +from typing import Optional, Any + + +READSTAT_TYPES = { + "double": "DOUBLE", + "string": "TEXT", +} +ORIGINAL_TYPES = { + "EDATE8": "DATETIME", +} + + +class SPSSConverter(converters.TableConverter): + """Converter for SAV files (stored by SPSS).""" + + def create_children(self, values: GeneralStore, element: StructureElement) -> list: + assert isinstance(element, File) + # The default dtype backend "numpy_nullable" does not handle dates well. + # Note that pandas.ArrowDtype is considered experimental (in Pandas 2.2). + df = pd.io.spss.read_spss(element.path, dtype_backend="pyarrow") + dtypes = read_column_types(element.path) + + # Fix datetime columns + for name, dtype in dtypes.items(): + if dtype != "DATETIME": + continue + col = df.loc[:, name] + col.fillna(np.nan, inplace=True) + col.replace([np.nan], [None], inplace=True) + + return self._children_from_dataframe(df) + + +def read_column_types(savfile: Optional[str] = None, meta: Optional[Any] = None) -> dict[str, str]: + """Read SAV file and return the column types. + +Optionally, take data from a previours reading. + +Parameters +---------- +savfile : Optional[str] + The SAV file to read. + +meta : Optional + The meta data result from `pyreadstat.read_sav(...)`. + +Returns +------- +out : dict[str, str] + The column names and types. + """ + if not meta: + _, meta = pyreadstat.read_sav(savfile, metadataonly=True) + elif savfile is not None: + raise ValueError("Only one of `savfile` and `meta` must be given.") + dtypes: dict[str, str] = {} + for name in meta.column_names: + datatype = ORIGINAL_TYPES.get(meta.original_variable_types[name], + READSTAT_TYPES[meta.readstat_variable_types[name]]) + dtypes[name] = datatype + return dtypes + + +def spss_to_yaml(savfile: str, yamlfile: str, cfood: Optional[str] = None) -> None: + """Parse the *.sav and create basic datamodel in ``yamlfile``. + +Parameters +---------- +cfood: str + If given, also create a cfood skeleton. + """ + _, meta = pyreadstat.read_sav(savfile, metadataonly=True) + dtypes = read_column_types(meta=meta) + + cfood_str = """ +--- +metadata: + macros: + - !defmacro + # Simple column value -> property rule + name: ColumnValue + params: + name: null + belongsto: BaseElement + type: TextElement + definition: + ${name}: + type: ${type} + match_name: ^${name}$$ + match_value: (?P<val>.*) + records: + ${belongsto}: + ${name}: $$val + - !defmacro + # column value -> reference property + name: ColumnValueReference + params: + name: null + reftype: null # RecordType of the reference + belongsto: BaseElement + type: TextElement # References are always text, right? + definition: + ${name}: + type: ${type} + match_name: ^${name}$$ + match_value: (?P<val>.*) + records: + ${reftype}: + name: $$val + ${belongsto}: + ${name}: $$${reftype} + - !defmacro + # Same as "ColumnValue", but also give name of property. + name: ColumnValuePropname + params: + name: null + propname: null + belongsto: BaseElement + type: TextElement + definition: + ${name}: + type: ${type} + match_name: ^${name}$$ + match_value: (?P<val>.*) + records: + ${belongsto}: + ${propname}: $$val +--- +directory: # corresponds to the directory given to the crawler + type: Directory + match: .* # we do not care how it is named here + subtree: + # This is the file + thisfile: + type: SPSSFile + match: ".*sav" + subtree: + entry: + type: Dict + match: .* # Name is irrelevant + records: + MyParent: + subtree: !macro +""" + + enums: dict[str, list[str]] = {} + properties = OrderedDict() + + for name in meta.column_names: + prop = { + "datatype": dtypes[name], + } + desc = meta.column_names_to_labels.get(name) + if desc and desc != name: + prop["description"] = desc + # Handle categorial variables + if var_label := meta.variable_to_label.get(name): + vvl = meta.variable_value_labels[name] + # reproducible (and sensible) order + label_values = [vvl[key] for key in sorted(vvl.keys())] + if label_values not in enums.values(): + enums[var_label] = label_values + else: + var_label = [key for key, value in enums.items() if value == label_values][0] + prop["datatype"] = var_label + properties[name] = prop + + output = f"""# auto-generated data model from file "{savfile}". +# To insert a datamodel into LinkAhead, run: +# +# python3 -m caosadvancedtools.models.parser datamodel.yaml --sync + +""" + + # Actual datamodel + output += """ +######### +# Enums # +######### + +""" + for name, values in enums.items(): + output += f"""{name}: + description: + # possible values: {values}\n""" + + output += (""" +############### +# RecordTypes # +############### + +DummyRT: + description: Note: Change name and enter description. + recommended_properties: + """ + + " ".join(yaml.dump(dict(properties), # from OrderedDict to dict + allow_unicode=True, + sort_keys=False).splitlines(keepends=True))) + + # Experimental: Enum creation + output += """ +############### +# Enum values # +############### +""" + for name, values in enums.items(): + output += f"\n# ### {name} ###\n" + for value in values: + output += f""" +{value}: + role: Record + inherit_from_suggested: + - {name} +""" + + with open(yamlfile, encoding="utf-8", mode="w") as myfile: + myfile.write(output) + + if cfood: + defs_col_value: list[str] = [] + defs_col_value_ref: list[str] = [] + prefix = " " * 14 + for name, propdef in properties.items(): + def_str = prefix + f"- name: {name}\n" + dtype = None + reftype = None + defs = defs_col_value + # Which type? + if propdef["datatype"] == "DOUBLE": + dtype = "FloatElement" + elif propdef["datatype"] in ("TEXT", "DATETIME"): + dtype = None + else: + reftype = propdef["datatype"] + defs = defs_col_value_ref + + # Append according to types: + if reftype: + def_str += prefix + f" reftype: {reftype}\n" + if dtype: + def_str += prefix + f" type: {dtype}\n" + + # Store result + defs.append(def_str) + del defs + + cfood_str += (prefix[2:] + "ColumnValue:\n" + "".join(defs_col_value) + + prefix[2:] + "ColumnValueReference:\n" + "".join(defs_col_value_ref) + ) + with open(cfood, encoding="utf-8", mode="w") as myfile: + myfile.write(cfood_str) + + +def _parse_arguments(): + """Parse the arguments.""" + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('-i', '--input', help="The *.sav file.", required=True) + parser.add_argument('-o', '--outfile', help="Yaml filename to save the result", required=True) + parser.add_argument('--cfood', help="Yaml filename to create cfood output in", required=False) + + return parser.parse_args() + + +def spss_to_datamodel_main(): + """The main function of this script.""" + args = _parse_arguments() + spss_to_yaml(savfile=args.input, yamlfile=args.outfile, cfood=args.cfood) + print(f"Written datamodel to: {args.outfile}") + if args.cfood: + print(f"Written cfood to: {args.cfood}") diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index 6280f16cc58c96145bcafca12437d18b0a4b63ba..c690da172ef36f7e1fdbe1fef22b25d18fabc677 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -1,11 +1,11 @@ -#!/usr/bin/env python3 # encoding: utf-8 # -# ** header v3.0 -# This file is a part of the CaosDB Project. +# This file is a part of the LinkAhead Project. # +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> # Copyright (C) 2021 Henrik tom Wörden -# 2021 Alexander Schlemmer +# Copyright (C) 2021 Alexander Schlemmer +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as @@ -19,9 +19,8 @@ # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. -# -# ** end header -# + +"""Converters take structure elements and create Records and new structure elements from them.""" from __future__ import annotations @@ -34,7 +33,7 @@ import warnings from abc import ABCMeta, abstractmethod from inspect import signature from string import Template -from typing import Any, Optional, Union +from typing import Any, Callable, Optional, Union import linkahead as db import pandas as pd @@ -189,7 +188,7 @@ out: tuple if "value" not in value: # TODO: how do we handle this case? Just ignore? # or disallow? - raise NotImplementedError() + raise NotImplementedError(f"This definition has no \"value\": {value}") propvalue = value["value"] # can be "single", "list" or "multiproperty" collection_mode = value["collection_mode"] @@ -290,9 +289,7 @@ def create_records(values: GeneralStore, records: RecordStore, def_records: dict propvalue = os.path.normpath(propvalue) setattr(c_record, key.lower(), propvalue) else: - if c_record.get_property(key) is None: - if collection_mode == "list": c_record.add_property(name=key, value=[propvalue]) elif (collection_mode == "multiproperty" or @@ -331,7 +328,8 @@ class Converter(object, metaclass=ABCMeta): definition dict must have. converter_registry: dict A dictionary that contains converter names as keys and dicts as values. Those value dicts - have the keys 'converter' and 'package'. + have the keys 'converter', 'package' and 'class'. 'converter' is the class name, + 'package' the module and 'class' the class instance of converters. """ self.definition = definition @@ -369,7 +367,7 @@ class Converter(object, metaclass=ABCMeta): @staticmethod def converter_factory(definition: dict, name: str, converter_registry: dict): - """creates a Converter instance of the appropriate class. + """Create a Converter instance of the appropriate class. The `type` key in the `definition` defines the Converter class which is being used. """ @@ -521,8 +519,8 @@ class Converter(object, metaclass=ABCMeta): result: Optional[dict]): """ Template for the debugging output for the match function """ msg = "\n--------" + name + "-----------\n" - for re, ma in zip(regexp, matched): - msg += "matching reg:\t" + re + "\n" + for exp, ma in zip(regexp, matched): + msg += "matching reg:\t" + exp + "\n" msg += "matching val:\t" + ma + "\n" msg += "---------\n" if result is None: @@ -822,7 +820,7 @@ class DictElementConverter(Converter): class DictConverter(DictElementConverter): def __init__(self, *args, **kwargs): warnings.warn(DeprecationWarning( - "This class is deprecated. Please use DictConverter.")) + "This class is deprecated. Please use DictElementConverter.")) super().__init__(*args, **kwargs) @@ -894,7 +892,7 @@ out: """ if "match_name" in definition: if "match" in definition: - raise RuntimeError(f"Do not supply both, 'match_name' and 'match'.") + raise RuntimeError("Do not supply both, 'match_name' and 'match'.") m1 = re.match(definition["match_name"], name) if m1 is None: @@ -1048,7 +1046,7 @@ class TextElementConverter(_AbstractScalarValueElementConverter): def __init__(self, definition, *args, **kwargs): if "match" in definition: raise ValueError(""" -The 'match' key will in future be used to match a potential name of a TextElement. Please use +The 'match' key is used to match a potential name of a TextElement. Please use the 'match_value' key to match the value of the TextElement and 'match_name' for matching the name. """) @@ -1130,15 +1128,22 @@ class TableConverter(Converter): The rows can be matched using a DictElementConverter. """ - @abstractmethod - def get_options(self): - """ - This method needs to be overwritten by the specific table converter to provide - information about the possible options. + + def get_options(self) -> dict: + """Get specific options, e.g. from ``self.definitions``. + +This method may to be overwritten by the specific table converter to provide information about the +possible options. Implementors may use ``TableConverter._get_options(...)`` to get (and convert) +options from ``self.definitions``. + +Returns +------- +out: dict + An options dict. """ - pass + return {} - def _get_options(self, possible_options): + def _get_options(self, possible_options: list[tuple[str, Callable]]) -> dict: option_dict = dict() for opt_name, opt_conversion in possible_options: if opt_name in self.definition: @@ -1166,6 +1171,14 @@ class TableConverter(Converter): return None return m.groupdict() + @staticmethod + def _children_from_dataframe(dataframe: pd.DataFrame): + child_elements = list() + for index, row in dataframe.iterrows(): + child_elements.append( + DictElement(str(index), row.to_dict())) + return child_elements + class XLSXTableConverter(TableConverter): """ @@ -1195,11 +1208,7 @@ class XLSXTableConverter(TableConverter): if not isinstance(element, File): raise RuntimeError("Element must be a File.") table = pd.read_excel(element.path, **self.get_options()) - child_elements = list() - for index, row in table.iterrows(): - child_elements.append( - DictElement(str(index), row.to_dict())) - return child_elements + return self._children_from_dataframe(table) class CSVTableConverter(TableConverter): @@ -1224,11 +1233,7 @@ class CSVTableConverter(TableConverter): if not isinstance(element, File): raise RuntimeError("Element must be a File.") table = pd.read_csv(element.path, **self.get_options()) - child_elements = list() - for index, row in table.iterrows(): - child_elements.append( - DictElement(str(index), row.to_dict())) - return child_elements + return self._children_from_dataframe(table) class DateElementConverter(TextElementConverter): diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 928bf472e97daa09bfdf6fd742b981d981ab9204..2ce5eae9afbd78cbf4b78db0b152fa7578258ee9 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -295,10 +295,18 @@ class Crawler(object): self.crawled_data = data return data - def split_into_inserts_and_updates(self, st: SyncGraph): - """ iteratively identifies nodes in the SyncGraph st and checks whether those exist on the - remote server such that in the end two list are being created that list entities that need - to be update or inserted""" + def _split_into_inserts_and_updates(self, st: SyncGraph): + """Classify nodes in the SyncGraph ``st`` with respect to their state on the server. + +This method iteratively checks whether those nodes exist on the remote server and creates two lists, +one with the entities that need to be updated and the other with entities to be inserted. + +.. todo:: + + Should this be made into a public method of SyncGraph instead? At the moment, this is a + purely static method that only operates on the state of ``st``. + + """ entity_was_treated = True # st.unchecked contains Entities which could not yet be checked against the remote server while entity_was_treated and len(st.unchecked) > 0: @@ -336,7 +344,7 @@ class Crawler(object): # ) raise RuntimeError( - "Could not finish split_into_inserts_and_updates. " + "Could not finish _split_into_inserts_and_updates. " "It might be due to a circular dependency") return st.export_record_lists() @@ -648,7 +656,7 @@ class Crawler(object): "use for example the Scanner to create this data.")) crawled_data = self.crawled_data - to_be_inserted, to_be_updated = self.split_into_inserts_and_updates( + to_be_inserted, to_be_updated = self._split_into_inserts_and_updates( SyncGraph(crawled_data, self.identifiableAdapter)) for el in to_be_updated: @@ -773,7 +781,7 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) res[converter.name]["subtree"][k[0]] = d[k[0]] return res - def save_debug_data(self, filename: str, debug_tree: DebugTree = None): + def save_debug_data(self, filename: str, debug_tree: Optional[DebugTree] = None): """ Save the information contained in a debug_tree to a file named filename. """ diff --git a/src/caoscrawler/default_converters.yml b/src/caoscrawler/default_converters.yml index e192ab1b3bae70a6772cf6defba4a4592a92e584..af2b1c764ac637c1391c89861ddba12386e6240e 100644 --- a/src/caoscrawler/default_converters.yml +++ b/src/caoscrawler/default_converters.yml @@ -24,7 +24,7 @@ TextElement: converter: TextElementConverter package: caoscrawler.converters - + DictDictElement: # deprecated converter: DictElementConverter package: caoscrawler.converters @@ -60,7 +60,7 @@ File: # deprecated converter: SimpleFileConverter package: caoscrawler.converters - + SimpleFile: converter: SimpleFileConverter package: caoscrawler.converters @@ -81,6 +81,10 @@ CSVTableConverter: converter: CSVTableConverter package: caoscrawler.converters +SPSSFile: + converter: SPSSConverter + package: caoscrawler.converters + XLSXTableConverter: converter: XLSXTableConverter package: caoscrawler.converters diff --git a/src/caoscrawler/hdf5_converter.py b/src/caoscrawler/hdf5_converter.py index 5b1ff5775fb74919c989507c449636fd822db7f0..482d59c12d2d0b8540c01bd04da718d9c514ddc4 100644 --- a/src/caoscrawler/hdf5_converter.py +++ b/src/caoscrawler/hdf5_converter.py @@ -18,6 +18,8 @@ # along with this program. If not, see <https://www.gnu.org/licenses/>. # +from typing import Optional + try: import h5py except ModuleNotFoundError: @@ -94,8 +96,8 @@ def convert_h5_element(elt: Union[h5py.Group, h5py.Dataset], name: str): raise ValueError("The given element must be either a HDF5 Group or Dataset object.") -def convert_basic_element_with_nd_array(value, name: str = None, - internal_path: str = None, msg_prefix: str = ""): +def convert_basic_element_with_nd_array(value, name: Optional[str] = None, + internal_path: Optional[str] = None, msg_prefix: str = ""): """Convert a given object either to an ndarray structure element or to a basic scalar structure element. diff --git a/src/caoscrawler/identifiable.py b/src/caoscrawler/identifiable.py index c7312e12addb89c74d406bdc0e63e1e21e07e12a..f6c85c694e5ef0be7e6a9be8154a34c400bab008 100644 --- a/src/caoscrawler/identifiable.py +++ b/src/caoscrawler/identifiable.py @@ -25,7 +25,7 @@ import json import logging from datetime import datetime from hashlib import sha256 -from typing import Union +from typing import Optional, Union import linkahead as db @@ -53,9 +53,9 @@ class Identifiable(): backrefs: list, TODO future """ - def __init__(self, record_id: int = None, record_type: str = None, - name: str = None, properties: dict = None, - backrefs: list[Union[int, str]] = None): + def __init__(self, record_id: Optional[int] = None, record_type: Optional[str] = None, + name: Optional[str] = None, properties: Optional[dict] = None, + backrefs: Optional[list[Union[int, str]]] = None): if (record_id is None and name is None and (backrefs is None or len(backrefs) == 0) and (properties is None or len(properties) == 0)): @@ -80,7 +80,7 @@ class Identifiable(): def get_representation(self) -> str: return sha256(Identifiable._create_hashable_string(self).encode('utf-8')).hexdigest() - @staticmethod + @ staticmethod def _value_representation(value) -> str: """returns the string representation of property values to be used in the hash function @@ -103,7 +103,7 @@ class Identifiable(): else: raise ValueError(f"Unknown datatype of the value: {value}") - @staticmethod + @ staticmethod def _create_hashable_string(identifiable: Identifiable) -> str: """ creates a string from the attributes of an identifiable that can be hashed diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index 23883185fee90d3953c1f354b05e05e3cc7ba0d1..3aae9353cb4c0cf4d6c264616d770837d87e801e 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -89,9 +89,9 @@ class IdentifiableAdapter(metaclass=ABCMeta): - Parent record types - Properties - ``is_referenced_by`` statements - - An *identifiable* belongs to a concrete record. It consists of identifying attributes which "fill - in" the *registered identifiable*. In code, it can be represented as a Record based on the - *registered identifiable* with all the values filled in. + - An *identifiable* belongs to a concrete record. It consists of identifying attributes which + "fill in" the *registered identifiable*. In code, it can be represented as a Record based on + the *registered identifiable* with all the values filled in. - An *identified record* is the result of retrieving a record from the database, based on the *identifiable* (and its values). @@ -287,6 +287,8 @@ class IdentifiableAdapter(metaclass=ABCMeta): if pname == "name" or pname == "is_referenced_by": continue if record.get_property(prop.name) is None: + logger.error(f"Record with missing identifying property:\n{record}\n" + f"This property is missing: {prop.name}\n") raise RuntimeError("Missing identifying Property") pval = record.get_property(prop.name).value if not isinstance(prop.value, list): diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py index fe8a6dbfdd7de93c84e8cabd28a9c0dae8b8468a..9f8f5e40beb729d73151bad38f3e390a4a8cecb4 100644 --- a/src/caoscrawler/scanner.py +++ b/src/caoscrawler/scanner.py @@ -62,11 +62,10 @@ def load_definition(crawler_definition_path: str): """ # Load the cfood from a yaml file: - with open(crawler_definition_path, "r") as f: + with open(crawler_definition_path, encoding="utf-8") as f: crawler_definitions = list(yaml.safe_load_all(f)) - crawler_definition = _load_definition_from_yaml_dict( - crawler_definitions) + crawler_definition = _load_definition_from_yaml_dict(crawler_definitions) return _resolve_validator_paths(crawler_definition, crawler_definition_path) diff --git a/src/caoscrawler/scripts/__init__.py b/src/caoscrawler/scripts/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/caoscrawler/scripts/generators.py b/src/caoscrawler/scripts/generators.py new file mode 100644 index 0000000000000000000000000000000000000000..927d4dcaf7f6123a50d30657beff1cb1b32d381e --- /dev/null +++ b/src/caoscrawler/scripts/generators.py @@ -0,0 +1,221 @@ +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Scripts and functions to generate datamodel yaml files and cfood skeletons. + +For example from actual data files. +""" + +import argparse +import csv +from collections import OrderedDict +from string import Template +from typing import Optional + +import pandas as pd +import yaml + + +DM_TEMPLATE = """# auto-generated data model from file "[]{infile}". +# To insert a datamodel into LinkAhead, run: +# +# python3 -m caosadvancedtools.models.parser datamodel.yaml --sync +""" + +HEADER_RT = """ +############### +# RecordTypes # +############### + +DummyRT: + description: Note: Change name and enter description. + recommended_properties: + """ + +CFOOD_TEMPLATE = """ +--- +metadata: + macros: + - !defmacro + # Simple column value -> property rule + name: ColumnValue + params: + name: null + belongsto: BaseElement + type: TextElement + definition: + ${name}: + type: ${type} + match_name: ^${name}$$ + match_value: (?P<val>.*) + records: + ${belongsto}: + ${name}: $$val + - !defmacro + # column value -> reference property + name: ColumnValueReference + params: + name: null + reftype: null # RecordType of the reference + belongsto: BaseElement + type: TextElement # References are always text, right? + definition: + ${name}: + type: ${type} + match_name: ^${name}$$ + match_value: (?P<val>.*) + records: + ${reftype}: + name: $$val + ${belongsto}: + ${name}: $$${reftype} + - !defmacro + # Same as "ColumnValue", but also give name of property. + name: ColumnValuePropname + params: + name: null + propname: null + belongsto: BaseElement + type: TextElement + definition: + ${name}: + type: ${type} + match_name: ^${name}$$ + match_value: (?P<val>.*) + records: + ${belongsto}: + ${propname}: $$val +--- +directory: # corresponds to the directory given to the crawler + type: Directory + match: .* # we do not care how it is named here + subtree: + # This is the file + thisfile: + type: []{file} + match: []{match} + subtree: + entry: + type: Dict + match: .* # Name is irrelevant + records: + MyParent: + subtree: !macro +""" + + +class _CustomTemplate(Template): + delimiter = "[]" # "$" is used too much by the yaml template. + + +def csv_to_datamodel(infile: str, outfile: str, cfood: Optional[str] = None): + """Parse the input csv and create basic datamodel in ``outfile``. + +Parameters +---------- +cfood: str + If given, also create a cfood skeleton. + """ + sniffer = csv.Sniffer() + with open(infile, encoding="utf-8") as f_infile: + max_sniff = 50000 + sniffed = sniffer.sniff(f_infile.read(max_sniff)) + df = pd.read_table(infile, sep=sniffed.delimiter, quotechar=sniffed.quotechar, + escapechar=sniffed.escapechar) + + properties = OrderedDict() + for colname in df.columns: + column = df[colname] + dtype: Optional[str] = "TEXT" + if pd.api.types.is_bool_dtype(column.dtype): + dtype = "BOOLEAN" + if pd.api.types.is_float_dtype(column.dtype): + dtype = "DOUBLE" + elif pd.api.types.is_integer_dtype(column.dtype): + dtype = "INTEGER" + properties[colname] = { + "datatype": dtype + } + + result = (_CustomTemplate(DM_TEMPLATE).substitute({"infile": infile}) + + HEADER_RT + + " ".join(yaml.dump(dict(properties), # from OrderedDict to dict + allow_unicode=True, + sort_keys=False).splitlines(keepends=True)) + ) + with open(outfile, encoding="utf-8", mode="w") as myfile: + myfile.write(result) + + ################# + # cfood section # + ################# + if cfood: + defs_col_value: list[str] = [] + defs_col_value_ref: list[str] = [] + prefix = " " * 14 + for name, propdef in properties.items(): + def_str = prefix + f"- name: {name}\n" + dtype = None + reftype = None + defs = defs_col_value + # Which type? + if propdef["datatype"] == "BOOLEAN": + dtype = "BooleanElement" + elif propdef["datatype"] == "INTEGER": + dtype = "IntegerElement" + elif propdef["datatype"] == "DOUBLE": + dtype = "FloatElement" + elif propdef["datatype"] == "TEXT": + dtype = None + else: + reftype = propdef["datatype"] + defs = defs_col_value_ref + + # Append according to types: + if reftype: + def_str += prefix + f" reftype: {reftype}\n" + if dtype: + def_str += prefix + f" type: {dtype}\n" + + # Store result + defs.append(def_str) + del defs + + cfood_str = (_CustomTemplate(CFOOD_TEMPLATE).substitute({"file": "CSVTableConverter", + "match": ".*\\[ct]sv"}) + + prefix[2:] + "ColumnValue:\n" + "".join(defs_col_value) + + prefix[2:] + "ColumnValueReference:\n" + "".join(defs_col_value_ref) + ) + with open(cfood, encoding="utf-8", mode="w") as myfile: + myfile.write(cfood_str) + + +def _parse_args_csv(): + """Parse the arguments.""" + parser = argparse.ArgumentParser(description="Create datamodel and cfood from CSV files.") + parser.add_argument('-i', '--input', help="The input file.", required=True, dest="infile") + parser.add_argument('-o', '--outfile', help="Yaml filename to save the result", required=True) + parser.add_argument('--cfood', help="Yaml filename to create cfood output in", required=False) + + return parser.parse_args() + + +def csv_to_datamodel_main(): + """The main function for csv data handling.""" + args = _parse_args_csv() + csv_to_datamodel(**vars(args)) diff --git a/src/caoscrawler/structure_elements.py b/src/caoscrawler/structure_elements.py index ff070626ebfdd580c16bbbf2dc30ab330dc162f0..0efba91c185446e0bfbecbbb53f68aaa8a8e15d1 100644 --- a/src/caoscrawler/structure_elements.py +++ b/src/caoscrawler/structure_elements.py @@ -23,7 +23,6 @@ # ** end header # -from typing import Dict as tDict import warnings @@ -39,7 +38,7 @@ name: str def __init__(self, name: str): # Used to store usage information for debugging: - self.metadata: tDict[str, set[str]] = { + self.metadata: dict[str, set[str]] = { "usage": set() } diff --git a/src/caoscrawler/utils.py b/src/caoscrawler/utils.py index c62f44eeaa75ca42579aa3d6ead437e901cd38ff..096fde9b573f4ff60995498144cad3589ce7dbb2 100644 --- a/src/caoscrawler/utils.py +++ b/src/caoscrawler/utils.py @@ -25,6 +25,9 @@ # Some utility functions, e.g. for extending pylib. +import sys +from typing import Optional + import linkahead as db @@ -39,3 +42,30 @@ def has_parent(entity: db.Entity, name: str): if parent.name == name: return True return False + + +def MissingImport(name: str, hint: str = "", err: Optional[Exception] = None) -> type: + """Factory with dummy classes, which may be assigned to variables but never used.""" + def _error(): + error_msg = f"This class ({name}) cannot be used, because some libraries are missing." + if hint: + error_msg += "\n\n" + hint + + if err: + print(error_msg, file=sys.stdout) + raise RuntimeError(error_msg) from err + raise RuntimeError(error_msg) + + class _Meta(type): + def __getattribute__(cls, *args, **kwargs): + _error() + + def __call__(cls, *args, **kwargs): + _error() + + class _DummyClass(metaclass=_Meta): + pass + + _DummyClass.__name__ = name + + return _DummyClass diff --git a/src/caoscrawler/version.py b/src/caoscrawler/version.py index fdc8323452cd190cc3628efa57c15992f30fabeb..0b72dd65116fbc102a4dc2492d726698cad5a13b 100644 --- a/src/caoscrawler/version.py +++ b/src/caoscrawler/version.py @@ -17,11 +17,7 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. # -try: - from importlib import metadata as importlib_metadata -except ImportError: # Python<3.8 dowesn"t support this so use - import importlib_metadata - +from importlib import metadata as importlib_metadata from packaging.version import parse as parse_version from warnings import warn @@ -43,7 +39,7 @@ def check_cfood_version(metadata: dict): if not metadata or "crawler-version" not in metadata: msg = """ -No crawler version specified in cfood definition, so there is now guarantee that +No crawler version specified in cfood definition, so there is no guarantee that the cfood definition matches the installed crawler version. Specifying a version is highly recommended to ensure that the definition works diff --git a/src/doc/concepts.rst b/src/doc/concepts.rst index 7100bcd1790edb3e040a1a90663a32a09b7c8eaf..770731857112b93205f0e80d623fa9183c4aa885 100644 --- a/src/doc/concepts.rst +++ b/src/doc/concepts.rst @@ -1,3 +1,4 @@ +======== Concepts ======== @@ -5,6 +6,10 @@ The CaosDB Crawler can handle any kind of hierarchical data structure. The typic directory tree that is traversed. We use the following terms/concepts to describe how the CaosDB Crawler works. +Basics +====== + + Structure Elements ++++++++++++++++++ @@ -29,7 +34,7 @@ existing StructureElements, Converters create a tree of StructureElements. .. image:: img/converter.png :height: 170 -See :std:doc:`converters<converters>` for details. +See the chapter :std:doc:`Converters<converters>` for details. Relevant sources in: @@ -183,8 +188,7 @@ TODO Caching +++++++ -The Crawler uses the cached library function ``cached_get_entity_by``. The -cache is cleared automatically, when the Crawler does updates, but if you would -run the same Python process indefinetely the Crawler would not see changes due -to the Cache. Thus, please make sure to clear the cache if you create long -running Python processes. +The Crawler uses the cached library function ``cached_get_entity_by``. The cache is cleared +automatically when the Crawler does updates, but if you ran the same Python process indefinitely, +the Crawler would not see changes in LinkAhead due to the cache. Thus, please make sure to clear the +cache if you create long running Python processes. diff --git a/src/doc/conf.py b/src/doc/conf.py index 3cce99d03728d229c848ba6374d15de9fe73ec7b..3248726ed63dd80fdee7c06da3c27caace93f22c 100644 --- a/src/doc/conf.py +++ b/src/doc/conf.py @@ -53,6 +53,7 @@ extensions = [ 'sphinx.ext.autosectionlabel', 'sphinx.ext.intersphinx', 'sphinx.ext.napoleon', # For Google style docstrings + "sphinx.ext.todo", "recommonmark", # For markdown files. "sphinx_rtd_theme", ] @@ -213,6 +214,10 @@ intersphinx_mapping = { # TODO Which options do we want? autodoc_default_options = { - 'members': None, - 'undoc-members': None, + 'members': True, + 'undoc-members': True, + 'member-order': 'bysource', + 'special-member': ["__init__"], } + +todo_include_todos = True diff --git a/src/doc/converters.rst b/src/doc/converters.rst index 9b28c9a61eec4d9707b9640720b9c6a44a8fe25e..d7e11c235fafa1e42f53342a24255ceb0d275ed4 100644 --- a/src/doc/converters.rst +++ b/src/doc/converters.rst @@ -8,10 +8,6 @@ existing StructureElements, Converters create a tree of StructureElements. .. image:: img/converter.png :height: 170 -The ``cfood.yml`` definition also describes which -Converters shall be used to treat the generated child StructureElements. The -definition therefore itself also defines a tree. - Each StructureElement in the tree has a set of properties, organized as key-value pairs. Some of those properties are specified by the type of StructureElement. For example, @@ -19,15 +15,18 @@ a file could have the file name as property: ``'filename': myfile.dat``. Converters may define additional functions that create further values. For example, a regular expression could be used to get a date from a file name. +CFood definition +++++++++++++++++ -A converter is defined via a yml file or part of it. The definition states -what kind of StructureElement it treats (typically one). -Also, it defines how children of the current StructureElement are -created and what Converters shall be used to treat those. +Converter application to data is specified via a tree-like yml file (called ``cfood.yml``, by +convention). The yml file specifies which Converters shall be used on which StructureElements, and +how to treat the generated *child* StructureElements. The yaml definition may look like this: -TODO: outdated, see cfood-schema.yml +.. todo:: + + This is outdated, see ``cfood-schema.yml`` for the current specification of a ``cfood.yml``. .. code-block:: yaml @@ -47,13 +46,18 @@ TODO: outdated, see cfood-schema.yml subtree: (...) -The **<NodeName>** is a description of what it represents (e.g. -'experiment-folder') and is used as identifier. +The **<NodeName>** is a description of what the current block represents (e.g. +``experiment-folder``) and is used as an identifier. **<type>** selects the converter that is going to be matched against the current structure element. If the structure element matches (this is a combination of a typecheck and a detailed -match, see :py:class:`~caoscrawler.converters.Converter` for details) the converter is used -to generate records (see :py:meth:`~caoscrawler.converters.Converter.create_records`) and to possibly process a subtree, as defined by the function :func:`caoscrawler.converters.create_children`. +match, see the :py:class:`~caoscrawler.converters.Converter` source documentation for details), the +converter will: + +- generate records (with :py:meth:`~caoscrawler.converters.Converter.create_records`) +- possibly process a subtree (with :py:meth:`caoscrawler.converters.Converter.create_children`) + +**match** *TODO* **records** is a dict of definitions that define the semantic structure (see details below). @@ -151,6 +155,9 @@ The following StructureElement types are typically created by the DictElement co - ListElement - DictElement +Note that you may use ``TextElement`` for anything that exists in a text format that can be +interpreted by the server, such as date and datetime strings in ISO-8601 format. + Scalar Value Converters ======================= `BooleanElementConverter`, `FloatElementConverter`, `TextElementConverter`, and @@ -253,13 +260,13 @@ HDF5 Converters For treating `HDF5 Files <https://docs.hdfgroup.org/hdf5/develop/_s_p_e_c.html>`_, there are in total -four individual converters corresponding to the internal structure of HDF5 files: -the :ref:`H5FileConverter` which opens the file itself and creates further -structure elements from HDF5 groups, datasets, and included multi-dimensional -arrays that are in turn treated by the :ref:`H5GroupConverter`, the -:ref:`H5DatasetConverter`, and the :ref:`H5NdarrayConverter`, respectively. You -need to install the LinkAhead crawler with its optional ``h5crawler`` dependency -for using these converters. +four individual converters corresponding to the internal structure of HDF5 +files: the :ref:`H5FileConverter` which opens the file itself and creates +further structure elements from HDF5 groups, datasets, and included +multi-dimensional arrays that are in turn treated by the +:ref:`H5GroupConverter`, the :ref:`H5DatasetConverter`, and the +:ref:`H5NdarrayConverter`, respectively. You need to install the LinkAhead +crawler with its optional ``h5-crawler`` dependency for using these converters. The basic idea when crawling HDF5 files is to treat them very similar to :ref:`dictionaries <DictElement Converter>` in which the attributes on root, diff --git a/src/doc/macros.rst b/src/doc/macros.rst index d093d9b69f5d2c14b5bfbb2fe292545fc7943ca7..3a234973ee17791aaa2a0bd9e4b81836207a07e0 100644 --- a/src/doc/macros.rst +++ b/src/doc/macros.rst @@ -1,6 +1,9 @@ Macros ------ +Introduction +============ + Macros highly facilitate the writing of complex :doc:`CFoods<cfood>`. Consider the following common example: @@ -83,16 +86,46 @@ The expanded version of `ExperimentalData` will look like: This :ref:`example<example_files_2>` can also be found in the macro unit tests (see :func:`unittests.test_macros.test_documentation_example_2`). -Complex Example -=============== -The following, more complex example, demonstrates the use -of macro variable substitutions that generate crawler variable substitutions: +Mixing macros and plain definitions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can also mix macros and plain definitions. Whenever a name cannot be resolved to a macro, a +plain yaml node definition is used as a fallback: + +.. code:: yaml + + --- + metadata: + macros: + - !defmacro + name: MarkdownFile + # ... Definition here ... + --- + ExperimentalData: + type: Directory + match: ExperimentalData + subtree: !macro + MarkdownFile: + - name: README + filename: ^README.md$ + OtherContent: # There is no macro named "OtherContent", so this is parsed as normal content. + type: SimpleFile + match: .*txt + records: + # ... Normal content ... + -- `$$$nodename` will lead to a macro variable substitution of variable `$nodename` during macro expansion. -- `$$` will be turned into `$` -- So in the crawler cfood, the string will appear as `$value` if variable `nodename` would be set to `value` when using the macro. +Complex example +=============== + +Let's try something more complex: what happens to multiple ``$``? This example demonstrates the use +of `macro` variable substitutions to generate `crawler` variable substitutions: +- ``$$`` will be converted into ``$``. +- ``$$$nodename`` will retain a single ``$`` and substitute ``$nodename`` during macro expansion. +- So in the cfood, if ``nodename: value``, the string ``$$$nodename`` will be converted to + ``$value``. .. _example_1: .. code-block:: yaml @@ -118,7 +151,8 @@ of macro variable substitutions that generate crawler variable substitutions: Simulation: $recordtype: +$File -The expanded version of :ref:`example<example_1>` can be seen in :ref:`example<example_1_expanded>`. +The expanded version of the :ref:`example above<example_1>` (with ``nodename: Dataset``) can be seen +:ref:`here<example_1_expanded>`: .. _example_1_expanded: @@ -141,11 +175,11 @@ The expanded version of :ref:`example<example_1>` can be seen in :ref:`example<e type: SimpleFile type: Directory -This :ref:`example<example_1>` can also be found in the macro unit tests (see :func:`unittests.test_macros.test_documentation_example_1`). - +This example can also be found in the macro unit tests (see +:func:`unittests.test_macros.test_documentation_example_1`). -Using Macros Multiple Times +Using macros multiple times =========================== To use the same macro multiple times in the same yaml node, lists can be used: @@ -198,11 +232,11 @@ use the same top level key. Because later versions would overwrite previous ones. Here we used ``$macro_name`` to prevent that. -Limitation -========== +Limitations +=========== -Currently it is not possible to use the same macro twice in the same yaml node, but in different -positions. Consider: +Currently it is not possible to use the same macro twice in the same yaml node, if it occurs in +different positions. Consider: .. _example_multiple_limitation: .. code-block:: yaml @@ -227,14 +261,13 @@ positions. Consider: Other_node: type: test - test_twice: # This is NOT possible as each - # dictionary element can only appear once in a yaml node. + test_twice: # This is NOT possible as each key + # can only appear once in a yaml node. - macro_name: twice # <- This is the second one, with different arguments a: 5 - {} # <- This is the third one, just using default arguments -However, this should not be a real limitation, as the crawler is designed in a way, -that the order of the nodes in the same level should not matter. +This should not be a real limitation however, as the order of nodes does not matter for the crawler. Using macros within macro definitions diff --git a/src/doc/tutorials/parameterfile.rst b/src/doc/tutorials/parameterfile.rst index 9369ba8b83df8c484a4af8f240e1a1de2f4c10fb..2442969541eebf9a4e058b797b48995b39372a3e 100644 --- a/src/doc/tutorials/parameterfile.rst +++ b/src/doc/tutorials/parameterfile.rst @@ -88,6 +88,10 @@ regular expressions do: We can use the groups from the regular expressions that are used for matching. In our example, we use the "value" group to assign the "frequency" value to the "Experiment". +.. note:: + + For more information on the ``cfood.yml`` specification, read on in the chapter :ref:`Converters`. + A fully grown CFood ------------------- @@ -148,4 +152,6 @@ the CFood file is in the current working directory): caosdb-crawler -s update -i identifiables.yml cfood.yml . +.. note:: + ``caosdb-crawler`` currently only works with cfoods which have a directory as top level element. diff --git a/tox.ini b/tox.ini index 36807a619a9536a02908c36364e02ba52c1a0d69..41249e4277391c5ffa4ec13fc4da1a6ee1f48491 100644 --- a/tox.ini +++ b/tox.ini @@ -3,15 +3,14 @@ envlist = py38, py39, py310, py311, py312, py313 skip_missing_interpreters = true [testenv] -deps = . +deps = .[h5-crawler,spss] pytest pytest-cov - h5py # TODO: Make this f-branch sensitive git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@dev commands = caosdb-crawler --help - py.test --cov=caosdb -vv {posargs} + py.test --cov=caoscrawler -vv {posargs} [flake8] max-line-length = 100 diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 52ece13dc2269a3e3b16e6378166e91b084f4a7c..2f62ef9216974bc4939667c0cb28971044c1f80c 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -3,8 +3,9 @@ # # This file is a part of the CaosDB Project. # -# Copyright (C) 2021,2022 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2021-2024 Indiscale GmbH <info@indiscale.com> # Copyright (C) 2021,2022 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as @@ -140,7 +141,7 @@ def test_markdown_converter(converter_registry): converter = MarkdownFileConverter({"match": "(.*)"}, "TestMarkdownFileConverter", converter_registry) - with pytest.raises(ConverterValidationError) as err: + with pytest.raises(ConverterValidationError): converter.create_children(None, File("test_tool.py", UNITTESTDIR / "test_crawler.py")) m = converter.match(test_readme) @@ -632,7 +633,7 @@ def test_load_converters(): # converter classes can be loaded from their respective packages. # Please adapt, if defaults change! - assert len(converter_registry) == 22 + assert len(converter_registry) == 23 # All of them are contained in caoscrawler.converters for conv_key, conv in converter_registry.items(): diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index b50dfb97c1adafb1de58a4aeea99f61c4ab87142..e7a03e3322da0d937bf3c1330f21b90768b478d8 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -324,7 +324,7 @@ def test_remove_unnecessary_updates(): def test_split_into_inserts_and_updates_trivial(): crawler = Crawler() st = SyncGraph([], crawler.identifiableAdapter) - crawler.split_into_inserts_and_updates(st) + crawler._split_into_inserts_and_updates(st) def test_split_into_inserts_and_updates_simple(crawler_mocked_identifiable_retrieve): @@ -338,7 +338,7 @@ def test_split_into_inserts_and_updates_simple(crawler_mocked_identifiable_retri st = SyncGraph(entlist, crawler.identifiableAdapter) # check setup - insert, update = crawler.split_into_inserts_and_updates(st) + insert, update = crawler._split_into_inserts_and_updates(st) assert len(insert) == 1 assert insert[0].name == "B" assert len(update) == 1 @@ -361,7 +361,7 @@ def test_split_into_inserts_and_updates_with_circ(crawler_mocked_identifiable_re st = SyncGraph([a, b], crawler.identifiableAdapter) with pytest.raises(RuntimeError): - crawler.split_into_inserts_and_updates(st) + crawler._split_into_inserts_and_updates(st) def test_split_into_inserts_and_updates_with_complex(crawler_mocked_identifiable_retrieve): @@ -380,7 +380,7 @@ def test_split_into_inserts_and_updates_with_complex(crawler_mocked_identifiable b.add_property("C", f) entlist = [a, b, g] st = SyncGraph(entlist, crawler.identifiableAdapter) - insert, update = crawler.split_into_inserts_and_updates(st) + insert, update = crawler._split_into_inserts_and_updates(st) assert len(insert) == 3 assert "B" in [el.name for el in insert] assert len(update) == 1 @@ -478,7 +478,7 @@ a: ([b1, b2]) # The Cs cannot be merged due to different identifying properties # The Bs cannot be merged due to different references to Cs with raises(ImpossibleMergeError) as rte: - crawler.split_into_inserts_and_updates(st) + crawler._split_into_inserts_and_updates(st) # TODO # assert not isinstance(rte.value, NotImplementedError), \ # "Exception must not be NotImplementedError, but plain RuntimeError." @@ -508,7 +508,7 @@ def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test) # one can be found remotely, one not # check the split... - insert, update = crawler.split_into_inserts_and_updates(st) + insert, update = crawler._split_into_inserts_and_updates(st) # A was found remotely and is therefore in the update list assert len(update) == 1 assert update[0].name == "A" @@ -537,7 +537,7 @@ def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_ assert len(identifiable.backrefs) == 2 # check the split... - insert, update = crawler.split_into_inserts_and_updates(st) + insert, update = crawler._split_into_inserts_and_updates(st) assert len(update) == 2 assert len(insert) == 1 @@ -562,7 +562,7 @@ def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_ assert len(identifiable.backrefs) == 2 # check the split... - insert, update = crawler.split_into_inserts_and_updates(st) + insert, update = crawler._split_into_inserts_and_updates(st) assert len(update) == 2 assert len(insert) == 1 diff --git a/unittests/test_identifiable_adapters.py b/unittests/test_identifiable_adapters.py index bdcfeacb6dea514ad689156bf2f61e712c665a4e..e37c1ad4953880f988bb1efc3f6804766805b4ee 100644 --- a/unittests/test_identifiable_adapters.py +++ b/unittests/test_identifiable_adapters.py @@ -125,10 +125,11 @@ def test_load_from_yaml_file(): def test_non_default_name(): ident = CaosDBIdentifiableAdapter() identifiable = ident.get_identifiable(SyncNode(db.Record(name="don't touch it") - .add_parent("Person") - .add_property(name="last_name", value='Tom'), db.RecordType() - .add_parent(name="Person") - .add_property(name="last_name")), []) + .add_parent("Person") + .add_property(name="last_name", value='Tom'), + db.RecordType() + .add_parent(name="Person") + .add_property(name="last_name")), []) assert identifiable.name is None diff --git a/unittests/test_issues.py b/unittests/test_issues.py index e6fe06efcd055945b9a13576b83f6bf470eaccdd..1678280555e739bae55819fa7fe42a53c938c4e5 100644 --- a/unittests/test_issues.py +++ b/unittests/test_issues.py @@ -24,7 +24,6 @@ from pytest import mark from caoscrawler.converters import replace_variables, CrawlerTemplate from caoscrawler.crawl import Crawler - from caoscrawler.structure_elements import DictElement from caoscrawler.stores import GeneralStore from caoscrawler.scanner import create_converter_registry, scan_structure_elements diff --git a/unittests/test_spss_converter.py b/unittests/test_spss_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..7ffc18dba43a6f7cd3c9fbc9273da349b4ec3c6e --- /dev/null +++ b/unittests/test_spss_converter.py @@ -0,0 +1,83 @@ +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Testing converter for SPSS files.""" + +import datetime +import importlib +import re +from pathlib import Path + +import numpy as np +import pytest + +from caoscrawler.converters import ( + ConverterValidationError, + SPSSConverter, +) +from caoscrawler.structure_elements import (BooleanElement, DictElement, + Directory, File, FloatElement, + IntegerElement, ListElement, + TextElement) + +UNITTESTDIR = Path(__file__).parent + + +@pytest.fixture +def converter_registry(): + converter_registry: dict[str, dict[str, str]] = { + "Directory": { + "converter": "DirectoryConverter", + "package": "caoscrawler.converters"}, + } + + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry + + +def test_spss_converter(converter_registry): + converter = SPSSConverter({ + "match": ("sample.sav") + }, + "ThisConverterNameIsIrrelevant", converter_registry + ) + + spss_dir = UNITTESTDIR / "test_tables" / "spss" + for sav_file, length, thistype in [ + (File("sample.sav", spss_dir / "sample.sav"), 5, str), + (File("sample.sav", spss_dir / "sample_large.sav"), 485, int), + ]: + m = converter.match(sav_file) + assert m is not None + assert len(m) == 0 + + children = converter.create_children(None, sav_file) + assert len(children) == length + + for ii, child in enumerate(children): + assert child.__class__ == DictElement + assert child.name == str(ii) + my_dict = child.value + assert isinstance(my_dict["mychar"], str) + assert isinstance(my_dict["mydate"], datetime.date) or np.isnan(my_dict["mydate"]) + assert isinstance(my_dict["dtime"], datetime.datetime) or np.isnan(my_dict["dtime"]) + assert isinstance(my_dict["mytime"], datetime.time) or np.isnan(my_dict["mytime"]) + assert isinstance(my_dict["mylabl"], thistype), f"{type(my_dict['mylabl'])}" + assert isinstance(my_dict["myord"], thistype), f"{type(my_dict['myord'])}" diff --git a/unittests/test_sync_graph.py b/unittests/test_sync_graph.py index 2c63cb54aceeaef98df36630ba0873cd62ebf7e3..a7c1539118a4cd87d8c46bf6e18b07b90a90361a 100644 --- a/unittests/test_sync_graph.py +++ b/unittests/test_sync_graph.py @@ -18,6 +18,8 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. # + +import logging from functools import partial from unittest.mock import MagicMock, Mock, patch @@ -631,8 +633,8 @@ def test_detect_circular_dependency(crawler_mocked_identifiable_retrieve, caplog assert Crawler.detect_circular_dependency([d]) is None st = SyncGraph(flat, crawler.identifiableAdapter) - with raises(RuntimeError): - _, _ = crawler.split_into_inserts_and_updates(st) + with pytest.raises(RuntimeError): + _, _ = crawler._split_into_inserts_and_updates(st) caplog.set_level(logging.ERROR, logger="caoscrawler.converters") assert "Found circular dependency" in caplog.text assert "\n--------\n\n> Parent: C\n\n>> Name: a\n[\'C\']" in caplog.text diff --git a/unittests/test_tables/spss/CITATION.cff b/unittests/test_tables/spss/CITATION.cff new file mode 100644 index 0000000000000000000000000000000000000000..140fcc071bf2d5f5709cf31bf11bd9676b81ca5f --- /dev/null +++ b/unittests/test_tables/spss/CITATION.cff @@ -0,0 +1,11 @@ +cff-version: 1.2.0 +message: "If you use this software, please cite it as below." +authors: +- family-names: "Fajardo" + given-names: "Otto" + orcid: "https://orcid.org/0000-0002-3363-9287" +title: "Pyreadstat" +version: 1.2.7 +doi: 10.5281/zenodo.6612282 +date-released: 2018-09-24 +url: "https://github.com/Roche/pyreadstat" diff --git a/unittests/test_tables/spss/LICENSE b/unittests/test_tables/spss/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..a2f94b1a2a5a4255fc8ef6d0beb94cce89f545e8 --- /dev/null +++ b/unittests/test_tables/spss/LICENSE @@ -0,0 +1,210 @@ +Test data files were copied from [pyreadstat](https://github.com/Roche/pyreadstat), they are +licensed under the Apache License, cited below. + +Copyright (C) 2018-2024 Otto Fajardo +Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> + +pyreadstat liscence: +--------------------------------------------------------------------------- + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/unittests/test_tables/spss/sample.sav b/unittests/test_tables/spss/sample.sav new file mode 100644 index 0000000000000000000000000000000000000000..20d0c5ce6689a60adfa329a17b4347274e9a863b Binary files /dev/null and b/unittests/test_tables/spss/sample.sav differ diff --git a/unittests/test_tables/spss/sample_large.sav b/unittests/test_tables/spss/sample_large.sav new file mode 100644 index 0000000000000000000000000000000000000000..b0c16c1390a15a4f62a859ade76aa17b89c6ae40 Binary files /dev/null and b/unittests/test_tables/spss/sample_large.sav differ diff --git a/unittests/test_utilities.py b/unittests/test_utilities.py index 5a80ab9b230db4540d741bf8fa4f9d11b5158aab..dfb79c8b6b10909952174cf24c3aa9198f3b7743 100644 --- a/unittests/test_utilities.py +++ b/unittests/test_utilities.py @@ -19,7 +19,10 @@ # along with this program. If not, see <https://www.gnu.org/licenses/>. # +import pytest + from caoscrawler.crawl import split_restricted_path +from caoscrawler.utils import MissingImport def test_split_restricted_path(): @@ -33,3 +36,33 @@ def test_split_restricted_path(): assert split_restricted_path("/test//bla") == ["test", "bla"] assert split_restricted_path("//test/bla") == ["test", "bla"] assert split_restricted_path("///test//bla////") == ["test", "bla"] + + +def test_dummy_class(): + Missing = MissingImport(name="Not Important", hint="Do the thing instead.") + with pytest.raises(RuntimeError) as err_info_1: + print(Missing.__name__) + with pytest.raises(RuntimeError) as err_info_2: + Missing() + with pytest.raises(RuntimeError) as err_info_3: + print(Missing.foo) + + for err_info in (err_info_1, err_info_2, err_info_3): + msg = str(err_info.value) + assert "(Not Important)" in msg + assert msg.endswith("Do the thing instead.") + + MissingErr = MissingImport(name="Not Important", hint="Do the thing instead.", + err=ImportError("Old error")) + with pytest.raises(RuntimeError) as err_info_1: + print(MissingErr.__name__) + with pytest.raises(RuntimeError) as err_info_2: + MissingErr() + with pytest.raises(RuntimeError) as err_info_3: + print(MissingErr.foo) + + for err_info in (err_info_1, err_info_2, err_info_3): + msg = str(err_info.value) + assert "(Not Important)" in msg + orig_msg = str(err_info.value.__cause__) + assert orig_msg == "Old error"