diff --git a/CHANGELOG.md b/CHANGELOG.md index a6382f9c6050c78d974ce954dbd3efa9d6c2d5de..e9185233190dbfe2a9bf3b052af12af9f4cd9116 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,34 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.11.0] 2024-07-09 ## + +### Added ### + +- XLSX handling: conversion from XLSX to Json +- `linkahead-loadfiles` now also is a standalone script in the installation + +### Changed ### + +- The `TableImporter` and its subclasses now change all integer datatypes to the + nullable `pandas.Int64Datatype` so that integer columns with empty fields can be + treated properly. In case you don't want the datatypes to be changed + automatically, initialize the `TableImporter` with + `convert_int_to_nullable_int=False`. + +### Fixed ### + +- Blacklisted buggy openpyxl version +- [#62](https://gitlab.com/linkahead/linkahead-advanced-user-tools/-/issues/62) + The `TableImporter` now handles empty fields in integer columns by supporting + the corresponding [nullable integer + types](https://pandas.pydata.org/docs/user_guide/integer_na.html) in Pandas. + +### Documentation ### + +- loadFiles has better `-h` documentation now +- Rudimentary documentation for `table_importer` module + ## [0.10.0] - 2024-04-24 ## ### Added ### diff --git a/CITATION.cff b/CITATION.cff index 6f7b69b665ada694ec5756fd0a6bb16c490c23a5..cb8d326b2b21a51398e21c7945cf1aa0a626999f 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -20,6 +20,6 @@ authors: given-names: Stefan orcid: https://orcid.org/0000-0001-7214-8125 title: CaosDB - Advanced User Tools -version: 0.10.0 +version: 0.11.0 doi: 10.3390/data4020083 -date-released: 2024-04-24 \ No newline at end of file +date-released: 2024-07-09 \ No newline at end of file diff --git a/Makefile b/Makefile index 724d9e5a4f06cb3a2880df9016adfe527314c338..26f5c8182545b2a57b3921f3745d16ff6305a0cc 100644 --- a/Makefile +++ b/Makefile @@ -41,5 +41,5 @@ style: .PHONY: style lint: - pylint --unsafe-load-any-extension=y --fail-under=9.59 -d R,C --ignore=swagger_client src/caosadvancedtools + pylint --unsafe-load-any-extension=y --fail-under=9.72 -d R,C --ignore=swagger_client src/caosadvancedtools .PHONY: lint diff --git a/README_SETUP.md b/README_SETUP.md index bf4f25d92106c19cccc276389b6c97aa22904923..3a7f0197a4b06694c7ae787d0baa6e8a89de0e5e 100644 --- a/README_SETUP.md +++ b/README_SETUP.md @@ -64,6 +64,7 @@ Build documentation in `build/` with `make doc`. - `sphinx` - `sphinx-autoapi` +- `sphinx-rtd-theme` - `recommonmark >= 0.6.0` ### How to contribute ### diff --git a/setup.py b/setup.py index 06133d5bb93e35b6f33240e544cae2b09ce2cc08..acc3a8de96acf27a553a696175d99c708ac85dd9 100755 --- a/setup.py +++ b/setup.py @@ -46,7 +46,7 @@ from setuptools import find_packages, setup ######################################################################## MAJOR = 0 -MINOR = 10 +MINOR = 11 MICRO = 0 PRE = "" # e.g. rc0, alpha.1, 0.beta-23 ISRELEASED = True @@ -159,17 +159,25 @@ def setup_package(): "jsonref", "jsonschema[format]>=4.4.0", "numpy>=1.24.0, < 2", - "openpyxl>=3.0.7", + "openpyxl>=3.1.2,!=3.1.3", "pandas>=1.2.0", "xlrd>=2.0", ], extras_require={"h5-crawler": ["h5py>=3.3.0", ], "gitignore-parser": ["gitignore-parser >=0.1.0", ], }, + setup_requires=["pytest-runner>=2.0,<3dev"], + tests_require=["pytest", + "pytest-pythonpath", + "pytest-cov", + "coverage>=4.4.2", + ], packages=find_packages('src'), package_dir={'': 'src'}, - setup_requires=["pytest-runner>=2.0,<3dev"], - tests_require=["pytest", "pytest-pythonpath", "pytest-cov", "coverage>=4.4.2"], + entry_points={"console_scripts": [ + "linkahead-loadfiles = caosadvancedtools.loadFiles:main", + ] + }, ) try: setup(**metadata) diff --git a/src/caosadvancedtools/crawler.py b/src/caosadvancedtools/crawler.py index 5e84bc8a60c1b358150c4db389efb62656af0631..724004479e1e909057764f74e7d459a9aac72dc1 100644 --- a/src/caosadvancedtools/crawler.py +++ b/src/caosadvancedtools/crawler.py @@ -589,6 +589,9 @@ ____________________\n""".format(i+1, len(pending_changes)) + str(el[3])) if "SHARED_DIR" in os.environ: directory = os.environ["SHARED_DIR"] + else: + directory = "." + logger.info("No 'SHARED_DIR' in environment, using '.' as fallback.") filename = str(run_id)+".html" randname = os.path.basename(os.path.abspath(directory)) filepath = os.path.abspath(os.path.join(directory, filename)) diff --git a/src/caosadvancedtools/loadFiles.py b/src/caosadvancedtools/loadFiles.py index 405b3d135c8af89e32c74015bd04f76f21828e20..77872d1dfe896688e54285551ba2e4eb9a02af99 100755 --- a/src/caosadvancedtools/loadFiles.py +++ b/src/caosadvancedtools/loadFiles.py @@ -190,7 +190,26 @@ def main(argv=None): sys.argv.extend(argv) # Setup argument parser - parser = ArgumentParser() + parser = ArgumentParser(description=""" +Make files that the LinkAhead server can see available als FILE entities. + +In a typical scenario where LinkAhead runs in a Docker container and a host directory `mydir` is +mounted as an extroot with name `myext`, loadfiles could be called like this: + +> loadFiles -p foo /opt/caosdb/mnt/extroot/myext/ + +This call would result in + +1. On the LinkAhead server: There are FILE entities for all files in `mydir`. +2. In the `caosroot` directory inside the Docker image, there are symlinks like this: + + foo/myext/somefile.txt -> /opt/caosdb/mnt/extroot/myext/somefile.txt + foo/myext/dir/other.bin -> /opt/caosdb/mnt/extroot/myext/dir/other.bin + +The FILE entity for `somefile.txt` for example now has the path "foo/myext/somefile.txt" and its +content can be retrieved via LinkAhead's API. + +""", formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("-p", "--prefix", dest="prefix", help="store files with this prefix into the server's" " file system.") diff --git a/src/caosadvancedtools/models/parser.py b/src/caosadvancedtools/models/parser.py index f9bea92455e948eb40e337a43ad87b6d79156fce..175f2f7fbfc5408e70e37740d7ae0506d547c628 100644 --- a/src/caosadvancedtools/models/parser.py +++ b/src/caosadvancedtools/models/parser.py @@ -1015,7 +1015,7 @@ if __name__ == "__main__": elif args.data_model.endswith(".yml") or args.data_model.endswith(".yaml"): model = parse_model_from_yaml(args.data_model) else: - RuntimeError("did not recognize file ending") + raise RuntimeError(f"Unknown file ending of data model: {args.data_model}") if args.print: print(model) if args.sync: diff --git a/src/caosadvancedtools/table_importer.py b/src/caosadvancedtools/table_importer.py index bae813b23195c93ccfd369a626424dd069164fb0..5efd0500a4c5a797a27a92caf0cd2a49165fddd2 100755 --- a/src/caosadvancedtools/table_importer.py +++ b/src/caosadvancedtools/table_importer.py @@ -205,18 +205,47 @@ def string_in_list(val, options, ignore_case=True): return val +def _pandas_typecheck(candidate, dtype): + if pd.api.types.is_integer_dtype(dtype): + return pd.api.types.is_integer_dtype(candidate) + if pd.api.types.is_float_dtype(dtype): + return pd.api.types.is_float_dtype(candidate) + if pd.api.types.is_bool_dtype(dtype): + return pd.api.types.is_bool_dtype(candidate) + return None + + +def _is_subtype_of(candidate, supertype): + """Check whether `candidate` has a subtype of `supertype`, also respecting + pandas types that np.issubdtype is not aware of. + + """ + pandas_typecheck = _pandas_typecheck(candidate, supertype) + if pandas_typecheck is not None: + return pandas_typecheck + return np.issubdtype(candidate, supertype) + + +def _is_instance_of_type(candidate, dtype): + """Wrape `isinstance` so that pandas datatypes can be handled.""" + pandas_typecheck = _pandas_typecheck(type(candidate), dtype) + if pandas_typecheck is not None: + return pandas_typecheck + return isinstance(candidate, dtype) + + class TableImporter(): """Abstract base class for importing data from tables. """ def __init__(self, converters, obligatory_columns=None, unique_keys=None, - datatypes=None, existing_columns=None): + datatypes=None, existing_columns=None, convert_int_to_nullable_int=True): """ Parameters ---------- converters : dict - Dict with column names as keys and converter functions as values. This dict also defines - what columns are required to exist throught the existing keys. The converter functions are + Dict with column names as keys and converter functions as values. This dict's keys also + define what columns must exist. The converter functions are applied to the cell values. They should also check for ValueErrors, such that a separate value check is not necessary. @@ -234,6 +263,12 @@ class TableImporter(): existing_columns : list, optional List of column names that must exist but may have missing (NULL) values + + convert_int_to_nullable_int : bool, optional + Whether to convert all integer datatypes to ``pandas.Int64Dtype()`` + which is nullable, to allow for integer columns with empty fields. If + set to False, a ``DataInconsistencyError`` will be raised in case of + empty fields in integer columns. Default is True. """ if converters is None: @@ -250,7 +285,14 @@ class TableImporter(): if datatypes is None: datatypes = {} - self.datatypes = datatypes + self.datatypes = datatypes.copy() + + self.convert_int_to_nullable_int = convert_int_to_nullable_int + + if convert_int_to_nullable_int is True: + for key, dtype in self.datatypes.items(): + if pd.api.types.is_integer_dtype(dtype): + self.datatypes[key] = pd.Int64Dtype() if existing_columns is None: existing_columns = [] @@ -333,22 +375,25 @@ class TableImporter(): """ for key, datatype in self.datatypes.items(): if key not in df.columns: + # We ignore all datatype definitions that are not present in the + # dataframe. continue + col_dtype = df.dtypes[key] + # Check for castable numeric types first: We unconditionally cast int to the default # float, because CaosDB does not have different sizes anyway. - col_dtype = df.dtypes[key] - if not strict and not np.issubdtype(col_dtype, datatype): + if not strict and not _is_subtype_of(col_dtype, datatype): # These special cases should be fine. if ((datatype == str) - or (np.issubdtype(col_dtype, np.integer) - and np.issubdtype(datatype, np.floating)) + or (pd.api.types.is_integer_dtype(col_dtype) + and pd.api.types.is_float_dtype(datatype)) ): # NOQA df[key] = df[key].astype(datatype) # Now check each element for idx, val in df.loc[pd.notnull(df.loc[:, key]), key].items(): - if not isinstance(val, datatype): + if not _is_instance_of_type(val, datatype): msg = ( "In row no. {rn} and column '{c}' of file '{fi}' the " "datatype was {was} but it should be " @@ -483,7 +528,8 @@ class CSVImporter(TableImporter): **kwargs) applicable_converters = {k: v for k, v in self.converters.items() if k in tmpdf.columns} - df = pd.read_csv(filename, sep=sep, converters=applicable_converters, + df = pd.read_csv(filename, sep=sep, + converters=applicable_converters, dtype=self.datatypes, **kwargs) except ValueError as ve: logger.warning( @@ -497,22 +543,6 @@ class CSVImporter(TableImporter): return df -class TSVImporter(TableImporter): +class TSVImporter(CSVImporter): def read_file(self, filename, **kwargs): - try: - tmpdf = pd.read_csv(filename, sep="\t", converters=self.converters, - **kwargs) - applicable_converters = {k: v for k, v in self.converters.items() - if k in tmpdf.columns} - df = pd.read_csv(filename, sep="\t", converters=self.converters, - **kwargs) - except ValueError as ve: - logger.warning( - "Cannot parse {}.\n{}".format(filename, ve), - extra={'identifier': str(filename), - 'category': "inconsistency"}) - raise DataInconsistencyError(*ve.args) - - df = self.check_dataframe(df, filename) - - return df + return super().read_file(filename, sep="\t", **kwargs) diff --git a/src/caosadvancedtools/table_json_conversion/convert.py b/src/caosadvancedtools/table_json_conversion/convert.py new file mode 100644 index 0000000000000000000000000000000000000000..09882f963fd976583d4acbc8f2dd3b67ef510ac8 --- /dev/null +++ b/src/caosadvancedtools/table_json_conversion/convert.py @@ -0,0 +1,497 @@ +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Convert XLSX files to JSON dictionaries.""" + +from __future__ import annotations + +import datetime +import itertools +import sys +from functools import reduce +from operator import getitem +from types import SimpleNamespace +from typing import Any, BinaryIO, Callable, TextIO, Union +from warnings import warn + +import jsonschema +from openpyxl import load_workbook +from openpyxl.worksheet.worksheet import Worksheet + +from caosadvancedtools.table_json_conversion import xlsx_utils +from caosadvancedtools.table_json_conversion.fill_xlsx import read_or_dict + + +def _strict_bool(value: Any) -> bool: + """Convert value to bool, but only if it really is a valid XLSX bool.""" + if isinstance(value, bool): + return value + raise TypeError(f"Not a good boolean: {repr(value)}") + + +class ForeignError(KeyError): + def __init__(self, *args, definitions: list, message: str = ""): + super().__init__(message, *args) + self.definitions = definitions + + +class XLSXConverter: + """Class for conversion from XLSX to JSON. + +For a detailed description of the required formatting of the XLSX files, see ``specs.md`` in the +documentation. + """ + + PARSER: dict[str, Callable] = { + "string": str, + "number": float, + "integer": int, + "boolean": _strict_bool, + } + + def __init__(self, xlsx: Union[str, BinaryIO], schema: Union[dict, str, TextIO], + strict: bool = False): + """ +Parameters +---------- +xlsx: Union[str, BinaryIO] + Path to the XLSX file or opened file object. + +schema: Union[dict, str, TextIO] + Schema for validation of XLSX content. + +strict: bool, optional + If True, fail faster. +""" + self._workbook = load_workbook(xlsx) + self._schema = read_or_dict(schema) + self._defining_path_index = xlsx_utils.get_defining_paths(self._workbook) + self._check_columns(fail_fast=strict) + self._handled_sheets: set[str] = set() + self._result: dict = {} + self._errors: dict = {} + + def to_dict(self, validate: bool = False, collect_errors: bool = True) -> dict: + """Convert the xlsx contents to a dict. + +Parameters +---------- +validate: bool, optional + If True, validate the result against the schema. + +collect_errors: bool, optional + If True, do not fail at the first error, but try to collect as many errors as possible. After an + Exception is raised, the errors can be collected with ``get_errors()`` and printed with + ``get_error_str()``. + +Returns +------- +out: dict + A dict representing the JSON with the extracted data. + """ + self._handled_sheets = set() + self._result = {} + self._errors = {} + for sheetname in self._workbook.sheetnames: + if sheetname not in self._handled_sheets: + self._handle_sheet(self._workbook[sheetname], fail_later=collect_errors) + if validate: + jsonschema.validate(self._result, self._schema) + if self._errors: + raise RuntimeError("There were error while handling the XLSX file.") + return self._result + + def get_errors(self) -> dict: + """Return a dict with collected errors.""" + return self._errors + + def get_error_str(self) -> str: + """Return a beautiful string with the collected errors.""" + result = "" + for loc, value in self._errors.items(): + result += f"Sheet: {loc[0]}\tRow: {loc[1] + 1}\n" + for item in value: + result += f"\t\t{item[:-1]}:\t{item[-1]}\n" + return result + + def _check_columns(self, fail_fast: bool = False): + """Check if the columns correspond to the schema.""" + def missing(path): + message = f"Missing column: {xlsx_utils.p2s(path)}" + if fail_fast: + raise ValueError(message) + else: + warn(message) + for sheetname in self._workbook.sheetnames: + sheet = self._workbook[sheetname] + parents: dict = {} + col_paths = [] + for col in xlsx_utils.get_data_columns(sheet).values(): + parents[xlsx_utils.p2s(col.path[:-1])] = col.path[:-1] + col_paths.append(col.path) + for path in parents.values(): + subschema = xlsx_utils.get_subschema(path, self._schema) + + # Unfortunately, there are a lot of special cases to handle here. + if subschema.get("type") == "array": + subschema = subschema["items"] + if "enum" in subschema: # Was handled in parent level already + continue + for child, content in subschema["properties"].items(): + child_path = path + [child] + if content == {'type': 'string', 'format': 'data-url'}: + continue # skip files + if content.get("type") == "array" and ( + content.get("items").get("type") == "object"): + if child_path not in itertools.chain(*self._defining_path_index.values()): + missing(child_path) + elif content.get("type") == "array" and "enum" in content.get("items", []) and ( + content.get("uniqueItems") is True): + # multiple choice + for choice in content["items"]["enum"]: + if child_path + [choice] not in col_paths: + missing(child_path + [choice]) + elif content.get("type") == "object": + pass + else: + if child_path not in col_paths: + missing(child_path) + + def _handle_sheet(self, sheet: Worksheet, fail_later: bool = False) -> None: + """Add the contents of the sheet to the result (stored in ``self._result``). + +Each row in the sheet corresponds to one entry in an array in the result. Which array exactly is +defined by the sheet's "proper name" and the content of the foreign columns. + +Look at ``xlsx_utils.get_path_position`` for the specification of the "proper name". + + +Parameters +---------- +fail_later: bool, optional + If True, do not fail with unresolvable foreign definitions, but collect all errors. +""" + row_type_column = xlsx_utils.get_row_type_column_index(sheet) + foreign_columns = xlsx_utils.get_foreign_key_columns(sheet) + foreign_column_paths = {col.index: col.path for col in foreign_columns.values()} + data_columns = xlsx_utils.get_data_columns(sheet) + data_column_paths = {col.index: col.path for col in data_columns.values()} + # Parent path, insert in correct order. + parent, proper_name = xlsx_utils.get_path_position(sheet) + if parent: + parent_sheetname = xlsx_utils.get_worksheet_for_path(parent, self._defining_path_index) + if parent_sheetname not in self._handled_sheets: + self._handle_sheet(self._workbook[parent_sheetname], fail_later=fail_later) + + # # We save single entries in lists, indexed by their foreign key contents. Each entry + # # consists of: + # # - foreign: Dict with path -> value for the foreign columns + # # - data: The actual data of this entry, a dict. + # entries: dict[str, list[SimpleNamespace]] = {} + + for row_idx, row in enumerate(sheet.iter_rows(values_only=True)): + # Skip non-data rows. + if row[row_type_column] is not None: + continue + foreign_repr = "" + foreign = [] # A list of lists, each of which is: [path1, path2, ..., leaf, value] + data: dict = {} # Local data dict + # Collect data (in dict relative to current level) and foreign data information + for col_idx, value in enumerate(row): + if col_idx in foreign_column_paths: + foreign_repr += str(value) + foreign.append(foreign_column_paths[col_idx] + [value]) + continue + + if col_idx in data_column_paths: + path = data_column_paths[col_idx] + if self._is_multiple_choice(path): + real_value = path.pop() # Last component is the enum value, insert above + # set up list + try: + _set_in_nested(mydict=data, path=path, value=[], prefix=parent, skip=1) + except ValueError as err: + if not str(err).startswith("There is already some value at"): + raise + if not xlsx_utils.parse_multiple_choice(value): + continue + _set_in_nested(mydict=data, path=path, value=real_value, prefix=parent, + skip=1, append_to_list=True) + else: + value = self._validate_and_convert(value, path) + _set_in_nested(mydict=data, path=path, value=value, prefix=parent, skip=1) + continue + + try: + # Find current position in tree + parent_dict = self._get_parent_dict(parent_path=parent, foreign=foreign) + + # Append data to current position's list + if proper_name not in parent_dict: + parent_dict[proper_name] = [] + parent_dict[proper_name].append(data) + except ForeignError as kerr: + if not fail_later: + raise + self._errors[(sheet.title, row_idx)] = kerr.definitions + self._handled_sheets.add(sheet.title) + + def _is_multiple_choice(self, path: list[str]) -> bool: + """Test if the path belongs to a multiple choice section.""" + if not path: + return False + subschema = self._get_subschema(path[:-1]) + if (subschema["type"] == "array" + and subschema.get("uniqueItems") is True + and "enum" in subschema["items"]): + return True + return False + + def _get_parent_dict(self, parent_path: list[str], foreign: list[list]) -> dict: + """Return the dict into which values can be inserted. + +This method returns, from the current result-in-making, the entry at ``parent_path`` which matches +the values given in the ``foreign`` specification. +""" + foreign_groups = _group_foreign_paths(foreign, common=parent_path) + + current_object = self._result + for group in foreign_groups: + # Find list for which foreign definitions are relevant. + current_object = reduce(getitem, group.subpath, current_object) + assert isinstance(current_object, list) + # Test all candidates. + for cand in current_object: + if all(reduce(getitem, definition[:-1], cand) == definition[-1] + for definition in group.definitions): + current_object = cand + break + else: + message = f"Cannot find an element at {parent_path} for these foreign defs:\n" + for name, value in group.definitions: + message += f" {name}: {value}\n" + print(message, file=sys.stderr) + error = ForeignError(definitions=group.definitions, message=message) + raise error + assert isinstance(current_object, dict) + return current_object + + def _validate_and_convert(self, value: Any, path: list[str]): + """Apply some basic validation and conversion steps. + +This includes: +- Validation against the type given in the schema +- List typed values are split at semicolons and validated individually + """ + if value is None: + return value + subschema = self._get_subschema(path) + # Array handling only if schema says it's an array. + if subschema.get("type") == "array": + array_type = subschema["items"]["type"] + if isinstance(value, str) and ";" in value: + values = [self.PARSER[array_type](v) for v in value.split(";")] + return values + try: + # special case: datetime or date + if ("anyOf" in subschema): + if isinstance(value, datetime.datetime) and ( + {'type': 'string', 'format': 'date-time'} in subschema["anyOf"]): + return value + if isinstance(value, datetime.date) and ( + {'type': 'string', 'format': 'date'} in subschema["anyOf"]): + return value + jsonschema.validate(value, subschema) + except jsonschema.ValidationError as verr: + print(verr) + print(path) + raise + + # Finally: convert to target type + return self.PARSER[subschema.get("type", "string")](value) + + def _get_subschema(self, path: list[str], schema: dict = None) -> dict: + """Return the sub schema at ``path``.""" + if schema is None: + schema = self._schema + assert schema is not None + assert isinstance(schema, dict) + + return xlsx_utils.get_subschema(path, schema) + + +def _group_foreign_paths(foreign: list[list], common: list[str]) -> list[SimpleNamespace]: + """Group the foreign keys by their base paths. + +Parameters +---------- +foreign: list[list] + A list of foreign definitions, consisting of path components, property and possibly value. + +common: list[list[str]] + A common path which defines the final target of the foreign definitions. This helps to understand + where the ``foreign`` paths shall be split. + +Returns +------- +out: list[dict[str, list[list]]] + + A list of foreign path segments, grouped by their common segments. Each element is a namespace + with detailed information of all those elements which form the group. The namespace has the + following attributes: + + - ``path``: The full path to this path segment. This is always the previous segment's ``path`` + plus this segment's ``subpath``. + - ``stringpath``: The stringified ``path``, might be useful for comparison or sorting. + - ``subpath``: The path, relative from the previous segment. + - ``definitions``: A list of the foreign definitions for this segment, but stripped of the + ``path`` components. + """ + # Build a simple dict first, without subpath. + results = {} + for f_path in foreign: + path = [] + for component in f_path: + path.append(component) + if path != common[:len(path)]: + break + path.pop() + definition = f_path[len(path):] + stringpath = xlsx_utils.p2s(path) + if stringpath not in results: + results[stringpath] = SimpleNamespace(stringpath=stringpath, path=path, + definitions=[definition]) + else: + results[stringpath].definitions.append(definition) + + # Then sort by stringpath and calculate subpath. + stringpaths = sorted(results.keys()) + + resultlist = [] + last_level = 0 + for stringpath in stringpaths: + elem = results[stringpath] + elem.subpath = elem.path[last_level:] + last_level = len(elem.path) + resultlist.append(elem) + + # from IPython import embed + # embed() + + if last_level != len(common): + raise ValueError("Foreign keys must cover the complete `common` depth.") + return resultlist + + +# pylint: disable-next=dangerous-default-value,too-many-arguments +def _set_in_nested(mydict: dict, path: list, value: Any, prefix: list = [], skip: int = 0, + overwrite: bool = False, append_to_list: bool = False) -> dict: + """Set a value in a nested dict. + +Parameters +---------- +mydict: dict + The dict into which the ``value`` shall be inserted. +path: list + A list of keys, denoting the location of the value. +value + The value which shall be set inside the dict. +prefix: list + A list of keys which shall be removed from ``path``. A KeyError is raised if ``path`` does not + start with the elements of ``prefix``. +skip: int = 0 + Remove this many additional levels from the path, *after* removing the prefix. +overwrite: bool = False + If True, allow overwriting existing content. Otherwise, attempting to overwrite existing values + leads to an exception. +append_to_list: bool = False + If True, assume that the element at ``path`` is a list and append the value to it. If the list + does not exist, create it. If there is a non-list at ``path`` already, overwrite it with a new + list, if ``overwrite`` is True, otherwise raise a ValueError. + +Returns +------- +mydict: dict + The same dictionary that was given as a parameter, but modified. + """ + for idx, el in enumerate(prefix): + if path[idx] != el: + raise KeyError(f"Path does not start with prefix: {prefix} not in {path}") + path = path[len(prefix):] + if skip: + assert len(path) > skip, f"Path must be long enoug to remove skip={skip} elements." + path = path[skip:] + + tmp_dict = mydict + while len(path) > 1: + key = path.pop(0) + if key not in tmp_dict: + tmp_dict[key] = {} + if not isinstance(tmp_dict[key], dict): + if overwrite: + tmp_dict[key] = {} + else: + raise ValueError(f"There is already some value at {path}") + tmp_dict = tmp_dict[key] + key = path.pop() + if append_to_list: + if key not in tmp_dict: + tmp_dict[key] = [] + if not isinstance(tmp_dict[key], list): + if overwrite: + tmp_dict[key] = [] + else: + raise ValueError(f"There is already some non-list value at [{key}]") + tmp_dict[key].append(value) + else: + if key in tmp_dict and not overwrite: + raise ValueError(f"There is already some value at [{key}]") + if key not in tmp_dict: + tmp_dict[key] = {} + tmp_dict[key] = value + return mydict + + +def to_dict(xlsx: Union[str, BinaryIO], schema: Union[dict, str, TextIO], + validate: bool = None, strict: bool = False) -> dict: + """Convert the xlsx contents to a dict, it must follow a schema. + +Parameters +---------- +xlsx: Union[str, BinaryIO] + Path to the XLSX file or opened file object. + +schema: Union[dict, str, TextIO] + Schema for validation of XLSX content. + +validate: bool, optional + If True, validate the result against the schema. + +strict: bool, optional + If True, fail faster. + + +Returns +------- +out: dict + A dict representing the JSON with the extracted data. + """ + converter = XLSXConverter(xlsx, schema, strict=strict) + return converter.to_dict() diff --git a/src/caosadvancedtools/table_json_conversion/fill_xlsx.py b/src/caosadvancedtools/table_json_conversion/fill_xlsx.py index 60b5c96c7de141b1ecb12254e6928252fe4a9f5c..f2e0abc3fc684172065d683c99c1c4309c80d6c0 100644 --- a/src/caosadvancedtools/table_json_conversion/fill_xlsx.py +++ b/src/caosadvancedtools/table_json_conversion/fill_xlsx.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 # encoding: utf-8 # # This file is a part of the LinkAhead Project. @@ -19,118 +18,32 @@ # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. +"""Class and function to fill an XLSX template from actual data.""" from __future__ import annotations -import json +import datetime import pathlib -from collections import OrderedDict from types import SimpleNamespace -from typing import Any, Dict, List, Optional, TextIO, Union +from typing import Any, Optional, TextIO, Union from warnings import warn from jsonschema import FormatChecker, validate from jsonschema.exceptions import ValidationError -from openpyxl import Workbook, load_workbook +from openpyxl import load_workbook, Workbook from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE -from openpyxl.worksheet.worksheet import Worksheet -from .table_generator import ColumnType, RowType -from .utils import p2s - - -def _is_exploded_sheet(sheet: Worksheet) -> bool: - """Return True if this is a an "exploded" sheet. - - An exploded sheet is a sheet whose data entries are LIST valued properties of entries in another - sheet. A sheet is detected as exploded iff it has FOREIGN columns. - """ - column_types = _get_column_types(sheet) - return ColumnType.FOREIGN.name in column_types.values() - - -def _get_column_types(sheet: Worksheet) -> OrderedDict: - """Return an OrderedDict: column index -> column type for the sheet. - """ - result = OrderedDict() - type_row_index = _get_row_type_column_index(sheet) - for idx, col in enumerate(sheet.columns): - type_cell = col[type_row_index] - result[idx] = type_cell.value if type_cell.value is not None else ColumnType.IGNORE.name - assert (hasattr(ColumnType, result[idx]) - or result[idx] == RowType.COL_TYPE.name), ( - f"Unexpected column type value ({idx}{type_row_index}): {type_cell.value}") - return result - - -def _get_foreign_key_columns(sheet: Worksheet) -> Dict[str, SimpleNamespace]: - """Return the foreign keys of the worksheet. - -Returns -------- -out: dict[str, SimpleNamespace] - The keys are the stringified paths. The values are SimpleNamespace objects with ``index``, - ``path`` and ``column`` attributes. - """ - column_types = _get_column_types(sheet) - path_rows = _get_path_rows(sheet) - result = OrderedDict() - for for_idx, name in column_types.items(): - if name != ColumnType.FOREIGN.name: - continue - path = [] - for row in path_rows: - component = sheet.cell(row=row+1, column=for_idx+1).value - if component is None: - break - assert isinstance(component, str), f"Expected string: {component}" - path.append(component) - result[p2s(path)] = SimpleNamespace(index=for_idx, path=path, - column=list(sheet.columns)[for_idx]) - return result - - -def _get_row_type_column_index(sheet: Worksheet): - """Return the column index (0-indexed) of the column which defines the row types. - """ - for col in sheet.columns: - for cell in col: - if cell.value == RowType.COL_TYPE.name: - return cell.column - 1 - raise ValueError("The column which defines row types (COL_TYPE, PATH, ...) is missing") - - -def _get_path_rows(sheet: Worksheet): - """Return the 0-based indices of the rows which represent paths.""" - rows = [] - rt_col = _get_row_type_column_index(sheet) - for cell in list(sheet.columns)[rt_col]: - if cell.value == RowType.PATH.name: - rows.append(cell.row-1) - return rows - - -def _next_row_index(sheet: Worksheet) -> int: - """Return the index for the next data row. - - This is defined as the first row without any content. - """ - return sheet.max_row - - -def _read_or_dict(data: Union[dict, str, TextIO]) -> dict: - """If data is a json file name or input stream, read data from there.""" - if isinstance(data, dict): - pass - elif isinstance(data, str): - with open(data, encoding="utf-8") as infile: - data = json.load(infile) - elif hasattr(data, "read"): - data = json.load(data) - else: - raise ValueError(f"I don't know how to handle the datatype of `data`: {type(data)}") - assert isinstance(data, dict) - return data +from .xlsx_utils import ( + array_schema_from_model_schema, + get_foreign_key_columns, + get_row_type_column_index, + is_exploded_sheet, + next_row_index, + p2s, + read_or_dict, + ColumnType, + RowType +) class TemplateFiller: @@ -143,6 +56,7 @@ class TemplateFiller: @property def workbook(self): + """Return the workbook of this TemplateFiller.""" return self._workbook def fill_data(self, data: dict): @@ -161,7 +75,7 @@ class TemplateFiller: """ - def __init__(self, current_path: List[str] = None, props: Dict[str, Any] = None): + def __init__(self, current_path: list[str] = None, props: dict[str, Any] = None): self._current_path = current_path if current_path is not None else [] self._props = props if props is not None else {} # this is flat @@ -172,11 +86,12 @@ class TemplateFiller: return result def next_level(self, next_level: str) -> TemplateFiller.Context: + """Return a copy of this Context, with the path appended by ``next_level``.""" result = self.copy() result._current_path.append(next_level) # pylint: disable=protected-access return result - def __getitem__(self, path: Union[List[str], str], owner=None) -> Any: + def __getitem__(self, path: Union[list[str], str], owner=None) -> Any: if isinstance(path, list): path = p2s(path) return self._props[path] @@ -185,7 +100,7 @@ class TemplateFiller: fullpath = p2s(self._current_path + [propname]) self._props[fullpath] = value - def fill_from_data(self, data: Dict[str, Any]): + def fill_from_data(self, data: dict[str, Any]): # TODO recursive for dicts and list? """Fill current level with all scalar elements of ``data``.""" for name, value in data.items(): @@ -206,12 +121,13 @@ class TemplateFiller: Index the sheets by all path arrays leading to them. Also create a simple column index by column type and path. + This method creates and populates the dict ``self._sheet_index``. """ self._sheet_index = {} for sheetname in self._workbook.sheetnames: sheet = self._workbook[sheetname] type_column = [x.value for x in list(sheet.columns)[ - _get_row_type_column_index(sheet)]] + get_row_type_column_index(sheet)]] # 0-indexed, as everything outside of sheet.cell(...): coltype_idx = type_column.index(RowType.COL_TYPE.name) path_indices = [i for i, typ in enumerate(type_column) if typ == RowType.PATH.name] @@ -227,7 +143,8 @@ class TemplateFiller: path.append(col[path_idx].value) # col_key = p2s([col[coltype_idx].value] + path) # col_index[col_key] = SimpleNamespace(column=col, col_index=col_idx) - if col[coltype_idx].value not in [ColumnType.SCALAR.name, ColumnType.LIST.name]: + if col[coltype_idx].value not in [ColumnType.SCALAR.name, ColumnType.LIST.name, + ColumnType.MULTIPLE_CHOICE.name]: continue path_str = p2s(path) @@ -236,10 +153,11 @@ class TemplateFiller: sheetname=sheetname, sheet=sheet, col_index=col_idx, col_type=col[coltype_idx].value) - def _handle_data(self, data: dict, current_path: List[str] = None, + def _handle_data(self, data: dict, current_path: list[str] = None, context: TemplateFiller.Context = None, only_collect_insertables: bool = False, - ) -> Optional[Dict[str, Any]]: + utc: bool = False, + ) -> Optional[dict[str, Any]]: """Handle the data and write it into ``workbook``. Parameters @@ -260,6 +178,8 @@ context: TemplateFiller.Context, optional only_collect_insertables: bool, optional If True, do not insert anything on this level, but return a dict with entries to be inserted. +utc: bool, optional + If True, store times as UTC. Else store as local time on a best-effort base. Returns ------- @@ -274,7 +194,7 @@ out: union[dict, None] context = TemplateFiller.Context() context.fill_from_data(data) - insertables: Dict[str, Any] = {} + insertables: dict[str, Any] = {} for name, content in data.items(): # TODO is this the best way to do it???? if name == "file": @@ -289,10 +209,19 @@ out: union[dict, None] assert len(set(type(entry) for entry in content)) == 1 if isinstance(content[0], dict): # all elements are dicts - # An array of objects: must go into exploded sheet - for entry in content: - self._handle_data(data=entry, current_path=path, context=next_context) - continue + # Heuristic to detect enum entries (only id and name): + if all(set(entry.keys()) == {"id", "name"} for entry in content): + # Convert to list of names, do not recurse + content = [entry["name"] for entry in content] + else: + # An array of objects: must go into exploded sheet + for entry in content: + self._handle_data(data=entry, current_path=path, context=next_context) + continue + # Heuristic to detect enum entries (dict with only id and name): + elif isinstance(content, dict) and set(content.keys()) == {"id", "name"}: + content = [content["name"]] + # "Normal" dicts elif isinstance(content, dict): # we recurse and simply use the result if not current_path: # Special handling for top level self._handle_data(content, current_path=path, context=next_context) @@ -308,16 +237,27 @@ out: union[dict, None] # collecting the data assert isinstance(content, list) - if len(content) > 1: - content = [ILLEGAL_CHARACTERS_RE.sub("", str(x)) for x in content] - value = ";".join(content) # TODO we need escaping of values - else: - value = content[0] - if isinstance(value, str): - value = ILLEGAL_CHARACTERS_RE.sub("", value) - path_str = p2s(path) - assert path_str not in insertables - insertables[path_str] = value + to_insert = self._try_multiple_choice(path, values=content) + if not to_insert: + if len(content) > 1: + content = [ILLEGAL_CHARACTERS_RE.sub("", str(x)) for x in content] + value = ";".join(content) # TODO we need escaping of values + else: + value = content[0] + if isinstance(value, str): + value = ILLEGAL_CHARACTERS_RE.sub("", value) + if isinstance(value, datetime.datetime): + if value.tzinfo is not None: + if utc: + value = value.astimezone(datetime.timezone.utc).replace(tzinfo=None) + else: + # Remove timezone, store in local timezone. + value = value.astimezone().replace(tzinfo=None) + + path_str = p2s(path) + assert path_str not in insertables + to_insert = {path_str: value} + insertables.update(to_insert) if only_collect_insertables: return insertables if not current_path: # Top level returns, because there are only sheets for the children. @@ -328,7 +268,8 @@ out: union[dict, None] sheet = None for path_str, value in insertables.items(): if self._graceful and path_str not in self._sheet_index: - warn(f"Ignoring path with missing sheet index: {path_str}") + if not (value is None or path_str.endswith(".id") or path_str.endswith(".name")): + warn(f"Ignoring path with missing sheet index: {path_str}") continue sheet_meta = self._sheet_index[path_str] if sheet is None: @@ -336,14 +277,14 @@ out: union[dict, None] assert sheet is sheet_meta.sheet, "All entries must be in the same sheet." col_index = sheet_meta.col_index if insert_row is None: - insert_row = _next_row_index(sheet) + insert_row = next_row_index(sheet) sheet.cell(row=insert_row+1, column=col_index+1, value=value) # Insert foreign keys - if insert_row is not None and sheet is not None and _is_exploded_sheet(sheet): + if insert_row is not None and sheet is not None and is_exploded_sheet(sheet): try: - foreigns = _get_foreign_key_columns(sheet) + foreigns = get_foreign_key_columns(sheet) except ValueError: print(f"Sheet: {sheet}") raise @@ -353,6 +294,40 @@ out: union[dict, None] return None + def _try_multiple_choice(self, path: list[str], values: list[str]) -> Optional[dict[str, str]]: + """Try to create sheet content for a multiple choice property. + +Parameters +---------- +path: list[str] + The Path to this property. +values: list[str] + A list of possible choices, should be unique. + +Returns +------- +to_insert: Optional[dict[str, str]] + A path-value dict. None if this doesn't seem to be a multiple choice data set. + """ + try: + assert len(set(values)) == len(values) + to_insert = {} + found_sheet = None + for value in values: + assert isinstance(value, str) + path_str = p2s(path + [value]) + assert path_str in self._sheet_index + sheet_meta = self._sheet_index[path_str] + # All matches shall be on the same sheet + assert found_sheet is None or found_sheet == sheet_meta.sheetname + found_sheet = sheet_meta.sheetname + # Correct type + assert sheet_meta.col_type == ColumnType.MULTIPLE_CHOICE.name + to_insert[path_str] = "x" + except AssertionError: + return None + return to_insert + def fill_template(data: Union[dict, str, TextIO], template: str, result: str, validation_schema: Union[dict, str, TextIO] = None) -> None: @@ -374,17 +349,18 @@ validation_schema: dict, optional fails. If no validation schema is given, try to ignore more errors in the data when filling the XLSX template. """ - data = _read_or_dict(data) + data = read_or_dict(data) assert isinstance(data, dict) # Validation if validation_schema is not None: - validation_schema = _read_or_dict(validation_schema) + validation_schema = array_schema_from_model_schema(read_or_dict(validation_schema)) try: + # FIXME redefine checker for datetime validate(data, validation_schema, format_checker=FormatChecker()) - except ValidationError as ve: - print(ve.message) - raise ve + except ValidationError as verr: + print(verr.message) + raise verr else: print("No validation schema given, continue at your own risk.") diff --git a/src/caosadvancedtools/table_json_conversion/table_generator.py b/src/caosadvancedtools/table_json_conversion/table_generator.py index 8ca026a8758361b658e98cabbcff42b849bb07fe..b8c50e7d8d40775f86c1a01d0934effe570cf20d 100644 --- a/src/caosadvancedtools/table_json_conversion/table_generator.py +++ b/src/caosadvancedtools/table_json_conversion/table_generator.py @@ -24,32 +24,18 @@ This module allows to generate template tables from JSON schemas. """ +from __future__ import annotations + import pathlib import re from abc import ABC, abstractmethod -from enum import Enum -from typing import Dict, List, Optional, Tuple +from typing import Optional from openpyxl import Workbook from openpyxl.styles import PatternFill from openpyxl.workbook.child import INVALID_TITLE_REGEX -from .utils import p2s - - -class ColumnType(Enum): - """ column types enum """ - SCALAR = 1 - LIST = 2 - FOREIGN = 3 - IGNORE = 4 - - -class RowType(Enum): - """ row types enum """ - COL_TYPE = 1 - PATH = 2 - IGNORE = 3 +from .xlsx_utils import p2s, ColumnType, RowType class TableTemplateGenerator(ABC): @@ -85,11 +71,16 @@ class TableTemplateGenerator(ABC): Example: ``{"Training": {"__this__": ["date"], "Person": ["name", "email"]}}`` Here, ``date`` is the sole foreign key for Training. + + | It probably is worth extending the first example, with a case where a "Training" shall + be distiguished by the "name" and "email" of a "Person" which it references. The + foreign keys for this example are specified like this: + | ``{"Training": {"__this__": [["Person", "name"], ["Person", "email"]]}}`` """ def _generate_sheets_from_schema(self, schema: dict, foreign_keys: Optional[dict] = None - ) -> Dict[str, Dict[str, - Tuple[ColumnType, Optional[str], list]]]: + ) -> dict[str, dict[str, + tuple[ColumnType, Optional[str], list]]]: """Generate a sheet definition from a given JSON schema. Parameters @@ -128,30 +119,49 @@ class TableTemplateGenerator(ABC): foreign_keys = {} # here, we treat the top level # sheets[sheetname][colname]= (COL_TYPE, description, [path]) - sheets: Dict[str, Dict[str, Tuple[ColumnType, Optional[str], list]]] = {} + sheets: dict[str, dict[str, tuple[ColumnType, Optional[str], list]]] = {} for rt_name, rt_def in schema["properties"].items(): sheets[rt_name] = self._treat_schema_element(schema=rt_def, sheets=sheets, path=[rt_name], foreign_keys=foreign_keys) return sheets - def _get_foreign_keys(self, keys: dict, path: list) -> list: - """Return the foreign keys that are needed at the location to which path points.""" - msg = f"A foreign key definition is missing for path:\n{path}\nKeys are:\n{keys}" + def _get_foreign_keys(self, keys: dict, path: list) -> list[list[str]]: + """Return the foreign keys that are needed at the location to which path points. + +Returns +------- +foreign_keys: list[list[str]] + Contains lists of strings, each element is the path to one foreign key. +""" + msg_missing = f"A foreign key definition is missing for path:\n{path}\nKeys are:\n{keys}" + orig_path = path.copy() while path: if keys is None or path[0] not in keys: - raise ValueError(msg) + raise ValueError(msg_missing) keys = keys[path[0]] path = path[1:] if isinstance(keys, dict) and "__this__" in keys: - return keys["__this__"] - if isinstance(keys, list): - return keys - raise ValueError(msg) - - def _treat_schema_element(self, schema: dict, sheets: dict, path: List[str], + keys = keys["__this__"] + if isinstance(keys, str): + raise ValueError("Foreign keys must be a list of strings, but a single " + "string was given:\n" + f"{orig_path} -> {keys}") + if not isinstance(keys, list): + raise ValueError(msg_missing) + + # Keys must be either all lists or all strings + types = {type(key) for key in keys} + if len(types) > 1: + raise ValueError("The keys of this path must bei either all lists or all strings:" + f" {orig_path}") + if types.pop() is str: + keys = [[key] for key in keys] + return keys + + def _treat_schema_element(self, schema: dict, sheets: dict, path: list[str], foreign_keys: Optional[dict] = None, level_in_sheet_name: int = 1, array_paths: Optional[list] = None - ) -> Dict[str, Tuple[ColumnType, Optional[str], list]]: + ) -> dict[str, tuple[ColumnType, Optional[str], list]]: """Recursively transform elements from the schema into column definitions. ``sheets`` is modified in place. @@ -189,15 +199,16 @@ class TableTemplateGenerator(ABC): # if it is an array, value defs are in 'items' if schema.get('type') == 'array': - if (schema['items'].get('type') == 'object' - and len(path) > 1): # list of references; special treatment + items = schema['items'] + # list of references; special treatment + if (items.get('type') == 'object' and len(path) > 1): # we add a new sheet with columns generated from the subtree of the schema sheetname = p2s(path) if sheetname in sheets: raise ValueError("The schema would lead to two sheets with the same name, " f"which is forbidden: {sheetname}") col_def = self._treat_schema_element( - schema=schema['items'], sheets=sheets, path=path, foreign_keys=foreign_keys, + schema=items, sheets=sheets, path=path, foreign_keys=foreign_keys, level_in_sheet_name=len(path), array_paths=array_paths+[path] # since this level is an array extend the list ) @@ -206,25 +217,37 @@ class TableTemplateGenerator(ABC): # and add the foreign keys that are necessary up to this point for array_path in array_paths: foreigns = self._get_foreign_keys(foreign_keys, array_path) - if isinstance(foreigns, str): - raise ValueError("Foreign keys must be a list of strings, but a single " - "string was given:\n" - f"{array_path} -> {foreigns}") for foreign in foreigns: - internal_key = p2s(array_path + [foreign]) + internal_key = p2s(array_path + foreign) if internal_key in sheets[sheetname]: - raise ValueError("The schema would lead to two columns with the same " - "name, which is forbidden:\n" + raise ValueError("The schema would lead to two columns with the " + "same name, which is forbidden:\n" f"{foreign} -> {internal_key}") ref_sheet = p2s(array_path) sheets[sheetname][internal_key] = ( - ColumnType.FOREIGN, f"see sheet '{ref_sheet}'", array_path + [foreign]) + ColumnType.FOREIGN, f"see sheet '{ref_sheet}'", + array_path + foreign) # Columns are added to the new sheet, thus we do not return any columns for the # current sheet. return {} + # List of enums: represent as checkbox columns + if (schema.get("uniqueItems") is True and "enum" in items and len(items) == 1): + choices = items["enum"] + assert len(path) >= 1 + prop_name = path[-1] + result = {} + for choice in choices: + name = f"{prop_name}.{choice}" + result[name] = ( + ColumnType.MULTIPLE_CHOICE, + schema.get('description'), + path + [str(choice)], + ) + return result + # it is a list of primitive types -> semicolon separated list - schema = schema['items'] + schema = items ctype = ColumnType.LIST # This should only be the case for "new or existing reference". @@ -247,9 +270,8 @@ class TableTemplateGenerator(ABC): return cols # The schema is a leaf. - description = schema['description'] if 'description' in schema else None # definition of a single column - default_return = {p2s(path[level_in_sheet_name:]): (ctype, description, path)} + default_return = {p2s(path[level_in_sheet_name:]): (ctype, schema.get('description'), path)} if 'type' not in schema and 'enum' in schema: return default_return if 'type' not in schema and 'anyOf' in schema: @@ -300,7 +322,7 @@ class XLSXTemplateGenerator(TableTemplateGenerator): definition dict You need to pass the dict of a single sheet to this function. """ - return max([len(path) for _, _, path in sheetdef.values()]) + return max(len(path) for _, _, path in sheetdef.values()) @staticmethod def _get_ordered_cols(sheetdef: dict) -> list: @@ -320,7 +342,7 @@ class XLSXTemplateGenerator(TableTemplateGenerator): return ordered_cols def _create_workbook_from_sheets_def( - self, sheets: Dict[str, Dict[str, Tuple[ColumnType, Optional[str], list]]]): + self, sheets: dict[str, dict[str, tuple[ColumnType, Optional[str], list]]]): """Create and return a nice workbook for the given sheets.""" wb = Workbook() yellowfill = PatternFill(fill_type="solid", fgColor='00FFFFAA') @@ -350,12 +372,12 @@ class XLSXTemplateGenerator(TableTemplateGenerator): ordered_cols = self._get_ordered_cols(sheetdef) # create other columns - for index, (colname, ct, desc, path) in enumerate(ordered_cols): - ws.cell(1, 2 + index, ct.name) + for index, (colname, coltype, desc, path) in enumerate(ordered_cols): + ws.cell(1, 2 + index, coltype.name) for path_index, el in enumerate(path): ws.cell(2 + path_index, 2 + index, el) ws.cell(header_index, 2 + index, colname) - if ct == ColumnType.FOREIGN: + if coltype == ColumnType.FOREIGN: # Visual highlighting ws.cell(header_index, 2 + index).fill = yellowfill if desc: diff --git a/src/caosadvancedtools/table_json_conversion/utils.py b/src/caosadvancedtools/table_json_conversion/utils.py deleted file mode 100644 index 15ae488d7cb8e142afba58424b49e8fc3a15e0d6..0000000000000000000000000000000000000000 --- a/src/caosadvancedtools/table_json_conversion/utils.py +++ /dev/null @@ -1,25 +0,0 @@ -# This file is a part of the LinkAhead Project. -# -# Copyright (C) 2024 IndiScale GmbH <info@indiscale.com> -# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see <https://www.gnu.org/licenses/>. - -from typing import List - - -def p2s(path: List[str]): - """Path to string: dot-separated. - """ - return ".".join(path) diff --git a/src/caosadvancedtools/table_json_conversion/xlsx_utils.py b/src/caosadvancedtools/table_json_conversion/xlsx_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5002f3ac7fe4bd78accffe0697cd7ecc7273dc27 --- /dev/null +++ b/src/caosadvancedtools/table_json_conversion/xlsx_utils.py @@ -0,0 +1,378 @@ +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""General utilities to work with XLSX files with (hidden) column and row annotations and typing. + +The most prominent functions are: + +- ``p2s``: Path to string: ``["some", "path"] -> "some.path"`` +- ``read_or_dict``: Load JSON object from path, file or dict. + +This module also defines these enums: + +- ColumnType +- RowType +""" + +from __future__ import annotations + +import json + +from collections import OrderedDict +from copy import deepcopy +from enum import Enum +from types import SimpleNamespace +from typing import Any, TextIO, Union + +from openpyxl import Workbook +from openpyxl.worksheet.worksheet import Worksheet + + +TRUTHY = {"true", "wahr", "x", "√", "yes", "ja", "y", "j"} # For multiple choice columns +FALSY = {"false", "falsch", "-", "no", "nein", "n"} # For multiple choice columns + + +class ColumnType(Enum): + """ column types enum """ + SCALAR = 1 + LIST = 2 + FOREIGN = 3 + MULTIPLE_CHOICE = 4 + IGNORE = 5 + + +class RowType(Enum): + """ row types enum """ + COL_TYPE = 1 + PATH = 2 + IGNORE = 3 + + +def array_schema_from_model_schema(model_schema: dict) -> dict: + """Convert a *data model* schema to a *data array* schema. + +Practically, this means that the top level properties are converted into lists. In a simplified +notation, this can be expressed as: + +``array_schema = { elem: [elem typed data...] for elem in model_schema }`` + +Parameters +---------- +model_schema: dict + The schema description of the data model. Must be a json schema *object*, with a number of + *object* typed properties. + +Returns +------- +array_schema: dict + A corresponding json schema, where the properties are arrays with the types of the input's + top-level properties. + """ + assert model_schema["type"] == "object" + result = deepcopy(model_schema) + for name, prop in result["properties"].items(): + assert prop["type"] == "object" + new_prop = { + "type": "array", + "items": prop + } + result["properties"][name] = new_prop + return result + + +def get_defining_paths(workbook: Workbook) -> dict[str, list[list[str]]]: + """For all sheets in ``workbook``, list the paths which they define. + +A sheet is said to define a path, if it has data columns for properties inside that path. For +example, consider the following worksheet: + +| `COL_TYPE` | `SCALAR` | `SCALAR` | `LIST` | `SCALAR` | +| `PATH` | `Training` | `Training` | `Training` | `Training` | +| `PATH` | `url` | `date` | `subjects` | `supervisor` | +| `PATH` | | | | `email` | +|------------|----------------|---------------|--------------|--------------------| +| | example.com/mp | 2024-02-27 | Math;Physics | steve@example.com | +| | example.com/m | 2024-02-27 | Math | stella@example.com | + +This worksheet defines properties for the paths `["Training"]` and `["Training", "supervisor"]`, and +thus these two path lists would be returned for the key with this sheet's sheetname. + +Parameters +---------- +workbook: Workbook + The workbook to analyze. + +Returns +------- +out: dict[str, list[list[str]] + A dict with worksheet names as keys and lists of paths (represented as string lists) as values. +""" + result: dict[str, list[list[str]]] = {} + for sheet in workbook.worksheets: + paths = [] + added = set() + for col in get_data_columns(sheet).values(): + rep = p2s(col.path[:-1]) + if rep not in added: + paths.append(col.path[:-1]) + added.add(rep) + result[sheet.title] = paths + return result + + +def get_data_columns(sheet: Worksheet) -> dict[str, SimpleNamespace]: + """Return the data paths of the worksheet. + +Returns +------- +out: dict[str, SimpleNamespace] + The keys are the stringified paths. The values are SimpleNamespace objects with ``index``, + ``path`` and ``column`` attributes. + """ + column_types = _get_column_types(sheet) + path_rows = get_path_rows(sheet) + result = OrderedDict() + for for_idx, name in column_types.items(): + if name not in ( + ColumnType.SCALAR.name, + ColumnType.LIST.name, + ColumnType.MULTIPLE_CHOICE.name, + ): + continue + path = [] + for row in path_rows: + component = sheet.cell(row=row+1, column=for_idx+1).value + if component is None: + break + assert isinstance(component, str), f"Expected string: {component}" + path.append(component) + result[p2s(path)] = SimpleNamespace(index=for_idx, path=path, + column=list(sheet.columns)[for_idx]) + return result + + +def get_foreign_key_columns(sheet: Worksheet) -> dict[str, SimpleNamespace]: + """Return the foreign keys of the worksheet. + +Returns +------- +out: dict[str, SimpleNamespace] + The keys are the stringified paths. The values are SimpleNamespace objects with ``index``, + ``path`` and ``column`` attributes. + """ + column_types = _get_column_types(sheet) + path_rows = get_path_rows(sheet) + result = OrderedDict() + for for_idx, name in column_types.items(): + if name != ColumnType.FOREIGN.name: + continue + path = [] + for row in path_rows: + component = sheet.cell(row=row+1, column=for_idx+1).value + if component is None: + break + assert isinstance(component, str), f"Expected string: {component}" + path.append(component) + result[p2s(path)] = SimpleNamespace(index=for_idx, path=path, + column=list(sheet.columns)[for_idx]) + return result + + +def get_path_position(sheet: Worksheet) -> tuple[list[str], str]: + """Return a path which represents the parent element, and the sheet's "proper name". + +For top-level sheets / entries (those without foreign columns), the path is an empty list. + +A sheet's "proper name" is detected from the data column paths: it is the first component after the +parent components. + +Returns +------- +parent: list[str] + Path to the parent element. Note that there may be list elements on the path which are **not** + represented in this return value. + +proper_name: str + The "proper name" of this sheet. This defines an array where all the data lives, relative to the + parent path. + """ + # Parent element: longest common path shared among any foreign column and all the data columns + parent: list[str] = [] + + # longest common path in data colums + data_paths = [el.path for el in get_data_columns(sheet).values()] + for ii in range(min(len(path) for path in data_paths)): + components_at_index = {path[ii] for path in data_paths} + if len(components_at_index) > 1: + break + longest_data_path = data_paths[0][:ii] + + # longest common overall path + foreign_paths = [el.path for el in get_foreign_key_columns(sheet).values()] + ii = 0 # If no foreign_paths, proper name is the first element + for foreign_path in foreign_paths: + for ii in range(min([len(foreign_path), len(longest_data_path)])): + components_at_index = {foreign_path[ii], longest_data_path[ii]} + if len(components_at_index) > 1: + break + if ii > len(parent): + parent = foreign_path[:ii] + + return parent, data_paths[0][ii] + + +def get_path_rows(sheet: Worksheet): + """Return the 0-based indices of the rows which represent paths.""" + rows = [] + rt_col = get_row_type_column_index(sheet) + for cell in list(sheet.columns)[rt_col]: + if cell.value == RowType.PATH.name: + rows.append(cell.row-1) + return rows + + +def get_row_type_column_index(sheet: Worksheet): + """Return the column index (0-indexed) of the column which defines the row types. + """ + for col in sheet.columns: + for cell in col: + if cell.value == RowType.COL_TYPE.name: + return cell.column - 1 + raise ValueError("The column which defines row types (COL_TYPE, PATH, ...) is missing") + + +def get_subschema(path: list[str], schema: dict) -> dict: + """Return the sub schema at ``path``.""" + if path: + if schema["type"] == "object": + next_schema = schema["properties"][path[0]] + return get_subschema(path=path[1:], schema=next_schema) + if schema["type"] == "array": + items = schema["items"] + if "enum" in items: + return schema + next_schema = items["properties"][path[0]] + return get_subschema(path=path[1:], schema=next_schema) + return schema + + +def get_worksheet_for_path(path: list[str], defining_path_index: dict[str, list[list[str]]]) -> str: + """Find the sheet name which corresponds to the given path.""" + for sheetname, paths in defining_path_index.items(): + if path in paths: + return sheetname + raise KeyError(f"Could not find defining worksheet for path: {path}") + + +def is_exploded_sheet(sheet: Worksheet) -> bool: + """Return True if this is a an "exploded" sheet. + + An exploded sheet is a sheet whose data entries are LIST valued properties of entries in another + sheet. A sheet is detected as exploded iff it has FOREIGN columns. + """ + column_types = _get_column_types(sheet) + return ColumnType.FOREIGN.name in column_types.values() + + +def next_row_index(sheet: Worksheet) -> int: + """Return the index for the next data row. + + This is defined as the first row without any content. + """ + return sheet.max_row + + +def p2s(path: list[str]) -> str: + """Path to string: dot-separated. + """ + return ".".join(path) + + +def parse_multiple_choice(value: Any) -> bool: + """Interpret ``value`` as a multiple choice input. + +*Truthy* values are: +- The boolean ``True``. +- The number "1". +- The (case-insensitive) strings ``true``, ``wahr``, ``x``, ``√``, ``yes``, ``ja``, ``y``, ``j``. + +*Falsy* values are: +- The boolean ``False``. +- ``None``, empty strings, lists, dicts. +- The number "0". +- The (case-insensitive) strings ``false``, ``falsch``, ``-``, ``no``, ``nein``, ``n``. +- Everything else. + +Returns +------- +out: bool + The interpretation result of ``value``. + """ + # Non-string cases first: + # pylint: disable-next=too-many-boolean-expressions + if (value is None or value is False or value == 0 + or value == [] or value == {} or value == ""): + return False + if (value is True or value == 1): + return True + + # String cases follow: + if not isinstance(value, str): + return False + value = value.lower() + + if value in TRUTHY: + return True + + # Strictly speaking, this test is not necessary, but I think it's good practice. + if value in FALSY: + return False + return False + + +def read_or_dict(data: Union[dict, str, TextIO]) -> dict: + """If data is a json file name or input stream, read data from there. +If it is a dict already, just return it.""" + if isinstance(data, dict): + return data + + if isinstance(data, str): + with open(data, encoding="utf-8") as infile: + data = json.load(infile) + elif hasattr(data, "read"): + data = json.load(data) + else: + raise ValueError(f"I don't know how to handle the datatype of `data`: {type(data)}") + assert isinstance(data, dict) + return data + + +def _get_column_types(sheet: Worksheet) -> OrderedDict: + """Return an OrderedDict: column index -> column type for the sheet. + """ + result = OrderedDict() + type_row_index = get_row_type_column_index(sheet) + for idx, col in enumerate(sheet.columns): + type_cell = col[type_row_index] + result[idx] = type_cell.value if type_cell.value is not None else ( + ColumnType.IGNORE.name) + assert (hasattr(ColumnType, result[idx]) or result[idx] == RowType.COL_TYPE.name), ( + f"Unexpected column type value ({idx}{type_row_index}): {type_cell.value}") + return result diff --git a/src/doc/conf.py b/src/doc/conf.py index b94d3060eac8f165f62b6d6fc0417c3c52d4c2a5..e3a95f36ce69146adbc3b91c9f28a3169286d63d 100644 --- a/src/doc/conf.py +++ b/src/doc/conf.py @@ -27,9 +27,9 @@ copyright = '2023, IndiScale GmbH' author = 'Daniel Hornung' # The short X.Y version -version = '0.10.0' +version = '0.11.0' # The full version, including alpha/beta/rc tags -release = '0.10.0' +release = '0.11.0' # -- General configuration --------------------------------------------------- diff --git a/src/doc/index.rst b/src/doc/index.rst index 7fa017ec4202f25fe9f94a154ed8762c4581eebc..7032e2c24ea32b0f1efad2bd2e5b7930259daf61 100644 --- a/src/doc/index.rst +++ b/src/doc/index.rst @@ -18,6 +18,7 @@ This documentation helps you to :doc:`get started<README_SETUP>`, explains the m Specifying a datamodel with JSON schema <json_schema_interface> Convert a data model into a json schema <json_schema_exporter> Conversion between XLSX, JSON and LinkAhead Entities <table-json-conversion/specs> + Other utilities <utilities> _apidoc/modules Related Projects <related_projects/index> Back to overview <https://docs.indiscale.com/> diff --git a/src/doc/table-json-conversion/specs.md b/src/doc/table-json-conversion/specs.md deleted file mode 100644 index 73e480440eccb923fd0979dc2adb653146667951..0000000000000000000000000000000000000000 --- a/src/doc/table-json-conversion/specs.md +++ /dev/null @@ -1,253 +0,0 @@ -# Conversion between LinkAhead data models, JSON schema, and XLSX (and vice versa) # - -This file describes the conversion between JSON schema files and XLSX templates, and between JSON -data files following a given schema and XLSX files with data. This conversion is handled by the -Python modules in the `table_json_conversion` library. - -Requirements: When converting from a json schema, the top level of the json schema must be a -dict. The keys of the dict are RecordType names. - -## Data models in JSON Schema and JSON data ## - -The data model in LinkAhead defines the types of records present in a LinkAhead instance and their -structure. This data model can also be represented in a JSON Schema, which defines the structure of -JSON files containing records pertaining to the data model. - -For example, the following JSON can describe a "Person" Record: - -```JSON -{ - "Person": { - "family_name": "Steve", - "given_name": "Stevie" - } -} -``` - -A *JSON Schema* specifies a concrete structure, and the associated JSON files can be used to -represent data for specific record structures. For instance, one could create a JSON Schema allowing -the storage of "Training" Records containing information about conducted trainings. This is -particularly valuable for data import and export. One could generate web forms from the JSON Schema -or use it to export objects stored in LinkAhead as JSON. - -## From JSON to XLSX: Data Representation ## - -The following describes how JSON files representing LinkAhead records are converted into XLSX files, -or how JSON files with records are created from XLSX files. - -The attribute name (e.g., "Person" above) determines the RecordType, and the value of this attribute -can either be an object or a list. If it is an object (as in the example above), a single record is -represented. In the case of a list, multiple records sharing the same RecordType as the parent are -represented. - -The *Properties* of the record (e.g., `family_name` and `given_name` above) become *columns* in the -XLSX file. These properties have an attribute name and a value. The value can be: - -a. A primitive (text, number, boolean, ...) -b. A record -c. A list of primitive types -d. A list of records - -In cases *a.* and *c.*, a cell is created in the column corresponding to the property in the XLSX -file. In case *b.*, columns are created for the Properties of the record, where for each of the -Properties the cases *a.* - *d.* are considered recursively. - -For case *d.* however, the two-dimensional structure of an XLSX sheet is not sufficient. Therefore, -for such cases, *new* XLSX sheets/tables are created. - -In these sheets/tables, the referenced records are treated as described above (new columns for the -Properties). However, there are now additional columns that indicate from which "external" record -these records are referenced. - -Let's now consider these four cases in detail and with examples: - -### a. Properties with Primitive Data Types ### - -```JSON -{ - "Training": { - "date": "2023-01-01", - "url": "www.indiscale.com", - "duration": 1.0, - "participants": 1, - "remote": false - } -} -``` - -This entry is represented in an XLSX sheet with the following content: -date url duration participants remote -2023-01-01 www.indiscale.com 1.0 1 false - - -### a. Properties mit primitiven Datentypen ### - -```JSON -{ - "Training": { - "date": "2023-01-01", - "url": "www.indiscale.com", - "duration": 1.0, - "participants": 1, - "remote": false - } -} -``` - -This entry will be represented in an XLSX sheet with the following content: - -| date | url | duration | participants | remote | -|------------|-------------------|----------|--------------|--------| -| 2023-01-01 | www.indiscale.com | 1.0 | 1 | false | - -### b. Property referencing a record ### - -```JSON -{ - "Training": { - "date": "2023-01-01", - "supervisor": { - "family_name": "Stevenson", - "given_name": "Stevie", - } - } -} -``` - -This entry will be represented in an XLSX sheet with the following content: - -| date | `supervisor.family_name` | `supervisor.given_name` | -|------------|--------------------------|-------------------------| -| 2023-01-01 | Stevenson | Stevie | - -Note that column names may be renamed. The mapping of columns to properties of records is ensured -through the content of hidden rows. (See below for the definition of hidden rows.) - -### c. Properties containing lists of primitive data types ### - -```JSON -{ - "Training": { - "url": "www.indiscale.com", - "subjects": ["Math", "Physics"], - } -} -``` - -This entry would be represented in an XLSX sheet with the following content: - -| url | subjects | -|-------------------|--------------| -| www.indiscale.com | Math;Physics | - -The list elements are written into the cell separated by `;` (semicolon). If the elements contain -the separator `;`, it is escaped with `\\`. - -### d. Properties containing lists with references ### - -```JSON -{ - "Training": { - "date": "2023-01-01", - "coach": [ - { - "family_name": "Sky", - "given_name": "Max", - }, - { - "family_name": "Sky", - "given_name": "Min", - } - ] - } -} -``` - -Since the two coaches cannot be represented properly in a single cell, another worksheet is needed -to contain the properties of the coaches. - -The sheet for the Trainings in this example only contains the "date" column - -| date | -|------------| -| 2023-01-01 | - -Additionally, there is *another* sheet where the coaches are stored. Here, it is crucial to define -how the correct element is chosen from potentially multiple "Trainings". In this case, it means that -the "date" must be unique. - -Note: This uniqueness requirement is not strictly checked right now, it is your responsibility as a -user that such "foreign properties" are truly unique. - -The second sheet looks like this: - -| date | `coach.family_name` | `coach.given_name` | -|------------|---------------------|--------------------| -| 2023-01-01 | Sky | Max | -| 2023-01-01 | Sky | Min | - -## Data in XLSX: Hidden automation logic ## - -### First column: Marker for row types ### - -The first column in each sheet will be hidden and it will contain an entry in each row that needs -special treatment. The following values are used: - -- ``IGNORE``: This row is ignored. It can be used for explanatory texts or layout. -- ``COL_TYPE``: Typically the first row that is not `IGNORE`. It indicates the row that defines the - type of columns (`FOREIGN`, `SCALAR`, `LIST`, `IGNORE`). This row may occur only once. -- ``PATH``: Indicates that the row is used to define the path within the JSON. These rows are - typically hidden for users. - -An example table could look like this: - -| `IGNORE` | | Welcome | to this | file! | | -| `IGNORE` | | Please | enter your | data here: | | -| `COL_TYPE` | `IGNORE` | `SCALAR` | `SCALAR` | `LIST` | `SCALAR` | -| `PATH` | | `Training` | `Training` | `Training` | `Training` | -| `PATH` | | `url` | `date` | `subjects` | `supervisor` | -| `PATH` | | | | | `email` | -| `IGNORE` | Please enter one training per line. | Training URL | Training date | Subjects | Supervisor's email | -|------------|-------------------------------------|----------------|---------------|--------------|--------------------| -| | | example.com/mp | 2024-02-27 | Math;Physics | steve@example.com | -| | | example.com/m | 2024-02-27 | Math | stella@example.com | - -### Parsing XLSX data ### - -To extract the value of a given cell, we traverse all path elements (in ``PATH`` rows) from top to -bottom. The final element of the path is the name of the Property to which the value belongs. In -the example above, `steve@example.com` is the value of the `email` Property in the path -`["Training", "supervisor", "email"]`. - -The path elements are sufficient to identify the object within a JSON, at least if the corresponding -JSON element is a single object. If the JSON element is an array, the appropriate object within the -array needs to be selected. - -For this selection additional ``FOREIGN`` columns are used. The paths in these columns must all have -the same *base* and one additional *unique key* component. For example, two `FOREIGN` columns could -be `["Training", "date"]` and `["Training", "url"]`, where `["Training"]` is the *base path* and -`"date"` and `"url"` are the *unique keys*. - -The base path defines the table (or recordtype) to which the entries belong, and the values of the -unique keys define the actual rows to which data belongs. - -For example, this table defines three coaches for the two trainings from the last table: - -| `COL_TYPE` | `FOREIGN` | `FOREIGN` | `SCALAR` | -| `PATH` | `Training` | `Training` | `Training` | -| `PATH` | `date` | `url` | `coach` | -| `PATH` | | | `given_name` | -| `IGNORE` | Date of training | URL of training | The coach's given name | -| `IGNORE` | from sheet 'Training' | from sheet 'Training' | | -|------------|-----------------------|-----------------------|------------------------| -| | 2024-02-27 | example.com/mp | Ada | -| | 2024-02-27 | example.com/mp | Berta | -| | 2024-02-27 | example.com/m | Chris | - -## Current limitations ## - -The current implementation still lacks the following: - -- Lists of enum references are not yet implemented as columns where matching cell can simply be - ticked/crossed. -- Files handling is not implemented yet. diff --git a/src/doc/table-json-conversion/specs.rst b/src/doc/table-json-conversion/specs.rst new file mode 100644 index 0000000000000000000000000000000000000000..c98eddc1180f552f1d2389b1bb57979e93550ab8 --- /dev/null +++ b/src/doc/table-json-conversion/specs.rst @@ -0,0 +1,527 @@ +Conversion between LinkAhead data models, JSON schema, and XLSX (and vice versa) +================================================================================ + +This file describes the conversion between JSON schema files and XLSX +templates, and between JSON data files following a given schema and XLSX +files with data. This conversion is handled by the Python modules in the +``table_json_conversion`` library. + +Data models in JSON Schema and JSON data +---------------------------------------- + +Let’s start simple! If you would describe a ``Person`` Record with the +Properties ``family_name`` and ``given_name`` in JSON, it would probably +look like this: + +.. code:: json + + { + "Person": + { + "family_name": "Steve", + "given_name": "Stevie" + } + } + +The data model in LinkAhead defines the types of records present in a +LinkAhead instance and their structure. This data model can also be +represented in a JSON Schema, which defines the structure of JSON files +containing records pertaining to the data model. + +You can define this kind of structure with the following JSON schema: + +.. code:: json + + { + "type": "object", + "properties": { + "Person": { + "type": "object", + "properties": { + "family_name": { + "type": "string" + }, + "given_name": { + "type": "string" + } + } + } + }, + "$schema": "https://json-schema.org/draft/2020-12/schema" + } + +The above schema (and schemas created by +``json_schema_exporter.merge_schemas(...)``) is, from a broad view, a +dict with all the top level recordtypes (the recordtype names are the +keys). This is sufficient to describe the data model. However, actual +data often consists of multiple entries of the same type (e.g. multiple +Persons). + +Since the data model schema does not match multiple data sets, there is +a utility function which creates a *data array* schema out of the *data +model* schema: It basically replaces the top-level entries of the data +model by lists which may contain data. + +For example, the following JSON describes two “Person” Records: + +.. code:: json + + { + "Person": [ + { + "family_name": "Steve", + "given_name": "Stevie" + }, + { + "family_name": "Star", + "given_name": "Stella" + } + ] + } + +The *JSON Schema* for a JSON like the above one could look like the +following: + +.. code:: json + + { + "type": "object", + "properties": { + "Person": { + "type": "array", + "items": { + "type": "object", + "properties": { + "family_name": { + "type": "string" + }, + "given_name": { + "type": "string" + } + } + } + } + }, + "$schema": "https://json-schema.org/draft/2020-12/schema" + } + +This would define that the top level object/dict may have a key +``Person`` which has as value an array of objects that in turn have the +properties ``family_name`` and ``given_name``. + +You can create a data array schema from a data model schema using +``xlsx_utils.array_schema_from_model_schema``. + +From JSON to XLSX: Data Representation +-------------------------------------- + +The following describes how JSON files representing LinkAhead records +are converted into XLSX files, or how JSON files with records are +created from XLSX files. + +The attribute name (e.g., “Person” above) determines the RecordType, and +the value of this attribute can either be an object or a list. If it is +an object (as in the example above), a single record is represented. In +the case of a list, multiple records sharing the same RecordType as the +parent are represented. + +The *Properties* of the record (e.g., ``family_name`` and ``given_name`` +above) become *columns* in the XLSX file. Thus the XLSX file created +from the above example would have a sheet “Person” with the following +table: + +========== =========== +given_name family_name +========== =========== +Stevie Steve +Stella Star +========== =========== + +The properties of objects (Records) in the JSON have an attribute name +and a value. The value can be: + +a. A primitive (text, number, boolean, …) +b. A record +c. A list of primitive types +d. A list of unique enums (multiple choice) +e. A list of records + +In cases *a.* and *c.*, a cell is created in the column corresponding to +the property in the XLSX file. In case *b.*, columns are created for the +Properties of the record, where for each of the Properties the cases +*a.* - *e.* are considered recursively. Case *d.* leads to a number of +columns, one for each of the possible choices. + +For case *e.* however, the two-dimensional structure of an XLSX sheet is +not sufficient. Therefore, for such cases, *new* XLSX sheets/tables are +created. + +In these sheets/tables, the referenced records are treated as described +above (new columns for the Properties). However, there are now +additional columns that indicate from which “external” record these +records are referenced. + +Let’s now consider these five cases in detail and with examples: + +a. Properties with primitive data types +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: json + + { + "Training": [ + { + "date": "2023-01-01", + "url": "www.indiscale.com", + "duration": 1.0, + "participants": 1, + "remote": false + }, + { + "date": "2023-06-15", + "url": "www.indiscale.com/next", + "duration": 2.5, + "participants": None, + "remote": true + } + ] + } + +This entry will be represented in an XLSX sheet with the following +content: + ++------------+------------------------+----------+--------------+--------+ +| date | url | duration | participants | remote | ++============+========================+==========+==============+========+ +| 2023-01-01 | www.indiscale.com | 1.0 | 1 | false | ++------------+------------------------+----------+--------------+--------+ +| 2023-06-15 | www.indiscale.com/next | 2.5 | | true | ++------------+------------------------+----------+--------------+--------+ + +b. Property referencing a record +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: json + + { + "Training": [ + { + "date": "2023-01-01", + "supervisor": { + "family_name": "Stevenson", + "given_name": "Stevie" + } + } + ] + } + +This entry will be represented in an XLSX sheet named "Training" with the following +content: + +========== ========================== ========================= +date supervisor.family_name supervisor.given_name +========== ========================== ========================= +2023-01-01 Stevenson Stevie +========== ========================== ========================= + + +c. Properties containing lists of primitive data types +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: json + + { + "Training": [ + { + "url": "www.indiscale.com", + "subjects": ["Math", "Physics"], + } + ] + } + +This entry would be represented in an XLSX sheet with the following +content: + +================= ============ +url subjects +================= ============ +www.indiscale.com Math;Physics +================= ============ + +The list elements are written into the cell separated by ``;`` +(semicolon). If the elements contain the separator ``;``, it is escaped +with ``\``. + +d. Multiple choice properties +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: json + + { + "Training": [ + { + "date": "2024-04-17", + "skills": [ + "Planning", + "Evaluation" + ] + } + ] + } + +If the ``skills`` list is denoted as an ``enum`` array with +``"uniqueItems": true`` in the json schema, this entry would be +represented like this in an XLSX: + ++------------+-----------------+----------------------+-------------------+ +| date | skills.Planning | skills.Communication | skills.Evaluation | ++============+=================+======================+===================+ +| 2024-04-17 | x | | x | ++------------+-----------------+----------------------+-------------------+ + +Note that this example assumes that the list of possible choices, as +given in the json schema, was “Planning, Communication, Evaluation”. + +e. Properties containing lists with references +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: json + + { + "Training": [ + { + "date": "2023-01-01", + "coach": [ + { + "family_name": "Sky", + "given_name": "Max", + }, + { + "family_name": "Sky", + "given_name": "Min", + } + ] + } + ] + } + +Since the two coaches cannot be represented properly in a single cell, +another worksheet is needed to contain the properties of the coaches. + +The sheet for the Trainings in this example only contains the “date” +column + ++------------+ +| date | ++============+ +| 2023-01-01 | ++------------+ + +Additionally, there is *another* sheet where the coaches are stored. +Here, it is crucial to define how the correct element is chosen from +potentially multiple “Trainings”. In this case, it means that the “date” +must be unique. + + +The second sheet looks like this: + +========== ===================== ==================== +date ``coach.family_name`` ``coach.given_name`` +========== ===================== ==================== +2023-01-01 Sky Max +2023-01-01 Sky Min +========== ===================== ==================== + +Note: This uniqueness requirement is not strictly checked right now, it +is your responsibility as a user that such “foreign properties” are +truly unique. + +When converting JSON files that contain Records that were exported from LinkAhead +it might be a good idea to use the LinkAhead ID as a unique identifier for Records. However, if +your Records do not yet have LinkAhead IDs you need to find some other identifying +properties/foreign keys. Note, that those properties only need to identify a Record uniquely within +the list of Records: In the above example the "coach" Record needs to be identified in the list of +coaches. + + +Data in XLSX: Hidden automation logic +------------------------------------- + +First column: Marker for row types +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The first column in each sheet will be hidden and it will contain an +entry in each row that needs special treatment. The following values are +used: + +- ``IGNORE``: This row is ignored. It can be used for explanatory texts + or layout. +- ``COL_TYPE``: Typically the first row that is not ``IGNORE``. It + indicates the row that defines the type of columns (``FOREIGN``, + ``SCALAR``, ``LIST``, ``MULTIPLE_CHOICE``, ``IGNORE``). This row must + occur exactly once per sheet. +- ``PATH``: Indicates that the row is used to define the path within + the JSON. These rows are typically hidden for users. + +An example table could look like this: + ++----------+-------------------------------------+----------------+---------------+--------------+---------------------+ +| IGNORE | | Welcome | to | this | file | ++==========+=====================================+================+===============+==============+=====================+ +| IGNORE | | Please | enter your | data here: | | ++----------+-------------------------------------+----------------+---------------+--------------+---------------------+ +| COL_TYPE | IGNORE | SCALAR | SCALAR | LIST | SCALAR | ++----------+-------------------------------------+----------------+---------------+--------------+---------------------+ +| PATH | | Training | Training | Training | Training | ++----------+-------------------------------------+----------------+---------------+--------------+---------------------+ +| PATH | | url | date | subjects | supervisor | ++----------+-------------------------------------+----------------+---------------+--------------+---------------------+ +| PATH | | | | | email | ++----------+-------------------------------------+----------------+---------------+--------------+---------------------+ +| IGNORE | Please enter one training per line. | Training URL | Training date | Subjects | Supervisor's email | ++----------+-------------------------------------+----------------+---------------+--------------+---------------------+ +| | | example.com/mp | 2024-02-27 | Math;Physics | steve@example.com | ++----------+-------------------------------------+----------------+---------------+--------------+---------------------+ +| | | example.com/m | 2024-02-28 | Math | stella@example.com | ++----------+-------------------------------------+----------------+---------------+--------------+---------------------+ + + +Parsing XLSX data +~~~~~~~~~~~~~~~~~ + +To extract the value of a given cell, we traverse all path elements (in +``PATH`` rows) from top to bottom. The final element of the path is the +name of the Property to which the value belongs. In the example above, +``steve@example.com`` is the value of the ``email`` Property in the path +``["Training", "supervisor", "email"]``. + +The path elements are sufficient to identify the object within a JSON, +at least if the corresponding JSON element is a single object. If the +JSON element is an array, the appropriate object within the array needs +to be selected. + +For this selection additional ``FOREIGN`` columns are used. The paths in +these columns must all have the same *base* and one additional *unique +key* component. For example, two ``FOREIGN`` columns could be +``["Training", "date"]`` and ``["Training", "url"]``, where +``["Training"]`` is the *base path* and ``"date"`` and ``"url"`` are the +*unique keys*. + +The base path defines the table (or recordtype) to which the entries +belong, and the values of the unique keys define the actual rows to +which data belongs. + +For example, this table defines three coaches for the two trainings from +the last table: + ++----------+-----------------------+-----------------------+------------------------+ +| COL_TYPE | FOREIGN | FOREIGN | SCALAR | ++----------+-----------------------+-----------------------+------------------------+ +| PATH | Training | Training | Training | ++----------+-----------------------+-----------------------+------------------------+ +| PATH | date | url | coach | ++----------+-----------------------+-----------------------+------------------------+ +| PATH | | | given_name | ++----------+-----------------------+-----------------------+------------------------+ +| IGNORE | Date of training | URL of training | The coach’s given name | ++----------+-----------------------+-----------------------+------------------------+ +| IGNORE | from sheet ‘Training’ | from sheet ‘Training’ | | ++----------+-----------------------+-----------------------+------------------------+ +| | 2024-02-27 | example.com/mp | Ada | ++----------+-----------------------+-----------------------+------------------------+ +| | 2024-02-27 | example.com/mp | Berta | ++----------+-----------------------+-----------------------+------------------------+ +| | 2024-02-28 | example.com/m | Chris | ++----------+-----------------------+-----------------------+------------------------+ + +Sepcial case: multiple choice “checkboxes” +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +As a special case, enum arrays with ``"uniqueItems": true`` can be +represented as multiple columns, with one column per choice. The choices +are denoted as the last PATH component, the column type must be +MULTIPLE_CHOICE. + +Stored data is denoted as an “x” character in the respective cell, empty +cells denote that the item was not selected. Additionally, the +implementation also allows TRUE or 1 for selected items, and +FALSE, 0 or cells with only whitespace characters for deselected +items: + ++----------+-----------------+----------------------+-------------------+ +| COL_TYPE | MULTIPLE_CHOICE | MULTIPLE_CHOICE | MULTIPLE_CHOICE | ++----------+-----------------+----------------------+-------------------+ +| PATH | skills | skills | skills | ++----------+-----------------+----------------------+-------------------+ +| PATH | Planning | Communication | Evaluation | ++----------+-----------------+----------------------+-------------------+ +| IGNORE | skills.Planning | skills.Communication | skills.Evaluation | ++----------+-----------------+----------------------+-------------------+ +| | x | | X | ++----------+-----------------+----------------------+-------------------+ +| | " " | TRUE | FALSE | ++----------+-----------------+----------------------+-------------------+ +| | 0 | x | 1 | ++----------+-----------------+----------------------+-------------------+ + +These rows correspond to: + +1. Planning, Evaluation +2. Communication +3. Communication, Evaluation + + +User Interaction +---------------- +The primary and most straight forward use case of this utility is to export +LinkAhead data as JSON and then as XLSX tables. This can be done fully +automatic. + +TODO show how! + +The hidden cells for automation are designed such that the XLSX template that +is created can be customized such that it is a nicely formatted table. The +hidden content must remain. See below for tips on how to manipulate the table. + +The second use case is to use XLSX to collect data and then import it into +LinkAhead. Here, it may be necessary to define foreign keys in order to +identify Records in lists. + +Table Manipulation +~~~~~~~~~~~~~~~~~~ + +- All formatting is ignored +- Nothing has to be observed when adding new data rows +- When adding new descriptory rows (for example one for descriptions of the + columns), the ``COL_TYPE`` must be set to ``IGNORE`` +- You can freely rename sheets. +- You can freely rename columns (since the row containing the column names is + set to ``IGNROE``; the Property name is taken from the last path element) +- You can change the order of columns. However, you have to make sure to move + the full column including hidden elements. Thus you should not select a range + of cells, but click on the column index in your spread sheet program. + +Note: Requirements +------------------ + +This conversion does not allow arbitrary JSON schema files nor does it +support arbitrary JSON files since conversion to XLSX files would not +make sense. Instead, this utility is tailored to supported conversion of +data (models) that are structured like data (models) in LinkAhead: + +- The JSON schema describes a data model of RecordTypes and Properties as it would be generated by the caosadvancedtools.json_schema_exporter module. +- The JSON files must contain arrays of Records complying with such a data model. + +Thus, when converting from a JSON schema, the top level of the JSON +schema must be a dict. The keys of the dict are RecordType names. + + + + +Current limitations +------------------- + +The current implementation still lacks the following: + +- Files handling is not implemented yet. + diff --git a/src/doc/utilities.rst b/src/doc/utilities.rst new file mode 100644 index 0000000000000000000000000000000000000000..4d520ae2d4b7a9bbd81171ba002c4f736223713a --- /dev/null +++ b/src/doc/utilities.rst @@ -0,0 +1,37 @@ +Other utilities in LinkAhead Advanced User Tools +================================================ + +The table file importer +%%%%%%%%%%%%%%%%%%%%%%% + +The LinkAhead Advanced user tools provide a generic +:py:class:`~caosadvancedtools.table_importer.TableImporter` class which reads +different table file formats (at the time of writing of this documentation, +.xls(x), .csv, and .tsv) and converts them into :py:class:`pandas.DataFrame` +objects. It provides helper functions for converting column values (e.g., +converting the string values "yes" or "no" to ``True`` or ``False``), checking +the presence of obligatory columns in a table and whether those have missing +values, and datatype checks. + +The base class :py:class:`~caosadvancedtools.table_importer.TableImporter` +provides the general verification methods, while each subclass like +:py:class:`~caosadvancedtools.table_importer.XLSXImporter` or +:py:class:`~caosadvancedtools.table_importer.CSVImporter` implements its own +``read_file`` function that is used to convert a given table file into a +:py:class:`pandas.DataFrame`. + +Empty fields in integer columns +-------------------------------- + +Reading in table files that have integer-valued columns with missing data can +result in datatype contradictions (see the Pandas documentation on `nullable +integers <https://pandas.pydata.org/docs/user_guide/integer_na.html>`_) since +the default value for missing fields, ``numpy.nan``, is a float. This is why +from version 0.11 and above, the ``TableImporter`` uses +:py:class:`pandas.Int64Dtype` as the default datatype for all integer columns +which allows for empty fields while keeping all actual data integer-valued. This +behavior can be changed by initializing the ``TableImporter`` with +``convert_int_to_nullable_int=False`` in which case a +:py:class:`~caosadvancedtools.datainconsistency.DataInconsistencyError` is +raised when an empty field is encountered in a column with an non-nullable +integer datatype. diff --git a/src/doc/yaml_interface.rst b/src/doc/yaml_interface.rst index 48e27802e72177424fd2c6258492b283bbff0d3b..6d2ec17867bd2cea408059b93375624c5680ffe5 100644 --- a/src/doc/yaml_interface.rst +++ b/src/doc/yaml_interface.rst @@ -1,4 +1,3 @@ - =============================== YAML data model specification =============================== @@ -20,10 +19,10 @@ in the library sources. Person: recommended_properties: firstName: - datatype: TEXT + datatype: TEXT description: 'first name' lastName: - datatype: TEXT + datatype: TEXT description: 'last name' LabbookEntry: recommended_properties: @@ -53,7 +52,7 @@ This example defines 3 ``RecordTypes``: - A ``Project`` with one obligatory property ``datatype`` - A Person with a ``firstName`` and a ``lastName`` (as recommended properties) - A ``LabbookEntry`` with multiple recommended properties of different data types -- It is assumed that the server knows a RecordType or Property with the name +- It is assumed that the server knows a RecordType or Property with the name ``Textfile``. @@ -70,10 +69,58 @@ Note the difference between the three property declarations of ``LabbookEntry``: - ``responsible``: This defines and adds a property with name "responsible" to ``LabbookEntry`, which has a datatype ``Person``. ``Person`` is defined above. - ``firstName``: This defines and adds a property with the standard data type ``TEXT`` to record type ``Person``. -If the data model depends on record types or properties which already exist in CaosDB, those can be -added using the ``extern`` keyword: ``extern`` takes a list of previously defined names. +If the data model depends on record types or properties which already exist in +LinkAhead, those can be added using the ``extern`` keyword: ``extern`` takes a +list of previously defined names of Properties and/or RecordTypes. Note that if you happen to use an already existing ``REFERENCE`` property that has an already existing RecordType as datatype, you also need to add that RecordType's name to the ``extern`` list, e.g., + +.. code-block:: yaml + + extern: + # Let's assume the following is a reference property with datatype Person + - Author + # We need Person (since it's the datatype of Author) even though we might + # not use it explicitly + - Person + + Dataset: + recommended_properties: + Author: + +Reusing Properties +================== + +Properties defined once (either as a property of a Record or as a separate Property) can be reused +later in the yaml file. That requires that after the first occurrence of the property, the +attributes have to be empty. Otherwise the reuse of the property would be conflicting with its +original definition. + +Example: +-------- + + +.. code-block:: yaml + + Project: + obligatory_properties: + projectId: + datatype: INTEGER + description: 'UID of this project' + date: + datetype: DATETIME + description: Date of a project or an experiment + + Experiment: + obligatory_properties: + experimentId: + datatype: INTEGER + description: 'UID of this experiment' + date: # no further attributes here, since property was defined above in 'Project'! +The above example defines two Records: Project and Experiment +The property ``date`` is defined upon its first occurrence as a property of ``Project``. +Later, the same property is also added to ``Experiment`` where no additional attributes are +allowed to specify. Datatypes ========= @@ -117,7 +164,7 @@ You can use the yaml parser directly in python as follows: .. code-block:: python - + from caosadvancedtools.models import parser as parser model = parser.parse_model_from_yaml("model.yml") @@ -135,7 +182,7 @@ You can now use the functions from ``DataModel`` to synchronize the model with a CaosDB instance, e.g.: .. code-block:: python - + model.sync_data_model() .. LocalWords: yml projectId UID firstName lastName LabbookEntry entryId textElement labbook diff --git a/unittests/table_json_conversion/create_jsonschema.py b/unittests/table_json_conversion/create_jsonschema.py index 9585f5458edf8f9d3f785099295a3e675230932c..8ab4ad2d973b78522e858b3ee866b870ecf187a4 100755 --- a/unittests/table_json_conversion/create_jsonschema.py +++ b/unittests/table_json_conversion/create_jsonschema.py @@ -20,17 +20,18 @@ """ +from __future__ import annotations + import argparse import json -from typing import List import caosadvancedtools.json_schema_exporter as jsex from caosadvancedtools.models import parser # import tomli -def prepare_datamodel(modelfile, recordtypes: List[str], outfile: str, - do_not_create: List[str] = None): +def prepare_datamodel(modelfile, recordtypes: list[str], outfile: str, + do_not_create: list[str] = None): if do_not_create is None: do_not_create = [] model = parser.parse_model_from_yaml(modelfile) diff --git a/unittests/table_json_conversion/data/error_simple_data.json b/unittests/table_json_conversion/data/error_simple_data.json index bfea88b675ab2a6e0c1787fc401afec5c564c006..4d57b0335b4685ea82f1668d50b52a9d30ef1759 100644 --- a/unittests/table_json_conversion/data/error_simple_data.json +++ b/unittests/table_json_conversion/data/error_simple_data.json @@ -1,11 +1,11 @@ { - "Training": { + "Training": [{ "duration": 1.0, "participants": 0.5 - }, - "Person": { + }], + "Person": [{ "family_name": "Auric", "given_name": "Goldfinger", "Organisation": "Federal Reserve" - } + }] } diff --git a/unittests/table_json_conversion/data/indirect_data.json b/unittests/table_json_conversion/data/indirect_data.json index c77dd1ff2a703af6b6b2a0db19f450ac10616d9b..76db75d97e1dafff223ea2b27ecca1086d6bc4af 100644 --- a/unittests/table_json_conversion/data/indirect_data.json +++ b/unittests/table_json_conversion/data/indirect_data.json @@ -1,5 +1,5 @@ { - "Wrapper": { + "Wrapper": [{ "Results": [ { "year": 2022, @@ -14,5 +14,5 @@ "name": "Basic Training", "url": "www.example.com/training/basic" } - } + }] } diff --git a/unittests/table_json_conversion/data/indirect_data.xlsx b/unittests/table_json_conversion/data/indirect_data.xlsx index 894ec95f87aa32a618b3b70504727398f2ce2358..3d0cf3245a414a4161b99034051424c699b5d453 100644 Binary files a/unittests/table_json_conversion/data/indirect_data.xlsx and b/unittests/table_json_conversion/data/indirect_data.xlsx differ diff --git a/unittests/table_json_conversion/data/indirect_template.xlsx b/unittests/table_json_conversion/data/indirect_template.xlsx index cc614acb75b36e10143a29f28dff9fce7d5e006f..0c521e554027f565ecae6fe27783034361b8ff41 100644 Binary files a/unittests/table_json_conversion/data/indirect_template.xlsx and b/unittests/table_json_conversion/data/indirect_template.xlsx differ diff --git a/unittests/table_json_conversion/data/multiple_choice_data.json b/unittests/table_json_conversion/data/multiple_choice_data.json new file mode 100644 index 0000000000000000000000000000000000000000..ee24ef7adbd61abf22d47bb3d49f43f3e1e26501 --- /dev/null +++ b/unittests/table_json_conversion/data/multiple_choice_data.json @@ -0,0 +1,11 @@ +{ + "Training": [{ + "name": "Super Skill Training", + "date": "2024-04-17", + "skills": [ + "Planning", + "Evaluation" + ], + "exam_types": [] + }] +} diff --git a/unittests/table_json_conversion/data/multiple_choice_data.xlsx b/unittests/table_json_conversion/data/multiple_choice_data.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..28cf4007d8a1a061235863d12e5bdc5b5747f386 Binary files /dev/null and b/unittests/table_json_conversion/data/multiple_choice_data.xlsx differ diff --git a/unittests/table_json_conversion/data/multiple_choice_data_missing.xlsx b/unittests/table_json_conversion/data/multiple_choice_data_missing.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..10d76529ab11d2e79ca0651313dd50f0cc326341 Binary files /dev/null and b/unittests/table_json_conversion/data/multiple_choice_data_missing.xlsx differ diff --git a/unittests/table_json_conversion/data/multiple_choice_schema.json b/unittests/table_json_conversion/data/multiple_choice_schema.json new file mode 100644 index 0000000000000000000000000000000000000000..71bf0379aba4ad6f8510581ba0defadb81a66609 --- /dev/null +++ b/unittests/table_json_conversion/data/multiple_choice_schema.json @@ -0,0 +1,57 @@ +{ + "type": "object", + "properties": { + "Training": { + "type": "object", + "required": [], + "additionalProperties": false, + "title": "Training", + "properties": { + "name": { + "type": "string", + "description": "The name of the Record to be created" + }, + "date": { + "description": "The date of the training.", + "anyOf": [ + { + "type": "string", + "format": "date" + }, + { + "type": "string", + "format": "date-time" + } + ] + }, + "skills": { + "description": "Skills that are trained.", + "type": "array", + "items": { + "enum": [ + "Planning", + "Communication", + "Evaluation" + ] + }, + "uniqueItems": true + }, + "exam_types": { + "type": "array", + "items": { + "enum": [ + "Oral", + "Written" + ] + }, + "uniqueItems": true + } + } + } + }, + "required": [ + "Training" + ], + "additionalProperties": false, + "$schema": "https://json-schema.org/draft/2020-12/schema" +} diff --git a/unittests/table_json_conversion/data/multiple_choice_template.xlsx b/unittests/table_json_conversion/data/multiple_choice_template.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..e523506201ee7301dfa6f814e0315c01b95b08ee Binary files /dev/null and b/unittests/table_json_conversion/data/multiple_choice_template.xlsx differ diff --git a/unittests/table_json_conversion/data/multiple_refs_data.json b/unittests/table_json_conversion/data/multiple_refs_data.json index 5b8ce9136635832111abb2206d8afe1bc7c58444..fa7c7af8e25096d15683bc924a41fb9572db3eb5 100644 --- a/unittests/table_json_conversion/data/multiple_refs_data.json +++ b/unittests/table_json_conversion/data/multiple_refs_data.json @@ -1,5 +1,5 @@ { - "Training": { + "Training": [{ "trainer": [], "participant": [ { @@ -44,5 +44,5 @@ "date": "2024-03-21T14:12:00.000Z", "url": "www.indiscale.com", "name": "Example training with multiple organizations." - } + }] } diff --git a/unittests/table_json_conversion/data/multiple_refs_data_wrong_foreign.xlsx b/unittests/table_json_conversion/data/multiple_refs_data_wrong_foreign.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..f6e9d9f9f2024708a0b70d8ed4660cc97e04ff27 Binary files /dev/null and b/unittests/table_json_conversion/data/multiple_refs_data_wrong_foreign.xlsx differ diff --git a/unittests/table_json_conversion/data/simple_data.json b/unittests/table_json_conversion/data/simple_data.json index 9997f17e76a46d5e97d842fdee40626047e7a347..92a1661a7e975747fa346997c0a3309e740c7324 100644 --- a/unittests/table_json_conversion/data/simple_data.json +++ b/unittests/table_json_conversion/data/simple_data.json @@ -1,5 +1,5 @@ { - "Training": { + "Training": [{ "date": "2023-01-01", "url": "www.indiscale.com", "coach": [ @@ -23,10 +23,10 @@ "participants": 1, "subjects": ["Math", "Physics"], "remote": false - }, - "Person": { + }], + "Person": [{ "family_name": "Steve", "given_name": "Stevie", "Organisation": "IMF" - } + }] } diff --git a/unittests/table_json_conversion/data/simple_data_ascii_chars.json b/unittests/table_json_conversion/data/simple_data_ascii_chars.json index b1d13ebee5d6e3949fa606a130e6f5819bfc4bc8..84e22b9bcbf3b5c053d955ed398b442379a99395 100644 --- a/unittests/table_json_conversion/data/simple_data_ascii_chars.json +++ b/unittests/table_json_conversion/data/simple_data_ascii_chars.json @@ -1,5 +1,5 @@ { - "Training": { + "Training": [{ "date": "2023-01-01", "url": "char: >\u0001\u0002\u0003\u0004\u0005\u0006\u0007\u0008\u0009<", "subjects": [ @@ -9,10 +9,10 @@ ">\u0020\u0021\u0022\u0023\u0024\u0025\u0026\u0027<", ">\u0028\u0029\u002a\u002b\u002c\u002d\u002e\u002f<" ] - }, - "Person": { + }], + "Person": [{ "family_name": "Steve", "given_name": "Stevie", "Organisation": "IMF" - } + }] } diff --git a/unittests/table_json_conversion/data/simple_data_datetime.xlsx b/unittests/table_json_conversion/data/simple_data_datetime.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..c6752614e700ecfd6040c087ff3254a68fd5e158 Binary files /dev/null and b/unittests/table_json_conversion/data/simple_data_datetime.xlsx differ diff --git a/unittests/table_json_conversion/data/simple_data_missing.xlsx b/unittests/table_json_conversion/data/simple_data_missing.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..6e6d3a39d965de81236f2ad14bd2116ac4d7669b Binary files /dev/null and b/unittests/table_json_conversion/data/simple_data_missing.xlsx differ diff --git a/unittests/table_json_conversion/data/simple_data_schema.json b/unittests/table_json_conversion/data/simple_data_schema.json new file mode 100644 index 0000000000000000000000000000000000000000..0a4d44f733b3a8301e2d053cd570c904ef02750f --- /dev/null +++ b/unittests/table_json_conversion/data/simple_data_schema.json @@ -0,0 +1,145 @@ +{ + "type": "object", + "properties": { + "Training": { + "type": "array", + "items": { + "type": "object", + "required": [], + "additionalProperties": false, + "title": "Training", + "properties": { + "name": { + "type": "string", + "description": "The name of the Record to be created" + }, + "date": { + "description": "The date of the training.", + "anyOf": [ + { + "type": "string", + "format": "date" + }, + { + "type": "string", + "format": "date-time" + } + ] + }, + "url": { + "type": "string", + "description": "The URL" + }, + "subjects": { + "type": "array", + "items": { + "type": "string" + } + }, + "coach": { + "type": "array", + "items": { + "type": "object", + "required": [], + "additionalProperties": false, + "title": "coach", + "properties": { + "name": { + "type": "string", + "description": "The name of the Record to be created" + }, + "family_name": { + "type": "string" + }, + "given_name": { + "type": "string" + }, + "Organisation": { + "enum": [ + "Federal Reserve", + "IMF", + "ECB" + ] + } + } + } + }, + "supervisor": { + "type": "object", + "required": [], + "additionalProperties": false, + "title": "supervisor", + "properties": { + "name": { + "type": "string", + "description": "The name of the Record to be created" + }, + "family_name": { + "type": "string" + }, + "given_name": { + "type": "string" + }, + "Organisation": { + "enum": [ + "Federal Reserve", + "IMF", + "ECB" + ] + } + } + }, + "duration": { + "type": "number" + }, + "participants": { + "type": "integer" + }, + "remote": { + "type": "boolean" + }, + "slides": { + "type": "string", + "format": "data-url" + } + }, + "$schema": "https://json-schema.org/draft/2020-12/schema" + } + }, + "Person": { + "type": "array", + "items": { + "type": "object", + "required": [], + "additionalProperties": false, + "title": "Person", + "properties": { + "name": { + "type": "string", + "description": "The name of the Record to be created" + }, + "family_name": { + "type": "string" + }, + "given_name": { + "type": "string" + }, + "Organisation": { + "enum": [ + "Federal Reserve", + "IMF", + "ECB" + ] + } + }, + "$schema": "https://json-schema.org/draft/2020-12/schema" + } + } + }, + "required": [ + "Training", + "Person" + ], + "additionalProperties": false, + "$schema": "https://json-schema.org/draft/2020-12/schema" +} diff --git a/unittests/table_json_conversion/data/simple_data_wrong_foreign.xlsx b/unittests/table_json_conversion/data/simple_data_wrong_foreign.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..dbd91d29947ec673a704a762f474111223484e56 Binary files /dev/null and b/unittests/table_json_conversion/data/simple_data_wrong_foreign.xlsx differ diff --git a/unittests/table_json_conversion/test_fill_xlsx.py b/unittests/table_json_conversion/test_fill_xlsx.py index 946336da721f7c9affd5c553ccbb38cb46217eef..899bb81ef1f91f3326f214f49f135a55b97d299f 100644 --- a/unittests/table_json_conversion/test_fill_xlsx.py +++ b/unittests/table_json_conversion/test_fill_xlsx.py @@ -1,10 +1,10 @@ -#!/usr/bin/env python3 # encoding: utf-8 # # This file is a part of the LinkAhead Project. # # Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> # Copyright (C) 2024 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as @@ -19,6 +19,7 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. +import datetime import json import os import re @@ -26,10 +27,16 @@ import tempfile import jsonschema.exceptions as schema_exc import pytest -from caosadvancedtools.table_json_conversion.fill_xlsx import ( - _get_path_rows, _get_row_type_column_index, fill_template) from openpyxl import load_workbook +from caosadvancedtools.table_json_conversion import xlsx_utils +from caosadvancedtools.table_json_conversion.fill_xlsx import fill_template +from caosadvancedtools.table_json_conversion.xlsx_utils import ( + get_row_type_column_index, + get_path_rows, + read_or_dict, +) + from .utils import compare_workbooks @@ -67,8 +74,8 @@ custom_output: str, optional def test_detect(): example = load_workbook(rfp("data/simple_template.xlsx")) - assert 0 == _get_row_type_column_index(example['Person']) - assert [1, 2] == _get_path_rows(example['Person']) + assert 0 == get_row_type_column_index(example['Person']) + assert [1, 2] == get_path_rows(example['Person']) def test_temporary(): @@ -138,6 +145,36 @@ def test_fill_xlsx(): template_file=rfp("data/simple_template.xlsx"), known_good=rfp("data/simple_data_ascii_chars.xlsx"), schema=rfp("data/simple_schema.json")) + fill_and_compare(json_file=rfp("data/multiple_choice_data.json"), + template_file=rfp("data/multiple_choice_template.xlsx"), + known_good=rfp("data/multiple_choice_data.xlsx"), + schema=rfp("data/multiple_choice_schema.json")) + + +def test_datetime(): + """Datetime values from LinkAhead are not serialized as strings.""" + json_file = rfp("data/simple_data.json") + template_file = rfp("data/simple_template.xlsx") + known_good = rfp("data/simple_data_datetime.xlsx") + # TODO Implement checker for datetime + # schema = rfp("data/simple_schema.json") + + # Set datetime explicitly + json_data = read_or_dict(json_file) + json_data["Training"][0]["date"] = datetime.datetime(2023, 1, 1) + + # Code copied mostly from `fill_and_compare(...)` + with tempfile.TemporaryDirectory() as tmpdir: + outfile = os.path.join(tmpdir, 'test.xlsx') + assert not os.path.exists(outfile) + fill_template(data=json_data, template=template_file, result=outfile, + # validation_schema=schema + ) + assert os.path.exists(outfile) + generated = load_workbook(outfile) # workbook can be read + + known_good_wb = load_workbook(known_good) + compare_workbooks(generated, known_good_wb) def test_errors(): @@ -152,3 +189,10 @@ def test_errors(): known_good=rfp("data/simple_data.xlsx"), schema=rfp("data/simple_schema.json")) assert exc.value.message == "0.5 is not of type 'integer'" + + +def test_data_schema_generation(): + model_schema = xlsx_utils.read_or_dict(rfp("data/simple_schema.json")) + array_schema = xlsx_utils.array_schema_from_model_schema(model_schema) + expected = xlsx_utils.read_or_dict(rfp("data/simple_data_schema.json")) + assert array_schema == expected diff --git a/unittests/table_json_conversion/test_read_xlsx.py b/unittests/table_json_conversion/test_read_xlsx.py new file mode 100644 index 0000000000000000000000000000000000000000..0eec2e9caa1f800ad86ab43057b8c512dc09881f --- /dev/null +++ b/unittests/table_json_conversion/test_read_xlsx.py @@ -0,0 +1,221 @@ +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +"""Testing the conversion from XLSX to JSON""" + + +import datetime +import json +import os +import re + +from types import SimpleNamespace + +import pytest +from caosadvancedtools.table_json_conversion import convert + +from .utils import assert_equal_jsons + + +def rfp(*pathcomponents): + """Return full path, a shorthand convenience function. + """ + return os.path.join(os.path.dirname(__file__), *pathcomponents) + + +def convert_and_compare(xlsx_file: str, schema_file: str, known_good_file: str, + known_good_data: dict = None, strict: bool = False, + validate: bool = True) -> dict: + """Convert an XLSX file and compare to a known result. + +Exactly one of ``known_good_file`` and ``known_good_data`` should be non-empty. + +Returns +------- +json: dict + The result of the conversion. + """ + result = convert.to_dict(xlsx=xlsx_file, schema=schema_file, validate=validate) + if known_good_file: + with open(known_good_file, encoding="utf-8") as myfile: + expected = json.load(myfile) + else: + expected = known_good_data + assert_equal_jsons(result, expected, allow_none=not strict, allow_empty=not strict) + return result + + +def test_conversions(): + """Test conversion from XLSX to JSON.""" + convert_and_compare(xlsx_file=rfp("data/simple_data.xlsx"), + schema_file=rfp("data/simple_schema.json"), + known_good_file=rfp("data/simple_data.json")) + convert_and_compare(xlsx_file=rfp("data/multiple_refs_data.xlsx"), + schema_file=rfp("data/multiple_refs_schema.json"), + known_good_file=rfp("data/multiple_refs_data.json")) + convert_and_compare(xlsx_file=rfp("data/indirect_data.xlsx"), + schema_file=rfp("data/indirect_schema.json"), + known_good_file=rfp("data/indirect_data.json")) + convert_and_compare(xlsx_file=rfp("data/multiple_choice_data.xlsx"), + schema_file=rfp("data/multiple_choice_schema.json"), + known_good_file=rfp("data/multiple_choice_data.json"), + strict=True) + + with open(rfp("data/simple_data.json"), encoding="utf-8") as myfile: + expected_datetime = json.load(myfile) + expected_datetime["Training"][0]["date"] = datetime.datetime(2023, 1, 1, 0, 0) + convert_and_compare(xlsx_file=rfp("data/simple_data_datetime.xlsx"), + schema_file=rfp("data/simple_schema.json"), + known_good_file="", known_good_data=expected_datetime) + + # Data loss when saving as xlsx + with pytest.raises(AssertionError) as err: + convert_and_compare(xlsx_file=rfp("data/simple_data_ascii_chars.xlsx"), + schema_file=rfp("data/simple_schema.json"), + known_good_file=rfp("data/simple_data_ascii_chars.json")) + assert str(err.value).startswith("Values at path ['Training', 0, ") + + +def test_missing_columns(): + with pytest.raises(ValueError) as caught: + convert.to_dict(xlsx=rfp("data/simple_data_missing.xlsx"), + schema=rfp("data/simple_schema.json"), strict=True) + assert str(caught.value) == "Missing column: Training.coach.given_name" + with pytest.warns(UserWarning) as caught: + convert.to_dict(xlsx=rfp("data/simple_data_missing.xlsx"), + schema=rfp("data/simple_schema.json")) + assert str(caught.pop().message) == "Missing column: Training.coach.given_name" + with pytest.warns(UserWarning) as caught: + convert.to_dict(xlsx=rfp("data/multiple_choice_data_missing.xlsx"), + schema=rfp("data/multiple_choice_schema.json")) + messages = {str(w.message) for w in caught} + for expected in [ + "Missing column: Training.skills.Communication", + "Missing column: Training.exam_types.Oral", + ]: + assert expected in messages + + +def test_faulty_foreign(): + # Simple wrong foreign key + converter = convert.XLSXConverter(xlsx=rfp("data/simple_data_wrong_foreign.xlsx"), + schema=rfp("data/simple_schema.json")) + with pytest.raises(RuntimeError): + converter.to_dict() + errors = converter.get_errors() + assert errors == {('Training.coach', 6): [['date', datetime.datetime(2023, 1, 2, 0, 0)], + ['url', 'www.indiscale.com']]} + + # More extensive example + converter = convert.XLSXConverter(xlsx=rfp("data/multiple_refs_data_wrong_foreign.xlsx"), + schema=rfp("data/multiple_refs_schema.json")) + with pytest.raises(RuntimeError): + converter.to_dict() + errors = converter.get_errors() + assert errors == { + ('Training.Organisation.Person', 8): [ + ['name', 'World Training Organization 2']], + ('Training.Organisation.Person', 9): [ + ['date', '2024-03-21T14:12:00.000Z'], + ['url', 'www.getlinkahead.com']], + ('Training.participant', 6): [ + ['date', '2024-03-21T14:12:00.000Z'], + ['url', None]], + ('Training.participant', 7): [ + ['date', '2024-03-21T14:12:00.000Z'], + ['url', None]], + } + + error_str = converter.get_error_str() + assert error_str == """Sheet: Training.Organisation.Person\tRow: 9 +\t\t['name']:\tWorld Training Organization 2 +Sheet: Training.Organisation.Person\tRow: 10 +\t\t['date']:\t2024-03-21T14:12:00.000Z +\t\t['url']:\twww.getlinkahead.com +Sheet: Training.participant\tRow: 7 +\t\t['date']:\t2024-03-21T14:12:00.000Z +\t\t['url']:\tNone +Sheet: Training.participant\tRow: 8 +\t\t['date']:\t2024-03-21T14:12:00.000Z +\t\t['url']:\tNone +""" + + +def test_set_in_nested(): + """Test the ``_set_in_nested`` function.""" + set_in_nested = convert._set_in_nested # pylint: disable=protected-access + + test_data_in = [ + {"mydict": {}, "path": ["a", 1], "value": 3}, + {"mydict": {"a": 1}, "path": ["a"], "value": 3, "overwrite": True}, + {"mydict": {"a": 1}, "path": ["a", 1], "value": 3, "overwrite": True}, + {"mydict": {"b": 2}, "path": ["a", 1, 3.141], "value": 3}, + {"mydict": {}, "path": ["X", "Y", "a", 1], "value": 3, "prefix": ["X", "Y"]}, + ] + test_data_out = [ + {"a": {1: 3}}, + {"a": 3}, + {"a": {1: 3}}, + {"a": {1: {3.141: 3}}, "b": 2}, + {"a": {1: 3}}, + ] + + for data_in, data_out in zip(test_data_in, test_data_out): + assert set_in_nested(**data_in) == data_out + + # Testing exceptions + test_data_in = [ + {"mydict": {"a": 1}, "path": ["a"], "value": 3}, + {"mydict": {"a": 1}, "path": ["a", 1], "value": 3}, + {"mydict": {}, "path": ["a", 1], "value": 3, "prefix": ["X", "Y", "Z"]}, + ] + exceptions = [ + [ValueError, r"There is already some value at \[a\]"], + [ValueError, r"There is already some value at \[1\]"], + [KeyError, r"Path does not start with prefix: \['X', 'Y', 'Z'\] not in \['a', 1\]"], + ] + + for data_in, (exc_out, match) in zip(test_data_in, exceptions): + with pytest.raises(exc_out, match=match): + set_in_nested(**data_in) + + +def test_group_foreign_paths(): + """Test the ``_group_foreign_paths`` function.""" + group = convert._group_foreign_paths # pylint: disable=protected-access + + foreign = [ + ["A", "x", 1.1], + ["A", "y", "z", "some text"], + ["A", "B", "CC", "x", 42], + ] + common = ["A", "B", "CC"] + common_wrong = ["A", "B", "C"] + expected = [ + SimpleNamespace(stringpath="A", path=["A"], subpath=["A"], + definitions=[["x", 1.1], ["y", "z", "some text"]]), + SimpleNamespace(stringpath="A.B.CC", path=["A", "B", "CC"], subpath=["B", "CC"], + definitions=[["x", 42]]), + ] + + with pytest.raises(ValueError, match=re.escape( + "Foreign keys must cover the complete `common` depth.")): + result = group(foreign=foreign, common=common_wrong) + result = group(foreign=foreign, common=common) + assert result == expected diff --git a/unittests/table_json_conversion/test_table_template_generator.py b/unittests/table_json_conversion/test_table_template_generator.py index 8fc7b216d0eb2aa54ece6ace986cbeb227cc3e45..d9a84dcf53ec991eec709aab406a7652881e6ea8 100644 --- a/unittests/table_json_conversion/test_table_template_generator.py +++ b/unittests/table_json_conversion/test_table_template_generator.py @@ -22,11 +22,10 @@ import json import os import tempfile -from typing import Tuple import pytest -from caosadvancedtools.table_json_conversion.table_generator import ( - ColumnType, XLSXTemplateGenerator) +from caosadvancedtools.table_json_conversion.table_generator import XLSXTemplateGenerator +from caosadvancedtools.table_json_conversion.xlsx_utils import ColumnType from openpyxl import load_workbook from .utils import compare_workbooks @@ -41,7 +40,7 @@ def rfp(*pathcomponents): def _compare_generated_to_known_good(schema_file: str, known_good: str, foreign_keys: dict = None, - outfile: str = None) -> Tuple: + outfile: str = None) -> tuple: """Generate an XLSX from the schema, then compare to known good output. Returns @@ -173,10 +172,10 @@ def test_generate_sheets_from_schema(): def test_get_foreign_keys(): generator = XLSXTemplateGenerator() fkd = {"Training": ['a']} - assert ['a'] == generator._get_foreign_keys(fkd, ['Training']) + assert [['a']] == generator._get_foreign_keys(fkd, ['Training']) fkd = {"Training": {"__this__": ['a']}} - assert ['a'] == generator._get_foreign_keys(fkd, ['Training']) + assert [['a']] == generator._get_foreign_keys(fkd, ['Training']) fkd = {"Training": {'hallo'}} with pytest.raises(ValueError, match=r"A foreign key definition is missing for path:\n\[" @@ -184,7 +183,7 @@ def test_get_foreign_keys(): generator._get_foreign_keys(fkd, ['Training']) fkd = {"Training": {"__this__": ['a'], 'b': ['c']}} - assert ['c'] == generator._get_foreign_keys(fkd, ['Training', 'b']) + assert [['c']] == generator._get_foreign_keys(fkd, ['Training', 'b']) with pytest.raises(ValueError, match=r"A foreign key definition is missing for .*"): generator._get_foreign_keys({}, ['Training']) @@ -264,7 +263,14 @@ def test_model_with_indirect_reference(): _compare_generated_to_known_good( schema_file=rfp("data/indirect_schema.json"), known_good=rfp("data/indirect_template.xlsx"), - foreign_keys={"Wrapper": ["Training.name", "Training.url"]}, + foreign_keys={"Wrapper": {"__this__": [["Training", "name"], ["Training", "url"]]}}, + outfile=None) + + +def test_model_with_multiple_choice(): + _compare_generated_to_known_good( + schema_file=rfp("data/multiple_choice_schema.json"), + known_good=rfp("data/multiple_choice_template.xlsx"), outfile=None) diff --git a/unittests/table_json_conversion/test_test_utils.py b/unittests/table_json_conversion/test_test_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..30171f61de26b1ae11fb25c730c96b31aa8f06a3 --- /dev/null +++ b/unittests/table_json_conversion/test_test_utils.py @@ -0,0 +1,42 @@ +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +"""Testing the ``utils`` module in this folder.""" + + +from .utils import _is_recursively_none + + +def test_recursively_none(): + """Testing ``_is_recursively_none``.""" + assert _is_recursively_none(None) + assert _is_recursively_none([]) + assert _is_recursively_none({}) + assert _is_recursively_none([None]) + assert _is_recursively_none({"a": None}) + assert _is_recursively_none([[], [None, None]]) + assert _is_recursively_none({1: [], 2: [None], 3: {"3.1": None}, 4: {"4.1": [None]}}) + + assert not _is_recursively_none(1) + assert not _is_recursively_none([1]) + assert not _is_recursively_none({1: 2}) + assert not _is_recursively_none([[1]]) + assert not _is_recursively_none({"a": None, "b": "b"}) + assert not _is_recursively_none([[], [None, 2]]) + assert not _is_recursively_none({1: [], 2: [None], 3: {"3.1": 3.141}, 4: {"4.1": [None]}}) diff --git a/unittests/table_json_conversion/utils.py b/unittests/table_json_conversion/utils.py index 6c32117c1296e686290ad75bf5f704a1abfb2547..b95715f72b08384f75857e48bcba328488313ad5 100644 --- a/unittests/table_json_conversion/utils.py +++ b/unittests/table_json_conversion/utils.py @@ -1,3 +1,5 @@ +# encoding: utf-8 +# # This file is a part of the LinkAhead Project. # # Copyright (C) 2024 IndiScale GmbH <info@indiscale.com> @@ -19,9 +21,53 @@ """Utilities for the tests. """ +from typing import Iterable, Union + from openpyxl import Workbook +def assert_equal_jsons(json1, json2, allow_none: bool = True, allow_empty: bool = True, + path: list = None) -> None: + """Compare two json objects for near equality. + +Raise an assertion exception if they are not equal.""" + if path is None: + path = [] + assert isinstance(json1, dict) == isinstance(json2, dict), f"Type mismatch, path: {path}" + if isinstance(json1, dict): + keys = set(json1.keys()).union(json2.keys()) + for key in keys: + this_path = path + [key] + # Case 1: exists in both collections + if key in json1 and key in json2: + el1 = json1[key] + el2 = json2[key] + assert isinstance(el1, type(el2)), f"Type mismatch, path: {this_path}" + if isinstance(el1, (dict, list)): + # Iterables: Recursion + assert_equal_jsons(el1, el2, allow_none=allow_none, allow_empty=allow_empty, + path=this_path) + continue + assert el1 == el2, f"Values at path {this_path} are not equal:\n{el1},\n{el2}" + continue + # Case 2: exists only in one collection + existing = json1.get(key, json2.get(key)) + assert ((allow_none and _is_recursively_none(existing)) + or (allow_empty and existing == [])), ( + f"Element at path {this_path} is None or empty in one json and does not exist in " + "the other.") + return + assert isinstance(json1, list) and isinstance(json2, list), f"Is not a list, path: {path}" + assert len(json1) == len(json2), f"Lists must have equal length, path: {path}" + for idx, (el1, el2) in enumerate(zip(json1, json2)): + this_path = path + [idx] + if isinstance(el1, dict): + assert_equal_jsons(el1, el2, allow_none=allow_none, allow_empty=allow_empty, + path=this_path) + else: + assert el1 == el2, f"Values at path {this_path} are not equal:\n{el1},\n{el2}" + + def compare_workbooks(wb1: Workbook, wb2: Workbook, hidden: bool = True): """Compare two workbooks for equal content. @@ -52,3 +98,19 @@ hidden: bool, optional f"Sheet: {sheetname}, cell: {cell1.coordinate}, Values: \n" f"{cell1.value}\n{cell2.value}" ) + + +def _is_recursively_none(obj: Union[list, dict] = None): + """Test if ``obj`` is None or recursively consists only of None-like objects.""" + if obj is None: + return True + if isinstance(obj, (list, dict)): + if isinstance(obj, list): + mylist: Iterable = obj + else: + mylist = obj.values() + for element in mylist: + if not _is_recursively_none(element): + return False + return True + return False diff --git a/unittests/test_table_importer.py b/unittests/test_table_importer.py index 599ea535d95d0b6c1216a935813d71c8e90c1d3b..6d445056b240e5ede6c52cb055cdde86cfb6d3d7 100644 --- a/unittests/test_table_importer.py +++ b/unittests/test_table_importer.py @@ -325,6 +325,60 @@ class CSVImporterTest(TableImporterTest): importer = CSVImporter(**kwargs) importer.read_file(tmp.name) + def test_gaps_in_int_column(self): + """Test for + https://gitlab.com/linkahead/linkahead-advanced-user-tools/-/issues/62: + Datatype confusion when encountering empty values in integer columns. + + """ + tmpfile = NamedTemporaryFile(delete=False, suffix=".csv") + with open(tmpfile.name, 'w') as tmp: + tmp.write( + "int,int_with_gaps,float\n" + "1,1,1.1\n" + "2,,1.2\n" + "3,3,1.3\n" + ) + + kwargs = { + "datatypes": { + "int": int, + "int_with_gaps": int, + "float": float + }, + "obligatory_columns": ["int"], + "converters": {} + } + importer = CSVImporter(**kwargs) + assert importer.datatypes["int"] == "Int64" + assert importer.datatypes["int_with_gaps"] == "Int64" + assert importer.datatypes["float"] == float + df = importer.read_file(tmpfile.name) + # Default is to convert nullable ints + assert df["int"].dtype == "Int64" + assert df["int_with_gaps"].dtype == "Int64" + assert df["float"].dtype == float + + assert pd.isna(df["int_with_gaps"][1]) + + # When not converting, empty fields raise errors ... + importer_strict = CSVImporter(convert_int_to_nullable_int=False, **kwargs) + assert importer_strict.datatypes["int"] == int + assert importer_strict.datatypes["int_with_gaps"] == int + assert importer_strict.datatypes["float"] == float + with pytest.raises(DataInconsistencyError) as die: + df = importer_strict.read_file(tmpfile.name) + assert "Integer column has NA values in column 1" in str(die.value) + + # ... except when a nullable datatype is set manually beforehand + kwargs["datatypes"]["int_with_gaps"] = "Int64" + importer_strict = CSVImporter(convert_int_to_nullable_int=False, **kwargs) + df = importer_strict.read_file(tmpfile.name) + # Now only the one that has been specifically set to Int64 is nullable. + assert df["int"].dtype == int + assert df["int_with_gaps"].dtype == "Int64" + assert df["float"].dtype == float + class TSVImporterTest(TableImporterTest): def test_full(self): diff --git a/unittests/test_yaml_model_parser.py b/unittests/test_yaml_model_parser.py index 97c3450f654e7b836734335cafac37adc6e700bb..d6dbf718dfa539e97214c2329cccc5a6bbf172b6 100644 --- a/unittests/test_yaml_model_parser.py +++ b/unittests/test_yaml_model_parser.py @@ -512,6 +512,8 @@ TestExperiment: assert test_rec.get_property("temperature").value == 23 assert test_rec.get_property("additional_prop").value == 7 + assert test_rec.name == "TestExperiment" + def test_file_role(): """Not implemented for now, see