diff --git a/CHANGELOG.md b/CHANGELOG.md index 0dead01baa2ed6fe45ddd74931b2863a3bc5cf38..6d90e03e5f79070ccbb1da12b5fed42c0b07a756 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### +- XLSX handling: conversion from XLSX to Json + ### Changed ### ### Deprecated ### diff --git a/Makefile b/Makefile index 724d9e5a4f06cb3a2880df9016adfe527314c338..26f5c8182545b2a57b3921f3745d16ff6305a0cc 100644 --- a/Makefile +++ b/Makefile @@ -41,5 +41,5 @@ style: .PHONY: style lint: - pylint --unsafe-load-any-extension=y --fail-under=9.59 -d R,C --ignore=swagger_client src/caosadvancedtools + pylint --unsafe-load-any-extension=y --fail-under=9.72 -d R,C --ignore=swagger_client src/caosadvancedtools .PHONY: lint diff --git a/README_SETUP.md b/README_SETUP.md index bf4f25d92106c19cccc276389b6c97aa22904923..3a7f0197a4b06694c7ae787d0baa6e8a89de0e5e 100644 --- a/README_SETUP.md +++ b/README_SETUP.md @@ -64,6 +64,7 @@ Build documentation in `build/` with `make doc`. - `sphinx` - `sphinx-autoapi` +- `sphinx-rtd-theme` - `recommonmark >= 0.6.0` ### How to contribute ### diff --git a/src/caosadvancedtools/table_json_conversion/convert.py b/src/caosadvancedtools/table_json_conversion/convert.py new file mode 100644 index 0000000000000000000000000000000000000000..586def7aae81b1f65b6132a1026a9de95f919987 --- /dev/null +++ b/src/caosadvancedtools/table_json_conversion/convert.py @@ -0,0 +1,388 @@ +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Convert XLSX files to JSON dictionaries.""" + +from __future__ import annotations + +from functools import reduce +from operator import getitem +from types import SimpleNamespace +from typing import Any, BinaryIO, Callable, TextIO, Union + +from jsonschema import validate, ValidationError +from openpyxl import load_workbook +from openpyxl.worksheet.worksheet import Worksheet + +from caosadvancedtools.table_json_conversion import xlsx_utils +from caosadvancedtools.table_json_conversion.fill_xlsx import read_or_dict + + +def _strict_bool(value: Any) -> bool: + """Convert value to bool, but only if it really is a valid XLSX bool.""" + if isinstance(value, bool): + return value + raise TypeError(f"Not a good boolean: {repr(value)}") + + +class XLSXConverter: + """Class for conversion from XLSX to JSON. + +For a detailed description of the required formatting of the XLSX files, see ``specs.md`` in the +documentation. + """ + + PARSER: dict[str, Callable] = { + "string": str, + "number": float, + "integer": int, + "boolean": _strict_bool, + } + + def __init__(self, xlsx: Union[str, BinaryIO], schema: Union[dict, str, TextIO]): + """ +Parameters +---------- +xlsx: Union[str, BinaryIO] + Path to the XLSX file or opened file object. + +schema: Union[dict, str, TextIO] + Schema for validation of XLSX content. +""" + self._workbook = load_workbook(xlsx) + self._schema = read_or_dict(schema) + self._defining_path_index = xlsx_utils.get_defining_paths(self._workbook) + self._handled_sheets: set[str] = set() + self._result: dict = {} + + def to_dict(self) -> dict: + """Convert the xlsx contents to a dict. + +Returns +------- +out: dict + A dict representing the JSON with the extracted data. + """ + self._handled_sheets = set() + self._result = {} + for sheetname in self._workbook.sheetnames: + if sheetname not in self._handled_sheets: + self._handle_sheet(self._workbook[sheetname]) + return self._result + + def _handle_sheet(self, sheet: Worksheet) -> None: + """Add the contents of the sheet to the result (stored in ``self._result``). + +Each row in the sheet corresponds to one entry in an array in the result. Which array exactly is +defined by the sheet's "proper name" and the content of the foreign columns. + +Look at ``xlsx_utils.get_path_position`` for the specification of the "proper name". + +""" + row_type_column = xlsx_utils.get_row_type_column_index(sheet) + foreign_columns = xlsx_utils.get_foreign_key_columns(sheet) + foreign_column_paths = {col.index: col.path for col in foreign_columns.values()} + data_columns = xlsx_utils.get_data_columns(sheet) + data_column_paths = {col.index: col.path for col in data_columns.values()} + # Parent path, insert in correct order. + parent, proper_name = xlsx_utils.get_path_position(sheet) + if parent: + parent_sheetname = xlsx_utils.get_worksheet_for_path(parent, self._defining_path_index) + if parent_sheetname not in self._handled_sheets: + self._handle_sheet(self._workbook[parent_sheetname]) + + # # We save single entries in lists, indexed by their foreign key contents. Each entry + # # consists of: + # # - foreign: Dict with path -> value for the foreign columns + # # - data: The actual data of this entry, a dict. + # entries: dict[str, list[SimpleNamespace]] = {} + + for row in sheet.iter_rows(values_only=True): + # Skip non-data rows. + if row[row_type_column] is not None: + continue + foreign_repr = "" + foreign = [] # A list of lists, each of which is: [path1, path2, ..., leaf, value] + data: dict = {} # Local data dict + # Collect data (in dict relative to current level) and foreign data information + for col_idx, value in enumerate(row): + if col_idx in foreign_column_paths: + foreign_repr += str(value) + foreign.append(foreign_column_paths[col_idx] + [value]) + continue + + if col_idx in data_column_paths: + path = data_column_paths[col_idx] + if self._is_multiple_choice(path): + real_value = path.pop() # Last component is the enum value, insert above + # set up list + try: + _set_in_nested(mydict=data, path=path, value=[], prefix=parent, skip=1) + except ValueError as err: + if not str(err).startswith("There is already some value at"): + raise + if not xlsx_utils.parse_multiple_choice(value): + continue + _set_in_nested(mydict=data, path=path, value=real_value, prefix=parent, + skip=1, append_to_list=True) + else: + value = self._validate_and_convert(value, path) + _set_in_nested(mydict=data, path=path, value=value, prefix=parent, skip=1) + continue + + # Find current position in tree + parent_dict = self._get_parent_dict(parent_path=parent, foreign=foreign) + + # Append data to current position's list + if proper_name not in parent_dict: + parent_dict[proper_name] = [] + parent_dict[proper_name].append(data) + self._handled_sheets.add(sheet.title) + + def _is_multiple_choice(self, path: list[str]) -> bool: + """Test if the path belongs to a multiple choice section.""" + if not path: + return False + subschema = self._get_subschema(path[:-1]) + if (subschema["type"] == "array" + and subschema.get("uniqueItems") is True + and "enum" in subschema["items"]): + return True + return False + + def _get_parent_dict(self, parent_path: list[str], foreign: list[list]) -> dict: + """Return the dict into which values can be inserted. + +This method returns, from the current result-in-making, the entry at ``parent_path`` which matches +the values given in the ``foreign`` specification. +""" + foreign_groups = _group_foreign_paths(foreign, common=parent_path) + + current_object = self._result + for group in foreign_groups: + # Find list for which foreign definitions are relevant. + current_object = reduce(getitem, group.subpath, current_object) + assert isinstance(current_object, list) + # Test all candidates. + for cand in current_object: + if all(reduce(getitem, definition[:-1], cand) == definition[-1] + for definition in group.definitions): + current_object = cand + break + else: + raise KeyError("Cannot find an element which matches the foreign definitions") + assert isinstance(current_object, dict) + return current_object + + def _validate_and_convert(self, value: Any, path: list[str]): + """Apply some basic validation and conversion steps. + +This includes: +- Validation against the type given in the schema +- List typed values are split at semicolons and validated individually + """ + if value is None: + return value + subschema = self._get_subschema(path) + # Array handling only if schema says it's an array. + if subschema.get("type") == "array": + array_type = subschema["items"]["type"] + if isinstance(value, str) and ";" in value: + values = [self.PARSER[array_type](v) for v in value.split(";")] + return values + try: + validate(value, subschema) + except ValidationError as verr: + print(verr) + print(path) + raise + + # Finally: convert to target type + return self.PARSER[subschema.get("type", "string")](value) + + def _get_subschema(self, path: list[str], schema: Union[dict, list] = None) -> dict: + """Return the sub schema at ``path``.""" + if schema is None: + schema = self._schema + assert schema is not None + assert isinstance(schema, dict) + if path: + if schema["type"] == "object": + next_schema = schema["properties"][path[0]] + return self._get_subschema(path=path[1:], schema=next_schema) + if schema["type"] == "array": + items = schema["items"] + if "enum" in items: + return schema + next_schema = items["properties"][path[0]] + return self._get_subschema(path=path[1:], schema=next_schema) + return schema + + +def _group_foreign_paths(foreign: list[list], common: list[str]) -> list[SimpleNamespace]: + """Group the foreign keys by their base paths. + +Parameters +---------- +foreign: list[list] + A list of foreign definitions, consisting of path components, property and possibly value. + +common: list[list[str]] + A common path which defines the final target of the foreign definitions. This helps to understand + where the ``foreign`` paths shall be split. + +Returns +------- +out: list[dict[str, list[list]]] + + A list of foreign path segments, grouped by their common segments. Each element is a namespace + with detailed information of all those elements which form the group. The namespace has the + following attributes: + + - ``path``: The full path to this path segment. This is always the previous segment's ``path`` + plus this segment's ``subpath``. + - ``stringpath``: The stringified ``path``, might be useful for comparison or sorting. + - ``subpath``: The path, relative from the previous segment. + - ``definitions``: A list of the foreign definitions for this segment, but stripped of the + ``path`` components. + """ + # Build a simple dict first, without subpath. + results = {} + for f_path in foreign: + path = [] + for component in f_path: + path.append(component) + if path != common[:len(path)]: + break + path.pop() + definition = f_path[len(path):] + stringpath = xlsx_utils.p2s(path) + if stringpath not in results: + results[stringpath] = SimpleNamespace(stringpath=stringpath, path=path, + definitions=[definition]) + else: + results[stringpath].definitions.append(definition) + + # Then sort by stringpath and calculate subpath. + stringpaths = sorted(results.keys()) + + resultlist = [] + last_level = 0 + for stringpath in stringpaths: + elem = results[stringpath] + elem.subpath = elem.path[last_level:] + last_level = len(elem.path) + resultlist.append(elem) + + # from IPython import embed + # embed() + + if last_level != len(common): + raise ValueError("Foreign keys must cover the complete `common` depth.") + return resultlist + + +# pylint: disable-next=dangerous-default-value,too-many-arguments +def _set_in_nested(mydict: dict, path: list, value: Any, prefix: list = [], skip: int = 0, + overwrite: bool = False, append_to_list: bool = False) -> dict: + """Set a value in a nested dict. + +Parameters +---------- +mydict: dict + The dict into which the ``value`` shall be inserted. +path: list + A list of keys, denoting the location of the value. +value + The value which shall be set inside the dict. +prefix: list + A list of keys which shall be removed from ``path``. A KeyError is raised if ``path`` does not + start with the elements of ``prefix``. +skip: int = 0 + Remove this many additional levels from the path, *after* removing the prefix. +overwrite: bool = False + If True, allow overwriting existing content. Otherwise, attempting to overwrite existing values + leads to an exception. +append_to_list: bool = False + If True, assume that the element at ``path`` is a list and append the value to it. If the list + does not exist, create it. If there is a non-list at ``path`` already, overwrite it with a new + list, if ``overwrite`` is True, otherwise raise a ValueError. + +Returns +------- +mydict: dict + The same dictionary that was given as a parameter, but modified. + """ + for idx, el in enumerate(prefix): + if path[idx] != el: + raise KeyError(f"Path does not start with prefix: {prefix} not in {path}") + path = path[len(prefix):] + if skip: + assert len(path) > skip, f"Path must be long enoug to remove skip={skip} elements." + path = path[skip:] + + tmp_dict = mydict + while len(path) > 1: + key = path.pop(0) + if key not in tmp_dict: + tmp_dict[key] = {} + if not isinstance(tmp_dict[key], dict): + if overwrite: + tmp_dict[key] = {} + else: + raise ValueError(f"There is already some value at {path}") + tmp_dict = tmp_dict[key] + key = path.pop() + if append_to_list: + if key not in tmp_dict: + tmp_dict[key] = [] + if not isinstance(tmp_dict[key], list): + if overwrite: + tmp_dict[key] = [] + else: + raise ValueError(f"There is already some non-list value at [{key}]") + tmp_dict[key].append(value) + else: + if key in tmp_dict and not overwrite: + raise ValueError(f"There is already some value at [{key}]") + if key not in tmp_dict: + tmp_dict[key] = {} + tmp_dict[key] = value + return mydict + + +def to_dict(xlsx: Union[str, BinaryIO], schema: Union[dict, str, TextIO]) -> dict: + """Convert the xlsx contents to a dict, it must follow a schema. + +Parameters +---------- +xlsx: Union[str, BinaryIO] + Path to the XLSX file or opened file object. + +schema: Union[dict, str, TextIO] + Schema for validation of XLSX content. + +Returns +------- +out: dict + A dict representing the JSON with the extracted data. + """ + converter = XLSXConverter(xlsx, schema) + return converter.to_dict() diff --git a/src/caosadvancedtools/table_json_conversion/fill_xlsx.py b/src/caosadvancedtools/table_json_conversion/fill_xlsx.py index 8019fb4440b361a5ac8623322df0e388375c4ece..66495584f611d76a2c2e7a661e046dee68681d2d 100644 --- a/src/caosadvancedtools/table_json_conversion/fill_xlsx.py +++ b/src/caosadvancedtools/table_json_conversion/fill_xlsx.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 # encoding: utf-8 # # This file is a part of the LinkAhead Project. @@ -19,12 +18,13 @@ # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. +"""Class and function to fill an XLSX template from actual data.""" from __future__ import annotations import pathlib from types import SimpleNamespace -from typing import Any, Dict, List, Optional, TextIO, Union +from typing import Any, Optional, TextIO, Union from warnings import warn from jsonschema import FormatChecker, validate @@ -74,7 +74,7 @@ class TemplateFiller: """ - def __init__(self, current_path: List[str] = None, props: Dict[str, Any] = None): + def __init__(self, current_path: list[str] = None, props: dict[str, Any] = None): self._current_path = current_path if current_path is not None else [] self._props = props if props is not None else {} # this is flat @@ -90,7 +90,7 @@ class TemplateFiller: result._current_path.append(next_level) # pylint: disable=protected-access return result - def __getitem__(self, path: Union[List[str], str], owner=None) -> Any: + def __getitem__(self, path: Union[list[str], str], owner=None) -> Any: if isinstance(path, list): path = p2s(path) return self._props[path] @@ -99,7 +99,7 @@ class TemplateFiller: fullpath = p2s(self._current_path + [propname]) self._props[fullpath] = value - def fill_from_data(self, data: Dict[str, Any]): + def fill_from_data(self, data: dict[str, Any]): # TODO recursive for dicts and list? """Fill current level with all scalar elements of ``data``.""" for name, value in data.items(): @@ -152,10 +152,10 @@ class TemplateFiller: sheetname=sheetname, sheet=sheet, col_index=col_idx, col_type=col[coltype_idx].value) - def _handle_data(self, data: dict, current_path: List[str] = None, + def _handle_data(self, data: dict, current_path: list[str] = None, context: TemplateFiller.Context = None, only_collect_insertables: bool = False, - ) -> Optional[Dict[str, Any]]: + ) -> Optional[dict[str, Any]]: """Handle the data and write it into ``workbook``. Parameters @@ -190,7 +190,7 @@ out: union[dict, None] context = TemplateFiller.Context() context.fill_from_data(data) - insertables: Dict[str, Any] = {} + insertables: dict[str, Any] = {} for name, content in data.items(): # TODO is this the best way to do it???? if name == "file": diff --git a/src/caosadvancedtools/table_json_conversion/table_generator.py b/src/caosadvancedtools/table_json_conversion/table_generator.py index 851173e2d51acec0da1e7a5f1f776bcef8db0f97..b8c50e7d8d40775f86c1a01d0934effe570cf20d 100644 --- a/src/caosadvancedtools/table_json_conversion/table_generator.py +++ b/src/caosadvancedtools/table_json_conversion/table_generator.py @@ -24,10 +24,12 @@ This module allows to generate template tables from JSON schemas. """ +from __future__ import annotations + import pathlib import re from abc import ABC, abstractmethod -from typing import Dict, List, Optional, Tuple +from typing import Optional from openpyxl import Workbook from openpyxl.styles import PatternFill @@ -69,11 +71,16 @@ class TableTemplateGenerator(ABC): Example: ``{"Training": {"__this__": ["date"], "Person": ["name", "email"]}}`` Here, ``date`` is the sole foreign key for Training. + + | It probably is worth extending the first example, with a case where a "Training" shall + be distiguished by the "name" and "email" of a "Person" which it references. The + foreign keys for this example are specified like this: + | ``{"Training": {"__this__": [["Person", "name"], ["Person", "email"]]}}`` """ def _generate_sheets_from_schema(self, schema: dict, foreign_keys: Optional[dict] = None - ) -> Dict[str, Dict[str, - Tuple[ColumnType, Optional[str], list]]]: + ) -> dict[str, dict[str, + tuple[ColumnType, Optional[str], list]]]: """Generate a sheet definition from a given JSON schema. Parameters @@ -112,30 +119,49 @@ class TableTemplateGenerator(ABC): foreign_keys = {} # here, we treat the top level # sheets[sheetname][colname]= (COL_TYPE, description, [path]) - sheets: Dict[str, Dict[str, Tuple[ColumnType, Optional[str], list]]] = {} + sheets: dict[str, dict[str, tuple[ColumnType, Optional[str], list]]] = {} for rt_name, rt_def in schema["properties"].items(): sheets[rt_name] = self._treat_schema_element(schema=rt_def, sheets=sheets, path=[rt_name], foreign_keys=foreign_keys) return sheets - def _get_foreign_keys(self, keys: dict, path: list) -> list: - """Return the foreign keys that are needed at the location to which path points.""" - msg = f"A foreign key definition is missing for path:\n{path}\nKeys are:\n{keys}" + def _get_foreign_keys(self, keys: dict, path: list) -> list[list[str]]: + """Return the foreign keys that are needed at the location to which path points. + +Returns +------- +foreign_keys: list[list[str]] + Contains lists of strings, each element is the path to one foreign key. +""" + msg_missing = f"A foreign key definition is missing for path:\n{path}\nKeys are:\n{keys}" + orig_path = path.copy() while path: if keys is None or path[0] not in keys: - raise ValueError(msg) + raise ValueError(msg_missing) keys = keys[path[0]] path = path[1:] if isinstance(keys, dict) and "__this__" in keys: - return keys["__this__"] - if isinstance(keys, list): - return keys - raise ValueError(msg) - - def _treat_schema_element(self, schema: dict, sheets: dict, path: List[str], + keys = keys["__this__"] + if isinstance(keys, str): + raise ValueError("Foreign keys must be a list of strings, but a single " + "string was given:\n" + f"{orig_path} -> {keys}") + if not isinstance(keys, list): + raise ValueError(msg_missing) + + # Keys must be either all lists or all strings + types = {type(key) for key in keys} + if len(types) > 1: + raise ValueError("The keys of this path must bei either all lists or all strings:" + f" {orig_path}") + if types.pop() is str: + keys = [[key] for key in keys] + return keys + + def _treat_schema_element(self, schema: dict, sheets: dict, path: list[str], foreign_keys: Optional[dict] = None, level_in_sheet_name: int = 1, array_paths: Optional[list] = None - ) -> Dict[str, Tuple[ColumnType, Optional[str], list]]: + ) -> dict[str, tuple[ColumnType, Optional[str], list]]: """Recursively transform elements from the schema into column definitions. ``sheets`` is modified in place. @@ -191,19 +217,16 @@ class TableTemplateGenerator(ABC): # and add the foreign keys that are necessary up to this point for array_path in array_paths: foreigns = self._get_foreign_keys(foreign_keys, array_path) - if isinstance(foreigns, str): - raise ValueError("Foreign keys must be a list of strings, but a single " - "string was given:\n" - f"{array_path} -> {foreigns}") for foreign in foreigns: - internal_key = p2s(array_path + [foreign]) + internal_key = p2s(array_path + foreign) if internal_key in sheets[sheetname]: - raise ValueError("The schema would lead to two columns with the same " - "name, which is forbidden:\n" + raise ValueError("The schema would lead to two columns with the " + "same name, which is forbidden:\n" f"{foreign} -> {internal_key}") ref_sheet = p2s(array_path) sheets[sheetname][internal_key] = ( - ColumnType.FOREIGN, f"see sheet '{ref_sheet}'", array_path + [foreign]) + ColumnType.FOREIGN, f"see sheet '{ref_sheet}'", + array_path + foreign) # Columns are added to the new sheet, thus we do not return any columns for the # current sheet. return {} @@ -299,7 +322,7 @@ class XLSXTemplateGenerator(TableTemplateGenerator): definition dict You need to pass the dict of a single sheet to this function. """ - return max([len(path) for _, _, path in sheetdef.values()]) + return max(len(path) for _, _, path in sheetdef.values()) @staticmethod def _get_ordered_cols(sheetdef: dict) -> list: @@ -319,7 +342,7 @@ class XLSXTemplateGenerator(TableTemplateGenerator): return ordered_cols def _create_workbook_from_sheets_def( - self, sheets: Dict[str, Dict[str, Tuple[ColumnType, Optional[str], list]]]): + self, sheets: dict[str, dict[str, tuple[ColumnType, Optional[str], list]]]): """Create and return a nice workbook for the given sheets.""" wb = Workbook() yellowfill = PatternFill(fill_type="solid", fgColor='00FFFFAA') diff --git a/src/caosadvancedtools/table_json_conversion/xlsx_utils.py b/src/caosadvancedtools/table_json_conversion/xlsx_utils.py index 374cdefb70737907839ba0a3339fefad28949340..32ed8552ce26f0c2245f9739721ba82ed95c12bf 100644 --- a/src/caosadvancedtools/table_json_conversion/xlsx_utils.py +++ b/src/caosadvancedtools/table_json_conversion/xlsx_utils.py @@ -39,12 +39,16 @@ from collections import OrderedDict from copy import deepcopy from enum import Enum from types import SimpleNamespace -from typing import Dict, List, TextIO, Union +from typing import Any, TextIO, Union from openpyxl import Workbook from openpyxl.worksheet.worksheet import Worksheet +TRUTHY = {"true", "wahr", "x", "√", "yes", "ja", "y", "j"} # For multiple choice columns +FALSY = {"false", "falsch", "-", "no", "nein", "n"} # For multiple choice columns + + class ColumnType(Enum): """ column types enum """ SCALAR = 1 @@ -133,7 +137,7 @@ out: dict[str, list[list[str]] return result -def get_data_columns(sheet: Worksheet) -> Dict[str, SimpleNamespace]: +def get_data_columns(sheet: Worksheet) -> dict[str, SimpleNamespace]: """Return the data paths of the worksheet. Returns @@ -164,7 +168,7 @@ out: dict[str, SimpleNamespace] return result -def get_foreign_key_columns(sheet: Worksheet) -> Dict[str, SimpleNamespace]: +def get_foreign_key_columns(sheet: Worksheet) -> dict[str, SimpleNamespace]: """Return the foreign keys of the worksheet. Returns @@ -214,7 +218,7 @@ proper_name: str # longest common path in data colums data_paths = [el.path for el in get_data_columns(sheet).values()] - for ii in range(min([len(path) for path in data_paths])): + for ii in range(min(len(path) for path in data_paths)): components_at_index = {path[ii] for path in data_paths} if len(components_at_index) > 1: break @@ -282,12 +286,54 @@ def next_row_index(sheet: Worksheet) -> int: return sheet.max_row -def p2s(path: List[str]) -> str: +def p2s(path: list[str]) -> str: """Path to string: dot-separated. """ return ".".join(path) +def parse_multiple_choice(value: Any) -> bool: + """Interpret ``value`` as a multiple choice input. + +*Truthy* values are: +- The boolean ``True``. +- The number "1". +- The (case-insensitive) strings ``true``, ``wahr``, ``x``, ``√``, ``yes``, ``ja``, ``y``, ``j``. + +*Falsy* values are: +- The boolean ``False``. +- ``None``, empty strings, lists, dicts. +- The number "0". +- The (case-insensitive) strings ``false``, ``falsch``, ``-``, ``no``, ``nein``, ``n``. +- Everything else. + +Returns +------- +out: bool + The interpretation result of ``value``. + """ + # Non-string cases first: + # pylint: disable-next=too-many-boolean-expressions + if (value is None or value is False or value == 0 + or value == [] or value == {} or value == ""): + return False + if (value is True or value == 1): + return True + + # String cases follow: + if not isinstance(value, str): + return False + value = value.lower() + + if value in TRUTHY: + return True + + # Strictly speaking, this test is not necessary, but I think it's good practice. + if value in FALSY: + return False + return False + + def read_or_dict(data: Union[dict, str, TextIO]) -> dict: """If data is a json file name or input stream, read data from there. If it is a dict already, just return it.""" diff --git a/src/doc/table-json-conversion/specs.md b/src/doc/table-json-conversion/specs.md deleted file mode 100644 index 5a5197473d82886fcb3ee54f8ac9c5865c456710..0000000000000000000000000000000000000000 --- a/src/doc/table-json-conversion/specs.md +++ /dev/null @@ -1,341 +0,0 @@ -# Conversion between LinkAhead data models, JSON schema, and XLSX (and vice versa) # - -This file describes the conversion between JSON schema files and XLSX templates, and between JSON -data files following a given schema and XLSX files with data. This conversion is handled by the -Python modules in the `table_json_conversion` library. - -Requirements: When converting from a json schema, the top level of the json schema must be a -dict. The keys of the dict are RecordType names. - -## Data models in JSON Schema and JSON data ## - -The data model in LinkAhead defines the types of records present in a LinkAhead instance and their -structure. This data model can also be represented in a JSON Schema, which defines the structure of -JSON files containing records pertaining to the data model. - -For example, the following JSON can describe a singe "Person" Record: - -```JSON -{ - "Person": [ - { - "family_name": "Steve", - "given_name": "Stevie" - } - ] -} -``` - -A *JSON Schema* specifies a concrete structure, and the associated JSON files can be used to -represent data for specific record structures. For instance, one could create a JSON Schema allowing -the storage of "Training" Records containing information about conducted trainings. This is -particularly valuable for data import and export. One could generate web forms from the JSON Schema -or use it to export objects stored in LinkAhead as JSON. - -### Note: Data models and data arrays ### - -The schema as created by ``json_schema_exporter.recordtype_to_json_schema(...)`` is, from a broad -view, a dict with all the top level recordtypes (the recordtype names are the keys). While this is -appropriate for the generation of user input forms, data often consists of multiple entries of the -same type. XLSX files are no exception, users expect that they may enter multiple rows of data. - -Since the data model schema does not match multiple data sets, there is a utility function which -create a *data array* schema out of the *data model* schema: It basically replaces the top-level -entries of the data model by lists which may contain data. - -A **short example** illustrates this well. Consider a *data model* schema which fits to this data -content: - -```JSON -{ - "Person": { - "name": "Charly" - } -} -``` - -Now the automatically generated *data array* schema would accept the following data: - -```JSON -{ - "Person": [ - { - "name": "Charly" - }, - { - "name": "Sam" - } - ] -} -``` - -## From JSON to XLSX: Data Representation ## - -The following describes how JSON files representing LinkAhead records are converted into XLSX files, -or how JSON files with records are created from XLSX files. - -The attribute name (e.g., "Person" above) determines the RecordType, and the value of this attribute -can either be an object or a list. If it is an object (as in the example above), a single record is -represented. In the case of a list, multiple records sharing the same RecordType as the parent are -represented. - -The *Properties* of the record (e.g., `family_name` and `given_name` above) become *columns* in the -XLSX file. These properties have an attribute name and a value. The value can be: - -a. A primitive (text, number, boolean, ...) -b. A record -c. A list of primitive types -d. A list of unique enums (multiple choice) -e. A list of records - -In cases *a.* and *c.*, a cell is created in the column corresponding to the property in the XLSX -file. In case *b.*, columns are created for the Properties of the record, where for each of the -Properties the cases *a.* - *e.* are considered recursively. Case *d.* leads to a number of -columns, one for each of the possible choices. - -For case *e.* however, the two-dimensional structure of an XLSX sheet is not sufficient. Therefore, -for such cases, *new* XLSX sheets/tables are created. - -In these sheets/tables, the referenced records are treated as described above (new columns for the -Properties). However, there are now additional columns that indicate from which "external" record -these records are referenced. - -Let's now consider these four cases in detail and with examples: - -### a. Properties with primitive data types ### - -```JSON -{ - "Training": [ - { - "date": "2023-01-01", - "url": "www.indiscale.com", - "duration": 1.0, - "participants": 1, - "remote": false - }, - { - "date": "2023-06-15", - "url": "www.indiscale.com/next", - "duration": 2.5, - "participants": None, - "remote": true - } - ] -} -``` - -This entry will be represented in an XLSX sheet with the following content: - -| date | url | duration | participants | remote | -|------------|------------------------|----------|--------------|--------| -| 2023-01-01 | www.indiscale.com | 1.0 | 1 | false | -| 2023-06-15 | www.indiscale.com/next | 2.5 | | true | - -### b. Property referencing a record ### - -```JSON -{ - "Training": [ - { - "date": "2023-01-01", - "supervisor": { - "family_name": "Stevenson", - "given_name": "Stevie", - } - } - ] -} -``` - -This entry will be represented in an XLSX sheet with the following content: - -| date | `supervisor.family_name` | `supervisor.given_name` | -|------------|--------------------------|-------------------------| -| 2023-01-01 | Stevenson | Stevie | - -Note that column names may be renamed. The mapping of columns to properties of records is ensured -through the content of hidden rows. (See below for the definition of hidden rows.) - -### c. Properties containing lists of primitive data types ### - -```JSON -{ - "Training": [ - { - "url": "www.indiscale.com", - "subjects": ["Math", "Physics"], - } - ] -} -``` - -This entry would be represented in an XLSX sheet with the following content: - -| url | subjects | -|-------------------|--------------| -| www.indiscale.com | Math;Physics | - -The list elements are written into the cell separated by `;` (semicolon). If the elements contain -the separator `;`, it is escaped with `\\`. - -### d. Multiple choice properties ### - -```JSON -{ - "Training": [ - { - "date": "2024-04-17", - "skills": [ - "Planning", - "Evaluation" - ] - } - ] -} -``` - -If the `skills` list is denoted as an `enum` array with `"uniqueItems": true` in the json schema, -this entry would be represented like this in an XLSX: - -| date | skills.Planning | skills.Communication | skills.Evaluation | -|------------|-----------------|----------------------|-------------------| -| 2024-04-17 | x | | x | - -Note that this example assumes that the list of possible choices, as given in the json schema, was -"Planning, Communication, Evaluation". - -### e. Properties containing lists with references ### - -```JSON -{ - "Training": [ - { - "date": "2023-01-01", - "coach": [ - { - "family_name": "Sky", - "given_name": "Max", - }, - { - "family_name": "Sky", - "given_name": "Min", - } - ] - } - ] -} -``` - -Since the two coaches cannot be represented properly in a single cell, another worksheet is needed -to contain the properties of the coaches. - -The sheet for the Trainings in this example only contains the "date" column - -| date | -|------------| -| 2023-01-01 | - -Additionally, there is *another* sheet where the coaches are stored. Here, it is crucial to define -how the correct element is chosen from potentially multiple "Trainings". In this case, it means that -the "date" must be unique. - -Note: This uniqueness requirement is not strictly checked right now, it is your responsibility as a -user that such "foreign properties" are truly unique. - -The second sheet looks like this: - -| date | `coach.family_name` | `coach.given_name` | -|------------|---------------------|--------------------| -| 2023-01-01 | Sky | Max | -| 2023-01-01 | Sky | Min | - -## Data in XLSX: Hidden automation logic ## - -### First column: Marker for row types ### - -The first column in each sheet will be hidden and it will contain an entry in each row that needs -special treatment. The following values are used: - -- ``IGNORE``: This row is ignored. It can be used for explanatory texts or layout. -- ``COL_TYPE``: Typically the first row that is not `IGNORE`. It indicates the row that defines the - type of columns (`FOREIGN`, `SCALAR`, `LIST`, `MULTIPLE_CHOICE`, `IGNORE`). This row must occur - exactly once per sheet. -- ``PATH``: Indicates that the row is used to define the path within the JSON. These rows are - typically hidden for users. - -An example table could look like this: - -| `IGNORE` | | Welcome | to this | file! | | -| `IGNORE` | | Please | enter your | data here: | | -| `COL_TYPE` | `IGNORE` | `SCALAR` | `SCALAR` | `LIST` | `SCALAR` | -| `PATH` | | `Training` | `Training` | `Training` | `Training` | -| `PATH` | | `url` | `date` | `subjects` | `supervisor` | -| `PATH` | | | | | `email` | -| `IGNORE` | Please enter one training per line. | Training URL | Training date | Subjects | Supervisor's email | -|------------|-------------------------------------|----------------|---------------|--------------|--------------------| -| | | example.com/mp | 2024-02-27 | Math;Physics | steve@example.com | -| | | example.com/m | 2024-02-27 | Math | stella@example.com | - -### Parsing XLSX data ### - -To extract the value of a given cell, we traverse all path elements (in ``PATH`` rows) from top to -bottom. The final element of the path is the name of the Property to which the value belongs. In -the example above, `steve@example.com` is the value of the `email` Property in the path -`["Training", "supervisor", "email"]`. - -The path elements are sufficient to identify the object within a JSON, at least if the corresponding -JSON element is a single object. If the JSON element is an array, the appropriate object within the -array needs to be selected. - -For this selection additional ``FOREIGN`` columns are used. The paths in these columns must all have -the same *base* and one additional *unique key* component. For example, two `FOREIGN` columns could -be `["Training", "date"]` and `["Training", "url"]`, where `["Training"]` is the *base path* and -`"date"` and `"url"` are the *unique keys*. - -The base path defines the table (or recordtype) to which the entries belong, and the values of the -unique keys define the actual rows to which data belongs. - -For example, this table defines three coaches for the two trainings from the last table: - -| `COL_TYPE` | `FOREIGN` | `FOREIGN` | `SCALAR` | -| `PATH` | `Training` | `Training` | `Training` | -| `PATH` | `date` | `url` | `coach` | -| `PATH` | | | `given_name` | -| `IGNORE` | Date of training | URL of training | The coach's given name | -| `IGNORE` | from sheet 'Training' | from sheet 'Training' | | -|------------|-----------------------|-----------------------|------------------------| -| | 2024-02-27 | example.com/mp | Ada | -| | 2024-02-27 | example.com/mp | Berta | -| | 2024-02-27 | example.com/m | Chris | - -#### Sepcial case: multiple choice "checkboxes" #### - -As a special case, enum arrays with `"uniqueItems": true` can be represented as multiple columns, -with one column per choice. The choices are denoted as the last `PATH` component, the column type -must be `MULTIPLE_CHOICE`. - -Stored data is denoted as an "x" character in the respective cell, empty cells denote that the item -was not selected. Additionally, the implementation also allows `TRUE` or `1` for selected items, -and `FALSE`, `0` or cells with only whitespace characters for deselected items: - -| `COL_TYPE` | `MULTIPLE_CHOICE` | `MULTIPLE_CHOICE` | `MULTIPLE_CHOICE` | -| `PATH` | `skills` | `skills` | `skills` | -| `PATH` | `Planning` | `Communication` | `Evaluation` | -| `IGNORE` | skills.Planning | skills.Communication | skills.Evaluation | -|------------|-------------------|----------------------|-------------------| -| | x | | X | -| | `" "` | `TRUE` | `FALSE` | -| | 0 | x | 1 | - -These rows correspond to: - -1. Planning, Evaluation -2. Communication -3. Communication, Evaluation - -## Current limitations ## - -The current implementation still lacks the following: - -- Files handling is not implemented yet. diff --git a/src/doc/table-json-conversion/specs.rst b/src/doc/table-json-conversion/specs.rst new file mode 100644 index 0000000000000000000000000000000000000000..c98eddc1180f552f1d2389b1bb57979e93550ab8 --- /dev/null +++ b/src/doc/table-json-conversion/specs.rst @@ -0,0 +1,527 @@ +Conversion between LinkAhead data models, JSON schema, and XLSX (and vice versa) +================================================================================ + +This file describes the conversion between JSON schema files and XLSX +templates, and between JSON data files following a given schema and XLSX +files with data. This conversion is handled by the Python modules in the +``table_json_conversion`` library. + +Data models in JSON Schema and JSON data +---------------------------------------- + +Let’s start simple! If you would describe a ``Person`` Record with the +Properties ``family_name`` and ``given_name`` in JSON, it would probably +look like this: + +.. code:: json + + { + "Person": + { + "family_name": "Steve", + "given_name": "Stevie" + } + } + +The data model in LinkAhead defines the types of records present in a +LinkAhead instance and their structure. This data model can also be +represented in a JSON Schema, which defines the structure of JSON files +containing records pertaining to the data model. + +You can define this kind of structure with the following JSON schema: + +.. code:: json + + { + "type": "object", + "properties": { + "Person": { + "type": "object", + "properties": { + "family_name": { + "type": "string" + }, + "given_name": { + "type": "string" + } + } + } + }, + "$schema": "https://json-schema.org/draft/2020-12/schema" + } + +The above schema (and schemas created by +``json_schema_exporter.merge_schemas(...)``) is, from a broad view, a +dict with all the top level recordtypes (the recordtype names are the +keys). This is sufficient to describe the data model. However, actual +data often consists of multiple entries of the same type (e.g. multiple +Persons). + +Since the data model schema does not match multiple data sets, there is +a utility function which creates a *data array* schema out of the *data +model* schema: It basically replaces the top-level entries of the data +model by lists which may contain data. + +For example, the following JSON describes two “Person” Records: + +.. code:: json + + { + "Person": [ + { + "family_name": "Steve", + "given_name": "Stevie" + }, + { + "family_name": "Star", + "given_name": "Stella" + } + ] + } + +The *JSON Schema* for a JSON like the above one could look like the +following: + +.. code:: json + + { + "type": "object", + "properties": { + "Person": { + "type": "array", + "items": { + "type": "object", + "properties": { + "family_name": { + "type": "string" + }, + "given_name": { + "type": "string" + } + } + } + } + }, + "$schema": "https://json-schema.org/draft/2020-12/schema" + } + +This would define that the top level object/dict may have a key +``Person`` which has as value an array of objects that in turn have the +properties ``family_name`` and ``given_name``. + +You can create a data array schema from a data model schema using +``xlsx_utils.array_schema_from_model_schema``. + +From JSON to XLSX: Data Representation +-------------------------------------- + +The following describes how JSON files representing LinkAhead records +are converted into XLSX files, or how JSON files with records are +created from XLSX files. + +The attribute name (e.g., “Person” above) determines the RecordType, and +the value of this attribute can either be an object or a list. If it is +an object (as in the example above), a single record is represented. In +the case of a list, multiple records sharing the same RecordType as the +parent are represented. + +The *Properties* of the record (e.g., ``family_name`` and ``given_name`` +above) become *columns* in the XLSX file. Thus the XLSX file created +from the above example would have a sheet “Person” with the following +table: + +========== =========== +given_name family_name +========== =========== +Stevie Steve +Stella Star +========== =========== + +The properties of objects (Records) in the JSON have an attribute name +and a value. The value can be: + +a. A primitive (text, number, boolean, …) +b. A record +c. A list of primitive types +d. A list of unique enums (multiple choice) +e. A list of records + +In cases *a.* and *c.*, a cell is created in the column corresponding to +the property in the XLSX file. In case *b.*, columns are created for the +Properties of the record, where for each of the Properties the cases +*a.* - *e.* are considered recursively. Case *d.* leads to a number of +columns, one for each of the possible choices. + +For case *e.* however, the two-dimensional structure of an XLSX sheet is +not sufficient. Therefore, for such cases, *new* XLSX sheets/tables are +created. + +In these sheets/tables, the referenced records are treated as described +above (new columns for the Properties). However, there are now +additional columns that indicate from which “external” record these +records are referenced. + +Let’s now consider these five cases in detail and with examples: + +a. Properties with primitive data types +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: json + + { + "Training": [ + { + "date": "2023-01-01", + "url": "www.indiscale.com", + "duration": 1.0, + "participants": 1, + "remote": false + }, + { + "date": "2023-06-15", + "url": "www.indiscale.com/next", + "duration": 2.5, + "participants": None, + "remote": true + } + ] + } + +This entry will be represented in an XLSX sheet with the following +content: + ++------------+------------------------+----------+--------------+--------+ +| date | url | duration | participants | remote | ++============+========================+==========+==============+========+ +| 2023-01-01 | www.indiscale.com | 1.0 | 1 | false | ++------------+------------------------+----------+--------------+--------+ +| 2023-06-15 | www.indiscale.com/next | 2.5 | | true | ++------------+------------------------+----------+--------------+--------+ + +b. Property referencing a record +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: json + + { + "Training": [ + { + "date": "2023-01-01", + "supervisor": { + "family_name": "Stevenson", + "given_name": "Stevie" + } + } + ] + } + +This entry will be represented in an XLSX sheet named "Training" with the following +content: + +========== ========================== ========================= +date supervisor.family_name supervisor.given_name +========== ========================== ========================= +2023-01-01 Stevenson Stevie +========== ========================== ========================= + + +c. Properties containing lists of primitive data types +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: json + + { + "Training": [ + { + "url": "www.indiscale.com", + "subjects": ["Math", "Physics"], + } + ] + } + +This entry would be represented in an XLSX sheet with the following +content: + +================= ============ +url subjects +================= ============ +www.indiscale.com Math;Physics +================= ============ + +The list elements are written into the cell separated by ``;`` +(semicolon). If the elements contain the separator ``;``, it is escaped +with ``\``. + +d. Multiple choice properties +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: json + + { + "Training": [ + { + "date": "2024-04-17", + "skills": [ + "Planning", + "Evaluation" + ] + } + ] + } + +If the ``skills`` list is denoted as an ``enum`` array with +``"uniqueItems": true`` in the json schema, this entry would be +represented like this in an XLSX: + ++------------+-----------------+----------------------+-------------------+ +| date | skills.Planning | skills.Communication | skills.Evaluation | ++============+=================+======================+===================+ +| 2024-04-17 | x | | x | ++------------+-----------------+----------------------+-------------------+ + +Note that this example assumes that the list of possible choices, as +given in the json schema, was “Planning, Communication, Evaluation”. + +e. Properties containing lists with references +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: json + + { + "Training": [ + { + "date": "2023-01-01", + "coach": [ + { + "family_name": "Sky", + "given_name": "Max", + }, + { + "family_name": "Sky", + "given_name": "Min", + } + ] + } + ] + } + +Since the two coaches cannot be represented properly in a single cell, +another worksheet is needed to contain the properties of the coaches. + +The sheet for the Trainings in this example only contains the “date” +column + ++------------+ +| date | ++============+ +| 2023-01-01 | ++------------+ + +Additionally, there is *another* sheet where the coaches are stored. +Here, it is crucial to define how the correct element is chosen from +potentially multiple “Trainings”. In this case, it means that the “date” +must be unique. + + +The second sheet looks like this: + +========== ===================== ==================== +date ``coach.family_name`` ``coach.given_name`` +========== ===================== ==================== +2023-01-01 Sky Max +2023-01-01 Sky Min +========== ===================== ==================== + +Note: This uniqueness requirement is not strictly checked right now, it +is your responsibility as a user that such “foreign properties” are +truly unique. + +When converting JSON files that contain Records that were exported from LinkAhead +it might be a good idea to use the LinkAhead ID as a unique identifier for Records. However, if +your Records do not yet have LinkAhead IDs you need to find some other identifying +properties/foreign keys. Note, that those properties only need to identify a Record uniquely within +the list of Records: In the above example the "coach" Record needs to be identified in the list of +coaches. + + +Data in XLSX: Hidden automation logic +------------------------------------- + +First column: Marker for row types +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The first column in each sheet will be hidden and it will contain an +entry in each row that needs special treatment. The following values are +used: + +- ``IGNORE``: This row is ignored. It can be used for explanatory texts + or layout. +- ``COL_TYPE``: Typically the first row that is not ``IGNORE``. It + indicates the row that defines the type of columns (``FOREIGN``, + ``SCALAR``, ``LIST``, ``MULTIPLE_CHOICE``, ``IGNORE``). This row must + occur exactly once per sheet. +- ``PATH``: Indicates that the row is used to define the path within + the JSON. These rows are typically hidden for users. + +An example table could look like this: + ++----------+-------------------------------------+----------------+---------------+--------------+---------------------+ +| IGNORE | | Welcome | to | this | file | ++==========+=====================================+================+===============+==============+=====================+ +| IGNORE | | Please | enter your | data here: | | ++----------+-------------------------------------+----------------+---------------+--------------+---------------------+ +| COL_TYPE | IGNORE | SCALAR | SCALAR | LIST | SCALAR | ++----------+-------------------------------------+----------------+---------------+--------------+---------------------+ +| PATH | | Training | Training | Training | Training | ++----------+-------------------------------------+----------------+---------------+--------------+---------------------+ +| PATH | | url | date | subjects | supervisor | ++----------+-------------------------------------+----------------+---------------+--------------+---------------------+ +| PATH | | | | | email | ++----------+-------------------------------------+----------------+---------------+--------------+---------------------+ +| IGNORE | Please enter one training per line. | Training URL | Training date | Subjects | Supervisor's email | ++----------+-------------------------------------+----------------+---------------+--------------+---------------------+ +| | | example.com/mp | 2024-02-27 | Math;Physics | steve@example.com | ++----------+-------------------------------------+----------------+---------------+--------------+---------------------+ +| | | example.com/m | 2024-02-28 | Math | stella@example.com | ++----------+-------------------------------------+----------------+---------------+--------------+---------------------+ + + +Parsing XLSX data +~~~~~~~~~~~~~~~~~ + +To extract the value of a given cell, we traverse all path elements (in +``PATH`` rows) from top to bottom. The final element of the path is the +name of the Property to which the value belongs. In the example above, +``steve@example.com`` is the value of the ``email`` Property in the path +``["Training", "supervisor", "email"]``. + +The path elements are sufficient to identify the object within a JSON, +at least if the corresponding JSON element is a single object. If the +JSON element is an array, the appropriate object within the array needs +to be selected. + +For this selection additional ``FOREIGN`` columns are used. The paths in +these columns must all have the same *base* and one additional *unique +key* component. For example, two ``FOREIGN`` columns could be +``["Training", "date"]`` and ``["Training", "url"]``, where +``["Training"]`` is the *base path* and ``"date"`` and ``"url"`` are the +*unique keys*. + +The base path defines the table (or recordtype) to which the entries +belong, and the values of the unique keys define the actual rows to +which data belongs. + +For example, this table defines three coaches for the two trainings from +the last table: + ++----------+-----------------------+-----------------------+------------------------+ +| COL_TYPE | FOREIGN | FOREIGN | SCALAR | ++----------+-----------------------+-----------------------+------------------------+ +| PATH | Training | Training | Training | ++----------+-----------------------+-----------------------+------------------------+ +| PATH | date | url | coach | ++----------+-----------------------+-----------------------+------------------------+ +| PATH | | | given_name | ++----------+-----------------------+-----------------------+------------------------+ +| IGNORE | Date of training | URL of training | The coach’s given name | ++----------+-----------------------+-----------------------+------------------------+ +| IGNORE | from sheet ‘Training’ | from sheet ‘Training’ | | ++----------+-----------------------+-----------------------+------------------------+ +| | 2024-02-27 | example.com/mp | Ada | ++----------+-----------------------+-----------------------+------------------------+ +| | 2024-02-27 | example.com/mp | Berta | ++----------+-----------------------+-----------------------+------------------------+ +| | 2024-02-28 | example.com/m | Chris | ++----------+-----------------------+-----------------------+------------------------+ + +Sepcial case: multiple choice “checkboxes” +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +As a special case, enum arrays with ``"uniqueItems": true`` can be +represented as multiple columns, with one column per choice. The choices +are denoted as the last PATH component, the column type must be +MULTIPLE_CHOICE. + +Stored data is denoted as an “x” character in the respective cell, empty +cells denote that the item was not selected. Additionally, the +implementation also allows TRUE or 1 for selected items, and +FALSE, 0 or cells with only whitespace characters for deselected +items: + ++----------+-----------------+----------------------+-------------------+ +| COL_TYPE | MULTIPLE_CHOICE | MULTIPLE_CHOICE | MULTIPLE_CHOICE | ++----------+-----------------+----------------------+-------------------+ +| PATH | skills | skills | skills | ++----------+-----------------+----------------------+-------------------+ +| PATH | Planning | Communication | Evaluation | ++----------+-----------------+----------------------+-------------------+ +| IGNORE | skills.Planning | skills.Communication | skills.Evaluation | ++----------+-----------------+----------------------+-------------------+ +| | x | | X | ++----------+-----------------+----------------------+-------------------+ +| | " " | TRUE | FALSE | ++----------+-----------------+----------------------+-------------------+ +| | 0 | x | 1 | ++----------+-----------------+----------------------+-------------------+ + +These rows correspond to: + +1. Planning, Evaluation +2. Communication +3. Communication, Evaluation + + +User Interaction +---------------- +The primary and most straight forward use case of this utility is to export +LinkAhead data as JSON and then as XLSX tables. This can be done fully +automatic. + +TODO show how! + +The hidden cells for automation are designed such that the XLSX template that +is created can be customized such that it is a nicely formatted table. The +hidden content must remain. See below for tips on how to manipulate the table. + +The second use case is to use XLSX to collect data and then import it into +LinkAhead. Here, it may be necessary to define foreign keys in order to +identify Records in lists. + +Table Manipulation +~~~~~~~~~~~~~~~~~~ + +- All formatting is ignored +- Nothing has to be observed when adding new data rows +- When adding new descriptory rows (for example one for descriptions of the + columns), the ``COL_TYPE`` must be set to ``IGNORE`` +- You can freely rename sheets. +- You can freely rename columns (since the row containing the column names is + set to ``IGNROE``; the Property name is taken from the last path element) +- You can change the order of columns. However, you have to make sure to move + the full column including hidden elements. Thus you should not select a range + of cells, but click on the column index in your spread sheet program. + +Note: Requirements +------------------ + +This conversion does not allow arbitrary JSON schema files nor does it +support arbitrary JSON files since conversion to XLSX files would not +make sense. Instead, this utility is tailored to supported conversion of +data (models) that are structured like data (models) in LinkAhead: + +- The JSON schema describes a data model of RecordTypes and Properties as it would be generated by the caosadvancedtools.json_schema_exporter module. +- The JSON files must contain arrays of Records complying with such a data model. + +Thus, when converting from a JSON schema, the top level of the JSON +schema must be a dict. The keys of the dict are RecordType names. + + + + +Current limitations +------------------- + +The current implementation still lacks the following: + +- Files handling is not implemented yet. + diff --git a/unittests/table_json_conversion/create_jsonschema.py b/unittests/table_json_conversion/create_jsonschema.py index 9585f5458edf8f9d3f785099295a3e675230932c..8ab4ad2d973b78522e858b3ee866b870ecf187a4 100755 --- a/unittests/table_json_conversion/create_jsonschema.py +++ b/unittests/table_json_conversion/create_jsonschema.py @@ -20,17 +20,18 @@ """ +from __future__ import annotations + import argparse import json -from typing import List import caosadvancedtools.json_schema_exporter as jsex from caosadvancedtools.models import parser # import tomli -def prepare_datamodel(modelfile, recordtypes: List[str], outfile: str, - do_not_create: List[str] = None): +def prepare_datamodel(modelfile, recordtypes: list[str], outfile: str, + do_not_create: list[str] = None): if do_not_create is None: do_not_create = [] model = parser.parse_model_from_yaml(modelfile) diff --git a/unittests/table_json_conversion/data/indirect_data.xlsx b/unittests/table_json_conversion/data/indirect_data.xlsx index 894ec95f87aa32a618b3b70504727398f2ce2358..3d0cf3245a414a4161b99034051424c699b5d453 100644 Binary files a/unittests/table_json_conversion/data/indirect_data.xlsx and b/unittests/table_json_conversion/data/indirect_data.xlsx differ diff --git a/unittests/table_json_conversion/data/indirect_template.xlsx b/unittests/table_json_conversion/data/indirect_template.xlsx index cc614acb75b36e10143a29f28dff9fce7d5e006f..0c521e554027f565ecae6fe27783034361b8ff41 100644 Binary files a/unittests/table_json_conversion/data/indirect_template.xlsx and b/unittests/table_json_conversion/data/indirect_template.xlsx differ diff --git a/unittests/table_json_conversion/test_read_data.py b/unittests/table_json_conversion/test_read_data.py new file mode 100644 index 0000000000000000000000000000000000000000..eef9a9d88ddbf26fbf658ac5c3753e5133a831be --- /dev/null +++ b/unittests/table_json_conversion/test_read_data.py @@ -0,0 +1,142 @@ +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +"""Testing the conversion from XLSX to JSON""" + + +import json +import os +import re + +from types import SimpleNamespace + +import pytest +from caosadvancedtools.table_json_conversion import convert + +from .utils import assert_equal_jsons + + +def rfp(*pathcomponents): + """Return full path, a shorthand convenience function. + """ + return os.path.join(os.path.dirname(__file__), *pathcomponents) + + +def convert_and_compare(xlsx_file: str, schema_file: str, known_good_file: str, + strict: bool = False) -> dict: + """Convert an XLSX file and compare to a known result. + +Returns +------- +json: dict + The result of the conversion. + """ + result = convert.to_dict(xlsx=xlsx_file, schema=schema_file) + with open(known_good_file, encoding="utf-8") as myfile: + expected = json.load(myfile) + assert_equal_jsons(result, expected, allow_none=not strict, allow_empty=not strict) + return result + + +def test_conversions(): + """Test conversion from XLSX to JSON.""" + convert_and_compare(xlsx_file=rfp("data/simple_data.xlsx"), + schema_file=rfp("data/simple_schema.json"), + known_good_file=rfp("data/simple_data.json")) + convert_and_compare(xlsx_file=rfp("data/multiple_refs_data.xlsx"), + schema_file=rfp("data/multiple_refs_schema.json"), + known_good_file=rfp("data/multiple_refs_data.json")) + convert_and_compare(xlsx_file=rfp("data/indirect_data.xlsx"), + schema_file=rfp("data/indirect_schema.json"), + known_good_file=rfp("data/indirect_data.json")) + convert_and_compare(xlsx_file=rfp("data/multiple_choice_data.xlsx"), + schema_file=rfp("data/multiple_choice_schema.json"), + known_good_file=rfp("data/multiple_choice_data.json"), + strict=True) + + # Data loss when saving as xlsx + with pytest.raises(AssertionError) as err: + convert_and_compare(xlsx_file=rfp("data/simple_data_ascii_chars.xlsx"), + schema_file=rfp("data/simple_schema.json"), + known_good_file=rfp("data/simple_data_ascii_chars.json")) + assert str(err.value).startswith("Values at path ['Training', 0, ") + + +def test_set_in_nested(): + """Test the ``_set_in_nested`` function.""" + set_in_nested = convert._set_in_nested # pylint: disable=protected-access + + test_data_in = [ + {"mydict": {}, "path": ["a", 1], "value": 3}, + {"mydict": {"a": 1}, "path": ["a"], "value": 3, "overwrite": True}, + {"mydict": {"a": 1}, "path": ["a", 1], "value": 3, "overwrite": True}, + {"mydict": {"b": 2}, "path": ["a", 1, 3.141], "value": 3}, + {"mydict": {}, "path": ["X", "Y", "a", 1], "value": 3, "prefix": ["X", "Y"]}, + ] + test_data_out = [ + {"a": {1: 3}}, + {"a": 3}, + {"a": {1: 3}}, + {"a": {1: {3.141: 3}}, "b": 2}, + {"a": {1: 3}}, + ] + + for data_in, data_out in zip(test_data_in, test_data_out): + assert set_in_nested(**data_in) == data_out + + # Testing exceptions + test_data_in = [ + {"mydict": {"a": 1}, "path": ["a"], "value": 3}, + {"mydict": {"a": 1}, "path": ["a", 1], "value": 3}, + {"mydict": {}, "path": ["a", 1], "value": 3, "prefix": ["X", "Y", "Z"]}, + ] + exceptions = [ + [ValueError, r"There is already some value at \[a\]"], + [ValueError, r"There is already some value at \[1\]"], + [KeyError, r"Path does not start with prefix: \['X', 'Y', 'Z'\] not in \['a', 1\]"], + ] + + for data_in, (exc_out, match) in zip(test_data_in, exceptions): + with pytest.raises(exc_out, match=match): + set_in_nested(**data_in) + + +def test_group_foreign_paths(): + """Test the ``_group_foreign_paths`` function.""" + group = convert._group_foreign_paths # pylint: disable=protected-access + + foreign = [ + ["A", "x", 1.1], + ["A", "y", "z", "some text"], + ["A", "B", "CC", "x", 42], + ] + common = ["A", "B", "CC"] + common_wrong = ["A", "B", "C"] + expected = [ + SimpleNamespace(stringpath="A", path=["A"], subpath=["A"], + definitions=[["x", 1.1], ["y", "z", "some text"]]), + SimpleNamespace(stringpath="A.B.CC", path=["A", "B", "CC"], subpath=["B", "CC"], + definitions=[["x", 42]]), + ] + + with pytest.raises(ValueError, match=re.escape( + "Foreign keys must cover the complete `common` depth.")): + result = group(foreign=foreign, common=common_wrong) + result = group(foreign=foreign, common=common) + assert result == expected diff --git a/unittests/table_json_conversion/test_table_template_generator.py b/unittests/table_json_conversion/test_table_template_generator.py index 070a7908dc3884a5a3f721140ff245617753d5e5..d9a84dcf53ec991eec709aab406a7652881e6ea8 100644 --- a/unittests/table_json_conversion/test_table_template_generator.py +++ b/unittests/table_json_conversion/test_table_template_generator.py @@ -22,7 +22,6 @@ import json import os import tempfile -from typing import Tuple import pytest from caosadvancedtools.table_json_conversion.table_generator import XLSXTemplateGenerator @@ -41,7 +40,7 @@ def rfp(*pathcomponents): def _compare_generated_to_known_good(schema_file: str, known_good: str, foreign_keys: dict = None, - outfile: str = None) -> Tuple: + outfile: str = None) -> tuple: """Generate an XLSX from the schema, then compare to known good output. Returns @@ -173,10 +172,10 @@ def test_generate_sheets_from_schema(): def test_get_foreign_keys(): generator = XLSXTemplateGenerator() fkd = {"Training": ['a']} - assert ['a'] == generator._get_foreign_keys(fkd, ['Training']) + assert [['a']] == generator._get_foreign_keys(fkd, ['Training']) fkd = {"Training": {"__this__": ['a']}} - assert ['a'] == generator._get_foreign_keys(fkd, ['Training']) + assert [['a']] == generator._get_foreign_keys(fkd, ['Training']) fkd = {"Training": {'hallo'}} with pytest.raises(ValueError, match=r"A foreign key definition is missing for path:\n\[" @@ -184,7 +183,7 @@ def test_get_foreign_keys(): generator._get_foreign_keys(fkd, ['Training']) fkd = {"Training": {"__this__": ['a'], 'b': ['c']}} - assert ['c'] == generator._get_foreign_keys(fkd, ['Training', 'b']) + assert [['c']] == generator._get_foreign_keys(fkd, ['Training', 'b']) with pytest.raises(ValueError, match=r"A foreign key definition is missing for .*"): generator._get_foreign_keys({}, ['Training']) @@ -264,7 +263,7 @@ def test_model_with_indirect_reference(): _compare_generated_to_known_good( schema_file=rfp("data/indirect_schema.json"), known_good=rfp("data/indirect_template.xlsx"), - foreign_keys={"Wrapper": ["Training.name", "Training.url"]}, + foreign_keys={"Wrapper": {"__this__": [["Training", "name"], ["Training", "url"]]}}, outfile=None) diff --git a/unittests/table_json_conversion/test_test_utils.py b/unittests/table_json_conversion/test_test_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..30171f61de26b1ae11fb25c730c96b31aa8f06a3 --- /dev/null +++ b/unittests/table_json_conversion/test_test_utils.py @@ -0,0 +1,42 @@ +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +"""Testing the ``utils`` module in this folder.""" + + +from .utils import _is_recursively_none + + +def test_recursively_none(): + """Testing ``_is_recursively_none``.""" + assert _is_recursively_none(None) + assert _is_recursively_none([]) + assert _is_recursively_none({}) + assert _is_recursively_none([None]) + assert _is_recursively_none({"a": None}) + assert _is_recursively_none([[], [None, None]]) + assert _is_recursively_none({1: [], 2: [None], 3: {"3.1": None}, 4: {"4.1": [None]}}) + + assert not _is_recursively_none(1) + assert not _is_recursively_none([1]) + assert not _is_recursively_none({1: 2}) + assert not _is_recursively_none([[1]]) + assert not _is_recursively_none({"a": None, "b": "b"}) + assert not _is_recursively_none([[], [None, 2]]) + assert not _is_recursively_none({1: [], 2: [None], 3: {"3.1": 3.141}, 4: {"4.1": [None]}}) diff --git a/unittests/table_json_conversion/utils.py b/unittests/table_json_conversion/utils.py index 6c32117c1296e686290ad75bf5f704a1abfb2547..b95715f72b08384f75857e48bcba328488313ad5 100644 --- a/unittests/table_json_conversion/utils.py +++ b/unittests/table_json_conversion/utils.py @@ -1,3 +1,5 @@ +# encoding: utf-8 +# # This file is a part of the LinkAhead Project. # # Copyright (C) 2024 IndiScale GmbH <info@indiscale.com> @@ -19,9 +21,53 @@ """Utilities for the tests. """ +from typing import Iterable, Union + from openpyxl import Workbook +def assert_equal_jsons(json1, json2, allow_none: bool = True, allow_empty: bool = True, + path: list = None) -> None: + """Compare two json objects for near equality. + +Raise an assertion exception if they are not equal.""" + if path is None: + path = [] + assert isinstance(json1, dict) == isinstance(json2, dict), f"Type mismatch, path: {path}" + if isinstance(json1, dict): + keys = set(json1.keys()).union(json2.keys()) + for key in keys: + this_path = path + [key] + # Case 1: exists in both collections + if key in json1 and key in json2: + el1 = json1[key] + el2 = json2[key] + assert isinstance(el1, type(el2)), f"Type mismatch, path: {this_path}" + if isinstance(el1, (dict, list)): + # Iterables: Recursion + assert_equal_jsons(el1, el2, allow_none=allow_none, allow_empty=allow_empty, + path=this_path) + continue + assert el1 == el2, f"Values at path {this_path} are not equal:\n{el1},\n{el2}" + continue + # Case 2: exists only in one collection + existing = json1.get(key, json2.get(key)) + assert ((allow_none and _is_recursively_none(existing)) + or (allow_empty and existing == [])), ( + f"Element at path {this_path} is None or empty in one json and does not exist in " + "the other.") + return + assert isinstance(json1, list) and isinstance(json2, list), f"Is not a list, path: {path}" + assert len(json1) == len(json2), f"Lists must have equal length, path: {path}" + for idx, (el1, el2) in enumerate(zip(json1, json2)): + this_path = path + [idx] + if isinstance(el1, dict): + assert_equal_jsons(el1, el2, allow_none=allow_none, allow_empty=allow_empty, + path=this_path) + else: + assert el1 == el2, f"Values at path {this_path} are not equal:\n{el1},\n{el2}" + + def compare_workbooks(wb1: Workbook, wb2: Workbook, hidden: bool = True): """Compare two workbooks for equal content. @@ -52,3 +98,19 @@ hidden: bool, optional f"Sheet: {sheetname}, cell: {cell1.coordinate}, Values: \n" f"{cell1.value}\n{cell2.value}" ) + + +def _is_recursively_none(obj: Union[list, dict] = None): + """Test if ``obj`` is None or recursively consists only of None-like objects.""" + if obj is None: + return True + if isinstance(obj, (list, dict)): + if isinstance(obj, list): + mylist: Iterable = obj + else: + mylist = obj.values() + for element in mylist: + if not _is_recursively_none(element): + return False + return True + return False