diff --git a/.gitignore b/.gitignore index 4c175607e5327472c301949f187c58d925f0d05e..2b50c0fc33c80b9d83bc913c1e23836b51049f1c 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ src/caosadvancedtools/version.py # compiled python and dist stuff +.venv *.egg .eggs *.egg-info/ diff --git a/CHANGELOG.md b/CHANGELOG.md index 62773ef94aeea0560bdea0bd3effa8b3e620f915..58ea93cdf96d767600f5ad7e48c47a60677eeb37 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### +- XLSX template generator +- Json schema exporter: + - has new parameter `use_rt_pool` + - propagates more properties in the `make_array` function + ### Changed ### ### Deprecated ### @@ -19,6 +24,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed ### +- Json schema exporter handles reference properties better. + ### Security ### ### Documentation ### diff --git a/Makefile b/Makefile index d9b182cbd0b17490e9d81b900d6ba8cefadb1b64..0c586c1e43f75d0b54eea2b625f25d6a9d6318a9 100644 --- a/Makefile +++ b/Makefile @@ -36,10 +36,10 @@ unittest: pytest-3 unittests style: - pycodestyle --count src unittests --exclude=swagger_client - autopep8 -ar --diff --exit-code --exclude swagger_client . + pycodestyle --count --exclude=swagger_client src unittests + autopep8 -ar --diff --exit-code --exclude swagger_client src unittests .PHONY: style lint: - pylint --unsafe-load-any-extension=y -d all -e E,F --ignore=swagger_client src/caosadvancedtools + pylint --unsafe-load-any-extension=y -d R,C --ignore=swagger_client src/caosadvancedtools .PHONY: lint diff --git a/integrationtests/test_json_schema_exporter.py b/integrationtests/test_json_schema_exporter.py index 69edcf42d1fd285c030ad6d6ccb7f73f2d1b5536..44b428263ebbd9696fc2a171ea356d764482d5e3 100644 --- a/integrationtests/test_json_schema_exporter.py +++ b/integrationtests/test_json_schema_exporter.py @@ -20,9 +20,12 @@ # with this program. If not, see <https://www.gnu.org/licenses/>. # +import json + import linkahead as db from caosadvancedtools.json_schema_exporter import recordtype_to_json_schema as rtjs +from caosadvancedtools.models.parser import parse_model_from_string def _delete_everything(): @@ -75,3 +78,37 @@ def test_uniqueness_of_reference_types(): assert one_of[1 - enum_index]["type"] == "object" # No properties in parent_type assert len(one_of[1 - enum_index]["properties"]) == 0 + + +def test_reference_property(): + model_string = """ +RT1: + description: Some recordtype +RT2: + obligatory_properties: + prop1: + description: Some reference property + datatype: RT1 + """ + model = parse_model_from_string(model_string) + model.sync_data_model(noquestion=True) + schema = rtjs(db.RecordType(name="RT2").retrieve()) + assert json.dumps(schema, indent=2) == """{ + "type": "object", + "required": [ + "prop1" + ], + "additionalProperties": true, + "title": "RT2", + "properties": { + "prop1": { + "type": "object", + "required": [], + "additionalProperties": true, + "description": "Some reference property", + "title": "prop1", + "properties": {} + } + }, + "$schema": "https://json-schema.org/draft/2020-12/schema" +}""" diff --git a/pylintrc b/pylintrc index 625f83ce950841f7a239538123ef7b5812fc5c5f..d3a2e89ae1990480e5377daf443e0a63224342bc 100644 --- a/pylintrc +++ b/pylintrc @@ -1,5 +1,3 @@ -# -*- mode:conf; -*- - [FORMAT] # Good variable names which should always be accepted, separated by a comma good-names=ii,rt,df @@ -17,3 +15,8 @@ init-hook= import sys; sys.path.extend(["src/caosadvancedtools"]); import astroid; astroid.context.InferenceContext.max_inferred = 500; +[MESSAGES CONTROL] +disable= + fixme, + logging-format-interpolation, + logging-not-lazy, diff --git a/src/caosadvancedtools/cfoods/h5.py b/src/caosadvancedtools/cfoods/h5.py index 4e6832f2e96e0950ed99146d4907f1ffb70d8494..f3d41dd0037ce9393b5b7dac3cc5bb6cc3db41d8 100644 --- a/src/caosadvancedtools/cfoods/h5.py +++ b/src/caosadvancedtools/cfoods/h5.py @@ -1,9 +1,9 @@ #!/usr/bin/env python3 -# This file is a part of the CaosDB Project. +# This file is a part of the LinkAhead Project. # # Copyright (C) 2020,2021 IndiScale GmbH <www.indiscale.com> -# Copyright (C) 2020 Daniel Hornung <d.hornung@indiscale.com> +# Copyright (C) 2020-2025 Daniel Hornung <d.hornung@indiscale.com> # Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> # Copyright (C) 2021 Alexander Kreft # Copyright (C) 2021 Laboratory for Fluid Physics and Biocomplexity, @@ -33,19 +33,14 @@ attributes. Groups and datasets are mapped to Records and attributes to Properties. """ -import re from copy import deepcopy import caosdb as db import h5py import numpy as np from caosadvancedtools.cfood import fileguide -from caosdb.common.datatype import is_reference -from caosdb.common.utils import uuid -from ..cfood import (AbstractFileCFood, assure_has_description, - assure_has_parent, assure_has_property, - assure_property_is) +from ..cfood import AbstractFileCFood from ..structure_mapping import (EntityMapping, collect_existing_structure, update_structure) @@ -100,8 +95,7 @@ def h5_attr_to_property(val): if hasattr(val, 'ndim'): if not isinstance(val, np.ndarray) and val.ndim != 0: print(val, val.ndim) - raise Exception( - "Implementation assumes that only np.arrays have ndim.") + raise RuntimeError("Implementation assumes that only np.arrays have ndim.") return val, dtype @@ -127,6 +121,8 @@ class H5CFood(AbstractFileCFood): self.identifiable_root = None self.root_name = "root" self.hdf5Container = db.Container() + self.to_be_inserted = db.Container() + self.structure = db.Container() self.em = EntityMapping() def collect_information(self): @@ -165,7 +161,8 @@ class H5CFood(AbstractFileCFood): """ - self.structure._cuid = "root element" + # TODO Why do we need a protected member here? + self.structure._cuid = "root element" # pylint: disable=protected-access self.em.add(self.structure, self.identifiable_root) collect_existing_structure(self.structure, self.identifiable_root, self.em) @@ -282,7 +279,7 @@ class H5CFood(AbstractFileCFood): return rec def insert_missing_structure(self, target_structure: db.Record): - if target_structure._cuid not in self.em.to_existing: + if target_structure._cuid not in self.em.to_existing: # pylint: disable=protected-access self.to_be_inserted.append(target_structure) for prop in target_structure.get_properties(): diff --git a/src/caosadvancedtools/json_schema_exporter.py b/src/caosadvancedtools/json_schema_exporter.py index 700c24e890c36a5b4219a1c2cc7d74ce38d6d398..5d18ad066452968abdc04c3c5d0ff1c5ba63a384 100644 --- a/src/caosadvancedtools/json_schema_exporter.py +++ b/src/caosadvancedtools/json_schema_exporter.py @@ -28,7 +28,9 @@ from collections import OrderedDict from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union import linkahead as db +from linkahead.cached import cached_query, cache_clear from linkahead.common.datatype import get_list_datatype, is_list_datatype +from .models.data_model import DataModel class JsonSchemaExporter: @@ -45,6 +47,7 @@ class JsonSchemaExporter: do_not_create: List[str] = None, do_not_retrieve: List[str] = None, no_remote: bool = False, + use_rt_pool: DataModel = None, multiple_choice: List[str] = None, wrap_files_in_objects: bool = False, ): @@ -73,16 +76,21 @@ class JsonSchemaExporter: description of the corresponding schema entry. If set to false, an additional `unit` key is added to the schema itself which is purely annotational and ignored, e.g., in validation. Default is True. - do_not_create : list[str] + do_not_create : list[str], optional A list of reference Property names, for which there should be no option to create them. Instead, only the choice of existing elements should be given. - do_not_retrieve : list[str] + do_not_retrieve : list[str], optional A list of RedcordType names, for which no Records shall be retrieved. Instead, only an object description should be given. If this list overlaps with the `do_not_create` parameter, the behavior is undefined. - no_remote : bool - If True, do not attempt to connect to a LinkAhead server at all. Default is False. + no_remote : bool, optional + If True, do not attempt to connect to a LinkAhead server at all. Default is False. Note + that the exporter may fail if this option is activated and the data model is not + self-sufficient. + use_rt_pool : DataModel, optional + If given, do not attempt to retrieve RecordType information remotely but from this parameter + instead. multiple_choice : list[str], optional A list of reference Property names which shall be denoted as multiple choice properties. This means that each option in this property may be selected at most once. This is not @@ -108,6 +116,8 @@ class JsonSchemaExporter: if not multiple_choice: multiple_choice = [] + cache_clear() + self._additional_properties = additional_properties self._name_property_for_new_records = name_property_for_new_records self._description_property_for_new_records = description_property_for_new_records @@ -118,6 +128,7 @@ class JsonSchemaExporter: self._do_not_create = do_not_create self._do_not_retrieve = do_not_retrieve self._no_remote = no_remote + self._use_rt_pool = use_rt_pool self._multiple_choice = multiple_choice self._wrap_files_in_objects = wrap_files_in_objects @@ -129,8 +140,6 @@ class JsonSchemaExporter: if rt.get_importance(prop.name) != db.OBLIGATORY: continue prop_name = prop.name - if isinstance(prop.datatype, db.Entity): - prop_name = prop.datatype.name required_list.append(prop_name) return required_list @@ -256,12 +265,24 @@ class JsonSchemaExporter: # Only a simple list of values json_prop["enum"] = values else: - if self._no_remote: + if self._use_rt_pool: + rt = self._use_rt_pool.get_deep(prop_name) + elif self._no_remote: rt = prop.datatype else: - rt = db.execute_query(f"FIND RECORDTYPE WITH name='{prop_name}'", - unique=True) + results = cached_query(f"FIND RECORDTYPE WITH name='{prop_name}'") + assert len(results) <= 1 + if len(results): + rt = results[0] + else: + rt = db.Entity() subschema, ui_schema = self._make_segment_from_recordtype(rt) + if prop.is_reference(): + if prop.name: + subschema["title"] = prop.name + if prop.description: + subschema["description"] = prop.description + # if inner_ui_schema: # ui_schema = inner_ui_schema if values: @@ -334,7 +355,7 @@ class JsonSchemaExporter: if self._no_remote: return [] - possible_values = db.execute_query(f"SELECT name, id FROM {role}") + possible_values = cached_query(f"SELECT name, id FROM {role}") vals = [] for val in possible_values: @@ -479,6 +500,7 @@ def recordtype_to_json_schema(rt: db.RecordType, additional_properties: bool = T do_not_create: List[str] = None, do_not_retrieve: List[str] = None, no_remote: bool = False, + use_rt_pool: DataModel = None, multiple_choice: List[str] = None, rjsf: bool = False, wrap_files_in_objects: bool = False @@ -524,6 +546,9 @@ def recordtype_to_json_schema(rt: db.RecordType, additional_properties: bool = T parameter, the behavior is undefined. no_remote : bool, optional If True, do not attempt to connect to a LinkAhead server at all. Default is False. + use_rt_pool : DataModel, optional + If given, do not attempt to retrieve RecordType information remotely but from this parameter + instead. multiple_choice : list[str], optional A list of reference Property names which shall be denoted as multiple choice properties. This means that each option in this property may be selected at most once. This is not @@ -560,6 +585,7 @@ def recordtype_to_json_schema(rt: db.RecordType, additional_properties: bool = T do_not_create=do_not_create, do_not_retrieve=do_not_retrieve, no_remote=no_remote, + use_rt_pool=use_rt_pool, multiple_choice=multiple_choice, wrap_files_in_objects=wrap_files_in_objects ) @@ -603,8 +629,16 @@ ui_schema : dict, optional "$schema": "https://json-schema.org/draft/2020-12/schema", } + if schema.get("description"): + result["description"] = schema["description"] + if rjsf_uischema is not None: ui_schema = {"items": rjsf_uischema} + # Propagate ui: options up one level. + for key in rjsf_uischema.keys(): + if key.startswith("ui:"): + ui_schema[key] = rjsf_uischema[key] + return result, ui_schema return result diff --git a/src/caosadvancedtools/scifolder/analysis_cfood.py b/src/caosadvancedtools/scifolder/analysis_cfood.py index 27cb871aed08f41531c367567ea36ea9a3faaf69..608054cf1fa0eacd3e86f086bfc98b526357302a 100644 --- a/src/caosadvancedtools/scifolder/analysis_cfood.py +++ b/src/caosadvancedtools/scifolder/analysis_cfood.py @@ -16,17 +16,14 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. -import os -from itertools import chain - import caosdb as db -from caosadvancedtools.cfood import (AbstractFileCFood, assure_has_parent, +from caosadvancedtools.cfood import (AbstractFileCFood, assure_has_property, - assure_object_is_in_list, get_entity) -from caosadvancedtools.read_md_header import get_header + assure_object_is_in_list, + ) from .generic_pattern import full_pattern -from .utils import (get_files_referenced_by_field, parse_responsibles, +from .utils import (parse_responsibles, reference_records_corresponding_to_files) from .withreadme import DATAMODEL as dm from .withreadme import (RESULTS, REVISIONOF, SCRIPTS, SOURCES, WithREADME, diff --git a/src/caosadvancedtools/table_json_conversion/__init__.py b/src/caosadvancedtools/table_json_conversion/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/caosadvancedtools/table_json_conversion/fill_xlsx.py b/src/caosadvancedtools/table_json_conversion/fill_xlsx.py new file mode 100644 index 0000000000000000000000000000000000000000..60b5c96c7de141b1ecb12254e6928252fe4a9f5c --- /dev/null +++ b/src/caosadvancedtools/table_json_conversion/fill_xlsx.py @@ -0,0 +1,398 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +from __future__ import annotations + +import json +import pathlib +from collections import OrderedDict +from types import SimpleNamespace +from typing import Any, Dict, List, Optional, TextIO, Union +from warnings import warn + +from jsonschema import FormatChecker, validate +from jsonschema.exceptions import ValidationError +from openpyxl import Workbook, load_workbook +from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE +from openpyxl.worksheet.worksheet import Worksheet + +from .table_generator import ColumnType, RowType +from .utils import p2s + + +def _is_exploded_sheet(sheet: Worksheet) -> bool: + """Return True if this is a an "exploded" sheet. + + An exploded sheet is a sheet whose data entries are LIST valued properties of entries in another + sheet. A sheet is detected as exploded iff it has FOREIGN columns. + """ + column_types = _get_column_types(sheet) + return ColumnType.FOREIGN.name in column_types.values() + + +def _get_column_types(sheet: Worksheet) -> OrderedDict: + """Return an OrderedDict: column index -> column type for the sheet. + """ + result = OrderedDict() + type_row_index = _get_row_type_column_index(sheet) + for idx, col in enumerate(sheet.columns): + type_cell = col[type_row_index] + result[idx] = type_cell.value if type_cell.value is not None else ColumnType.IGNORE.name + assert (hasattr(ColumnType, result[idx]) + or result[idx] == RowType.COL_TYPE.name), ( + f"Unexpected column type value ({idx}{type_row_index}): {type_cell.value}") + return result + + +def _get_foreign_key_columns(sheet: Worksheet) -> Dict[str, SimpleNamespace]: + """Return the foreign keys of the worksheet. + +Returns +------- +out: dict[str, SimpleNamespace] + The keys are the stringified paths. The values are SimpleNamespace objects with ``index``, + ``path`` and ``column`` attributes. + """ + column_types = _get_column_types(sheet) + path_rows = _get_path_rows(sheet) + result = OrderedDict() + for for_idx, name in column_types.items(): + if name != ColumnType.FOREIGN.name: + continue + path = [] + for row in path_rows: + component = sheet.cell(row=row+1, column=for_idx+1).value + if component is None: + break + assert isinstance(component, str), f"Expected string: {component}" + path.append(component) + result[p2s(path)] = SimpleNamespace(index=for_idx, path=path, + column=list(sheet.columns)[for_idx]) + return result + + +def _get_row_type_column_index(sheet: Worksheet): + """Return the column index (0-indexed) of the column which defines the row types. + """ + for col in sheet.columns: + for cell in col: + if cell.value == RowType.COL_TYPE.name: + return cell.column - 1 + raise ValueError("The column which defines row types (COL_TYPE, PATH, ...) is missing") + + +def _get_path_rows(sheet: Worksheet): + """Return the 0-based indices of the rows which represent paths.""" + rows = [] + rt_col = _get_row_type_column_index(sheet) + for cell in list(sheet.columns)[rt_col]: + if cell.value == RowType.PATH.name: + rows.append(cell.row-1) + return rows + + +def _next_row_index(sheet: Worksheet) -> int: + """Return the index for the next data row. + + This is defined as the first row without any content. + """ + return sheet.max_row + + +def _read_or_dict(data: Union[dict, str, TextIO]) -> dict: + """If data is a json file name or input stream, read data from there.""" + if isinstance(data, dict): + pass + elif isinstance(data, str): + with open(data, encoding="utf-8") as infile: + data = json.load(infile) + elif hasattr(data, "read"): + data = json.load(data) + else: + raise ValueError(f"I don't know how to handle the datatype of `data`: {type(data)}") + assert isinstance(data, dict) + return data + + +class TemplateFiller: + """Class to fill XLSX templates. Has an index for all relevant columns.""" + + def __init__(self, workbook: Workbook, graceful: bool = False): + self._workbook = workbook + self._graceful = graceful + self._create_index() + + @property + def workbook(self): + return self._workbook + + def fill_data(self, data: dict): + """Fill the data into the workbook.""" + self._handle_data(data=data) + + class Context: + """Context for an entry: simple properties of all ancestors, organized in a dict. + + This is similar to a dictionary with all scalar element properties at the tree nodes up to + the root. Siblings in lists and dicts are ignored. Additionally the context knows where + its current position is. + + Lookup of elements can easily be achieved by giving the path (as ``list[str]`` or + stringified path). + + """ + + def __init__(self, current_path: List[str] = None, props: Dict[str, Any] = None): + self._current_path = current_path if current_path is not None else [] + self._props = props if props is not None else {} # this is flat + + def copy(self) -> TemplateFiller.Context: + """Deep copy.""" + result = TemplateFiller.Context(current_path=self._current_path.copy(), + props=self._props.copy()) + return result + + def next_level(self, next_level: str) -> TemplateFiller.Context: + result = self.copy() + result._current_path.append(next_level) # pylint: disable=protected-access + return result + + def __getitem__(self, path: Union[List[str], str], owner=None) -> Any: + if isinstance(path, list): + path = p2s(path) + return self._props[path] + + def __setitem__(self, propname: str, value): + fullpath = p2s(self._current_path + [propname]) + self._props[fullpath] = value + + def fill_from_data(self, data: Dict[str, Any]): + # TODO recursive for dicts and list? + """Fill current level with all scalar elements of ``data``.""" + for name, value in data.items(): + if not isinstance(value, (dict, list)): + self[name] = value + elif isinstance(value, dict): + if not value or isinstance(list(value.items())[0], list): + continue + old_path = self._current_path + new_path = self._current_path.copy() + [name] + self._current_path = new_path + self.fill_from_data(data=value) + self._current_path = old_path + + def _create_index(self): + """Create a sheet index for the workbook. + + Index the sheets by all path arrays leading to them. Also create a simple column index by + column type and path. + + """ + self._sheet_index = {} + for sheetname in self._workbook.sheetnames: + sheet = self._workbook[sheetname] + type_column = [x.value for x in list(sheet.columns)[ + _get_row_type_column_index(sheet)]] + # 0-indexed, as everything outside of sheet.cell(...): + coltype_idx = type_column.index(RowType.COL_TYPE.name) + path_indices = [i for i, typ in enumerate(type_column) if typ == RowType.PATH.name] + + # Get the paths, use without the leaf component for sheet indexing, with type prefix and + # leaf for column indexing. + for col_idx, col in enumerate(sheet.columns): + if col[coltype_idx].value == RowType.COL_TYPE.name: + continue + path = [] + for path_idx in path_indices: + if col[path_idx].value is not None: + path.append(col[path_idx].value) + # col_key = p2s([col[coltype_idx].value] + path) + # col_index[col_key] = SimpleNamespace(column=col, col_index=col_idx) + if col[coltype_idx].value not in [ColumnType.SCALAR.name, ColumnType.LIST.name]: + continue + + path_str = p2s(path) + assert path_str not in self._sheet_index + self._sheet_index[path_str] = SimpleNamespace( + sheetname=sheetname, sheet=sheet, col_index=col_idx, + col_type=col[coltype_idx].value) + + def _handle_data(self, data: dict, current_path: List[str] = None, + context: TemplateFiller.Context = None, + only_collect_insertables: bool = False, + ) -> Optional[Dict[str, Any]]: + """Handle the data and write it into ``workbook``. + +Parameters +---------- +data: dict + The data at the current path position. Elements may be dicts, lists or simple scalar values. + +current_path: list[str], optional + If this is None or empty, we are at the top level. This means that all children shall be entered + into their respective sheets and not into a sheet at this level. ``current_path`` and ``context`` + must either both be given, or none of them. + +context: TemplateFiller.Context, optional + Directopry of scalar element properties at the tree nodes up to the root. Siblings in lists + and dicts are ignored. ``context`` and ``current_path`` must either both be given, or none of + them. + +only_collect_insertables: bool, optional + If True, do not insert anything on this level, but return a dict with entries to be inserted. + + +Returns +------- +out: union[dict, None] + If ``only_collect_insertables`` is True, return a dict (path string -> value) + """ + assert (current_path is None) is (context is None), ( + "`current_path` and `context` must either both be given, or none of them.") + if current_path is None: + current_path = [] + if context is None: + context = TemplateFiller.Context() + context.fill_from_data(data) + + insertables: Dict[str, Any] = {} + for name, content in data.items(): + # TODO is this the best way to do it???? + if name == "file": + continue + path = current_path + [name] + next_context = context.next_level(name) + # preprocessing + if isinstance(content, list): + if not content: # empty list + continue + # List elements must be all of the same type. + assert len(set(type(entry) for entry in content)) == 1 + + if isinstance(content[0], dict): # all elements are dicts + # An array of objects: must go into exploded sheet + for entry in content: + self._handle_data(data=entry, current_path=path, context=next_context) + continue + elif isinstance(content, dict): # we recurse and simply use the result + if not current_path: # Special handling for top level + self._handle_data(content, current_path=path, context=next_context) + continue + insert = self._handle_data(content, current_path=path, context=next_context.copy(), + only_collect_insertables=True) + assert isinstance(insert, dict) + assert not any(key in insertables for key in insert) + insertables.update(insert) + continue + else: # scalars + content = [content] # make list for unified treatment below + + # collecting the data + assert isinstance(content, list) + if len(content) > 1: + content = [ILLEGAL_CHARACTERS_RE.sub("", str(x)) for x in content] + value = ";".join(content) # TODO we need escaping of values + else: + value = content[0] + if isinstance(value, str): + value = ILLEGAL_CHARACTERS_RE.sub("", value) + path_str = p2s(path) + assert path_str not in insertables + insertables[path_str] = value + if only_collect_insertables: + return insertables + if not current_path: # Top level returns, because there are only sheets for the children. + return None + + # actual data insertion + insert_row = None + sheet = None + for path_str, value in insertables.items(): + if self._graceful and path_str not in self._sheet_index: + warn(f"Ignoring path with missing sheet index: {path_str}") + continue + sheet_meta = self._sheet_index[path_str] + if sheet is None: + sheet = sheet_meta.sheet + assert sheet is sheet_meta.sheet, "All entries must be in the same sheet." + col_index = sheet_meta.col_index + if insert_row is None: + insert_row = _next_row_index(sheet) + + sheet.cell(row=insert_row+1, column=col_index+1, value=value) + + # Insert foreign keys + if insert_row is not None and sheet is not None and _is_exploded_sheet(sheet): + try: + foreigns = _get_foreign_key_columns(sheet) + except ValueError: + print(f"Sheet: {sheet}") + raise + for index, path in ((f.index, f.path) for f in foreigns.values()): + value = context[path] + sheet.cell(row=insert_row+1, column=index+1, value=value) + + return None + + +def fill_template(data: Union[dict, str, TextIO], template: str, result: str, + validation_schema: Union[dict, str, TextIO] = None) -> None: + """Insert json data into an xlsx file, according to a template. + +This function fills the json data into the template stored at ``template`` and stores the result as +``result``. + +Parameters +---------- +data: Union[dict, str, TextIO] + The data, given as Python dict, path to a file or a file-like object. +template: str + Path to the XLSX template. +result: str + Path for the result XLSX. +validation_schema: dict, optional + If given, validate the date against this schema first. This raises an exception if the validation + fails. If no validation schema is given, try to ignore more errors in the data when filling the + XLSX template. +""" + data = _read_or_dict(data) + assert isinstance(data, dict) + + # Validation + if validation_schema is not None: + validation_schema = _read_or_dict(validation_schema) + try: + validate(data, validation_schema, format_checker=FormatChecker()) + except ValidationError as ve: + print(ve.message) + raise ve + else: + print("No validation schema given, continue at your own risk.") + + # Filling the data + result_wb = load_workbook(template) + template_filler = TemplateFiller(result_wb, graceful=(validation_schema is None)) + template_filler.fill_data(data=data) + + parentpath = pathlib.Path(result).parent + parentpath.mkdir(parents=True, exist_ok=True) + result_wb.save(result) diff --git a/src/caosadvancedtools/table_json_conversion/table_generator.py b/src/caosadvancedtools/table_json_conversion/table_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..21bcc7ae57ae0f4dc64c0770234b0abed256acfa --- /dev/null +++ b/src/caosadvancedtools/table_json_conversion/table_generator.py @@ -0,0 +1,379 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +""" +This module allows to generate template tables from JSON schemas. +""" + +import pathlib +import re +from abc import ABC, abstractmethod +from enum import Enum +from typing import Dict, List, Optional, Tuple + +from openpyxl import Workbook +from openpyxl.styles import PatternFill +from openpyxl.workbook.child import INVALID_TITLE_REGEX + +from .utils import p2s + + +class ColumnType(Enum): + """ column types enum """ + SCALAR = 1 + LIST = 2 + FOREIGN = 3 + IGNORE = 4 + + +class RowType(Enum): + """ row types enum """ + COL_TYPE = 1 + PATH = 2 + IGNORE = 3 + + +class TableTemplateGenerator(ABC): + """ base class for generating tables from json schema """ + + def __init__(self): + pass + + @abstractmethod + def generate(self, schema: dict, foreign_keys: dict, filepath: str): + """Generate a sheet definition from a given JSON schema. + + Parameters: + ----------- + schema: dict + Given JSON schema. + + foreign_keys: dict + A tree-like configuration (nested dict) that defines which attributes shall be used to + create additional columns when a list of references exists. The nested dict is + structured like the data model, its innermost elements are leaves of the path trees + within the JSON, they define the required keys. + + | Suppose we want to distinguish Persons that are referenced by Trainings, then + ``foreign_keys`` must at least contain the following: + | ``{"Training": {"Person": ["name", "email"]}}``. + + Values within the dicts can be either a list representing the keys (as in the example + above) or a dict that allows to set additional foreign keys at higher depths. In the + latter case (dict instead of list) if foreign keys exist at that level (e.g. in the + above example there might be further levels below "Person"), then the foreign keys can + be set using the special ``__this__`` key. + + Example: ``{"Training": {"__this__": ["date"], "Person": ["name", "email"]}}`` + Here, ``date`` is the sole foreign key for Training. + """ + + def _generate_sheets_from_schema(self, schema: dict, foreign_keys: Optional[dict] = None + ) -> Dict[str, Dict[str, + Tuple[ColumnType, Optional[str], list]]]: + """Generate a sheet definition from a given JSON schema. + + Parameters + ---------- + schema: dict + given JSON schema + foreign_keys: dict, optional + a configuration that defines which attributes shall be used to create + additional columns when a list of references exists. See ``foreign_keys`` + argument of TableTemplateGenerator.generate. + + Returns + ------- + sheets: dict + A two-level dict which describes columns of template sheets. + + | The structure of this two-level dict is as follows: + | ``sheets[sheetname][colname]= (<col_type>, <description>, [<path>, ...])`` + + I.e. the outer dict contains sheet names as keys, the inner dict has column names as + keys and tuples as values. These tuples consist of: + - the column type + - the description of the corresponding property + - a list representing the path. + + """ + if not ("type" in schema or "anyOf" in schema): + raise ValueError("Inappropriate JSON schema: The following object must contain the " + f"'type' or 'anyOf' key:\n{schema}\n") + if "properties" not in schema: + raise ValueError("Inappropriate JSON schema: The following object must contain " + f"the 'properties' key:\n{schema}\n") + if "type" in schema: + assert schema["type"] == "object" + if foreign_keys is None: + foreign_keys = {} + # here, we treat the top level + # sheets[sheetname][colname]= (COL_TYPE, description, [path]) + sheets: Dict[str, Dict[str, Tuple[ColumnType, Optional[str], list]]] = {} + for rt_name, rt_def in schema["properties"].items(): + sheets[rt_name] = self._treat_schema_element(schema=rt_def, sheets=sheets, + path=[rt_name], foreign_keys=foreign_keys) + return sheets + + def _get_foreign_keys(self, keys: dict, path: list) -> list: + """Return the foreign keys that are needed at the location to which path points.""" + msg = f"A foreign key definition is missing for path:\n{path}\nKeys are:\n{keys}" + while path: + if keys is None or path[0] not in keys: + raise ValueError(msg) + keys = keys[path[0]] + path = path[1:] + if isinstance(keys, dict) and "__this__" in keys: + return keys["__this__"] + if isinstance(keys, list): + return keys + raise ValueError(msg) + + def _treat_schema_element(self, schema: dict, sheets: dict, path: List[str], + foreign_keys: Optional[dict] = None, level_in_sheet_name: int = 1, + array_paths: Optional[list] = None + ) -> Dict[str, Tuple[ColumnType, Optional[str], list]]: + """Recursively transform elements from the schema into column definitions. + + ``sheets`` is modified in place. + + Parameters + ---------- + schema: dict + Part of the json schema; it must be the level that contains the type definition + (e.g. 'type' or 'oneOf' key) + sheets: dict + All the sheets, indexed by their name. This is typically modified in place by this + method. + path: list[str] + The relevant (sub) path for this schema part? + array_paths: list + A list of path along the way to the current object, where the json contains arrays. + + Returns + ------- + columns: dict + Describing the columns; see doc string of `_generate_sheets_from_schema`_ + """ + if not ("type" in schema or "enum" in schema or "oneOf" in schema or "anyOf" in schema): + raise ValueError("Inappropriate JSON schema: The following schema part must contain " + f"'type', 'enum', 'oneOf' or 'anyOf':\n{schema}\n") + + if array_paths is None: + # if this is not set, we are at top level and the top level element may always be an + # array + array_paths = [path] + if foreign_keys is None: + foreign_keys = {} + + ctype = ColumnType.SCALAR + + # if it is an array, value defs are in 'items' + if schema.get('type') == 'array': + if (schema['items'].get('type') == 'object' + and len(path) > 1): # list of references; special treatment + # we add a new sheet with columns generated from the subtree of the schema + sheetname = p2s(path) + if sheetname in sheets: + raise ValueError("The schema would lead to two sheets with the same name, " + f"which is forbidden: {sheetname}") + col_def = self._treat_schema_element( + schema=schema['items'], sheets=sheets, path=path, foreign_keys=foreign_keys, + level_in_sheet_name=len(path), + array_paths=array_paths+[path] # since this level is an array extend the list + ) + if col_def: + sheets[sheetname] = col_def + # and add the foreign keys that are necessary up to this point + for array_path in array_paths: + foreigns = self._get_foreign_keys(foreign_keys, array_path) + if isinstance(foreigns, str): + raise ValueError("Foreign keys must be a list of strings, but a single " + "string was given:\n" + f"{array_path} -> {foreigns}") + for foreign in foreigns: + internal_key = p2s(array_path + [foreign]) + if internal_key in sheets[sheetname]: + raise ValueError("The schema would lead to two columns with the same " + "name, which is forbidden:\n" + f"{foreign} -> {internal_key}") + ref_sheet = p2s(array_path) + sheets[sheetname][internal_key] = ( + ColumnType.FOREIGN, f"see sheet '{ref_sheet}'", array_path + [foreign]) + # Columns are added to the new sheet, thus we do not return any columns for the + # current sheet. + return {} + + # it is a list of primitive types -> semicolon separated list + schema = schema['items'] + ctype = ColumnType.LIST + + # This should only be the case for "new or existing reference". + for el in schema.get('oneOf', []): + if 'type' in el: + schema = el + break + + if "properties" in schema: # recurse for each property, then return + cols = {} + for pname in schema["properties"]: + col_defs = self._treat_schema_element( + schema["properties"][pname], sheets, path+[pname], foreign_keys, + level_in_sheet_name, array_paths=array_paths) + for k in col_defs: + if k in cols: + raise ValueError(f"The schema would lead to two columns with the same " + f"name which is forbidden: {k}") + cols.update(col_defs) + return cols + + # The schema is a leaf. + description = schema['description'] if 'description' in schema else None + # definition of a single column + default_return = {p2s(path[level_in_sheet_name:]): (ctype, description, path)} + if 'type' not in schema and 'enum' in schema: + return default_return + if 'type' not in schema and 'anyOf' in schema: + for d in schema['anyOf']: + # currently the only case where this occurs is date formats + assert d['type'] == 'string' + assert d['format'] == 'date' or d['format'] == 'date-time' + return default_return + if schema["type"] in ['string', 'number', 'integer', 'boolean']: + if 'format' in schema and schema['format'] == 'data-url': + return {} # file; ignore for now + return default_return + raise ValueError("Inappropriate JSON schema: The following part should define an" + f" object with properties or a primitive type:\n{schema}\n") + + +class XLSXTemplateGenerator(TableTemplateGenerator): + """Class for generating XLSX tables from json schema definitions.""" + + def __init__(self): + pass + + def generate(self, schema: dict, foreign_keys: dict, filepath: str) -> None: + """Generate a sheet definition from a given JSON schema. + + Parameters: + ----------- + schema: dict + Given JSON schema + foreign_keys: dict + A configuration that defines which attributes shall be used to create + additional columns when a list of references exists. See ``foreign_keys`` + argument of :ref:`TableTemplateGenerator.generate` . + filepath: str + The XLSX file will be stored under this path. + """ + sheets = self._generate_sheets_from_schema(schema, foreign_keys) + wb = self._create_workbook_from_sheets_def(sheets) + parentpath = pathlib.Path(filepath).parent + parentpath.mkdir(parents=True, exist_ok=True) + wb.save(filepath) + + @staticmethod + def _get_max_path_length(sheetdef: dict) -> int: + """ returns the length of the longest path contained in the sheet definition + + see TableTemplateGenerator._generate_sheets_from_schema for the structure of the sheets + definition dict + You need to pass the dict of a single sheet to this function. + """ + return max([len(path) for _, _, path in sheetdef.values()]) + + @staticmethod + def _get_ordered_cols(sheetdef: dict) -> list: + """ + creates a list with tuples (colname, column type, path) where the foreign keys are first + """ + ordered_cols = [] + # first foreign cols + for colname, (ct, desc, path) in sheetdef.items(): + if ct == ColumnType.FOREIGN: + ordered_cols.append((colname, ct, desc, path)) + # now the other + for colname, (ct, desc, path) in sheetdef.items(): + if ct != ColumnType.FOREIGN: + ordered_cols.append((colname, ct, desc, path)) + + return ordered_cols + + def _create_workbook_from_sheets_def( + self, sheets: Dict[str, Dict[str, Tuple[ColumnType, Optional[str], list]]]): + """Create and return a nice workbook for the given sheets.""" + wb = Workbook() + yellowfill = PatternFill(fill_type="solid", fgColor='00FFFFAA') + # remove initial sheet + assert wb.sheetnames == ["Sheet"] + del wb['Sheet'] + + for sheetname, sheetdef in sheets.items(): + if not sheetdef: + continue + ws = wb.create_sheet(re.sub(INVALID_TITLE_REGEX, '_', sheetname)) + # First row will by the COL_TYPE row. + # First column will be the indicator row with values COL_TYPE, PATH, IGNORE. + # The COL_TYPE row will be followed by as many PATH rows as needed. + + max_path_length = self._get_max_path_length(sheetdef) + header_index = 2 + max_path_length + description_index = 3 + max_path_length + + # create first column + ws.cell(1, 1, RowType.COL_TYPE.name) + for index in range(max_path_length): + ws.cell(2 + index, 1, RowType.PATH.name) + ws.cell(header_index, 1, RowType.IGNORE.name) + ws.cell(description_index, 1, RowType.IGNORE.name) + + ordered_cols = self._get_ordered_cols(sheetdef) + + # create other columns + for index, (colname, ct, desc, path) in enumerate(ordered_cols): + ws.cell(1, 2 + index, ct.name) + for path_index, el in enumerate(path): + ws.cell(2 + path_index, 2 + index, el) + ws.cell(header_index, 2 + index, colname) + if ct == ColumnType.FOREIGN: + # Visual highlighting + ws.cell(header_index, 2 + index).fill = yellowfill + if desc: + ws.cell(description_index, 2 + index, desc) + + # hide special rows + for index, row in enumerate(ws.rows): + if not (row[0].value is None or row[0].value == RowType.IGNORE.name): + ws.row_dimensions[index+1].hidden = True + + # hide special column + ws.column_dimensions['A'].hidden = True + + # order sheets + # for index, sheetname in enumerate(sorted(wb.sheetnames)): + # wb.move_sheet(sheetname, index-wb.index(wb[sheetname])) + # reverse sheets + for index, sheetname in enumerate(wb.sheetnames[::-1]): + wb.move_sheet(sheetname, index-wb.index(wb[sheetname])) + + return wb diff --git a/src/caosadvancedtools/table_json_conversion/utils.py b/src/caosadvancedtools/table_json_conversion/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..15ae488d7cb8e142afba58424b49e8fc3a15e0d6 --- /dev/null +++ b/src/caosadvancedtools/table_json_conversion/utils.py @@ -0,0 +1,25 @@ +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +from typing import List + + +def p2s(path: List[str]): + """Path to string: dot-separated. + """ + return ".".join(path) diff --git a/src/doc/table-json-conversion/specs.md b/src/doc/table-json-conversion/specs.md new file mode 100644 index 0000000000000000000000000000000000000000..3a5fcef587d9facd1f0960298226c0f96307beb9 --- /dev/null +++ b/src/doc/table-json-conversion/specs.md @@ -0,0 +1,229 @@ +# Konversion zwischen LinkAhead-Datenmodellen, JSON-Schema und XLSX (und zurück) # + +Top level of json must be a dict. keys of the dict are RT names. + + +Frage: is the first RT never an array? + + +Do not use sheet name, but only content of hidden rows + +## Datenmodelle in JSON-Schema und JSON-Daten ## + +Das Datenmodell in LinkAhead legt fest, welche Arten von Records es in einer LinkAhead-Instanz gibt +und wie diese aussehen. Dieses Datenmodell kann auch in einem JSON Schema repräsentiert werden, dass +die Struktur von JSON Dateien festlegt, die zu dem Datenmodell gehörige Records enthält. + +Zum Beispiel kann das folgende JSON den Record einer Person beschreiben: + +```JSON +{ + "Person": { + "family_name": "Steve", + "given_name": "Stevie" + } +} +``` + +Ein *JSON Schema* schreibt eine konkrete Struktur vor, und die zugehörige JSON Dateien können +genutzt werden, um Daten zu bestimmten Record-Strukturen zu repräsentieren. Beispielsweise könnte +man ein JSON Schema erstellen, dass es erlaubt "Training" Records mit Informationen zu abgehaltenen +Trainings zu speichern. Dies ist insbesondere wertvoll beim Datenim- und export. Man +könnte Webformulare aus dem Json Schema generieren oder es nutzen, um in LinkAhead gespeicherte +Objekte als JSON zu exportieren. + +## Von JSON zu XLSX: Datenrepräsentation ## + +Im Folgenden wird beschrieben, wie JSON Dateien, die LinkAhead-Records reprästentieren in XLSX +Dateien umgewandelt werden, bzw. wie aus XLSX-Dateien JSON Dateien mit Records erstellt werden. + +Der Attributname (oben "Person") legt den RecordType fest und der Wert diese Attributs kann entweder +ein Objekt oder eine Liste sein. Ist es ein Objekt (wie im obigen Beispiel), so wird ein einzelner +Record repräsentiert. Bei einer Liste mehrere Records, die den gleichen RecordType als Parent +haben. + +Die *Properties* des Records (oben `family_name` und `given_name`) werden zu *Spalten* im XLSX. Die +Properties haben wiederum einen Attributnamen und einen Wert. Der Wert kann + +a. primitiv (Text, Zahl, Boolean, ...) +b. ein Record +c. eine Liste von primitiven Typen +d. eine Liste von Records + +sein. + +In den Fällen *a.* und *c.* wird in XLSX eine Zelle in der zur Property gehörigen Spalte erstellt. +Im Fall *b.* wird prinzipiell für die Properties des Records Spalten erstellt. Tatsächlich wird der +referenzierte Record genauso behandelt wie der ursprüngliche. D.h. die Fälle a.-d. werden wieder +für die einzelnen Properties betrachtet. + +Für den Fall *d.* ist die zweidimensionale Struktur eines XLSX Blatts nicht ausreichend. Daher +werden für solche Fälle *neue* XLSX-Blätter/-Tabellen erstellt. + +In diesen werden die referenzierten Records behandelt wie oben beschrieben. Es gibt jedoch +zusätzliche Spalten die es erlauben zu erkennen, von welchem "externen" Record diese Records +referenziert werden. + +Wir betrachten diese vier Fälle nun im Detail: + +### a. Properties mit primitiven Datentypen ### + +```JSON +{ + "Training": { + "date": "2023-01-01", + "url": "www.indiscale.com", + "duration": 1.0, + "participants": 1, + "remote": false + } +} +``` +Dieser Eintrag wird in einem XLSX-Blatt mit dem folgenden Inhalt abgebildet: + +| date | url | duration | participants | remote | +|------------|-------------------|----------|--------------|--------| +| 2023-01-01 | www.indiscale.com | 1.0 | 1 | false | + +### b. Property, die einen Record referenziert ### + +```JSON +{ + "Training": { + "date": "2023-01-01", + "supervisor": { + "family_name": "Stevenson", + "given_name": "Stevie", + } + } +} +``` + +Dieser Eintrag wird in einem XLSX Blatt mit dem folgenden Inhalt abgebildet: + +| date | `supervisor.family_name` | `supervisor.given_name` | +|------------|--------------------------|-------------------------| +| 2023-01-01 | Stevenson | Stevie | + +Beachten Sie, dass die Spaltennamen umbenannt werden dürfen. Die Zuordnung der Spalte zu Properties +von Records wird über den Inhalt von versteckten Zeilen gewährleistet. + +### c. Properties, die Listen mit Werten von primitiven Datentypen enthalten ### + +```JSON +{ + "Training": { + "url": "www.indiscale.com", + "subjects": ["Math", "Physics"], + } +} +``` + +Dieser Eintrag würde in einem XLSX Blatt mit dem folgenden Inhalt abgebildet: + +| url | subjects | +|-------------------|--------------| +| www.indiscale.com | Math;Physics | + +Die Listenelemente werden separiert von `;` in die Zelle geschrieben. Wenn die Elemente den +Separator `;` enthalten, dann wird dieser mit einem `\\` escaped. + +### d. Properties, die Listen mit Referenzen enthalten ### + +```JSON +{ + "Training": { + "date": "2023-01-01", + "coach": [ + { + "family_name": "Sky", + "given_name": "Max", + }, + { + "family_name": "Sky", + "given_name": "Min", + } + ] + } +} +``` + +Da die beiden Coaches nicht vernünftig in einer Zelle dargestellt werden können, bedarf es nun eines +weiteren Tabellenblatts, das die Eigenschaften der Coaches enthält. + +Das Blatt zu den *Trainings* enthält in diesem Beispiel nur die "date" Spalte: + +| date | +|------------| +| 2023-01-01 | + +Zusätzlich gibt es ein *weiteres* Blatt in dem die Coaches gespeichert werden. Hier ist nun +entscheidend, dass definiert wird, wie von potentiell mehreren "Trainings" das richtige Element +gewählt wird. In diesem Fall bedeutet dies, dass das "date" eindeutig sein muss. + +TODO: In welchem Scope gilt diese Eindeutigkeit? Können wir dies checken? + +Das zweite Blatt sieht dann wie folgt aus + +| date | `coach.family_name` | `coach.given_name` | +|------------|---------------------|--------------------| +| 2023-01-01 | Sky | Max | +| 2023-01-01 | Sky | Min | + +## Data in XLSX: Hidden automation logic ## + +### First column: Marker for row types ### + +The first column in each sheet will be hidden and it will contain an entry in each row that needs +special treatment. The following values are used: + +- ``IGNORE``: This row is ignored. It can be used for explanatory texts or layout. +- ``COL_TYPE``: Typically the first row that is not `IGNORE`. It indicates the row that defines the + type of columns (`FOREIGN`, `SCALAR`, `LIST`, `IGNORE`). This row may occur only once. +- ``PATH``: Indicates that the row is used to define the path within the JSON. These rows are + typically hidden for users. + +An example table could look like this: + +| `IGNORE` | | Welcome | to this | file! | | +| `IGNORE` | | Please | enter your | data here: | | +| `COL_TYPE` | `IGNORE` | `SCALAR` | `SCALAR` | `LIST` | `SCALAR` | +| `PATH` | | `Training` | `Training` | `Training` | `Training` | +| `PATH` | | `url` | `date` | `subjects` | `supervisor` | +| `PATH` | | | | | `email` | +| `IGNORE` | Please enter one training per line. | Training URL | Training date | Subjects | Supervisor's email | +|------------|-------------------------------------|----------------|---------------|--------------|--------------------| +| | | example.com/mp | 2024-02-27 | Math;Physics | steve@example.com | +| | | example.com/m | 2024-02-27 | Math | stella@example.com | + +### Parsing XLSX data ### + +To extract the value of a given cell, we traverse all path elements (in ``PATH`` rows) from top to +bottom. The final element of the path is the name of the Property to which the value belongs. In +the example above, `steve@example.com` is the value of the `email` Property in the path +`["Training", "supervisor", "email"]`. + +The path elements are sufficient to identify the object within a JSON, at least if the corresponding +JSON element is a single object. If the JSON element is an array, the appropriate object within the +array needs to be selected. + +For this selection additional ``FOREIGN`` columns are used. The paths in these columns must all have +the same *base* and one additional *unique key* component. For example, two `FOREIGN` columns could +be `["Training", "date"]` and `["Training", "url"]`, where `["Training"]` is the *base path* and +`"date"` and `"url"` are the *unique keys*. + +The base path defines the table (or recordtype) to which the entries belong, and the values of the +unique keys define the actual rows to which data belongs. + +For example, this table defines three coaches for the two trainings from the last table: + +| `COL_TYPE` | `FOREIGN` | `FOREIGN` | `SCALAR` | +| `PATH` | `Training` | `Training` | `Training` | +| `PATH` | `date` | `url` | `coach` | +| `PATH` | | | `given_name` | +| `IGNORE` | Date of training | URL of training | The coach's given name | +| `IGNORE` | from sheet 'Training' | from sheet 'Training' | | +|------------|-----------------------|-----------------------|------------------------| +| | 2024-02-27 | example.com/mp | Ada | +| | 2024-02-27 | example.com/mp | Berta | +| | 2024-02-27 | example.com/m | Chris | diff --git a/unittests/table_json_conversion/__init__.py b/unittests/table_json_conversion/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/unittests/table_json_conversion/create_jsonschema.py b/unittests/table_json_conversion/create_jsonschema.py new file mode 100755 index 0000000000000000000000000000000000000000..9585f5458edf8f9d3f785099295a3e675230932c --- /dev/null +++ b/unittests/table_json_conversion/create_jsonschema.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2023 IndiScale GmbH <www.indiscale.com> +# Copyright (C) 2023 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Create JSON-Schema according to configuration. + +""" + +import argparse +import json +from typing import List + +import caosadvancedtools.json_schema_exporter as jsex +from caosadvancedtools.models import parser +# import tomli + + +def prepare_datamodel(modelfile, recordtypes: List[str], outfile: str, + do_not_create: List[str] = None): + if do_not_create is None: + do_not_create = [] + model = parser.parse_model_from_yaml(modelfile) + + exporter = jsex.JsonSchemaExporter(additional_properties=False, + # additional_options_for_text_props=additional_text_options, + # name_and_description_in_properties=True, + name_property_for_new_records=True, + do_not_create=do_not_create, + # do_not_retrieve=do_not_retrieve, + no_remote=True, + use_rt_pool=model, + ) + schemas = [] + for recordtype in recordtypes: + schemas.append(exporter.recordtype_to_json_schema(model.get_deep(recordtype))) + merged_schema = jsex.merge_schemas(schemas) + + with open(outfile, mode="w", encoding="utf8") as json_file: + json.dump(merged_schema, json_file, ensure_ascii=False, indent=2) + + +def _parse_arguments(): + """Parse the arguments.""" + arg_parser = argparse.ArgumentParser(description='') + + return arg_parser.parse_args() + + +def main(): + """The main function of this script.""" + _ = _parse_arguments() + prepare_datamodel("data/simple_model.yml", ["Training", "Person"], "data/simple_schema.json", + do_not_create=["Organisation"]) + prepare_datamodel("data/multiple_refs_model.yml", ["Training", "Person"], + "data/multiple_refs_schema.json") + prepare_datamodel("data/indirect_model.yml", ["Wrapper"], + "data/indirect_schema.json") + + +if __name__ == "__main__": + main() diff --git a/unittests/table_json_conversion/data/error_simple_data.json b/unittests/table_json_conversion/data/error_simple_data.json new file mode 100644 index 0000000000000000000000000000000000000000..bfea88b675ab2a6e0c1787fc401afec5c564c006 --- /dev/null +++ b/unittests/table_json_conversion/data/error_simple_data.json @@ -0,0 +1,11 @@ +{ + "Training": { + "duration": 1.0, + "participants": 0.5 + }, + "Person": { + "family_name": "Auric", + "given_name": "Goldfinger", + "Organisation": "Federal Reserve" + } +} diff --git a/unittests/table_json_conversion/data/indirect_data.json b/unittests/table_json_conversion/data/indirect_data.json new file mode 100644 index 0000000000000000000000000000000000000000..c77dd1ff2a703af6b6b2a0db19f450ac10616d9b --- /dev/null +++ b/unittests/table_json_conversion/data/indirect_data.json @@ -0,0 +1,18 @@ +{ + "Wrapper": { + "Results": [ + { + "year": 2022, + "avg_score": 2.4 + }, + { + "year": 2023, + "avg_score": 4.2 + } + ], + "Training": { + "name": "Basic Training", + "url": "www.example.com/training/basic" + } + } +} diff --git a/unittests/table_json_conversion/data/indirect_data.xlsx b/unittests/table_json_conversion/data/indirect_data.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..894ec95f87aa32a618b3b70504727398f2ce2358 Binary files /dev/null and b/unittests/table_json_conversion/data/indirect_data.xlsx differ diff --git a/unittests/table_json_conversion/data/indirect_model.yml b/unittests/table_json_conversion/data/indirect_model.yml new file mode 100644 index 0000000000000000000000000000000000000000..2a7f4f98ff9a46478eb631e6990deceadc9a498c --- /dev/null +++ b/unittests/table_json_conversion/data/indirect_model.yml @@ -0,0 +1,18 @@ +Training: + recommended_properties: + url: + datatype: TEXT + description: 'The URL' +Results: + description: "Results for a training" + recommended_properties: + year: + datatype: INTEGER + avg_score: + description: The average score for the linked training. + datatype: DOUBLE +Wrapper: + recommended_properties: + Training: + Results: + datatype: LIST<Results> diff --git a/unittests/table_json_conversion/data/indirect_schema.json b/unittests/table_json_conversion/data/indirect_schema.json new file mode 100644 index 0000000000000000000000000000000000000000..64b6ff279c584456fe1f90454225d144c90e014d --- /dev/null +++ b/unittests/table_json_conversion/data/indirect_schema.json @@ -0,0 +1,63 @@ +{ + "type": "object", + "properties": { + "Wrapper": { + "type": "object", + "required": [], + "additionalProperties": false, + "title": "Wrapper", + "properties": { + "name": { + "type": "string", + "description": "The name of the Record to be created" + }, + "Training": { + "type": "object", + "required": [], + "additionalProperties": false, + "title": "Training", + "properties": { + "name": { + "type": "string", + "description": "The name of the Record to be created" + }, + "url": { + "type": "string", + "description": "The URL" + } + } + }, + "Results": { + "description": "Results for a training", + "type": "array", + "items": { + "type": "object", + "required": [], + "additionalProperties": false, + "description": "Results for a training", + "title": "Results", + "properties": { + "name": { + "type": "string", + "description": "The name of the Record to be created" + }, + "year": { + "type": "integer" + }, + "avg_score": { + "description": "The average score for the linked training.", + "type": "number" + } + } + } + } + }, + "$schema": "https://json-schema.org/draft/2020-12/schema" + } + }, + "required": [ + "Wrapper" + ], + "additionalProperties": false, + "$schema": "https://json-schema.org/draft/2020-12/schema" +} diff --git a/unittests/table_json_conversion/data/indirect_template.xlsx b/unittests/table_json_conversion/data/indirect_template.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..cc614acb75b36e10143a29f28dff9fce7d5e006f Binary files /dev/null and b/unittests/table_json_conversion/data/indirect_template.xlsx differ diff --git a/unittests/table_json_conversion/data/multiple_refs_data.json b/unittests/table_json_conversion/data/multiple_refs_data.json new file mode 100644 index 0000000000000000000000000000000000000000..5b8ce9136635832111abb2206d8afe1bc7c58444 --- /dev/null +++ b/unittests/table_json_conversion/data/multiple_refs_data.json @@ -0,0 +1,48 @@ +{ + "Training": { + "trainer": [], + "participant": [ + { + "full_name": "Petra Participant", + "email": "petra@indiscale.com" + }, + { + "full_name": "Peter", + "email": "peter@getlinkahead.com" + } + ], + "Organisation": [ + { + "Person": [ + { + "full_name": "Henry Henderson", + "email": "henry@organization.org" + }, + { + "full_name": "Harry Hamburg", + "email": "harry@organization.org" + } + ], + "name": "World Training Organization", + "Country": "US" + }, + { + "Person": [ + { + "full_name": "Hermione Harvard", + "email": "hermione@organisation.org.uk" + }, + { + "full_name": "Hazel Harper", + "email": "hazel@organisation.org.uk" + } + ], + "name": "European Training Organisation", + "Country": "UK" + } + ], + "date": "2024-03-21T14:12:00.000Z", + "url": "www.indiscale.com", + "name": "Example training with multiple organizations." + } +} diff --git a/unittests/table_json_conversion/data/multiple_refs_data.xlsx b/unittests/table_json_conversion/data/multiple_refs_data.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..21622ede9515b0cfa9f965f8c2ee89f782c4bf0c Binary files /dev/null and b/unittests/table_json_conversion/data/multiple_refs_data.xlsx differ diff --git a/unittests/table_json_conversion/data/multiple_refs_model.yml b/unittests/table_json_conversion/data/multiple_refs_model.yml new file mode 100644 index 0000000000000000000000000000000000000000..576ff1d59215f893cb5dec1fd5a90c5975713c60 --- /dev/null +++ b/unittests/table_json_conversion/data/multiple_refs_model.yml @@ -0,0 +1,36 @@ +Person: + recommended_properties: + full_name: + datatype: TEXT + email: + datatype: TEXT +Training: + recommended_properties: + date: + datatype: DATETIME + description: 'The date of the training.' + url: + datatype: TEXT + description: 'The URL' + trainer: + datatype: LIST<Person> + participant: + datatype: LIST<Person> + supervisor: + datatype: Person + responsible: + datatype: Person + Organisation: + datatype: LIST<Organisation> + supervisor_inherit: + inherit_from_suggested: + - Person + responsible_inherit: + inherit_from_suggested: + - Person +Organisation: + recommended_properties: + Country: + datatype: TEXT + Person: + datatype: LIST<Person> diff --git a/unittests/table_json_conversion/data/multiple_refs_schema.json b/unittests/table_json_conversion/data/multiple_refs_schema.json new file mode 100644 index 0000000000000000000000000000000000000000..2adec8de92b548ef97129074c7a24e9378118a4f --- /dev/null +++ b/unittests/table_json_conversion/data/multiple_refs_schema.json @@ -0,0 +1,210 @@ +{ + "type": "object", + "properties": { + "Training": { + "type": "object", + "required": [], + "additionalProperties": false, + "title": "Training", + "properties": { + "name": { + "type": "string", + "description": "The name of the Record to be created" + }, + "date": { + "description": "The date of the training.", + "anyOf": [ + { + "type": "string", + "format": "date" + }, + { + "type": "string", + "format": "date-time" + } + ] + }, + "url": { + "type": "string", + "description": "The URL" + }, + "trainer": { + "type": "array", + "items": { + "type": "object", + "required": [], + "additionalProperties": false, + "title": "trainer", + "properties": { + "name": { + "type": "string", + "description": "The name of the Record to be created" + }, + "full_name": { + "type": "string" + }, + "email": { + "type": "string" + } + } + } + }, + "participant": { + "type": "array", + "items": { + "type": "object", + "required": [], + "additionalProperties": false, + "title": "participant", + "properties": { + "name": { + "type": "string", + "description": "The name of the Record to be created" + }, + "full_name": { + "type": "string" + }, + "email": { + "type": "string" + } + } + } + }, + "supervisor": { + "type": "object", + "required": [], + "additionalProperties": false, + "title": "supervisor", + "properties": { + "name": { + "type": "string", + "description": "The name of the Record to be created" + }, + "full_name": { + "type": "string" + }, + "email": { + "type": "string" + } + } + }, + "responsible": { + "type": "object", + "required": [], + "additionalProperties": false, + "title": "responsible", + "properties": { + "name": { + "type": "string", + "description": "The name of the Record to be created" + }, + "full_name": { + "type": "string" + }, + "email": { + "type": "string" + } + } + }, + "Organisation": { + "type": "array", + "items": { + "type": "object", + "required": [], + "additionalProperties": false, + "title": "Organisation", + "properties": { + "name": { + "type": "string", + "description": "The name of the Record to be created" + }, + "Country": { + "type": "string" + }, + "Person": { + "type": "array", + "items": { + "type": "object", + "required": [], + "additionalProperties": false, + "title": "Person", + "properties": { + "name": { + "type": "string", + "description": "The name of the Record to be created" + }, + "full_name": { + "type": "string" + }, + "email": { + "type": "string" + } + } + } + } + } + } + }, + "supervisor_inherit": { + "type": "object", + "required": [], + "additionalProperties": false, + "title": "supervisor_inherit", + "properties": { + "name": { + "type": "string", + "description": "The name of the Record to be created" + }, + "full_name": { + "type": "string" + }, + "email": { + "type": "string" + } + } + }, + "responsible_inherit": { + "type": "object", + "required": [], + "additionalProperties": false, + "title": "responsible_inherit", + "properties": { + "name": { + "type": "string", + "description": "The name of the Record to be created" + }, + "full_name": { + "type": "string" + }, + "email": { + "type": "string" + } + } + } + }, + "$schema": "https://json-schema.org/draft/2020-12/schema" + }, + "Person": { + "type": "object", + "required": [], + "additionalProperties": false, + "title": "Person", + "properties": { + "name": { + "type": "string", + "description": "The name of the Record to be created" + }, + "full_name": { + "type": "string" + }, + "email": { + "type": "string" + } + }, + "$schema": "https://json-schema.org/draft/2020-12/schema" + } + }, + "required": [], + "additionalProperties": false, + "$schema": "https://json-schema.org/draft/2020-12/schema" +} diff --git a/unittests/table_json_conversion/data/multiple_refs_template.xlsx b/unittests/table_json_conversion/data/multiple_refs_template.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..cff3dad99a3c296e360d660ed5178a0eee48cd40 Binary files /dev/null and b/unittests/table_json_conversion/data/multiple_refs_template.xlsx differ diff --git a/unittests/table_json_conversion/data/simple_data.json b/unittests/table_json_conversion/data/simple_data.json new file mode 100644 index 0000000000000000000000000000000000000000..9997f17e76a46d5e97d842fdee40626047e7a347 --- /dev/null +++ b/unittests/table_json_conversion/data/simple_data.json @@ -0,0 +1,32 @@ +{ + "Training": { + "date": "2023-01-01", + "url": "www.indiscale.com", + "coach": [ + { + "family_name": "Sky", + "given_name": "Max", + "Organisation": "ECB" + }, + { + "family_name": "Sky", + "given_name": "Min", + "Organisation": "ECB" + } + ], + "supervisor": { + "family_name": "Steve", + "given_name": "Stevie", + "Organisation": "IMF" + }, + "duration": 1.0, + "participants": 1, + "subjects": ["Math", "Physics"], + "remote": false + }, + "Person": { + "family_name": "Steve", + "given_name": "Stevie", + "Organisation": "IMF" + } +} diff --git a/unittests/table_json_conversion/data/simple_data.xlsx b/unittests/table_json_conversion/data/simple_data.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..662a636603944a91cbaeaea8cc0c3bbab12f2f50 Binary files /dev/null and b/unittests/table_json_conversion/data/simple_data.xlsx differ diff --git a/unittests/table_json_conversion/data/simple_data_ascii_chars.json b/unittests/table_json_conversion/data/simple_data_ascii_chars.json new file mode 100644 index 0000000000000000000000000000000000000000..b1d13ebee5d6e3949fa606a130e6f5819bfc4bc8 --- /dev/null +++ b/unittests/table_json_conversion/data/simple_data_ascii_chars.json @@ -0,0 +1,18 @@ +{ + "Training": { + "date": "2023-01-01", + "url": "char: >\u0001\u0002\u0003\u0004\u0005\u0006\u0007\u0008\u0009<", + "subjects": [ + ">\u000a\u000b\u000c\u000e\u000f<", + ">\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017<", + ">\u0018\u0019\u001a\u001b\u001c\u001d\u001e\u001f<", + ">\u0020\u0021\u0022\u0023\u0024\u0025\u0026\u0027<", + ">\u0028\u0029\u002a\u002b\u002c\u002d\u002e\u002f<" + ] + }, + "Person": { + "family_name": "Steve", + "given_name": "Stevie", + "Organisation": "IMF" + } +} diff --git a/unittests/table_json_conversion/data/simple_data_ascii_chars.xlsx b/unittests/table_json_conversion/data/simple_data_ascii_chars.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..bdf60b568bf51726a0a25e599bb6c7e70988d287 Binary files /dev/null and b/unittests/table_json_conversion/data/simple_data_ascii_chars.xlsx differ diff --git a/unittests/table_json_conversion/data/simple_model.yml b/unittests/table_json_conversion/data/simple_model.yml new file mode 100644 index 0000000000000000000000000000000000000000..74fb5bc5dc4251bb3834ea2f6201f991cab510d1 --- /dev/null +++ b/unittests/table_json_conversion/data/simple_model.yml @@ -0,0 +1,36 @@ +Person: + recommended_properties: + family_name: + datatype: TEXT + given_name: + datatype: TEXT + Organisation: +Training: + recommended_properties: + date: + datatype: DATETIME + description: 'The date of the training.' + url: + datatype: TEXT + description: 'The URL' + subjects: + datatype: LIST<TEXT> + coach: + datatype: LIST<Person> + supervisor: + datatype: Person + duration: + datatype: DOUBLE + participants: + datatype: INTEGER + remote: + datatype: BOOLEAN + slides: + datatype: FILE +ProgrammingCourse: + inherit_from_suggested: + - Training +Organisation: + recommended_properties: + Country: + datatype: TEXT diff --git a/unittests/table_json_conversion/data/simple_schema.json b/unittests/table_json_conversion/data/simple_schema.json new file mode 100644 index 0000000000000000000000000000000000000000..01a732d67758ed52a334ff7076778a45f8850f95 --- /dev/null +++ b/unittests/table_json_conversion/data/simple_schema.json @@ -0,0 +1,139 @@ +{ + "type": "object", + "properties": { + "Training": { + "type": "object", + "required": [], + "additionalProperties": false, + "title": "Training", + "properties": { + "name": { + "type": "string", + "description": "The name of the Record to be created" + }, + "date": { + "description": "The date of the training.", + "anyOf": [ + { + "type": "string", + "format": "date" + }, + { + "type": "string", + "format": "date-time" + } + ] + }, + "url": { + "type": "string", + "description": "The URL" + }, + "subjects": { + "type": "array", + "items": { + "type": "string" + } + }, + "coach": { + "type": "array", + "items": { + "type": "object", + "required": [], + "additionalProperties": false, + "title": "coach", + "properties": { + "name": { + "type": "string", + "description": "The name of the Record to be created" + }, + "family_name": { + "type": "string" + }, + "given_name": { + "type": "string" + }, + "Organisation": { + "enum": [ + "Federal Reserve", + "IMF", + "ECB" + ] + } + } + } + }, + "supervisor": { + "type": "object", + "required": [], + "additionalProperties": false, + "title": "supervisor", + "properties": { + "name": { + "type": "string", + "description": "The name of the Record to be created" + }, + "family_name": { + "type": "string" + }, + "given_name": { + "type": "string" + }, + "Organisation": { + "enum": [ + "Federal Reserve", + "IMF", + "ECB" + ] + } + } + }, + "duration": { + "type": "number" + }, + "participants": { + "type": "integer" + }, + "remote": { + "type": "boolean" + }, + "slides": { + "type": "string", + "format": "data-url" + } + }, + "$schema": "https://json-schema.org/draft/2020-12/schema" + }, + "Person": { + "type": "object", + "required": [], + "additionalProperties": false, + "title": "Person", + "properties": { + "name": { + "type": "string", + "description": "The name of the Record to be created" + }, + "family_name": { + "type": "string" + }, + "given_name": { + "type": "string" + }, + "Organisation": { + "enum": [ + "Federal Reserve", + "IMF", + "ECB" + ] + } + }, + "$schema": "https://json-schema.org/draft/2020-12/schema" + } + }, + "required": [ + "Training", + "Person" + ], + "additionalProperties": false, + "$schema": "https://json-schema.org/draft/2020-12/schema" +} diff --git a/unittests/table_json_conversion/data/simple_template.xlsx b/unittests/table_json_conversion/data/simple_template.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..1162965bf44642a4523123fa52c58dd240b25e5f Binary files /dev/null and b/unittests/table_json_conversion/data/simple_template.xlsx differ diff --git a/unittests/table_json_conversion/how_to_schema.md b/unittests/table_json_conversion/how_to_schema.md new file mode 100644 index 0000000000000000000000000000000000000000..a7b4e3ca35a1fc9e67ebbb29f316825e89596f4a --- /dev/null +++ b/unittests/table_json_conversion/how_to_schema.md @@ -0,0 +1,19 @@ +Insert the data model into a LinkAhead server. + +Run the following code: +``` + model = parser.parse_model_from_yaml("./model.yml") + + exporter = jsex.JsonSchemaExporter(additional_properties=False, + #additional_options_for_text_props=additional_text_options, + #name_and_description_in_properties=True, + #do_not_create=do_not_create, + #do_not_retrieve=do_not_retrieve, + ) + schema_top = exporter.recordtype_to_json_schema(model.get_deep("Training")) + schema_pers = exporter.recordtype_to_json_schema(model.get_deep("Person")) + merged_schema = jsex.merge_schemas([schema_top, schema_pers]) + + with open("model_schema.json", mode="w", encoding="utf8") as json_file: + json.dump(merged_schema, json_file, ensure_ascii=False, indent=2) +``` diff --git a/unittests/table_json_conversion/test_fill_xlsx.py b/unittests/table_json_conversion/test_fill_xlsx.py new file mode 100644 index 0000000000000000000000000000000000000000..946336da721f7c9affd5c553ccbb38cb46217eef --- /dev/null +++ b/unittests/table_json_conversion/test_fill_xlsx.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +import json +import os +import re +import tempfile + +import jsonschema.exceptions as schema_exc +import pytest +from caosadvancedtools.table_json_conversion.fill_xlsx import ( + _get_path_rows, _get_row_type_column_index, fill_template) +from openpyxl import load_workbook + +from .utils import compare_workbooks + + +def rfp(*pathcomponents): + """ + Return full path. + Shorthand convenience function. + """ + return os.path.join(os.path.dirname(__file__), *pathcomponents) + + +def fill_and_compare(json_file: str, template_file: str, known_good: str, + schema: str = None, custom_output: str = None): + """Fill the data into a template and compare to a known good. + +Parameters: +----------- +schema: str, optional, + Json schema to validate against. +custom_output: str, optional + If given, write to this file and drop into an IPython shell. For development only. + """ + with tempfile.TemporaryDirectory() as tmpdir: + outfile = os.path.join(tmpdir, 'test.xlsx') + assert not os.path.exists(outfile) + if custom_output is not None: + outfile = custom_output + fill_template(data=json_file, template=template_file, result=outfile, + validation_schema=schema) + assert os.path.exists(outfile) + generated = load_workbook(outfile) # workbook can be read + known_good_wb = load_workbook(known_good) + compare_workbooks(generated, known_good_wb) + + +def test_detect(): + example = load_workbook(rfp("data/simple_template.xlsx")) + assert 0 == _get_row_type_column_index(example['Person']) + assert [1, 2] == _get_path_rows(example['Person']) + + +def test_temporary(): + # TODO: remove the following after manual testing + di = '/home/henrik/CaosDB/management/external/dimr/eingabemaske/crawler/schemas' + dd = '/home/henrik/CaosDB/management/external/dimr/eingabemaske/django/laforms/persistent/' + allreadydone = [ + "Präventionsmaßnahmen", + "Beratungsstellen", + "Schutzeinrichtungen", + "Einzelfallversorgung", + "Strategiedokumente", + "Kooperationsvereinbarungen", + "Gremien", + "Verwaltungsvorschriften", + "Gewaltschutzkonzepte und -maßnahmen", + "Polizeilicher Opferschutz", + "Feedback", + ] + for prefix, _, files in os.walk(dd): + for fi in files: + match = re.match(r"(?P<teilb>.*)_2024-.*\.json", fi) + + if match: + print(match.group('teilb')) + tb = match.group('teilb') + if tb in allreadydone: + continue + # allreadydone.append(tb) + template = os.path.join(di, "template_"+tb+".xlsx") + schema = os.path.join(di, "schema_"+tb+".json") + if not os.path.exists(template): + print(template) + assert False + jfi = os.path.join(prefix, fi) + print(jfi) + if not fi.startswith("Art"): + continue + # if jfi != "/home/henrik/CaosDB/management/external/dimr/eingabemaske/django/laforms/persistent/data/datenhalterin_gg/he_gg_2/Art__13_Bewusstseinsbildung_2024-01-11T10:22:26.json": + # continue + with open(jfi, encoding="utf-8") as infile: + data = json.load(infile) + data = data["form_data"] + if "__version__" in data: + del data["__version__"] + with tempfile.TemporaryDirectory() as tmpdir: + outfile = os.path.join(tmpdir, 'test.xlsx') + fill_template(data=data, template=template, result=outfile, + validation_schema=schema) + os.system(f'libreoffice {outfile}') + + +def test_fill_xlsx(): + fill_and_compare(json_file=rfp("data/simple_data.json"), + template_file=rfp("data/simple_template.xlsx"), + known_good=rfp("data/simple_data.xlsx"), + schema=rfp("data/simple_schema.json")) + fill_and_compare(json_file=rfp("data/multiple_refs_data.json"), + template_file=rfp("data/multiple_refs_template.xlsx"), + known_good=rfp("data/multiple_refs_data.xlsx"), + schema=rfp("data/multiple_refs_schema.json")) + fill_and_compare(json_file=rfp("data/indirect_data.json"), + template_file=rfp("data/indirect_template.xlsx"), + known_good=rfp("data/indirect_data.xlsx"), + schema=rfp("data/indirect_schema.json")) + fill_and_compare(json_file=rfp("data/simple_data_ascii_chars.json"), + template_file=rfp("data/simple_template.xlsx"), + known_good=rfp("data/simple_data_ascii_chars.xlsx"), + schema=rfp("data/simple_schema.json")) + + +def test_errors(): + with pytest.raises(AssertionError) as exc: + fill_and_compare(json_file=rfp("data/error_simple_data.json"), + template_file=rfp("data/simple_template.xlsx"), + known_good=rfp("data/simple_data.xlsx")) + assert "Auric\nSteve" in str(exc.value) + with pytest.raises(schema_exc.ValidationError) as exc: + fill_and_compare(json_file=rfp("data/error_simple_data.json"), + template_file=rfp("data/simple_template.xlsx"), + known_good=rfp("data/simple_data.xlsx"), + schema=rfp("data/simple_schema.json")) + assert exc.value.message == "0.5 is not of type 'integer'" diff --git a/unittests/table_json_conversion/test_table_template_generator.py b/unittests/table_json_conversion/test_table_template_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..8fc7b216d0eb2aa54ece6ace986cbeb227cc3e45 --- /dev/null +++ b/unittests/table_json_conversion/test_table_template_generator.py @@ -0,0 +1,279 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +import json +import os +import tempfile +from typing import Tuple + +import pytest +from caosadvancedtools.table_json_conversion.table_generator import ( + ColumnType, XLSXTemplateGenerator) +from openpyxl import load_workbook + +from .utils import compare_workbooks + + +def rfp(*pathcomponents): + """ + Return full path. + Shorthand convenience function. + """ + return os.path.join(os.path.dirname(__file__), *pathcomponents) + + +def _compare_generated_to_known_good(schema_file: str, known_good: str, foreign_keys: dict = None, + outfile: str = None) -> Tuple: + """Generate an XLSX from the schema, then compare to known good output. + +Returns +------- +out: tuple + The generated and known good workbook objects. + """ + generator = XLSXTemplateGenerator() + if foreign_keys is None: + foreign_keys = {} + with open(schema_file, encoding="utf-8") as schema_input: + schema = json.load(schema_input) + with tempfile.TemporaryDirectory() as tmpdir: + if outfile is None: + outpath = os.path.join(tmpdir, 'generated.xlsx') + else: + outpath = outfile + assert not os.path.exists(outpath) + generator.generate(schema=schema, + foreign_keys=foreign_keys, + filepath=outpath) + assert os.path.exists(outpath) + generated = load_workbook(outpath) + good = load_workbook(known_good) + compare_workbooks(generated, good) + return generated, good + + +def test_generate_sheets_from_schema(): + # trivial case; we do not support this + schema = {} + generator = XLSXTemplateGenerator() + with pytest.raises(ValueError, match="Inappropriate JSON schema:.*"): + generator._generate_sheets_from_schema(schema) + + # top level must be RT with Properties + schema = { + "type": "string" + } + with pytest.raises(ValueError, match="Inappropriate JSON schema:.*"): + generator._generate_sheets_from_schema(schema) + + # bad type + schema = { + "type": "object", + "properties": { + "Training": { + "type": "object", + "properties": { + "name": { + "type": "str", + "description": "The name of the Record to be created" + }, + } + } + } + } + with pytest.raises(ValueError, + match="Inappropriate JSON schema: The following part " + "should define an object.*"): + generator._generate_sheets_from_schema(schema, {'Training': ['a']}) + + # bad schema + schema = { + "type": "object", + "properties": { + "Training": { + "type": "object" + } + } + } + with pytest.raises(ValueError, + match="Inappropriate JSON schema: The following part " + "should define an object.*"): + generator._generate_sheets_from_schema(schema, {'Training': ['a']}) + + # minimal case: one RT with one P + schema = { + "type": "object", + "properties": { + "Training": { + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "The name of the Record to be created" + }, + } + } + } + } + sdef = generator._generate_sheets_from_schema(schema, {'Training': ['a']}) + assert "Training" in sdef + tdef = sdef['Training'] + assert 'name' in tdef + assert tdef['name'] == (ColumnType.SCALAR, "The name of the Record to be created", ["Training", 'name']) + + # example case + with open(rfp("data/simple_schema.json")) as sfi: + schema = json.load(sfi) + with pytest.raises(ValueError, match="A foreign key definition is missing.*"): + generator._generate_sheets_from_schema(schema) + sdef = generator._generate_sheets_from_schema( + schema, + foreign_keys={'Training': {"__this__": ['date', 'url']}}) + assert "Training" in sdef + tdef = sdef['Training'] + assert tdef['date'] == (ColumnType.SCALAR, 'The date of the training.', ["Training", 'date']) + assert tdef['url'] == (ColumnType.SCALAR, 'The URL', ["Training", 'url']) + assert tdef['supervisor.family_name'] == (ColumnType.SCALAR, None, ["Training", 'supervisor', + 'family_name']) + assert tdef['supervisor.given_name'] == (ColumnType.SCALAR, None, ["Training", 'supervisor', + 'given_name']) + assert tdef['supervisor.Organisation'] == (ColumnType.SCALAR, None, ["Training", 'supervisor', + 'Organisation']) + assert tdef['duration'] == (ColumnType.SCALAR, None, ["Training", 'duration']) + assert tdef['participants'] == (ColumnType.SCALAR, None, ["Training", 'participants']) + assert tdef['subjects'] == (ColumnType.LIST, None, ["Training", 'subjects']) + assert tdef['remote'] == (ColumnType.SCALAR, None, ["Training", 'remote']) + cdef = sdef['Training.coach'] + assert cdef['family_name'] == (ColumnType.SCALAR, None, ["Training", 'coach', 'family_name']) + assert cdef['given_name'] == (ColumnType.SCALAR, None, ["Training", 'coach', 'given_name']) + assert cdef['Organisation'] == (ColumnType.SCALAR, None, ["Training", 'coach', + 'Organisation']) + assert cdef['Training.date'] == (ColumnType.FOREIGN, "see sheet 'Training'", ["Training", 'date']) + assert cdef['Training.url'] == (ColumnType.FOREIGN, "see sheet 'Training'", ["Training", 'url']) + + +def test_get_foreign_keys(): + generator = XLSXTemplateGenerator() + fkd = {"Training": ['a']} + assert ['a'] == generator._get_foreign_keys(fkd, ['Training']) + + fkd = {"Training": {"__this__": ['a']}} + assert ['a'] == generator._get_foreign_keys(fkd, ['Training']) + + fkd = {"Training": {'hallo'}} + with pytest.raises(ValueError, match=r"A foreign key definition is missing for path:\n\[" + r"'Training'\]\nKeys are:\n{'Training': \{'hallo'\}\}"): + generator._get_foreign_keys(fkd, ['Training']) + + fkd = {"Training": {"__this__": ['a'], 'b': ['c']}} + assert ['c'] == generator._get_foreign_keys(fkd, ['Training', 'b']) + + with pytest.raises(ValueError, match=r"A foreign key definition is missing for .*"): + generator._get_foreign_keys({}, ['Training']) + + +def test_get_max_path_length(): + assert 4 == XLSXTemplateGenerator._get_max_path_length({'a': (1, 'desc', [1, 2, 3]), + 'b': (2, 'desc', [1, 2, 3, 4])}) + + +def test_template_generator(): + generated, _ = _compare_generated_to_known_good( + schema_file=rfp("data/simple_schema.json"), known_good=rfp("data/simple_template.xlsx"), + foreign_keys={'Training': {"__this__": ['date', 'url']}}, + outfile=None) + # test some hidden + ws = generated.active + assert ws.row_dimensions[1].hidden is True + assert ws.column_dimensions['A'].hidden is True + + # TODO: remove the following after manual testing + di = '/home/henrik/CaosDB/management/external/dimr/eingabemaske/crawler/schemas' + if not os.path.exists(di): + return + for fi in os.listdir(di): + rp = os.path.join(di, fi) + if not fi.startswith("schema_"): + continue + with open(rp) as sfi: + schema = json.load(sfi) + fk_path = os.path.join(di, "foreign_keys"+fi[len('schema'):]) + path = os.path.join(di, "template"+fi[len('schema'):-4]+"xlsx") + alreadydone = [ + "Präventionsmaßnahmen", + "Beratungsstellen", + "Schutzeinrichtungen", + "Einzelfallversorgung", + "Strategiedokumente", + "Kooperationsvereinbarungen", + "Gremien", + "Verwaltungsvorschriften", + "Gewaltschutzkonzepte und -maßnahmen", + "Polizeilicher Opferschutz", + "Feedback", + ] + if any([path.startswith("template_"+k) for k in alreadydone]): + continue + + if not os.path.exists(fk_path): + print(f"No foreign keys file for:\n{fk_path}") + assert False + with open(fk_path) as sfi: + fk = json.load(sfi) + generator = XLSXTemplateGenerator() + if not os.path.exists(path): + generator.generate(schema=schema, foreign_keys=fk, filepath=path) + os.system(f'libreoffice {path}') + else: + print(f"Not creating template because it exists:\n{path}") + + # TODO test collisions of sheet or colnames + # TODO test escaping of values + + # TODO finish enum example + + +def test_model_with_multiple_refs(): + _compare_generated_to_known_good( + schema_file=rfp("data/multiple_refs_schema.json"), + known_good=rfp("data/multiple_refs_template.xlsx"), + foreign_keys={"Training": {"__this__": ["date", "url"], + "Organisation": ["name"]}}, + outfile=None) + + +def test_model_with_indirect_reference(): + _compare_generated_to_known_good( + schema_file=rfp("data/indirect_schema.json"), + known_good=rfp("data/indirect_template.xlsx"), + foreign_keys={"Wrapper": ["Training.name", "Training.url"]}, + outfile=None) + + +def test_exceptions(): + # Foreign keys must be lists + with pytest.raises(ValueError, match="Foreign keys must be a list of strings, but a single " + r"string was given:\n\['Wrapper'\] -> name"): + _compare_generated_to_known_good( + schema_file=rfp("data/indirect_schema.json"), + known_good=rfp("data/multiple_refs_template.xlsx"), + foreign_keys={"Wrapper": {"__this__": "name"}}, + outfile=None) diff --git a/unittests/table_json_conversion/utils.py b/unittests/table_json_conversion/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..6c32117c1296e686290ad75bf5f704a1abfb2547 --- /dev/null +++ b/unittests/table_json_conversion/utils.py @@ -0,0 +1,54 @@ +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Utilities for the tests. +""" + +from openpyxl import Workbook + + +def compare_workbooks(wb1: Workbook, wb2: Workbook, hidden: bool = True): + """Compare two workbooks for equal content. + +Raises an error if differences are found. + +Parameters +---------- + +hidden: bool, optional + Test if the "hidden" status of rows and columns is the same. + """ + assert wb1.sheetnames == wb2.sheetnames, ( + f"Sheet names are different: \n{wb1.sheetnames}\n !=\n{wb2.sheetnames}" + ) + for sheetname in wb2.sheetnames: + sheet_1 = wb1[sheetname] + sheet_2 = wb2[sheetname] + for irow, (row1, row2) in enumerate(zip(sheet_1.iter_rows(), sheet_2.iter_rows())): + if hidden: + assert (sheet_1.row_dimensions[irow].hidden + == sheet_2.row_dimensions[irow].hidden), f"hidden row: {sheetname}, {irow}" + for icol, (cell1, cell2) in enumerate(zip(row1, row2)): + if hidden: + assert (sheet_1.column_dimensions[cell1.column_letter].hidden + == sheet_2.column_dimensions[cell2.column_letter].hidden), ( + f"hidden col: {sheetname}, {icol}") + assert cell1.value == cell2.value, ( + f"Sheet: {sheetname}, cell: {cell1.coordinate}, Values: \n" + f"{cell1.value}\n{cell2.value}" + ) diff --git a/unittests/test_json_schema_exporter.py b/unittests/test_json_schema_exporter.py index f0503385a25eb89e66dd3518d71a32b91d07bf88..b8c1ddd18973fc85a0cd227aac1fafb6708e8d0b 100644 --- a/unittests/test_json_schema_exporter.py +++ b/unittests/test_json_schema_exporter.py @@ -58,6 +58,14 @@ RT21 = GLOBAL_MODEL.get_deep("RT21") RT31 = GLOBAL_MODEL.get_deep("RT31") +def _make_unique(entity, unique: bool): + """Mock the `unique` behavior of execute_query(). + """ + if unique: + return entity + return db.Container().append(entity) + + def _mock_execute_query(query_string, unique=False, **kwargs): """Mock the response to queries for references.""" all_records = db.Container() @@ -83,20 +91,20 @@ def _mock_execute_query(query_string, unique=False, **kwargs): if query_string == "SELECT name, id FROM RECORD 'OtherType'": return other_type_records - elif query_string == "FIND RECORDTYPE WITH name='OtherType'" and unique is True: - return other_type_rt + elif query_string == "FIND RECORDTYPE WITH name='OtherType'": + return _make_unique(other_type_rt, unique) elif query_string == "SELECT name, id FROM RECORD 'ReferencingType'": return referencing_type_records - elif query_string == "FIND RECORDTYPE WITH name='ReferencingType'" and unique is True: - return referencing_type_rt + elif query_string == "FIND RECORDTYPE WITH name='ReferencingType'": + return _make_unique(referencing_type_rt, unique) elif query_string == "SELECT name, id FROM RECORD 'RT1'": return referencing_type_records # wrong types, but who cares for the test? - elif query_string == "FIND RECORDTYPE WITH name='RT1'" and unique is True: - return RT1 - elif query_string == "FIND RECORDTYPE WITH name='RT21'" and unique is True: - return RT21 - elif query_string == "FIND RECORDTYPE WITH name='RT31'" and unique is True: - return RT31 + elif query_string == "FIND RECORDTYPE WITH name='RT1'": + return _make_unique(RT1, unique) + elif query_string == "FIND RECORDTYPE WITH name='RT21'": + return _make_unique(RT21, unique) + elif query_string == "FIND RECORDTYPE WITH name='RT31'": + return _make_unique(RT31, unique) elif query_string == "SELECT name, id FROM RECORD": return all_records elif query_string == "SELECT name, id FROM FILE": @@ -333,6 +341,7 @@ def test_rt_with_list_props(): @patch("linkahead.execute_query", new=Mock(side_effect=_mock_execute_query)) +@patch("linkahead.cached.execute_query", new=Mock(side_effect=_mock_execute_query)) def test_rt_with_references(): rt = db.RecordType() @@ -598,6 +607,38 @@ def test_rt_with_references(): assert items["properties"]["file"]["type"] == "string" assert items["properties"]["file"]["format"] == "data-url" + # Test reference property + model_string = """ +RT1: + description: Some recordtype +RT2: + obligatory_properties: + prop1: + description: Some reference property + datatype: RT1 + """ + model = parse_model_from_string(model_string) + schema = rtjs(model.get_deep("RT2"), no_remote=True) + assert json.dumps(schema, indent=2) == """{ + "type": "object", + "required": [ + "prop1" + ], + "additionalProperties": true, + "title": "RT2", + "properties": { + "prop1": { + "type": "object", + "required": [], + "additionalProperties": true, + "description": "Some reference property", + "title": "prop1", + "properties": {} + } + }, + "$schema": "https://json-schema.org/draft/2020-12/schema" +}""" + def test_broken(): @@ -621,6 +662,7 @@ def test_broken(): @patch("linkahead.execute_query", new=Mock(side_effect=_mock_execute_query)) +@patch("linkahead.cached.execute_query", new=Mock(side_effect=_mock_execute_query)) def test_reference_options(): """Testing miscellaneous options. """ @@ -851,6 +893,7 @@ RT5: @patch("linkahead.execute_query", new=Mock(side_effect=_mock_execute_query)) +@patch("linkahead.cached.execute_query", new=Mock(side_effect=_mock_execute_query)) def test_empty_retrieve(): """Special case: ``do_not_retrieve`` is set, or the retrieve result is empty.""" model_str = """ @@ -893,6 +936,7 @@ RT3: @patch("linkahead.execute_query", new=Mock(side_effect=_mock_execute_query)) +@patch("linkahead.cached.execute_query", new=Mock(side_effect=_mock_execute_query)) def test_multiple_choice(): """Multiple choice is mostyly a matter of UI.""" model_str = """ @@ -933,6 +977,7 @@ RT4: @patch("linkahead.execute_query", new=Mock(side_effect=_mock_execute_query)) +@patch("linkahead.cached.execute_query", new=Mock(side_effect=_mock_execute_query)) def test_uischema(): model_str = """ RT1: @@ -983,6 +1028,7 @@ RT3: @patch("linkahead.execute_query", new=Mock(side_effect=_mock_execute_query)) +@patch("linkahead.cached.execute_query", new=Mock(side_effect=_mock_execute_query)) def test_schema_customization_with_dicts(): """Testing the ``additional_json_schema`` and ``additional_ui_schema`` parameters.""" model_str = """ diff --git a/unittests/test_yaml_model_parser.py b/unittests/test_yaml_model_parser.py index a7f6d6b9b292a6dc064e6fa35682c40bd66c07d2..97c3450f654e7b836734335cafac37adc6e700bb 100644 --- a/unittests/test_yaml_model_parser.py +++ b/unittests/test_yaml_model_parser.py @@ -592,3 +592,50 @@ prop2: model = parse_model_from_string(model_string) prop2 = model["prop2"] assert prop2.role == "Property" + + +def test_fancy_yaml(): + """Testing aliases and other fancy YAML features.""" + # Simple aliasing + model_string = """ +foo: + datatype: INTEGER +RT1: + obligatory_properties: &RT1_oblig + foo: +RT2: + obligatory_properties: *RT1_oblig + """ + model = parse_model_from_string(model_string) + assert str(model) == """{'foo': <Property name="foo" datatype="INTEGER"/> +, 'RT1': <RecordType name="RT1"> + <Property name="foo" importance="OBLIGATORY" flag="inheritance:FIX"/> +</RecordType> +, 'RT2': <RecordType name="RT2"> + <Property name="foo" importance="OBLIGATORY" flag="inheritance:FIX"/> +</RecordType> +}""" + + # Aliasing with override + model_string = """ +foo: + datatype: INTEGER +RT1: + obligatory_properties: &RT1_oblig + foo: +RT2: + obligatory_properties: + <<: *RT1_oblig + bar: + """ + model = parse_model_from_string(model_string) + assert str(model) == """{'foo': <Property name="foo" datatype="INTEGER"/> +, 'RT1': <RecordType name="RT1"> + <Property name="foo" importance="OBLIGATORY" flag="inheritance:FIX"/> +</RecordType> +, 'RT2': <RecordType name="RT2"> + <Property name="foo" importance="OBLIGATORY" flag="inheritance:FIX"/> + <Property name="bar" importance="OBLIGATORY" flag="inheritance:FIX"/> +</RecordType> +, 'bar': <RecordType name="bar"/> +}"""