diff --git a/CHANGELOG.md b/CHANGELOG.md index b92ebce6de44057c5166d1a0d8a181a74574c8a0..58ea93cdf96d767600f5ad7e48c47a60677eeb37 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### +- XLSX template generator - Json schema exporter: - has new parameter `use_rt_pool` - propagates more properties in the `make_array` function diff --git a/src/caosadvancedtools/loadFiles.py b/src/caosadvancedtools/loadFiles.py index 405b3d135c8af89e32c74015bd04f76f21828e20..26b6c1c0862f7d74da6bbcdd6f1057f881fb5e1a 100755 --- a/src/caosadvancedtools/loadFiles.py +++ b/src/caosadvancedtools/loadFiles.py @@ -129,7 +129,14 @@ def create_re_for_file_list(files, localroot, remoteroot): def loadpath(path, include, exclude, prefix, dryrun, forceAllowSymlinks, caosdbignore=None, - localpath=None): + localpath=None) -> dict: + """ +Returns +------- + +inserted: dict + A dict with the files to be included for each of the ``include`` elements. + """ if caosdbignore: # create list of files and create regular expression for small chunks @@ -146,6 +153,7 @@ def loadpath(path, include, exclude, prefix, dryrun, forceAllowSymlinks, caosdbi else: includes = [include] + inserted = {} # if no caosdbignore file is used, this iterates over a single include for include in includes: if dryrun: @@ -177,8 +185,9 @@ def loadpath(path, include, exclude, prefix, dryrun, forceAllowSymlinks, caosdbi logger.info( f"Made new files accessible: {len(files)}, combined size: {convert_size(totalsize)} ") + inserted[include] = files - return + return inserted def main(argv=None): diff --git a/src/caosadvancedtools/table_json_conversion/__init__.py b/src/caosadvancedtools/table_json_conversion/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/caosadvancedtools/table_json_conversion/fill_xlsx.py b/src/caosadvancedtools/table_json_conversion/fill_xlsx.py new file mode 100644 index 0000000000000000000000000000000000000000..79c7bfea1925adec47cf74af94d99deaf4fabc06 --- /dev/null +++ b/src/caosadvancedtools/table_json_conversion/fill_xlsx.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +from openpyxl import load_workbook + +from .table_generator import ColumnType, RowType + + +def _fill_leaves(json_doc: dict, workbook): + for key, value in json_doc: + if not isinstance(value, list): + value = [value] + for el in value: + if isinstance(el, dict): + _fill_leaves(el, workbook) + workbook.cell(1, 2, el) + + +def _get_row_type_column(worksheet): + for col in worksheet.columns: + for cell in col: + if cell.value == RowType.COL_TYPE.name: + return cell.column + raise ValueError("The column which defines row types (COL_TYPE, PATH, ...) is missing") + + +def _get_path_rows(worksheet): + rows = [] + rt_col = _get_row_type_column(worksheet) + for cell in list(worksheet.columns)[rt_col-1]: + print(cell.value) + if cell.value == RowType.PATH.name: + rows.append(cell.row) + return rows + + +def _generate_path_col_mapping(workbook): + rt_col = _get_row_type_column(workbook) + + for col in workbook.columns: + pass + + +def fill_template(template_path: str, json_path: str, result_path: str) -> None: + """ + Fill the contents of the JSON document stored at ``json_path`` into the template stored at + ``template_path`` and store the result under ``result_path``. + """ + template = load_workbook(template_path) + # For each top level key in the json we iterate the values (if it is an array). Those are the + # root elements that belong to a particular sheet. + # After treating a root element, the row index for the corresponding sheet needs to be + # increased + # When we finished treating an object that goes into a lower ranked sheet (see below), we + # increase the row index of that sheet. + # + + # We can generate a hierarchy of sheets in the beginning (using the paths). The lower sheets + # are for objects referenced by objects in higher ranked sheets. + # We can detect the sheet corresponding to a root element by looking at the first path element: + # The first path element must be the root element every where. + # Suggestion: + # row indices: Dict[str, int] string is the sheet name + # sheet_hirarchy: List[Tuple[str]] elements are sheet names + # + # Question: + # We can create an internal representation where we assign as sheet_names the same names that + # are used in table generator. Or should we create another special row that contains this + # somehow? + + template.save(result_path) diff --git a/src/caosadvancedtools/table_json_conversion/table_generator.py b/src/caosadvancedtools/table_json_conversion/table_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..5b9c6577b77e4a9c4d2cbfed903d534f4c761f08 --- /dev/null +++ b/src/caosadvancedtools/table_json_conversion/table_generator.py @@ -0,0 +1,356 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +""" +This module allows to generate template tables from JSON schemas. +""" +import re +from abc import ABC, abstractmethod +from enum import Enum +from typing import Dict, List, Optional, Tuple + +from openpyxl import Workbook +from openpyxl.workbook.child import INVALID_TITLE_REGEX + + +class ColumnType(Enum): + """ column types enum """ + SCALAR = 1 + LIST = 2 + FOREIGN = 3 + IGNORE = 3 + + +class RowType(Enum): + """ row types enum """ + COL_TYPE = 1 + PATH = 2 + IGNORE = 3 + + +class TableTemplateGenerator(ABC): + """ base class for generating tables from json schema """ + + def __init__(self): + pass + + @abstractmethod + def generate(self, schema: dict, foreign_keys: dict, filepath: str): + """Generate a sheet definition from a given JSON schema. + + Parameters: + ----------- + schema: dict + Given JSON schema. + + foreign_keys: dict + A tree-like configuration (nested dict) that defines which attributes shall be used to + create additional columns when a list of references exists. The nested dict is + structured like the data model, its innermost elements are leaves of the path trees + within the JSON, they define the required keys. + + | Suppose we want to distinguish Persons that are referenced by Trainings, then + ``foreign_keys`` must at least contain the following: + | ``{"Training": {"Person": ["name", "email"]}}``. + + Values within the dicts can be either a list representing the keys (as in the example + above) or a dict that allows to set additional foreign keys at higher depths. In the + latter case (dict instead of list) if foreign keys exist at that level (e.g. in the + above example there might be further levels below "Person"), then the foreign keys can + be set using the special ``__this__`` key. + + Example: ``{"Training": {"__this__": ["date"], "Person": ["name", "email"]}}`` + Here, ``date`` is the sole foreign key for Training. + """ + + def _generate_sheets_from_schema(self, schema: dict, foreign_keys: Optional[dict] = None + ) -> Dict[str, Dict[str, + Tuple[ColumnType, Optional[str], list]]]: + """Generate a sheet definition from a given JSON schema. + + Parameters + ---------- + schema: dict + given JSON schema + foreign_keys: dict, optional + a configuration that defines which attributes shall be used to create + additional columns when a list of references exists. See ``foreign_keys`` + argument of TableTemplateGenerator.generate. + + Returns + ------- + sheets: dict + A two-level dict which describes columns of template sheets. + + | The structure of this two-level dict is as follows: + | ``sheets[sheetname][colname]= (<col_type>, <description>, [<path>, ...])`` + + I.e. the outer dict contains sheet names as keys, the inner dict has column names as + keys and tuples as values. These tuples consist of: + - the column type + - the description of the corresponding property + - a list representing the path. + + """ + if not ("type" in schema or "anyOf" in schema): + raise ValueError("Inappropriate JSON schema: The following object must contain the " + f"'type' or 'anyOf' key:\n{schema}\n") + if "properties" not in schema: + raise ValueError("Inappropriate JSON schema: The following object must contain " + f"the 'properties' key:\n{schema}\n") + if "type" in schema: + assert schema["type"] == "object" + if foreign_keys is None: + foreign_keys = {} + # here, we treat the top level + # sheets[sheetname][colname]= (COL_TYPE, description, [path]) + sheets: Dict[str, Dict[str, Tuple[ColumnType, Optional[str], list]]] = {} + for rt_name, rt_def in schema["properties"].items(): + sheets[rt_name] = self._treat_schema_element(schema=rt_def, sheets=sheets, + path=[rt_name], foreign_keys=foreign_keys) + return sheets + + def _get_foreign_keys(self, keys: dict, path: list) -> list: + """Return the foreign keys that are needed at the location to which path points.""" + msg = f"A foreign key definition is missing for path:\n{path}\nKeys are:\n{keys}" + while path: + if keys is None or path[0] not in keys: + raise ValueError(msg) + keys = keys[path[0]] + path = path[1:] + if isinstance(keys, dict) and "__this__" in keys: + return keys["__this__"] + if isinstance(keys, list): + return keys + raise ValueError(msg) + + def _treat_schema_element(self, schema: dict, sheets: dict, path: List[str], + foreign_keys: Optional[dict] = None, level_in_sheet_name: int = 1, + array_paths: Optional[list] = None + ) -> Dict[str, Tuple[ColumnType, Optional[str], list]]: + """Recursively transform elements from the schema into column definitions. + + ``sheets`` is modified in place. + + Parameters + ---------- + schema: dict + Part of the json schema; it must be the level that contains the type definition + (e.g. 'type' or 'oneOf' key) + sheets: dict + All the sheets, indexed by their name. This is typically modified in place by this + method. + path: list[str] + The relevant (sub) path for this schema part? + array_paths: list + A list of path along the way to the current object, where the json contains arrays. + + Returns + ------- + columns: dict + Describing the columns; see doc string of `_generate_sheets_from_schema`_ + """ + if not ("type" in schema or "enum" in schema or "oneOf" in schema or "anyOf" in schema): + raise ValueError("Inappropriate JSON schema: The following schema part must contain " + f"'type', 'enum', 'oneOf' or 'anyOf':\n{schema}\n") + + if array_paths is None: + # if this is not set, we are at top level and the top level element may always be an + # array + array_paths = [path] + if foreign_keys is None: + foreign_keys = {} + + ctype = ColumnType.SCALAR + + # if it is an array, value defs are in 'items' + if schema.get('type') == 'array': + if (schema['items'].get('type') == 'object' + and len(path) > 1): # list of references; special treatment + # we add a new sheet with columns generated from the subtree of the schema + sheetname = ".".join(path) + if sheetname in sheets: + raise ValueError("The schema would lead to two sheets with the same name, " + f"which is forbidden: {sheetname}") + sheets[sheetname] = self._treat_schema_element( + schema=schema['items'], sheets=sheets, path=path, foreign_keys=foreign_keys, + level_in_sheet_name=len(path), + array_paths=array_paths+[path] # since this level is an array extend the list + ) + # and add the foreign keys that are necessary up to this point + for array_path in array_paths: + keys = self._get_foreign_keys(foreign_keys, array_path) + for key in keys: + if key in sheets[sheetname]: + raise ValueError("The schema would lead to two columns with the same " + f"name which is forbidden: {key}") + sheets[sheetname][key] = (ColumnType.FOREIGN, f"see sheet '{path[0]}'", + array_path + [key]) + # Columns are added to the new sheet, thus we do not return any columns for the + # current sheet. + return {} + + # it is a list of primitive types -> semicolon separated list + schema = schema['items'] + ctype = ColumnType.LIST + + # This should only be the case for "new or existing reference". + for el in schema.get('oneOf', []): + if 'type' in el: + schema = el + break + + if "properties" in schema: # recurse for each property, then return + cols = {} + for pname in schema["properties"]: + col_defs = self._treat_schema_element( + schema["properties"][pname], sheets, path+[pname], foreign_keys, + level_in_sheet_name, array_paths=array_paths) + for k in col_defs: + if k in cols: + raise ValueError(f"The schema would lead to two columns with the same " + f"name which is forbidden: {k}") + cols.update(col_defs) + return cols + + # The schema is a leaf. + description = schema['description'] if 'description' in schema else None + # definition of a single column + default_return = {".".join(path[level_in_sheet_name:]): (ctype, description, path)} + if 'type' not in schema and 'enum' in schema: + return default_return + if 'type' not in schema and 'anyOf' in schema: + for d in schema['anyOf']: + # currently the only case where this occurs is date formats + assert d['type'] == 'string' + assert d['format'] == 'date' or d['format'] == 'date-time' + return default_return + if schema["type"] in ['string', 'number', 'integer', 'boolean']: + if 'format' in schema and schema['format'] == 'data-url': + return {} # file; ignore for now + return default_return + raise ValueError("Inappropriate JSON schema: The following part should define an" + f" object with properties or a primitive type:\n{schema}\n") + + +class XLSXTemplateGenerator(TableTemplateGenerator): + """Class for generating XLSX tables from json schema definitions.""" + + def __init__(self): + pass + + def generate(self, schema: dict, foreign_keys: dict, filepath: str) -> None: + """Generate a sheet definition from a given JSON schema. + + Parameters: + ----------- + schema: dict + Given JSON schema + foreign_keys: dict + A configuration that defines which attributes shall be used to create + additional columns when a list of references exists. See ``foreign_keys`` + argument of :ref:`TableTemplateGenerator.generate` . + filepath: str + The XLSX file will be stored under this path. + """ + sheets = self._generate_sheets_from_schema(schema, foreign_keys) + wb = self._create_workbook_from_sheets_def(sheets) + wb.save(filepath) + + @staticmethod + def _get_max_path_length(sheetdef: dict) -> int: + """ returns the length of the longest path contained in the sheet definition + + see TableTemplateGenerator._generate_sheets_from_schema for the structure of the sheets + definition dict + You need to pass the dict of a single sheet to this function. + """ + return max([len(path) for _, _, path in sheetdef.values()]) + + @staticmethod + def _get_ordered_cols(sheetdef: dict) -> list: + """ + creates a list with tuples (colname, column type, path) where the foreign keys are first + """ + ordered_cols = [] + # first foreign cols + for colname, (ct, desc, path) in sheetdef.items(): + if ct == ColumnType.FOREIGN: + ordered_cols.append((colname, ct, desc, path)) + # now the other + for colname, (ct, desc, path) in sheetdef.items(): + if ct != ColumnType.FOREIGN: + ordered_cols.append((colname, ct, desc, path)) + + return ordered_cols + + def _create_workbook_from_sheets_def( + self, sheets: Dict[str, Dict[str, Tuple[ColumnType, Optional[str], list]]]): + """Create and return a nice workbook for the given sheets.""" + wb = Workbook() + assert wb.sheetnames == ["Sheet"] + for sheetname, sheetdef in sheets.items(): + ws = wb.create_sheet(re.sub(INVALID_TITLE_REGEX, '_', sheetname)) + # First row will by the COL_TYPE row. + # First column will be the indicator row with values COL_TYPE, PATH, IGNORE. + # The COL_TYPE row will be followed by as many PATH rows as needed. + + max_path_length = self._get_max_path_length(sheetdef) + header_index = 2 + max_path_length + description_index = 3 + max_path_length + + # create first column + ws.cell(1, 1, RowType.COL_TYPE.name) + for index in range(max_path_length): + ws.cell(2 + index, 1, RowType.PATH.name) + ws.cell(header_index, 1, RowType.IGNORE.name) + ws.cell(description_index, 1, RowType.IGNORE.name) + + ordered_cols = self._get_ordered_cols(sheetdef) + + # create other columns + for index, (colname, ct, desc, path) in enumerate(ordered_cols): + ws.cell(1, 2 + index, ct.name) + for path_index, el in enumerate(path): + ws.cell(2 + path_index, 2 + index, el) + ws.cell(header_index, 2 + index, colname) + if desc: + ws.cell(description_index, 2 + index, desc) + + # hide special rows + for index, row in enumerate(ws.rows): + if not (row[0].value is None or row[0].value == RowType.IGNORE.name): + ws.row_dimensions[index+1].hidden = True + + # hide special column + ws.column_dimensions['A'].hidden = True + + # remove initial sheet + del wb['Sheet'] + + # order sheets + # for index, sheetname in enumerate(sorted(wb.sheetnames)): + # wb.move_sheet(sheetname, index-wb.index(wb[sheetname])) + # reverse sheets + for index, sheetname in enumerate(wb.sheetnames[::-1]): + wb.move_sheet(sheetname, index-wb.index(wb[sheetname])) + + return wb diff --git a/src/doc/table-json-conversion/specs.md b/src/doc/table-json-conversion/specs.md new file mode 100644 index 0000000000000000000000000000000000000000..3a5fcef587d9facd1f0960298226c0f96307beb9 --- /dev/null +++ b/src/doc/table-json-conversion/specs.md @@ -0,0 +1,229 @@ +# Konversion zwischen LinkAhead-Datenmodellen, JSON-Schema und XLSX (und zurück) # + +Top level of json must be a dict. keys of the dict are RT names. + + +Frage: is the first RT never an array? + + +Do not use sheet name, but only content of hidden rows + +## Datenmodelle in JSON-Schema und JSON-Daten ## + +Das Datenmodell in LinkAhead legt fest, welche Arten von Records es in einer LinkAhead-Instanz gibt +und wie diese aussehen. Dieses Datenmodell kann auch in einem JSON Schema repräsentiert werden, dass +die Struktur von JSON Dateien festlegt, die zu dem Datenmodell gehörige Records enthält. + +Zum Beispiel kann das folgende JSON den Record einer Person beschreiben: + +```JSON +{ + "Person": { + "family_name": "Steve", + "given_name": "Stevie" + } +} +``` + +Ein *JSON Schema* schreibt eine konkrete Struktur vor, und die zugehörige JSON Dateien können +genutzt werden, um Daten zu bestimmten Record-Strukturen zu repräsentieren. Beispielsweise könnte +man ein JSON Schema erstellen, dass es erlaubt "Training" Records mit Informationen zu abgehaltenen +Trainings zu speichern. Dies ist insbesondere wertvoll beim Datenim- und export. Man +könnte Webformulare aus dem Json Schema generieren oder es nutzen, um in LinkAhead gespeicherte +Objekte als JSON zu exportieren. + +## Von JSON zu XLSX: Datenrepräsentation ## + +Im Folgenden wird beschrieben, wie JSON Dateien, die LinkAhead-Records reprästentieren in XLSX +Dateien umgewandelt werden, bzw. wie aus XLSX-Dateien JSON Dateien mit Records erstellt werden. + +Der Attributname (oben "Person") legt den RecordType fest und der Wert diese Attributs kann entweder +ein Objekt oder eine Liste sein. Ist es ein Objekt (wie im obigen Beispiel), so wird ein einzelner +Record repräsentiert. Bei einer Liste mehrere Records, die den gleichen RecordType als Parent +haben. + +Die *Properties* des Records (oben `family_name` und `given_name`) werden zu *Spalten* im XLSX. Die +Properties haben wiederum einen Attributnamen und einen Wert. Der Wert kann + +a. primitiv (Text, Zahl, Boolean, ...) +b. ein Record +c. eine Liste von primitiven Typen +d. eine Liste von Records + +sein. + +In den Fällen *a.* und *c.* wird in XLSX eine Zelle in der zur Property gehörigen Spalte erstellt. +Im Fall *b.* wird prinzipiell für die Properties des Records Spalten erstellt. Tatsächlich wird der +referenzierte Record genauso behandelt wie der ursprüngliche. D.h. die Fälle a.-d. werden wieder +für die einzelnen Properties betrachtet. + +Für den Fall *d.* ist die zweidimensionale Struktur eines XLSX Blatts nicht ausreichend. Daher +werden für solche Fälle *neue* XLSX-Blätter/-Tabellen erstellt. + +In diesen werden die referenzierten Records behandelt wie oben beschrieben. Es gibt jedoch +zusätzliche Spalten die es erlauben zu erkennen, von welchem "externen" Record diese Records +referenziert werden. + +Wir betrachten diese vier Fälle nun im Detail: + +### a. Properties mit primitiven Datentypen ### + +```JSON +{ + "Training": { + "date": "2023-01-01", + "url": "www.indiscale.com", + "duration": 1.0, + "participants": 1, + "remote": false + } +} +``` +Dieser Eintrag wird in einem XLSX-Blatt mit dem folgenden Inhalt abgebildet: + +| date | url | duration | participants | remote | +|------------|-------------------|----------|--------------|--------| +| 2023-01-01 | www.indiscale.com | 1.0 | 1 | false | + +### b. Property, die einen Record referenziert ### + +```JSON +{ + "Training": { + "date": "2023-01-01", + "supervisor": { + "family_name": "Stevenson", + "given_name": "Stevie", + } + } +} +``` + +Dieser Eintrag wird in einem XLSX Blatt mit dem folgenden Inhalt abgebildet: + +| date | `supervisor.family_name` | `supervisor.given_name` | +|------------|--------------------------|-------------------------| +| 2023-01-01 | Stevenson | Stevie | + +Beachten Sie, dass die Spaltennamen umbenannt werden dürfen. Die Zuordnung der Spalte zu Properties +von Records wird über den Inhalt von versteckten Zeilen gewährleistet. + +### c. Properties, die Listen mit Werten von primitiven Datentypen enthalten ### + +```JSON +{ + "Training": { + "url": "www.indiscale.com", + "subjects": ["Math", "Physics"], + } +} +``` + +Dieser Eintrag würde in einem XLSX Blatt mit dem folgenden Inhalt abgebildet: + +| url | subjects | +|-------------------|--------------| +| www.indiscale.com | Math;Physics | + +Die Listenelemente werden separiert von `;` in die Zelle geschrieben. Wenn die Elemente den +Separator `;` enthalten, dann wird dieser mit einem `\\` escaped. + +### d. Properties, die Listen mit Referenzen enthalten ### + +```JSON +{ + "Training": { + "date": "2023-01-01", + "coach": [ + { + "family_name": "Sky", + "given_name": "Max", + }, + { + "family_name": "Sky", + "given_name": "Min", + } + ] + } +} +``` + +Da die beiden Coaches nicht vernünftig in einer Zelle dargestellt werden können, bedarf es nun eines +weiteren Tabellenblatts, das die Eigenschaften der Coaches enthält. + +Das Blatt zu den *Trainings* enthält in diesem Beispiel nur die "date" Spalte: + +| date | +|------------| +| 2023-01-01 | + +Zusätzlich gibt es ein *weiteres* Blatt in dem die Coaches gespeichert werden. Hier ist nun +entscheidend, dass definiert wird, wie von potentiell mehreren "Trainings" das richtige Element +gewählt wird. In diesem Fall bedeutet dies, dass das "date" eindeutig sein muss. + +TODO: In welchem Scope gilt diese Eindeutigkeit? Können wir dies checken? + +Das zweite Blatt sieht dann wie folgt aus + +| date | `coach.family_name` | `coach.given_name` | +|------------|---------------------|--------------------| +| 2023-01-01 | Sky | Max | +| 2023-01-01 | Sky | Min | + +## Data in XLSX: Hidden automation logic ## + +### First column: Marker for row types ### + +The first column in each sheet will be hidden and it will contain an entry in each row that needs +special treatment. The following values are used: + +- ``IGNORE``: This row is ignored. It can be used for explanatory texts or layout. +- ``COL_TYPE``: Typically the first row that is not `IGNORE`. It indicates the row that defines the + type of columns (`FOREIGN`, `SCALAR`, `LIST`, `IGNORE`). This row may occur only once. +- ``PATH``: Indicates that the row is used to define the path within the JSON. These rows are + typically hidden for users. + +An example table could look like this: + +| `IGNORE` | | Welcome | to this | file! | | +| `IGNORE` | | Please | enter your | data here: | | +| `COL_TYPE` | `IGNORE` | `SCALAR` | `SCALAR` | `LIST` | `SCALAR` | +| `PATH` | | `Training` | `Training` | `Training` | `Training` | +| `PATH` | | `url` | `date` | `subjects` | `supervisor` | +| `PATH` | | | | | `email` | +| `IGNORE` | Please enter one training per line. | Training URL | Training date | Subjects | Supervisor's email | +|------------|-------------------------------------|----------------|---------------|--------------|--------------------| +| | | example.com/mp | 2024-02-27 | Math;Physics | steve@example.com | +| | | example.com/m | 2024-02-27 | Math | stella@example.com | + +### Parsing XLSX data ### + +To extract the value of a given cell, we traverse all path elements (in ``PATH`` rows) from top to +bottom. The final element of the path is the name of the Property to which the value belongs. In +the example above, `steve@example.com` is the value of the `email` Property in the path +`["Training", "supervisor", "email"]`. + +The path elements are sufficient to identify the object within a JSON, at least if the corresponding +JSON element is a single object. If the JSON element is an array, the appropriate object within the +array needs to be selected. + +For this selection additional ``FOREIGN`` columns are used. The paths in these columns must all have +the same *base* and one additional *unique key* component. For example, two `FOREIGN` columns could +be `["Training", "date"]` and `["Training", "url"]`, where `["Training"]` is the *base path* and +`"date"` and `"url"` are the *unique keys*. + +The base path defines the table (or recordtype) to which the entries belong, and the values of the +unique keys define the actual rows to which data belongs. + +For example, this table defines three coaches for the two trainings from the last table: + +| `COL_TYPE` | `FOREIGN` | `FOREIGN` | `SCALAR` | +| `PATH` | `Training` | `Training` | `Training` | +| `PATH` | `date` | `url` | `coach` | +| `PATH` | | | `given_name` | +| `IGNORE` | Date of training | URL of training | The coach's given name | +| `IGNORE` | from sheet 'Training' | from sheet 'Training' | | +|------------|-----------------------|-----------------------|------------------------| +| | 2024-02-27 | example.com/mp | Ada | +| | 2024-02-27 | example.com/mp | Berta | +| | 2024-02-27 | example.com/m | Chris | diff --git a/unittests/table_json_conversion/create_jsonschema.py b/unittests/table_json_conversion/create_jsonschema.py new file mode 100755 index 0000000000000000000000000000000000000000..6f556863bf12e1c9001be95cc7213a3e875a766e --- /dev/null +++ b/unittests/table_json_conversion/create_jsonschema.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2023 IndiScale GmbH <www.indiscale.com> +# Copyright (C) 2023 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Create JSON-Schema according to configuration +""" + +import argparse +import json +import os +import sys + +import caosadvancedtools.json_schema_exporter as jsex +import caosadvancedtools.models.parser as parser +import tomli + +# TODO why do I need a running LA instance? + + +def prepare_datamodel(): + model = parser.parse_model_from_yaml("./model.yml") + + exporter = jsex.JsonSchemaExporter(additional_properties=False, + # additional_options_for_text_props=additional_text_options, + # name_and_description_in_properties=True, + name_property_for_new_records=True, + do_not_create=["Organisation"], + # do_not_retrieve=do_not_retrieve, + ) + schema_top = exporter.recordtype_to_json_schema(model.get_deep("Training")) + schema_pers = exporter.recordtype_to_json_schema(model.get_deep("Person")) + merged_schema = jsex.merge_schemas([schema_top, schema_pers]) + + with open("model_schema.json", mode="w", encoding="utf8") as json_file: + json.dump(merged_schema, json_file, ensure_ascii=False, indent=2) + + +def _parse_arguments(): + """Parse the arguments.""" + parser = argparse.ArgumentParser(description='') + + return parser.parse_args() + + +def main(): + """The main function of this script.""" + args = _parse_arguments() + prepare_datamodel() + + +if __name__ == "__main__": + main() diff --git a/unittests/table_json_conversion/example.json b/unittests/table_json_conversion/example.json new file mode 100644 index 0000000000000000000000000000000000000000..8d08fd09d168a85641d8a9c3cf9776b8d1c866b2 --- /dev/null +++ b/unittests/table_json_conversion/example.json @@ -0,0 +1,30 @@ +{ + "Training": { + "date": "2023-01-01", + "url": "www.indiscale.com", + "coach": [ + { + "family_name": "Sky", + "given_name": "Max", + "Organisation": "ECB" + },{ + "family_name": "Sky", + "given_name": "Min", + "Organisation": "ECB" + }], + "supervisor": { + "family_name": "Steve", + "given_name": "Stevie", + "Organisation": "IMF" + }, + "duration": 1.0, + "participants": 1, + "subjects": ["Math", "Physics"], + "remote": false + }, + "Person": { + "family_name": "Steve", + "given_name": "Stevie", + "Organisation": "IMF" + } +} diff --git a/unittests/table_json_conversion/example_template.xlsx b/unittests/table_json_conversion/example_template.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..6d9c7627e724144e7748dcfb36f2292be49b036e Binary files /dev/null and b/unittests/table_json_conversion/example_template.xlsx differ diff --git a/unittests/table_json_conversion/how_to_schema.md b/unittests/table_json_conversion/how_to_schema.md new file mode 100644 index 0000000000000000000000000000000000000000..a7b4e3ca35a1fc9e67ebbb29f316825e89596f4a --- /dev/null +++ b/unittests/table_json_conversion/how_to_schema.md @@ -0,0 +1,19 @@ +Insert the data model into a LinkAhead server. + +Run the following code: +``` + model = parser.parse_model_from_yaml("./model.yml") + + exporter = jsex.JsonSchemaExporter(additional_properties=False, + #additional_options_for_text_props=additional_text_options, + #name_and_description_in_properties=True, + #do_not_create=do_not_create, + #do_not_retrieve=do_not_retrieve, + ) + schema_top = exporter.recordtype_to_json_schema(model.get_deep("Training")) + schema_pers = exporter.recordtype_to_json_schema(model.get_deep("Person")) + merged_schema = jsex.merge_schemas([schema_top, schema_pers]) + + with open("model_schema.json", mode="w", encoding="utf8") as json_file: + json.dump(merged_schema, json_file, ensure_ascii=False, indent=2) +``` diff --git a/unittests/table_json_conversion/model.yml b/unittests/table_json_conversion/model.yml new file mode 100644 index 0000000000000000000000000000000000000000..74fb5bc5dc4251bb3834ea2f6201f991cab510d1 --- /dev/null +++ b/unittests/table_json_conversion/model.yml @@ -0,0 +1,36 @@ +Person: + recommended_properties: + family_name: + datatype: TEXT + given_name: + datatype: TEXT + Organisation: +Training: + recommended_properties: + date: + datatype: DATETIME + description: 'The date of the training.' + url: + datatype: TEXT + description: 'The URL' + subjects: + datatype: LIST<TEXT> + coach: + datatype: LIST<Person> + supervisor: + datatype: Person + duration: + datatype: DOUBLE + participants: + datatype: INTEGER + remote: + datatype: BOOLEAN + slides: + datatype: FILE +ProgrammingCourse: + inherit_from_suggested: + - Training +Organisation: + recommended_properties: + Country: + datatype: TEXT diff --git a/unittests/table_json_conversion/model_schema.json b/unittests/table_json_conversion/model_schema.json new file mode 100644 index 0000000000000000000000000000000000000000..6f2fffdea938da3e4e3a39397a1ddbfbaa47724c --- /dev/null +++ b/unittests/table_json_conversion/model_schema.json @@ -0,0 +1,129 @@ +{ + "type": "object", + "properties": { + "Training": { + "type": "object", + "required": [], + "additionalProperties": false, + "title": "Training", + "properties": { + "name": { + "type": "string", + "description": "The name of the Record to be created" + }, + "date": { + "description": "The date of the training.", + "anyOf": [ + { + "type": "string", + "format": "date" + }, + { + "type": "string", + "format": "date-time" + } + ] + }, + "url": { + "type": "string", + "description": "The URL" + }, + "subjects": { + "type": "array", + "items": { + "type": "string" + } + }, + "coach": { + "type": "array", + "items": { + "type": "object", + "required": [], + "additionalProperties": false, + "title": "coach", + "properties": { + "name": { + "type": "string", + "description": "The name of the Record to be created" + }, + "family_name": { + "type": "string" + }, + "given_name": { + "type": "string" + }, + "Organisation": { + "enum": [ + "Federal Reserve" + ] + } + } + } + }, + "supervisor": { + "type": "object", + "required": [], + "additionalProperties": false, + "title": "supervisor", + "properties": { + "name": { + "type": "string", + "description": "The name of the Record to be created" + }, + "family_name": { + "type": "string" + }, + "given_name": { + "type": "string" + }, + "Organisation": { + "enum": [ + "Federal Reserve" + ] + } + } + }, + "duration": { + "type": "number" + }, + "participants": { + "type": "integer" + }, + "remote": { + "type": "boolean" + } + }, + "$schema": "https://json-schema.org/draft/2020-12/schema" + }, + "Person": { + "type": "object", + "required": [], + "additionalProperties": false, + "title": "Person", + "properties": { + "name": { + "type": "string", + "description": "The name of the Record to be created" + }, + "family_name": { + "type": "string" + }, + "given_name": { + "type": "string" + }, + "Organisation": { + "enum": [ + "Federal Reserve" + ] + } + }, + "$schema": "https://json-schema.org/draft/2020-12/schema" + } + }, + "required": [ + "Training", + "Person" + ], + "additionalProperties": false, + "$schema": "https://json-schema.org/draft/2020-12/schema" +} diff --git a/unittests/table_json_conversion/test_fill_xlsx.py b/unittests/table_json_conversion/test_fill_xlsx.py new file mode 100644 index 0000000000000000000000000000000000000000..a1458fc244f5360e384e4371401cf8a033928797 --- /dev/null +++ b/unittests/table_json_conversion/test_fill_xlsx.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +import os +import tempfile + +from caosadvancedtools.table_json_conversion.fill_xlsx import ( + _get_path_rows, _get_row_type_column, fill_template) +from openpyxl import load_workbook + + +def rfp(*pathcomponents): + """ + Return full path. + Shorthand convenience function. + """ + return os.path.join(os.path.dirname(__file__), *pathcomponents) + + +def test_detect(): + example = load_workbook(rfp("example_template.xlsx")) + assert 1 == _get_row_type_column(example['Person']) + assert [2, 3] == _get_path_rows(example['Person']) + + +def test_fill_xlsx(): + path = os.path.join(tempfile.mkdtemp(), 'test.xlsx') + assert not os.path.exists(path) + fill_template(rfp('example_template.xlsx'), rfp('example.json'), path) + assert os.path.exists(path) + generated = load_workbook(path) # workbook can be read diff --git a/unittests/table_json_conversion/test_table_template_generator.py b/unittests/table_json_conversion/test_table_template_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..af3b3a2d39c998183292ac0af1377df4fac02d7d --- /dev/null +++ b/unittests/table_json_conversion/test_table_template_generator.py @@ -0,0 +1,214 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +import json +import os +import tempfile + +import pytest +from caosadvancedtools.table_json_conversion.table_generator import ( + ColumnType, XLSXTemplateGenerator) +from openpyxl import load_workbook + + +def rfp(*pathcomponents): + """ + Return full path. + Shorthand convenience function. + """ + return os.path.join(os.path.dirname(__file__), *pathcomponents) + + +def test_generate_sheets_from_schema(): + # trivial case; we do not support this + schema = {} + generator = XLSXTemplateGenerator() + with pytest.raises(ValueError, match="Inappropriate JSON schema:.*"): + generator._generate_sheets_from_schema(schema) + + # top level must be RT with Properties + schema = { + "type": "string" + } + with pytest.raises(ValueError, match="Inappropriate JSON schema:.*"): + generator._generate_sheets_from_schema(schema) + + # bad type + schema = { + "type": "object", + "properties": { + "Training": { + "type": "object", + "properties": { + "name": { + "type": "str", + "description": "The name of the Record to be created" + }, + } + } + } + } + with pytest.raises(ValueError, + match="Inappropriate JSON schema: The following part " + "should define an object.*"): + generator._generate_sheets_from_schema(schema, {'Training': ['a']}) + + # bad schema + schema = { + "type": "object", + "properties": { + "Training": { + "type": "object" + } + } + } + with pytest.raises(ValueError, + match="Inappropriate JSON schema: The following part " + "should define an object.*"): + generator._generate_sheets_from_schema(schema, {'Training': ['a']}) + + # minimal case: one RT with one P + schema = { + "type": "object", + "properties": { + "Training": { + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "The name of the Record to be created" + }, + } + } + } + } + sdef = generator._generate_sheets_from_schema(schema, {'Training': ['a']}) + assert "Training" in sdef + tdef = sdef['Training'] + assert 'name' in tdef + assert tdef['name'] == (ColumnType.SCALAR, "The name of the Record to be created", ["Training", 'name']) + + # example case + with open(rfp("model_schema.json")) as sfi: + schema = json.load(sfi) + with pytest.raises(ValueError, match="A foreign key definition is missing.*"): + generator._generate_sheets_from_schema(schema) + sdef = generator._generate_sheets_from_schema( + schema, + foreign_keys={'Training': {"__this__": ['date', 'url']}}) + assert "Training" in sdef + tdef = sdef['Training'] + assert tdef['date'] == (ColumnType.SCALAR, 'The date of the training.', ["Training", 'date']) + assert tdef['url'] == (ColumnType.SCALAR, 'The URL', ["Training", 'url']) + assert tdef['supervisor.family_name'] == (ColumnType.SCALAR, None, ["Training", 'supervisor', + 'family_name']) + assert tdef['supervisor.given_name'] == (ColumnType.SCALAR, None, ["Training", 'supervisor', + 'given_name']) + assert tdef['supervisor.Organisation'] == (ColumnType.SCALAR, None, ["Training", 'supervisor', + 'Organisation']) + assert tdef['duration'] == (ColumnType.SCALAR, None, ["Training", 'duration']) + assert tdef['participants'] == (ColumnType.SCALAR, None, ["Training", 'participants']) + assert tdef['subjects'] == (ColumnType.LIST, None, ["Training", 'subjects']) + assert tdef['remote'] == (ColumnType.SCALAR, None, ["Training", 'remote']) + cdef = sdef['Training.coach'] + assert cdef['family_name'] == (ColumnType.SCALAR, None, ["Training", 'coach', 'family_name']) + assert cdef['given_name'] == (ColumnType.SCALAR, None, ["Training", 'coach', 'given_name']) + assert cdef['Organisation'] == (ColumnType.SCALAR, None, ["Training", 'coach', + 'Organisation']) + assert cdef['date'] == (ColumnType.FOREIGN, "see sheet 'Training'", ["Training", 'date']) + assert cdef['url'] == (ColumnType.FOREIGN, "see sheet 'Training'", ["Training", 'url']) + + +def test_get_foreign_keys(): + generator = XLSXTemplateGenerator() + fkd = {"Training": ['a']} + assert ['a'] == generator._get_foreign_keys(fkd, ['Training']) + + fkd = {"Training": {"__this__": ['a']}} + assert ['a'] == generator._get_foreign_keys(fkd, ['Training']) + + fkd = {"Training": {'hallo'}} + with pytest.raises(ValueError, match=r"A foreign key definition is missing for path:\n\[" + r"'Training'\]\nKeys are:\n{'Training': \{'hallo'\}\}"): + generator._get_foreign_keys(fkd, ['Training']) + + fkd = {"Training": {"__this__": ['a'], 'b': ['c']}} + assert ['c'] == generator._get_foreign_keys(fkd, ['Training', 'b']) + + with pytest.raises(ValueError, match=r"A foreign key definition is missing for .*"): + generator._get_foreign_keys({}, ['Training']) + + +def test_get_max_path_length(): + assert 4 == XLSXTemplateGenerator._get_max_path_length({'a': (1, 'desc', [1, 2, 3]), + 'b': (2, 'desc', [1, 2, 3, 4])}) + + +def test_template_generator(): + generator = XLSXTemplateGenerator() + with open(rfp("model_schema.json")) as sfi: + schema = json.load(sfi) + path = os.path.join(tempfile.mkdtemp(), 'test.xlsx') + assert not os.path.exists(path) + generator.generate(schema=schema, + foreign_keys={'Training': {"__this__": ['date', 'url']}}, + filepath=path) + assert os.path.exists(path) + generated = load_workbook(path) # workbook can be read + example = load_workbook(rfp("example_template.xlsx")) + assert generated.sheetnames == example.sheetnames + for sheetname in example.sheetnames: + gen_sheet = generated[sheetname] + ex_sheet = example[sheetname] + for irow, (erow, grow) in enumerate(zip(ex_sheet.iter_rows(), gen_sheet.iter_rows())): + assert ex_sheet.row_dimensions[irow].hidden == gen_sheet.row_dimensions[irow].hidden + for icol, (ecol, gcol) in enumerate(zip(erow, grow)): + assert (ex_sheet.column_dimensions[ecol.column_letter].hidden + == gen_sheet.column_dimensions[ecol.column_letter].hidden) + cell = gen_sheet.cell(irow+1, icol+1) + assert ecol.value == gcol.value, f"Sheet: {sheetname}, cell: {cell.coordinate}" + + # test some hidden + ws = generated.active + assert ws.row_dimensions[1].hidden is True + assert ws.column_dimensions['A'].hidden is True + + # TODO: remove the following after manual testing + di = '/home/professional/CaosDB/management/external/dimr/eingabemaske/crawler/schemas' + if not os.path.exists(di): + return + for fi in os.listdir(di): + rp = os.path.join(di, fi) + with open(rp) as sfi: + schema = json.load(sfi) + fk_path = os.path.join(di, "foreign_keys"+fi[len('schema'):]) + if not os.path.exists(fk_path): + print(f"No foreign keys file for:\n{rp}") + continue + with open(fk_path) as sfi: + fk = json.load(sfi) + generator.generate(schema=schema, foreign_keys=fk, filepath=path) + os.system(f'libreoffice {path}') + + # TODO test collisions of sheet or colnames + # TODO test escaping of values + + # TODO finish enum example