Skip to content
Snippets Groups Projects
Verified Commit de1b92a7 authored by Daniel Hornung's avatar Daniel Hornung
Browse files

Merge branch 'f-table-json-data-schema' into f-convert-xlsx-to-json

parents f85fcabe 67d94135
No related branches found
No related tags found
2 merge requests!107Release v0.11.0,!102ENH: XLSX reader
Pipeline #50432 failed
Showing
with 325 additions and 66 deletions
......@@ -33,6 +33,7 @@ from openpyxl import load_workbook, Workbook
from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE
from .xlsx_utils import (
array_schema_from_model_schema,
get_foreign_key_columns,
get_row_type_column_index,
is_exploded_sheet,
......@@ -332,7 +333,7 @@ validation_schema: dict, optional
# Validation
if validation_schema is not None:
validation_schema = read_or_dict(validation_schema)
validation_schema = array_schema_from_model_schema(read_or_dict(validation_schema))
try:
validate(data, validation_schema, format_checker=FormatChecker())
except ValidationError as verr:
......
......@@ -18,13 +18,25 @@
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""General utilities to work with XLSX files with (hidden) column and row annotations and typing."""
"""General utilities to work with XLSX files with (hidden) column and row annotations and typing.
The most prominent functions are:
- ``p2s``: Path to string: ``["some", "path"] -> "some.path"``
- ``read_or_dict``: Load JSON object from path, file or dict.
This module also defines these enums:
- ColumnType
- RowType
"""
from __future__ import annotations
import json
from collections import OrderedDict
from copy import deepcopy
from enum import Enum
from types import SimpleNamespace
from typing import Dict, List, TextIO, Union
......@@ -49,27 +61,36 @@ class RowType(Enum):
IGNORE = 3
def p2s(path: List[str]) -> str:
"""Path to string: dot-separated.
"""
return ".".join(path)
def array_schema_from_model_schema(model_schema: dict) -> dict:
"""Convert a *data model* schema to a *data array* schema.
Practically, this means that the top level properties are converted into lists. In a simplified
notation, this can be expressed as:
def read_or_dict(data: Union[dict, str, TextIO]) -> dict:
"""If data is a json file name or input stream, read data from there.
If it is a dict already, just return it."""
if isinstance(data, dict):
return data
``array_schema = { elem: [elem typed data...] for elem in model_schema }``
if isinstance(data, str):
with open(data, encoding="utf-8") as infile:
data = json.load(infile)
elif hasattr(data, "read"):
data = json.load(data)
else:
raise ValueError(f"I don't know how to handle the datatype of `data`: {type(data)}")
assert isinstance(data, dict)
return data
Parameters
----------
model_schema: dict
The schema description of the data model. Must be a json schema *object*, with a number of
*object* typed properties.
Returns
-------
array_schema: dict
A corresponding json schema, where the properties are arrays with the types of the input's
top-level properties.
"""
assert model_schema["type"] == "object"
result = deepcopy(model_schema)
for name, prop in result["properties"].items():
assert prop["type"] == "object"
new_prop = {
"type": "array",
"items": prop
}
result["properties"][name] = new_prop
return result
def get_defining_paths(workbook: Workbook) -> dict[str, list[list[str]]]:
......@@ -243,6 +264,16 @@ def get_worksheet_for_path(path: list[str], defining_path_index: dict[str, list[
raise KeyError(f"Could not find defining worksheet for path: {path}")
def is_exploded_sheet(sheet: Worksheet) -> bool:
"""Return True if this is a an "exploded" sheet.
An exploded sheet is a sheet whose data entries are LIST valued properties of entries in another
sheet. A sheet is detected as exploded iff it has FOREIGN columns.
"""
column_types = _get_column_types(sheet)
return ColumnType.FOREIGN.name in column_types.values()
def next_row_index(sheet: Worksheet) -> int:
"""Return the index for the next data row.
......@@ -251,14 +282,27 @@ def next_row_index(sheet: Worksheet) -> int:
return sheet.max_row
def is_exploded_sheet(sheet: Worksheet) -> bool:
"""Return True if this is a an "exploded" sheet.
An exploded sheet is a sheet whose data entries are LIST valued properties of entries in another
sheet. A sheet is detected as exploded iff it has FOREIGN columns.
def p2s(path: List[str]) -> str:
"""Path to string: dot-separated.
"""
column_types = _get_column_types(sheet)
return ColumnType.FOREIGN.name in column_types.values()
return ".".join(path)
def read_or_dict(data: Union[dict, str, TextIO]) -> dict:
"""If data is a json file name or input stream, read data from there.
If it is a dict already, just return it."""
if isinstance(data, dict):
return data
if isinstance(data, str):
with open(data, encoding="utf-8") as infile:
data = json.load(infile)
elif hasattr(data, "read"):
data = json.load(data)
else:
raise ValueError(f"I don't know how to handle the datatype of `data`: {type(data)}")
assert isinstance(data, dict)
return data
def _get_column_types(sheet: Worksheet) -> OrderedDict:
......
......@@ -13,14 +13,16 @@ The data model in LinkAhead defines the types of records present in a LinkAhead
structure. This data model can also be represented in a JSON Schema, which defines the structure of
JSON files containing records pertaining to the data model.
For example, the following JSON can describe a "Person" Record:
For example, the following JSON can describe a singe "Person" Record:
```JSON
{
"Person": {
"family_name": "Steve",
"given_name": "Stevie"
}
"Person": [
{
"family_name": "Steve",
"given_name": "Stevie"
}
]
}
```
......@@ -30,6 +32,43 @@ the storage of "Training" Records containing information about conducted trainin
particularly valuable for data import and export. One could generate web forms from the JSON Schema
or use it to export objects stored in LinkAhead as JSON.
### Note: Data models and data arrays ###
The schema as created by ``json_schema_exporter.recordtype_to_json_schema(...)`` is, from a broad
view, a dict with all the top level recordtypes (the recordtype names are the keys). While this is
appropriate for the generation of user input forms, data often consists of multiple entries of the
same type. XLSX files are no exception, users expect that they may enter multiple rows of data.
Since the data model schema does not match multiple data sets, there is a utility function which
create a *data array* schema out of the *data model* schema: It basically replaces the top-level
entries of the data model by lists which may contain data.
A **short example** illustrates this well. Consider a *data model* schema which fits to this data
content:
```JSON
{
"Person": {
"name": "Charly"
}
}
```
Now the automatically generated *data array* schema would accept the following data:
```JSON
{
"Person": [
{
"name": "Charly"
},
{
"name": "Sam"
}
]
}
```
## From JSON to XLSX: Data Representation ##
The following describes how JSON files representing LinkAhead records are converted into XLSX files,
......@@ -67,33 +106,45 @@ Let's now consider these four cases in detail and with examples:
```JSON
{
"Training": {
"Training": [
{
"date": "2023-01-01",
"url": "www.indiscale.com",
"duration": 1.0,
"participants": 1,
"remote": false
}
},
{
"date": "2023-06-15",
"url": "www.indiscale.com/next",
"duration": 2.5,
"participants": None,
"remote": true
}
]
}
```
This entry will be represented in an XLSX sheet with the following content:
| date | url | duration | participants | remote |
|------------|-------------------|----------|--------------|--------|
| 2023-01-01 | www.indiscale.com | 1.0 | 1 | false |
| date | url | duration | participants | remote |
|------------|------------------------|----------|--------------|--------|
| 2023-01-01 | www.indiscale.com | 1.0 | 1 | false |
| 2023-06-15 | www.indiscale.com/next | 2.5 | | true |
### b. Property referencing a record ###
```JSON
{
"Training": {
"Training": [
{
"date": "2023-01-01",
"supervisor": {
"family_name": "Stevenson",
"given_name": "Stevie",
}
}
}
]
}
```
......@@ -110,10 +161,12 @@ through the content of hidden rows. (See below for the definition of hidden row
```JSON
{
"Training": {
"Training": [
{
"url": "www.indiscale.com",
"subjects": ["Math", "Physics"],
}
}
]
}
```
......@@ -130,13 +183,15 @@ the separator `;`, it is escaped with `\\`.
```JSON
{
"Training": {
"Training": [
{
"date": "2024-04-17",
"skills": [
"Planning",
"Evaluation"
]
}
}
]
}
```
......@@ -154,7 +209,8 @@ Note that this example assumes that the list of possible choices, as given in th
```JSON
{
"Training": {
"Training": [
{
"date": "2023-01-01",
"coach": [
{
......@@ -166,7 +222,8 @@ Note that this example assumes that the list of possible choices, as given in th
"given_name": "Min",
}
]
}
}
]
}
```
......@@ -281,6 +338,4 @@ These rows correspond to:
The current implementation still lacks the following:
- Lists of enum references are not yet implemented as columns where matching cell can simply be
ticked/crossed.
- Files handling is not implemented yet.
{
"Training": {
"Training": [{
"duration": 1.0,
"participants": 0.5
},
"Person": {
}],
"Person": [{
"family_name": "Auric",
"given_name": "Goldfinger",
"Organisation": "Federal Reserve"
}
}]
}
{
"Wrapper": {
"Wrapper": [{
"Results": [
{
"year": 2022,
......@@ -14,5 +14,5 @@
"name": "Basic Training",
"url": "www.example.com/training/basic"
}
}
}]
}
{
"Training": {
"Training": [{
"name": "Super Skill Training",
"date": "2024-04-17",
"skills": [
......@@ -7,5 +7,5 @@
"Evaluation"
],
"exam_types": []
}
}]
}
{
"Training": {
"Training": [{
"trainer": [],
"participant": [
{
......@@ -44,5 +44,5 @@
"date": "2024-03-21T14:12:00.000Z",
"url": "www.indiscale.com",
"name": "Example training with multiple organizations."
}
}]
}
{
"Training": {
"Training": [{
"date": "2023-01-01",
"url": "www.indiscale.com",
"coach": [
......@@ -23,10 +23,10 @@
"participants": 1,
"subjects": ["Math", "Physics"],
"remote": false
},
"Person": {
}],
"Person": [{
"family_name": "Steve",
"given_name": "Stevie",
"Organisation": "IMF"
}
}]
}
{
"Training": {
"Training": [{
"date": "2023-01-01",
"url": "char: >\u0001\u0002\u0003\u0004\u0005\u0006\u0007\u0008\u0009<",
"subjects": [
......@@ -9,10 +9,10 @@
">\u0020\u0021\u0022\u0023\u0024\u0025\u0026\u0027<",
">\u0028\u0029\u002a\u002b\u002c\u002d\u002e\u002f<"
]
},
"Person": {
}],
"Person": [{
"family_name": "Steve",
"given_name": "Stevie",
"Organisation": "IMF"
}
}]
}
{
"type": "object",
"properties": {
"Training": {
"type": "array",
"items": {
"type": "object",
"required": [],
"additionalProperties": false,
"title": "Training",
"properties": {
"name": {
"type": "string",
"description": "The name of the Record to be created"
},
"date": {
"description": "The date of the training.",
"anyOf": [
{
"type": "string",
"format": "date"
},
{
"type": "string",
"format": "date-time"
}
]
},
"url": {
"type": "string",
"description": "The URL"
},
"subjects": {
"type": "array",
"items": {
"type": "string"
}
},
"coach": {
"type": "array",
"items": {
"type": "object",
"required": [],
"additionalProperties": false,
"title": "coach",
"properties": {
"name": {
"type": "string",
"description": "The name of the Record to be created"
},
"family_name": {
"type": "string"
},
"given_name": {
"type": "string"
},
"Organisation": {
"enum": [
"Federal Reserve",
"IMF",
"ECB"
]
}
}
}
},
"supervisor": {
"type": "object",
"required": [],
"additionalProperties": false,
"title": "supervisor",
"properties": {
"name": {
"type": "string",
"description": "The name of the Record to be created"
},
"family_name": {
"type": "string"
},
"given_name": {
"type": "string"
},
"Organisation": {
"enum": [
"Federal Reserve",
"IMF",
"ECB"
]
}
}
},
"duration": {
"type": "number"
},
"participants": {
"type": "integer"
},
"remote": {
"type": "boolean"
},
"slides": {
"type": "string",
"format": "data-url"
}
},
"$schema": "https://json-schema.org/draft/2020-12/schema"
}
},
"Person": {
"type": "array",
"items": {
"type": "object",
"required": [],
"additionalProperties": false,
"title": "Person",
"properties": {
"name": {
"type": "string",
"description": "The name of the Record to be created"
},
"family_name": {
"type": "string"
},
"given_name": {
"type": "string"
},
"Organisation": {
"enum": [
"Federal Reserve",
"IMF",
"ECB"
]
}
},
"$schema": "https://json-schema.org/draft/2020-12/schema"
}
}
},
"required": [
"Training",
"Person"
],
"additionalProperties": false,
"$schema": "https://json-schema.org/draft/2020-12/schema"
}
#!/usr/bin/env python3
# encoding: utf-8
#
# This file is a part of the LinkAhead Project.
#
# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com>
# Copyright (C) 2024 Henrik tom Wörden <h.tomwoerden@indiscale.com>
# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
......@@ -33,6 +33,13 @@ from caosadvancedtools.table_json_conversion.xlsx_utils import (
get_path_rows,
)
from caosadvancedtools.table_json_conversion import xlsx_utils
from caosadvancedtools.table_json_conversion.fill_xlsx import fill_template
from caosadvancedtools.table_json_conversion.xlsx_utils import (
get_row_type_column_index,
get_path_rows,
)
from .utils import compare_workbooks
......@@ -159,3 +166,10 @@ def test_errors():
known_good=rfp("data/simple_data.xlsx"),
schema=rfp("data/simple_schema.json"))
assert exc.value.message == "0.5 is not of type 'integer'"
def test_data_schema_generation():
model_schema = xlsx_utils.read_or_dict(rfp("data/simple_schema.json"))
array_schema = xlsx_utils.array_schema_from_model_schema(model_schema)
expected = xlsx_utils.read_or_dict(rfp("data/simple_data_schema.json"))
assert array_schema == expected
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment