Merge branch 'f-table-json-data-schema' into f-convert-xlsx-to-json

de1b92a7 · Daniel Hornung · f85fcabe · 67d94135 · de1b92a7 · de1b92a7
Verified Commit de1b92a7 authored 1 year ago by Daniel Hornung
--- a/src/caosadvancedtools/table_json_conversion/fill_xlsx.py
+++ b/src/caosadvancedtools/table_json_conversion/fill_xlsx.py
@@ -33,6 +33,7 @@ from openpyxl import load_workbook, Workbook
 from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE

 from .xlsx_utils import (
+    array_schema_from_model_schema,
    get_foreign_key_columns,
    get_row_type_column_index,
    is_exploded_sheet,
@@ -332,7 +333,7 @@ validation_schema: dict, optional

    # Validation
    if validation_schema is not None:
-        validation_schema = read_or_dict(validation_schema)
+        validation_schema = array_schema_from_model_schema(read_or_dict(validation_schema))
        try:
            validate(data, validation_schema, format_checker=FormatChecker())
        except ValidationError as verr:

--- a/src/caosadvancedtools/table_json_conversion/xlsx_utils.py
+++ b/src/caosadvancedtools/table_json_conversion/xlsx_utils.py
@@ -18,13 +18,25 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program. If not, see <https://www.gnu.org/licenses/>.

-"""General utilities to work with XLSX files with (hidden) column and row annotations and typing."""
+"""General utilities to work with XLSX files with (hidden) column and row annotations and typing.
+
+The most prominent functions are:
+
+- ``p2s``: Path to string: ``["some", "path"] -> "some.path"``
+- ``read_or_dict``: Load JSON object from path, file or dict.
+
+This module also defines these enums:
+
+- ColumnType
+- RowType
+"""

 from __future__ import annotations

 import json

 from collections import OrderedDict
+from copy import deepcopy
 from enum import Enum
 from types import SimpleNamespace
 from typing import Dict, List, TextIO, Union
@@ -49,27 +61,36 @@ class RowType(Enum):
    IGNORE = 3


-def p2s(path: List[str]) -> str:
-    """Path to string: dot-separated.
-    """
-    return ".".join(path)
+def array_schema_from_model_schema(model_schema: dict) -> dict:
+    """Convert a *data model* schema to a *data array* schema.

+Practically, this means that the top level properties are converted into lists.  In a simplified
+notation, this can be expressed as:

-def read_or_dict(data: Union[dict, str, TextIO]) -> dict:
-    """If data is a json file name or input stream, read data from there.
-If it is a dict already, just return it."""
-    if isinstance(data, dict):
-        return data
+``array_schema = { elem: [elem typed data...] for elem in model_schema }``

-    if isinstance(data, str):
-        with open(data, encoding="utf-8") as infile:
-            data = json.load(infile)
-    elif hasattr(data, "read"):
-        data = json.load(data)
-    else:
-        raise ValueError(f"I don't know how to handle the datatype of `data`: {type(data)}")
-    assert isinstance(data, dict)
-    return data
+Parameters
+----------
+model_schema: dict
+  The schema description of the data model.  Must be a json schema *object*, with a number of
+  *object* typed properties.
+
+Returns
+-------
+array_schema: dict
+  A corresponding json schema, where the properties are arrays with the types of the input's
+  top-level properties.
+    """
+    assert model_schema["type"] == "object"
+    result = deepcopy(model_schema)
+    for name, prop in result["properties"].items():
+        assert prop["type"] == "object"
+        new_prop = {
+            "type": "array",
+            "items": prop
+        }
+        result["properties"][name] = new_prop
+    return result


 def get_defining_paths(workbook: Workbook) -> dict[str, list[list[str]]]:
@@ -243,6 +264,16 @@ def get_worksheet_for_path(path: list[str], defining_path_index: dict[str, list[
    raise KeyError(f"Could not find defining worksheet for path: {path}")


+def is_exploded_sheet(sheet: Worksheet) -> bool:
+    """Return True if this is a an "exploded" sheet.
+
+    An exploded sheet is a sheet whose data entries are LIST valued properties of entries in another
+    sheet.  A sheet is detected as exploded iff it has FOREIGN columns.
+    """
+    column_types = _get_column_types(sheet)
+    return ColumnType.FOREIGN.name in column_types.values()
+
+
 def next_row_index(sheet: Worksheet) -> int:
    """Return the index for the next data row.

@@ -251,14 +282,27 @@ def next_row_index(sheet: Worksheet) -> int:
    return sheet.max_row


-def is_exploded_sheet(sheet: Worksheet) -> bool:
-    """Return True if this is a an "exploded" sheet.
-
-    An exploded sheet is a sheet whose data entries are LIST valued properties of entries in another
-    sheet.  A sheet is detected as exploded iff it has FOREIGN columns.
+def p2s(path: List[str]) -> str:
+    """Path to string: dot-separated.
    """
-    column_types = _get_column_types(sheet)
-    return ColumnType.FOREIGN.name in column_types.values()
+    return ".".join(path)
+
+
+def read_or_dict(data: Union[dict, str, TextIO]) -> dict:
+    """If data is a json file name or input stream, read data from there.
+If it is a dict already, just return it."""
+    if isinstance(data, dict):
+        return data
+
+    if isinstance(data, str):
+        with open(data, encoding="utf-8") as infile:
+            data = json.load(infile)
+    elif hasattr(data, "read"):
+        data = json.load(data)
+    else:
+        raise ValueError(f"I don't know how to handle the datatype of `data`: {type(data)}")
+    assert isinstance(data, dict)
+    return data


 def _get_column_types(sheet: Worksheet) -> OrderedDict:

--- a/src/doc/table-json-conversion/specs.md
+++ b/src/doc/table-json-conversion/specs.md
@@ -13,14 +13,16 @@ The data model in LinkAhead defines the types of records present in a LinkAhead
 structure. This data model can also be represented in a JSON Schema, which defines the structure of
 JSON files containing records pertaining to the data model.

-For example, the following JSON can describe a "Person" Record:
+For example, the following JSON can describe a singe "Person" Record:

 ```JSON
 {
-    "Person": {
+    "Person": [
+        {
            "family_name": "Steve",
            "given_name": "Stevie"
        }
+    ]
 }
 ```

@@ -30,6 +32,43 @@ the storage of "Training" Records containing information about conducted trainin
 particularly valuable for data import and export. One could generate web forms from the JSON Schema
 or use it to export objects stored in LinkAhead as JSON.

+### Note: Data models and data arrays ###
+
+The schema as created by ``json_schema_exporter.recordtype_to_json_schema(...)`` is, from a broad
+view, a dict with all the top level recordtypes (the recordtype names are the keys).  While this is
+appropriate for the generation of user input forms, data often consists of multiple entries of the
+same type.  XLSX files are no exception, users expect that they may enter multiple rows of data.
+
+Since the data model schema does not match multiple data sets, there is a utility function which
+create a *data array* schema out of the *data model* schema: It basically replaces the top-level
+entries of the data model by lists which may contain data.
+
+A **short example** illustrates this well.  Consider a *data model* schema which fits to this data
+content:
+
+```JSON
+{
+  "Person": {
+    "name": "Charly"
+  }
+}
+```
+
+Now the automatically generated *data array* schema would accept the following data:
+
+```JSON
+{
+  "Person": [
+    {
+      "name": "Charly"
+    },
+    {
+      "name": "Sam"
+    }
+  ]
+}
+```
+
 ## From JSON to XLSX: Data Representation ##

 The following describes how JSON files representing LinkAhead records are converted into XLSX files,
@@ -67,33 +106,45 @@ Let's now consider these four cases in detail and with examples:

 ```JSON
 {
-    "Training": {
+    "Training": [
+      {
        "date": "2023-01-01",
        "url": "www.indiscale.com",
        "duration": 1.0,
        "participants": 1,
        "remote": false
+      },
+      {
+        "date": "2023-06-15",
+        "url": "www.indiscale.com/next",
+        "duration": 2.5,
+        "participants": None,
+        "remote": true
      }
+    ]
 }
 ```

 This entry will be represented in an XLSX sheet with the following content:

 | date       | url                    | duration | participants | remote |
-|------------|-------------------|----------|--------------|--------|
+|------------|------------------------|----------|--------------|--------|
 | 2023-01-01 | www.indiscale.com      | 1.0      | 1            | false  |
+| 2023-06-15 | www.indiscale.com/next | 2.5      |              | true   |

 ### b. Property referencing a record ###

 ```JSON
 {
-    "Training": {
+    "Training": [
+      {
        "date": "2023-01-01",
        "supervisor": {
            "family_name": "Stevenson",
            "given_name": "Stevie",
        }
      }
+    ]
 }
 ```

@@ -110,10 +161,12 @@ through the content of hidden rows.  (See below for the definition of hidden row

 ```JSON
 {
-    "Training": {
+    "Training": [
+      {
        "url": "www.indiscale.com",
        "subjects": ["Math", "Physics"],
      }
+    ]
 }
 ```

@@ -130,13 +183,15 @@ the separator `;`, it is escaped with `\\`.

 ```JSON
 {
-    "Training": {
+    "Training": [
+      {
        "date": "2024-04-17",
        "skills": [
              "Planning",
              "Evaluation"
        ]
      }
+    ]
 }
 ```

@@ -154,7 +209,8 @@ Note that this example assumes that the list of possible choices, as given in th

 ```JSON
 {
-    "Training": {
+    "Training": [
+      {
        "date": "2023-01-01",
        "coach": [
            {
@@ -167,6 +223,7 @@ Note that this example assumes that the list of possible choices, as given in th
            }
        ]
      }
+    ]
 }
 ```

@@ -281,6 +338,4 @@ These rows correspond to:

 The current implementation still lacks the following:

- Lists of enum references are not yet implemented as columns where matching cell can simply be
-  ticked/crossed.
 - Files handling is not implemented yet.
--- a/unittests/table_json_conversion/data/error_simple_data.json
+++ b/unittests/table_json_conversion/data/error_simple_data.json
 {
-  "Training": {
+  "Training": [{
    "duration": 1.0,
    "participants": 0.5
-  },
-  "Person": {
+  }],
+  "Person": [{
    "family_name": "Auric",
    "given_name": "Goldfinger",
    "Organisation": "Federal Reserve"
-  }
+  }]
 }
--- a/unittests/table_json_conversion/data/indirect_data.json
+++ b/unittests/table_json_conversion/data/indirect_data.json
 {
-  "Wrapper": {
+  "Wrapper": [{
    "Results": [
      {
        "year": 2022,
@@ -14,5 +14,5 @@
      "name": "Basic Training",
      "url": "www.example.com/training/basic"
    }
-  }
+  }]
 }
--- a/unittests/table_json_conversion/data/multiple_choice_data.json
+++ b/unittests/table_json_conversion/data/multiple_choice_data.json
 {
-  "Training": {
+  "Training": [{
    "name": "Super Skill Training",
    "date": "2024-04-17",
    "skills": [
@@ -7,5 +7,5 @@
      "Evaluation"
    ],
    "exam_types": []
-  }
+  }]
 }
--- a/unittests/table_json_conversion/data/multiple_refs_data.json
+++ b/unittests/table_json_conversion/data/multiple_refs_data.json
 {
-  "Training": {
+  "Training": [{
    "trainer": [],
    "participant": [
      {
@@ -44,5 +44,5 @@
    "date": "2024-03-21T14:12:00.000Z",
    "url": "www.indiscale.com",
    "name": "Example training with multiple organizations."
-  }
+  }]
 }
--- a/unittests/table_json_conversion/data/simple_data.json
+++ b/unittests/table_json_conversion/data/simple_data.json
 {
-  "Training": {
+  "Training": [{
    "date": "2023-01-01",
    "url": "www.indiscale.com",
    "coach": [
@@ -23,10 +23,10 @@
    "participants": 1,
    "subjects": ["Math", "Physics"],
    "remote": false
-  },
-  "Person": {
+  }],
+  "Person": [{
    "family_name": "Steve",
    "given_name": "Stevie",
    "Organisation": "IMF"
-  }
+  }]
 }
--- a/unittests/table_json_conversion/data/simple_data_ascii_chars.json
+++ b/unittests/table_json_conversion/data/simple_data_ascii_chars.json
 {
-  "Training": {
+  "Training": [{
    "date": "2023-01-01",
    "url": "char: >\u0001\u0002\u0003\u0004\u0005\u0006\u0007\u0008\u0009<",
    "subjects": [
@@ -9,10 +9,10 @@
      ">\u0020\u0021\u0022\u0023\u0024\u0025\u0026\u0027<",
      ">\u0028\u0029\u002a\u002b\u002c\u002d\u002e\u002f<"
    ]
-  },
-  "Person": {
+  }],
+  "Person": [{
    "family_name": "Steve",
    "given_name": "Stevie",
    "Organisation": "IMF"
-  }
+  }]
 }
--- a/unittests/table_json_conversion/data/simple_data_schema.json
+++ b/unittests/table_json_conversion/data/simple_data_schema.json
+{
+  "type": "object",
+  "properties": {
+    "Training": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "required": [],
+        "additionalProperties": false,
+        "title": "Training",
+        "properties": {
+          "name": {
+            "type": "string",
+            "description": "The name of the Record to be created"
+          },
+          "date": {
+            "description": "The date of the training.",
+            "anyOf": [
+              {
+                "type": "string",
+                "format": "date"
+              },
+              {
+                "type": "string",
+                "format": "date-time"
+              }
+            ]
+          },
+          "url": {
+            "type": "string",
+            "description": "The URL"
+          },
+          "subjects": {
+            "type": "array",
+            "items": {
+              "type": "string"
+            }
+          },
+          "coach": {
+            "type": "array",
+            "items": {
+              "type": "object",
+              "required": [],
+              "additionalProperties": false,
+              "title": "coach",
+              "properties": {
+                "name": {
+                  "type": "string",
+                  "description": "The name of the Record to be created"
+                },
+                "family_name": {
+                  "type": "string"
+                },
+                "given_name": {
+                  "type": "string"
+                },
+                "Organisation": {
+                  "enum": [
+                    "Federal Reserve",
+                    "IMF",
+                    "ECB"
+                  ]
+                }
+              }
+            }
+          },
+          "supervisor": {
+            "type": "object",
+            "required": [],
+            "additionalProperties": false,
+            "title": "supervisor",
+            "properties": {
+              "name": {
+                "type": "string",
+                "description": "The name of the Record to be created"
+              },
+              "family_name": {
+                "type": "string"
+              },
+              "given_name": {
+                "type": "string"
+              },
+              "Organisation": {
+                "enum": [
+                  "Federal Reserve",
+                  "IMF",
+                  "ECB"
+                ]
+              }
+            }
+          },
+          "duration": {
+            "type": "number"
+          },
+          "participants": {
+            "type": "integer"
+          },
+          "remote": {
+            "type": "boolean"
+          },
+          "slides": {
+            "type": "string",
+            "format": "data-url"
+          }
+        },
+        "$schema": "https://json-schema.org/draft/2020-12/schema"
+      }
+    },
+    "Person": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "required": [],
+        "additionalProperties": false,
+        "title": "Person",
+        "properties": {
+          "name": {
+            "type": "string",
+            "description": "The name of the Record to be created"
+          },
+          "family_name": {
+            "type": "string"
+          },
+          "given_name": {
+            "type": "string"
+          },
+          "Organisation": {
+            "enum": [
+              "Federal Reserve",
+              "IMF",
+              "ECB"
+            ]
+          }
+        },
+        "$schema": "https://json-schema.org/draft/2020-12/schema"
+      }
+    }
+  },
+  "required": [
+    "Training",
+    "Person"
+  ],
+  "additionalProperties": false,
+  "$schema": "https://json-schema.org/draft/2020-12/schema"
+}
--- a/unittests/table_json_conversion/test_fill_xlsx.py
+++ b/unittests/table_json_conversion/test_fill_xlsx.py
-#!/usr/bin/env python3
 # encoding: utf-8
 #
 # This file is a part of the LinkAhead Project.
 #
 # Copyright (C) 2024 Indiscale GmbH <info@indiscale.com>
 # Copyright (C) 2024 Henrik tom Wörden <h.tomwoerden@indiscale.com>
+# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com>
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU Affero General Public License as
@@ -33,6 +33,13 @@ from caosadvancedtools.table_json_conversion.xlsx_utils import (
    get_path_rows,
 )

+from caosadvancedtools.table_json_conversion import xlsx_utils
+from caosadvancedtools.table_json_conversion.fill_xlsx import fill_template
+from caosadvancedtools.table_json_conversion.xlsx_utils import (
+    get_row_type_column_index,
+    get_path_rows,
+)
+
 from .utils import compare_workbooks


@@ -159,3 +166,10 @@ def test_errors():
                         known_good=rfp("data/simple_data.xlsx"),
                         schema=rfp("data/simple_schema.json"))
    assert exc.value.message == "0.5 is not of type 'integer'"
+
+
+def test_data_schema_generation():
+    model_schema = xlsx_utils.read_or_dict(rfp("data/simple_schema.json"))
+    array_schema = xlsx_utils.array_schema_from_model_schema(model_schema)
+    expected = xlsx_utils.read_or_dict(rfp("data/simple_data_schema.json"))
+    assert array_schema == expected