From 734b70b4738e41775fca9f601b7dcf4b9a7f69e8 Mon Sep 17 00:00:00 2001
From: Daniel <d.hornung@indiscale.com>
Date: Fri, 8 Mar 2024 11:05:19 +0100
Subject: [PATCH] WIP: Filling XLSX: Schema validation and tests.

---
 .../table_json_conversion/fill_xlsx.py        | 38 ++++++++++++++-----
 .../data/multiple_refs_schema.json            |  7 +---
 .../data/simple_schema.json                   | 12 ++++--
 .../table_json_conversion/test_fill_xlsx.py   | 14 ++++---
 4 files changed, 48 insertions(+), 23 deletions(-)

diff --git a/src/caosadvancedtools/table_json_conversion/fill_xlsx.py b/src/caosadvancedtools/table_json_conversion/fill_xlsx.py
index 5a620d67..977b4e90 100644
--- a/src/caosadvancedtools/table_json_conversion/fill_xlsx.py
+++ b/src/caosadvancedtools/table_json_conversion/fill_xlsx.py
@@ -27,6 +27,7 @@ from collections import OrderedDict
 from types import SimpleNamespace
 from typing import Any, Dict, List, Optional, Union, TextIO
 
+from jsonschema import FormatChecker, validate
 from openpyxl import load_workbook, Workbook
 from openpyxl.worksheet.worksheet import Worksheet
 
@@ -122,6 +123,21 @@ def _next_row_index(sheet: Worksheet) -> int:
     return sheet.max_row
 
 
+def _read_or_dict(data: Union[dict, str, TextIO]) -> dict:
+    """If data is a json file name or input stream, read data from there."""
+    if isinstance(data, dict):
+        pass
+    elif isinstance(data, str):
+        with open(data, encoding="utf-8") as infile:
+            data = json.load(infile)
+    elif hasattr(data, "read"):
+        data = json.load(data)
+    else:
+        raise ValueError(f"I don't know how to handle the datatype of `data`: {type(data)}")
+    assert isinstance(data, dict)
+    return data
+
+
 class TemplateFiller:
     """Class to fill XLSX templates.  Has an index for all relevant columns."""
 
@@ -319,7 +335,8 @@ out: union[dict, None]
         return None
 
 
-def fill_template(data: Union[dict, str, TextIO], template: str, result: str) -> None:
+def fill_template(data: Union[dict, str, TextIO], template: str, result: str,
+                  validation_schema: Union[dict, str, TextIO] = None) -> None:
     """Insert json data into an xlsx file, according to a template.
 
 This function fills the json data into the template stored at ``template`` and stores the result as
@@ -333,18 +350,19 @@ template: str
   Path to the XLSX template.
 result: str
   Path for the result XLSX.
+validation_schema: dict, optional
+  If given, validate the date against this schema first.  This raises an exception if the validation
+  fails.
 """
-    if isinstance(data, dict):
-        pass
-    elif isinstance(data, str):
-        with open(data, encoding="utf-8") as infile:
-            data = json.load(infile)
-    elif hasattr(data, "read"):
-        data = json.load(data)
-    else:
-        raise ValueError(f"I don't know how to handle the datatype of `data`: {type(data)}")
+    data = _read_or_dict(data)
     assert isinstance(data, dict)
 
+    # Validation
+    if validation_schema is not None:
+        validation_schema = _read_or_dict(validation_schema)
+        validate(data, validation_schema, format_checker=FormatChecker())
+
+    # Filling the data
     result_wb = load_workbook(template)
     template_filler = TemplateFiller(result_wb)
     template_filler.fill_data(data=data)
diff --git a/unittests/table_json_conversion/data/multiple_refs_schema.json b/unittests/table_json_conversion/data/multiple_refs_schema.json
index 7acf5e0d..2adec8de 100644
--- a/unittests/table_json_conversion/data/multiple_refs_schema.json
+++ b/unittests/table_json_conversion/data/multiple_refs_schema.json
@@ -204,10 +204,7 @@
       "$schema": "https://json-schema.org/draft/2020-12/schema"
     }
   },
-  "required": [
-    "Training",
-    "Person"
-  ],
+  "required": [],
   "additionalProperties": false,
   "$schema": "https://json-schema.org/draft/2020-12/schema"
-}
\ No newline at end of file
+}
diff --git a/unittests/table_json_conversion/data/simple_schema.json b/unittests/table_json_conversion/data/simple_schema.json
index f18fd6af..01a732d6 100644
--- a/unittests/table_json_conversion/data/simple_schema.json
+++ b/unittests/table_json_conversion/data/simple_schema.json
@@ -54,7 +54,9 @@
               },
               "Organisation": {
                 "enum": [
-                  "Federal Reserve"
+                  "Federal Reserve",
+                  "IMF",
+                  "ECB"
                 ]
               }
             }
@@ -78,7 +80,9 @@
             },
             "Organisation": {
               "enum": [
-                "Federal Reserve"
+                "Federal Reserve",
+                "IMF",
+                "ECB"
               ]
             }
           }
@@ -117,7 +121,9 @@
         },
         "Organisation": {
           "enum": [
-            "Federal Reserve"
+            "Federal Reserve",
+            "IMF",
+            "ECB"
           ]
         }
       },
diff --git a/unittests/table_json_conversion/test_fill_xlsx.py b/unittests/table_json_conversion/test_fill_xlsx.py
index fcc9ad23..0353eaa3 100644
--- a/unittests/table_json_conversion/test_fill_xlsx.py
+++ b/unittests/table_json_conversion/test_fill_xlsx.py
@@ -38,12 +38,13 @@ def rfp(*pathcomponents):
 
 
 def fill_and_compare(json_file: str, template_file: str, known_good: str,
-                     custom_output: str = None):
+                     schema: str = None, custom_output: str = None):
     """Fill the data into a template and compare to a known good.
 
 Parameters:
 -----------
-
+schema: str, optional,
+  Json schema to validate against.
 custom_output: str, optional
   If given, write to this file and drop into an IPython shell.  For development only.
     """
@@ -52,7 +53,8 @@ custom_output: str, optional
         assert not os.path.exists(outfile)
         if custom_output is not None:
             outfile = custom_output
-        fill_template(data=json_file, template=template_file, result=outfile)
+        fill_template(data=json_file, template=template_file, result=outfile,
+                      validation_schema=schema)
         assert os.path.exists(outfile)
         generated = load_workbook(outfile)  # workbook can be read
     known_good_wb = load_workbook(known_good)
@@ -68,7 +70,9 @@ def test_detect():
 def test_fill_xlsx():
     fill_and_compare(json_file=rfp("data/simple_data.json"),
                      template_file=rfp("data/simple_template.xlsx"),
-                     known_good=rfp("data/simple_data.xlsx"))
+                     known_good=rfp("data/simple_data.xlsx"),
+                     schema=rfp("data/simple_schema.json"))
     fill_and_compare(json_file=rfp("data/multiple_refs_data.json"),
                      template_file=rfp("data/multiple_refs_template.xlsx"),
-                     known_good=rfp("data/multiple_refs_data.xlsx"))
+                     known_good=rfp("data/multiple_refs_data.xlsx"),
+                     schema=rfp("data/multiple_refs_schema.json"))
-- 
GitLab