From 2ca967384eeb55eac509b3e43f037905fbee9d2d Mon Sep 17 00:00:00 2001
From: Daniel <d.hornung@indiscale.com>
Date: Mon, 29 Apr 2024 16:49:08 +0200
Subject: [PATCH] WIP: Reading XLSX.

---
 .../table_json_conversion/convert.py          | 78 ++++++++++++++++---
 .../table_json_conversion/test_read_data.py   | 16 ++--
 2 files changed, 75 insertions(+), 19 deletions(-)

diff --git a/src/caosadvancedtools/table_json_conversion/convert.py b/src/caosadvancedtools/table_json_conversion/convert.py
index 0d7be9ef..fe0e20eb 100644
--- a/src/caosadvancedtools/table_json_conversion/convert.py
+++ b/src/caosadvancedtools/table_json_conversion/convert.py
@@ -22,19 +22,24 @@
 
 from __future__ import annotations
 
-from collections import OrderedDict
 from functools import reduce
 from operator import getitem
 from types import SimpleNamespace
-from typing import Any, BinaryIO, Optional, TextIO, Union
+from typing import Any, BinaryIO, Callable, TextIO, Union
 
-import caosadvancedtools.table_json_conversion.xlsx_utils as xlsx_utils
-
-from openpyxl import load_workbook, Workbook
+from jsonschema import validate, ValidationError
+from openpyxl import load_workbook
 from openpyxl.worksheet.worksheet import Worksheet
 
-from . import fill_xlsx
-from .fill_xlsx import read_or_dict
+from caosadvancedtools.table_json_conversion import xlsx_utils
+from caosadvancedtools.table_json_conversion.fill_xlsx import read_or_dict
+
+
+def _strict_bool(value: Any) -> bool:
+    """Convert value to bool, but only if it really is a valid XLSX bool."""
+    if isinstance(value, bool):
+        return value
+    raise TypeError(f"Not a good boolean: {repr(value)}")
 
 
 class XLSXConverter:
@@ -44,6 +49,13 @@ For a detailed description of the required formatting of the XLSX files, see ``s
 documentation.
     """
 
+    PARSER: dict[str, Callable] = {
+        "string": str,
+        "number": float,
+        "integer": int,
+        "boolean": _strict_bool,
+    }
+
     def __init__(self, xlsx: Union[str, BinaryIO], schema: Union[dict, str, TextIO]):
         """
 Parameters
@@ -98,11 +110,11 @@ Look at ``xlsx_utils.get_path_position`` for the specification of the "proper na
             if parent_sheetname not in self._handled_sheets:
                 self._handle_sheet(self._workbook.get_sheet_by_name(parent_sheetname))
 
-        # We save single entries in lists, indexed by their foreign key contents.  Each entry
-        # consists of:
-        # - foreign: Dict with path -> value for the foreign columns
-        # - data: The actual data of this entry, a dict.
-        entries: dict[str, list[SimpleNamespace]] = {}
+        # # We save single entries in lists, indexed by their foreign key contents.  Each entry
+        # # consists of:
+        # # - foreign: Dict with path -> value for the foreign columns
+        # # - data: The actual data of this entry, a dict.
+        # entries: dict[str, list[SimpleNamespace]] = {}
 
         for row in sheet.iter_rows(values_only=True):
             # Skip non-data rows.
@@ -119,6 +131,7 @@ Look at ``xlsx_utils.get_path_position`` for the specification of the "proper na
                     continue
 
                 if col_idx in data_column_paths:
+                    value = self._validate_and_convert(value=value, path=data_column_paths[col_idx])
                     _set_in_nested(mydict=data, path=data_column_paths[col_idx], value=value,
                                    prefix=parent, skip=1)
                     continue
@@ -162,6 +175,47 @@ Look at ``xlsx_utils.get_path_position`` for the specification of the "proper na
         assert isinstance(current_object, dict)
         return current_object
 
+    def _validate_and_convert(self, value: Any, path: list[str]):
+        """Apply some basic validation and conversion steps.
+
+This includes:
+- Validation against the type given in the schema
+- List typed values are split at semicolons and validated individually
+        """
+        if value is None:
+            return value
+        subschema = self._get_subschema(path)
+        # Array handling only if schema says it's an array.
+        if subschema.get("type") == "array":
+            array_type = subschema["items"]["type"]
+            if isinstance(value, str) and ";" in value:
+                values = [self.PARSER[array_type](v) for v in value.split(";")]
+                return values
+        try:
+            validate(value, subschema)
+        except ValidationError as verr:
+            print(verr)
+            print(path)
+            raise
+
+        # Finally: convert to target type
+        return self.PARSER[subschema.get("type", "string")](value)
+
+    def _get_subschema(self, path: list[str], schema: Union[dict, list] = None) -> dict:
+        """Return the sub schema at ``path``."""
+        if schema is None:
+            schema = self._schema
+            assert schema is not None
+        assert isinstance(schema, dict)
+        if path:
+            if schema["type"] == "object":
+                next_schema = schema["properties"][path[0]]
+                return self._get_subschema(path=path[1:], schema=next_schema)
+            if schema["type"] == "array":
+                next_schema = schema["items"]["properties"][path[0]]
+                return self._get_subschema(path=path[1:], schema=next_schema)
+        return schema
+
 
 def _group_foreign_paths(foreign: list[list], common: list[str]) -> list[SimpleNamespace]:
     """Group the foreign keys by their base paths.
diff --git a/unittests/table_json_conversion/test_read_data.py b/unittests/table_json_conversion/test_read_data.py
index 24700b55..3fbdf3dd 100644
--- a/unittests/table_json_conversion/test_read_data.py
+++ b/unittests/table_json_conversion/test_read_data.py
@@ -83,8 +83,10 @@ Raise an assertion exception if they are not equal."""
                 el2 = json2[key]
                 assert type(el1) is type(el2), f"Type mismatch, path: {this_path}"
                 if isinstance(el1, (dict, list)):
+                    # Iterables: Recursion
                     _assert_equal_jsons(el1, el2, allow_none=allow_none, allow_empty=allow_empty,
                                         path=this_path)
+                    continue
                 assert el1 == el2, f"Values at path {this_path} are not equal:\n{el1},\n{el2}"
                 continue
             # Case 2: only one exists
@@ -93,15 +95,15 @@ Raise an assertion exception if they are not equal."""
                 f"Element at path {this_path} is None or empty in one json and does not exist in "
                 "the other.")
         return
-
     assert isinstance(json1, list) and isinstance(json2, list), f"Is not a list, path: {path}"
     assert len(json1) == len(json2), f"Lists must have equal length, path: {path}"
     for idx, (el1, el2) in enumerate(zip(json1, json2)):
         this_path = path + [idx]
-        assert isinstance(el1, dict) and isinstance(el2, dict), (
-            f"List elements must be dicts: path: {this_path}")
-        _assert_equal_jsons(el1, el2, allow_none=allow_none, allow_empty=allow_empty,
-                            path=this_path)
+        if isinstance(el1, dict):
+            _assert_equal_jsons(el1, el2, allow_none=allow_none, allow_empty=allow_empty,
+                                path=this_path)
+        else:
+            assert el1 == el2
 
 
 def test_conversions():
@@ -110,9 +112,9 @@ def test_conversions():
     # result = convert.to_dict(xlsx=rfp("data/multiple_refs_data.xlsx"),
     #                          schema=rfp("data/multiple_refs_schema.json"))
     # expected = json.load(open(rfp("data/multiple_refs_data.json")))
-    breakpoint()
+    # breakpoint()
     _assert_equal_jsons(result, expected)
-    breakpoint()
+    # breakpoint()
     # conv = XLSXConverter(schema=rfp("data/simple_schema.json"))
     # result = conv.to_dict(rfp("data/simple_template.xlsx"))
 
-- 
GitLab