From 9ed052b100a0ab949e30846cf4b6a0862862b6ac Mon Sep 17 00:00:00 2001
From: Daniel <d.hornung@indiscale.com>
Date: Thu, 2 May 2024 10:46:39 +0200
Subject: [PATCH] ENH: XLSX reader works now.

Still TODO:

- Early: Foreign column checking
- Result validation with schema
- Warnings: Additional or missing columns
---
 .../table_json_conversion/convert.py          | 66 +++++++++++++++----
 .../table_json_conversion/xlsx_utils.py       | 48 +++++++++++++-
 .../table_json_conversion/test_read_data.py   |  8 ++-
 unittests/table_json_conversion/utils.py      |  2 +-
 4 files changed, 106 insertions(+), 18 deletions(-)

diff --git a/src/caosadvancedtools/table_json_conversion/convert.py b/src/caosadvancedtools/table_json_conversion/convert.py
index 1ba31324..02c46bd1 100644
--- a/src/caosadvancedtools/table_json_conversion/convert.py
+++ b/src/caosadvancedtools/table_json_conversion/convert.py
@@ -131,9 +131,22 @@ Look at ``xlsx_utils.get_path_position`` for the specification of the "proper na
                     continue
 
                 if col_idx in data_column_paths:
-                    value = self._validate_and_convert(value=value, path=data_column_paths[col_idx])
-                    _set_in_nested(mydict=data, path=data_column_paths[col_idx], value=value,
-                                   prefix=parent, skip=1)
+                    path = data_column_paths[col_idx]
+                    if self._is_multiple_choice(path):
+                        real_value = path.pop()  # Last component is the enum value, insert above
+                        # set up list
+                        try:
+                            _set_in_nested(mydict=data, path=path, value=[], prefix=parent, skip=1)
+                        except ValueError as err:
+                            if not str(err).startswith("There is already some value at"):
+                                raise
+                        if not xlsx_utils.parse_multiple_choice(value):
+                            continue
+                        _set_in_nested(mydict=data, path=path, value=real_value, prefix=parent,
+                                       skip=1, append_to_list=True)
+                    else:
+                        value = self._validate_and_convert(value, path)
+                        _set_in_nested(mydict=data, path=path, value=value, prefix=parent, skip=1)
                     continue
                 continue
 
@@ -150,6 +163,17 @@ Look at ``xlsx_utils.get_path_position`` for the specification of the "proper na
         self._handled_sheets.add(sheet.title)
         # print(f"Added sheet: {sheet.title}")
 
+    def _is_multiple_choice(self, path: list[str]) -> bool:
+        """Test if the path belongs to a multiple choice section."""
+        if not path:
+            return False
+        subschema = self._get_subschema(path[:-1])
+        if (subschema["type"] == "array"
+                and subschema.get("uniqueItems") is True
+                and "enum" in subschema["items"]):
+            return True
+        return False
+
     def _get_parent_dict(self, parent_path: list[str], foreign: list[list]) -> dict:
         """Return the dict into which values can be inserted.
 
@@ -211,7 +235,10 @@ This includes:
                 next_schema = schema["properties"][path[0]]
                 return self._get_subschema(path=path[1:], schema=next_schema)
             if schema["type"] == "array":
-                next_schema = schema["items"]["properties"][path[0]]
+                items = schema["items"]
+                if "enum" in items:
+                    return schema
+                next_schema = items["properties"][path[0]]
                 return self._get_subschema(path=path[1:], schema=next_schema)
         return schema
 
@@ -261,8 +288,7 @@ out: list[dict[str, list[list]]]
             results[stringpath].definitions.append(definition)
 
     # Then sort by stringpath and calculate subpath.
-    stringpaths = list(results.keys())
-    stringpaths.sort()
+    stringpaths = sorted(results.keys())
 
     resultlist = []
     last_level = 0
@@ -280,9 +306,9 @@ out: list[dict[str, list[list]]]
     return resultlist
 
 
-# pylint: disable-next=dangerous-default-value
+# pylint: disable-next=dangerous-default-value,too-many-arguments
 def _set_in_nested(mydict: dict, path: list, value: Any, prefix: list = [], skip: int = 0,
-                   overwrite: bool = False) -> dict:
+                   overwrite: bool = False, append_to_list: bool = False) -> dict:
     """Set a value in a nested dict.
 
 Parameters
@@ -301,6 +327,10 @@ skip: int = 0
 overwrite: bool = False
   If True, allow overwriting existing content. Otherwise, attempting to overwrite existing values
   leads to an exception.
+append_to_list: bool = False
+  If True, assume that the element at ``path`` is a list and append the value to it.  If the list
+  does not exist, create it.  If there is a non-list at ``path`` already, overwrite it with a new
+  list, if ``overwrite`` is True, otherwise raise a ValueError.
 
 Returns
 -------
@@ -327,11 +357,21 @@ mydict: dict
                 raise ValueError(f"There is already some value at {path}")
         tmp_dict = tmp_dict[key]
     key = path.pop()
-    if key in tmp_dict and not overwrite:
-        raise ValueError(f"There is already some value at [{key}]")
-    if key not in tmp_dict:
-        tmp_dict[key] = {}
-    tmp_dict[key] = value
+    if append_to_list:
+        if key not in tmp_dict:
+            tmp_dict[key] = []
+        if not isinstance(tmp_dict[key], list):
+            if overwrite:
+                tmp_dict[key] = []
+            else:
+                raise ValueError(f"There is already some non-list value at [{key}]")
+        tmp_dict[key].append(value)
+    else:
+        if key in tmp_dict and not overwrite:
+            raise ValueError(f"There is already some value at [{key}]")
+        if key not in tmp_dict:
+            tmp_dict[key] = {}
+        tmp_dict[key] = value
     return mydict
 
 
diff --git a/src/caosadvancedtools/table_json_conversion/xlsx_utils.py b/src/caosadvancedtools/table_json_conversion/xlsx_utils.py
index 465e7c90..32ed8552 100644
--- a/src/caosadvancedtools/table_json_conversion/xlsx_utils.py
+++ b/src/caosadvancedtools/table_json_conversion/xlsx_utils.py
@@ -39,12 +39,16 @@ from collections import OrderedDict
 from copy import deepcopy
 from enum import Enum
 from types import SimpleNamespace
-from typing import TextIO, Union
+from typing import Any, TextIO, Union
 
 from openpyxl import Workbook
 from openpyxl.worksheet.worksheet import Worksheet
 
 
+TRUTHY = {"true", "wahr", "x", "√", "yes", "ja", "y", "j"}  # For multiple choice columns
+FALSY = {"false", "falsch", "-", "no", "nein", "n"}  # For multiple choice columns
+
+
 class ColumnType(Enum):
     """ column types enum """
     SCALAR = 1
@@ -288,6 +292,48 @@ def p2s(path: list[str]) -> str:
     return ".".join(path)
 
 
+def parse_multiple_choice(value: Any) -> bool:
+    """Interpret ``value`` as a multiple choice input.
+
+*Truthy* values are:
+- The boolean ``True``.
+- The number "1".
+- The (case-insensitive) strings ``true``, ``wahr``, ``x``, ``√``, ``yes``, ``ja``, ``y``, ``j``.
+
+*Falsy* values are:
+- The boolean ``False``.
+- ``None``, empty strings, lists, dicts.
+- The number "0".
+- The (case-insensitive) strings ``false``, ``falsch``, ``-``, ``no``, ``nein``, ``n``.
+- Everything else.
+
+Returns
+-------
+out: bool
+  The interpretation result of ``value``.
+    """
+    # Non-string cases first:
+    # pylint: disable-next=too-many-boolean-expressions
+    if (value is None or value is False or value == 0
+            or value == [] or value == {} or value == ""):
+        return False
+    if (value is True or value == 1):
+        return True
+
+    # String cases follow:
+    if not isinstance(value, str):
+        return False
+    value = value.lower()
+
+    if value in TRUTHY:
+        return True
+
+    # Strictly speaking, this test is not necessary, but I think it's good practice.
+    if value in FALSY:
+        return False
+    return False
+
+
 def read_or_dict(data: Union[dict, str, TextIO]) -> dict:
     """If data is a json file name or input stream, read data from there.
 If it is a dict already, just return it."""
diff --git a/unittests/table_json_conversion/test_read_data.py b/unittests/table_json_conversion/test_read_data.py
index 76f9c03f..eef9a9d8 100644
--- a/unittests/table_json_conversion/test_read_data.py
+++ b/unittests/table_json_conversion/test_read_data.py
@@ -38,7 +38,8 @@ def rfp(*pathcomponents):
     return os.path.join(os.path.dirname(__file__), *pathcomponents)
 
 
-def convert_and_compare(xlsx_file: str, schema_file: str, known_good_file: str) -> dict:
+def convert_and_compare(xlsx_file: str, schema_file: str, known_good_file: str,
+                        strict: bool = False) -> dict:
     """Convert an XLSX file and compare to a known result.
 
 Returns
@@ -49,7 +50,7 @@ json: dict
     result = convert.to_dict(xlsx=xlsx_file, schema=schema_file)
     with open(known_good_file, encoding="utf-8") as myfile:
         expected = json.load(myfile)
-    assert_equal_jsons(result, expected)
+    assert_equal_jsons(result, expected, allow_none=not strict, allow_empty=not strict)
     return result
 
 
@@ -66,7 +67,8 @@ def test_conversions():
                         known_good_file=rfp("data/indirect_data.json"))
     convert_and_compare(xlsx_file=rfp("data/multiple_choice_data.xlsx"),
                         schema_file=rfp("data/multiple_choice_schema.json"),
-                        known_good_file=rfp("data/multiple_choice_data.json"))
+                        known_good_file=rfp("data/multiple_choice_data.json"),
+                        strict=True)
 
     # Data loss when saving as xlsx
     with pytest.raises(AssertionError) as err:
diff --git a/unittests/table_json_conversion/utils.py b/unittests/table_json_conversion/utils.py
index 0fcdbf39..b95715f7 100644
--- a/unittests/table_json_conversion/utils.py
+++ b/unittests/table_json_conversion/utils.py
@@ -42,7 +42,7 @@ Raise an assertion exception if they are not equal."""
             if key in json1 and key in json2:
                 el1 = json1[key]
                 el2 = json2[key]
-                assert type(el1) is type(el2), f"Type mismatch, path: {this_path}"
+                assert isinstance(el1, type(el2)), f"Type mismatch, path: {this_path}"
                 if isinstance(el1, (dict, list)):
                     # Iterables: Recursion
                     assert_equal_jsons(el1, el2, allow_none=allow_none, allow_empty=allow_empty,
-- 
GitLab