From 2ca967384eeb55eac509b3e43f037905fbee9d2d Mon Sep 17 00:00:00 2001 From: Daniel <d.hornung@indiscale.com> Date: Mon, 29 Apr 2024 16:49:08 +0200 Subject: [PATCH] WIP: Reading XLSX. --- .../table_json_conversion/convert.py | 78 ++++++++++++++++--- .../table_json_conversion/test_read_data.py | 16 ++-- 2 files changed, 75 insertions(+), 19 deletions(-) diff --git a/src/caosadvancedtools/table_json_conversion/convert.py b/src/caosadvancedtools/table_json_conversion/convert.py index 0d7be9ef..fe0e20eb 100644 --- a/src/caosadvancedtools/table_json_conversion/convert.py +++ b/src/caosadvancedtools/table_json_conversion/convert.py @@ -22,19 +22,24 @@ from __future__ import annotations -from collections import OrderedDict from functools import reduce from operator import getitem from types import SimpleNamespace -from typing import Any, BinaryIO, Optional, TextIO, Union +from typing import Any, BinaryIO, Callable, TextIO, Union -import caosadvancedtools.table_json_conversion.xlsx_utils as xlsx_utils - -from openpyxl import load_workbook, Workbook +from jsonschema import validate, ValidationError +from openpyxl import load_workbook from openpyxl.worksheet.worksheet import Worksheet -from . import fill_xlsx -from .fill_xlsx import read_or_dict +from caosadvancedtools.table_json_conversion import xlsx_utils +from caosadvancedtools.table_json_conversion.fill_xlsx import read_or_dict + + +def _strict_bool(value: Any) -> bool: + """Convert value to bool, but only if it really is a valid XLSX bool.""" + if isinstance(value, bool): + return value + raise TypeError(f"Not a good boolean: {repr(value)}") class XLSXConverter: @@ -44,6 +49,13 @@ For a detailed description of the required formatting of the XLSX files, see ``s documentation. """ + PARSER: dict[str, Callable] = { + "string": str, + "number": float, + "integer": int, + "boolean": _strict_bool, + } + def __init__(self, xlsx: Union[str, BinaryIO], schema: Union[dict, str, TextIO]): """ Parameters @@ -98,11 +110,11 @@ Look at ``xlsx_utils.get_path_position`` for the specification of the "proper na if parent_sheetname not in self._handled_sheets: self._handle_sheet(self._workbook.get_sheet_by_name(parent_sheetname)) - # We save single entries in lists, indexed by their foreign key contents. Each entry - # consists of: - # - foreign: Dict with path -> value for the foreign columns - # - data: The actual data of this entry, a dict. - entries: dict[str, list[SimpleNamespace]] = {} + # # We save single entries in lists, indexed by their foreign key contents. Each entry + # # consists of: + # # - foreign: Dict with path -> value for the foreign columns + # # - data: The actual data of this entry, a dict. + # entries: dict[str, list[SimpleNamespace]] = {} for row in sheet.iter_rows(values_only=True): # Skip non-data rows. @@ -119,6 +131,7 @@ Look at ``xlsx_utils.get_path_position`` for the specification of the "proper na continue if col_idx in data_column_paths: + value = self._validate_and_convert(value=value, path=data_column_paths[col_idx]) _set_in_nested(mydict=data, path=data_column_paths[col_idx], value=value, prefix=parent, skip=1) continue @@ -162,6 +175,47 @@ Look at ``xlsx_utils.get_path_position`` for the specification of the "proper na assert isinstance(current_object, dict) return current_object + def _validate_and_convert(self, value: Any, path: list[str]): + """Apply some basic validation and conversion steps. + +This includes: +- Validation against the type given in the schema +- List typed values are split at semicolons and validated individually + """ + if value is None: + return value + subschema = self._get_subschema(path) + # Array handling only if schema says it's an array. + if subschema.get("type") == "array": + array_type = subschema["items"]["type"] + if isinstance(value, str) and ";" in value: + values = [self.PARSER[array_type](v) for v in value.split(";")] + return values + try: + validate(value, subschema) + except ValidationError as verr: + print(verr) + print(path) + raise + + # Finally: convert to target type + return self.PARSER[subschema.get("type", "string")](value) + + def _get_subschema(self, path: list[str], schema: Union[dict, list] = None) -> dict: + """Return the sub schema at ``path``.""" + if schema is None: + schema = self._schema + assert schema is not None + assert isinstance(schema, dict) + if path: + if schema["type"] == "object": + next_schema = schema["properties"][path[0]] + return self._get_subschema(path=path[1:], schema=next_schema) + if schema["type"] == "array": + next_schema = schema["items"]["properties"][path[0]] + return self._get_subschema(path=path[1:], schema=next_schema) + return schema + def _group_foreign_paths(foreign: list[list], common: list[str]) -> list[SimpleNamespace]: """Group the foreign keys by their base paths. diff --git a/unittests/table_json_conversion/test_read_data.py b/unittests/table_json_conversion/test_read_data.py index 24700b55..3fbdf3dd 100644 --- a/unittests/table_json_conversion/test_read_data.py +++ b/unittests/table_json_conversion/test_read_data.py @@ -83,8 +83,10 @@ Raise an assertion exception if they are not equal.""" el2 = json2[key] assert type(el1) is type(el2), f"Type mismatch, path: {this_path}" if isinstance(el1, (dict, list)): + # Iterables: Recursion _assert_equal_jsons(el1, el2, allow_none=allow_none, allow_empty=allow_empty, path=this_path) + continue assert el1 == el2, f"Values at path {this_path} are not equal:\n{el1},\n{el2}" continue # Case 2: only one exists @@ -93,15 +95,15 @@ Raise an assertion exception if they are not equal.""" f"Element at path {this_path} is None or empty in one json and does not exist in " "the other.") return - assert isinstance(json1, list) and isinstance(json2, list), f"Is not a list, path: {path}" assert len(json1) == len(json2), f"Lists must have equal length, path: {path}" for idx, (el1, el2) in enumerate(zip(json1, json2)): this_path = path + [idx] - assert isinstance(el1, dict) and isinstance(el2, dict), ( - f"List elements must be dicts: path: {this_path}") - _assert_equal_jsons(el1, el2, allow_none=allow_none, allow_empty=allow_empty, - path=this_path) + if isinstance(el1, dict): + _assert_equal_jsons(el1, el2, allow_none=allow_none, allow_empty=allow_empty, + path=this_path) + else: + assert el1 == el2 def test_conversions(): @@ -110,9 +112,9 @@ def test_conversions(): # result = convert.to_dict(xlsx=rfp("data/multiple_refs_data.xlsx"), # schema=rfp("data/multiple_refs_schema.json")) # expected = json.load(open(rfp("data/multiple_refs_data.json"))) - breakpoint() + # breakpoint() _assert_equal_jsons(result, expected) - breakpoint() + # breakpoint() # conv = XLSXConverter(schema=rfp("data/simple_schema.json")) # result = conv.to_dict(rfp("data/simple_template.xlsx")) -- GitLab