diff --git a/src/caosadvancedtools/table_json_conversion/convert.py b/src/caosadvancedtools/table_json_conversion/convert.py index 586def7aae81b1f65b6132a1026a9de95f919987..09882f963fd976583d4acbc8f2dd3b67ef510ac8 100644 --- a/src/caosadvancedtools/table_json_conversion/convert.py +++ b/src/caosadvancedtools/table_json_conversion/convert.py @@ -22,12 +22,16 @@ from __future__ import annotations +import datetime +import itertools +import sys from functools import reduce from operator import getitem from types import SimpleNamespace from typing import Any, BinaryIO, Callable, TextIO, Union +from warnings import warn -from jsonschema import validate, ValidationError +import jsonschema from openpyxl import load_workbook from openpyxl.worksheet.worksheet import Worksheet @@ -42,6 +46,12 @@ def _strict_bool(value: Any) -> bool: raise TypeError(f"Not a good boolean: {repr(value)}") +class ForeignError(KeyError): + def __init__(self, *args, definitions: list, message: str = ""): + super().__init__(message, *args) + self.definitions = definitions + + class XLSXConverter: """Class for conversion from XLSX to JSON. @@ -56,7 +66,8 @@ documentation. "boolean": _strict_bool, } - def __init__(self, xlsx: Union[str, BinaryIO], schema: Union[dict, str, TextIO]): + def __init__(self, xlsx: Union[str, BinaryIO], schema: Union[dict, str, TextIO], + strict: bool = False): """ Parameters ---------- @@ -65,16 +76,31 @@ xlsx: Union[str, BinaryIO] schema: Union[dict, str, TextIO] Schema for validation of XLSX content. + +strict: bool, optional + If True, fail faster. """ self._workbook = load_workbook(xlsx) self._schema = read_or_dict(schema) self._defining_path_index = xlsx_utils.get_defining_paths(self._workbook) + self._check_columns(fail_fast=strict) self._handled_sheets: set[str] = set() self._result: dict = {} + self._errors: dict = {} - def to_dict(self) -> dict: + def to_dict(self, validate: bool = False, collect_errors: bool = True) -> dict: """Convert the xlsx contents to a dict. +Parameters +---------- +validate: bool, optional + If True, validate the result against the schema. + +collect_errors: bool, optional + If True, do not fail at the first error, but try to collect as many errors as possible. After an + Exception is raised, the errors can be collected with ``get_errors()`` and printed with + ``get_error_str()``. + Returns ------- out: dict @@ -82,12 +108,73 @@ out: dict """ self._handled_sheets = set() self._result = {} + self._errors = {} for sheetname in self._workbook.sheetnames: if sheetname not in self._handled_sheets: - self._handle_sheet(self._workbook[sheetname]) + self._handle_sheet(self._workbook[sheetname], fail_later=collect_errors) + if validate: + jsonschema.validate(self._result, self._schema) + if self._errors: + raise RuntimeError("There were error while handling the XLSX file.") return self._result - def _handle_sheet(self, sheet: Worksheet) -> None: + def get_errors(self) -> dict: + """Return a dict with collected errors.""" + return self._errors + + def get_error_str(self) -> str: + """Return a beautiful string with the collected errors.""" + result = "" + for loc, value in self._errors.items(): + result += f"Sheet: {loc[0]}\tRow: {loc[1] + 1}\n" + for item in value: + result += f"\t\t{item[:-1]}:\t{item[-1]}\n" + return result + + def _check_columns(self, fail_fast: bool = False): + """Check if the columns correspond to the schema.""" + def missing(path): + message = f"Missing column: {xlsx_utils.p2s(path)}" + if fail_fast: + raise ValueError(message) + else: + warn(message) + for sheetname in self._workbook.sheetnames: + sheet = self._workbook[sheetname] + parents: dict = {} + col_paths = [] + for col in xlsx_utils.get_data_columns(sheet).values(): + parents[xlsx_utils.p2s(col.path[:-1])] = col.path[:-1] + col_paths.append(col.path) + for path in parents.values(): + subschema = xlsx_utils.get_subschema(path, self._schema) + + # Unfortunately, there are a lot of special cases to handle here. + if subschema.get("type") == "array": + subschema = subschema["items"] + if "enum" in subschema: # Was handled in parent level already + continue + for child, content in subschema["properties"].items(): + child_path = path + [child] + if content == {'type': 'string', 'format': 'data-url'}: + continue # skip files + if content.get("type") == "array" and ( + content.get("items").get("type") == "object"): + if child_path not in itertools.chain(*self._defining_path_index.values()): + missing(child_path) + elif content.get("type") == "array" and "enum" in content.get("items", []) and ( + content.get("uniqueItems") is True): + # multiple choice + for choice in content["items"]["enum"]: + if child_path + [choice] not in col_paths: + missing(child_path + [choice]) + elif content.get("type") == "object": + pass + else: + if child_path not in col_paths: + missing(child_path) + + def _handle_sheet(self, sheet: Worksheet, fail_later: bool = False) -> None: """Add the contents of the sheet to the result (stored in ``self._result``). Each row in the sheet corresponds to one entry in an array in the result. Which array exactly is @@ -95,6 +182,11 @@ defined by the sheet's "proper name" and the content of the foreign columns. Look at ``xlsx_utils.get_path_position`` for the specification of the "proper name". + +Parameters +---------- +fail_later: bool, optional + If True, do not fail with unresolvable foreign definitions, but collect all errors. """ row_type_column = xlsx_utils.get_row_type_column_index(sheet) foreign_columns = xlsx_utils.get_foreign_key_columns(sheet) @@ -106,7 +198,7 @@ Look at ``xlsx_utils.get_path_position`` for the specification of the "proper na if parent: parent_sheetname = xlsx_utils.get_worksheet_for_path(parent, self._defining_path_index) if parent_sheetname not in self._handled_sheets: - self._handle_sheet(self._workbook[parent_sheetname]) + self._handle_sheet(self._workbook[parent_sheetname], fail_later=fail_later) # # We save single entries in lists, indexed by their foreign key contents. Each entry # # consists of: @@ -114,7 +206,7 @@ Look at ``xlsx_utils.get_path_position`` for the specification of the "proper na # # - data: The actual data of this entry, a dict. # entries: dict[str, list[SimpleNamespace]] = {} - for row in sheet.iter_rows(values_only=True): + for row_idx, row in enumerate(sheet.iter_rows(values_only=True)): # Skip non-data rows. if row[row_type_column] is not None: continue @@ -147,13 +239,18 @@ Look at ``xlsx_utils.get_path_position`` for the specification of the "proper na _set_in_nested(mydict=data, path=path, value=value, prefix=parent, skip=1) continue - # Find current position in tree - parent_dict = self._get_parent_dict(parent_path=parent, foreign=foreign) - - # Append data to current position's list - if proper_name not in parent_dict: - parent_dict[proper_name] = [] - parent_dict[proper_name].append(data) + try: + # Find current position in tree + parent_dict = self._get_parent_dict(parent_path=parent, foreign=foreign) + + # Append data to current position's list + if proper_name not in parent_dict: + parent_dict[proper_name] = [] + parent_dict[proper_name].append(data) + except ForeignError as kerr: + if not fail_later: + raise + self._errors[(sheet.title, row_idx)] = kerr.definitions self._handled_sheets.add(sheet.title) def _is_multiple_choice(self, path: list[str]) -> bool: @@ -187,7 +284,12 @@ the values given in the ``foreign`` specification. current_object = cand break else: - raise KeyError("Cannot find an element which matches the foreign definitions") + message = f"Cannot find an element at {parent_path} for these foreign defs:\n" + for name, value in group.definitions: + message += f" {name}: {value}\n" + print(message, file=sys.stderr) + error = ForeignError(definitions=group.definitions, message=message) + raise error assert isinstance(current_object, dict) return current_object @@ -208,8 +310,16 @@ This includes: values = [self.PARSER[array_type](v) for v in value.split(";")] return values try: - validate(value, subschema) - except ValidationError as verr: + # special case: datetime or date + if ("anyOf" in subschema): + if isinstance(value, datetime.datetime) and ( + {'type': 'string', 'format': 'date-time'} in subschema["anyOf"]): + return value + if isinstance(value, datetime.date) and ( + {'type': 'string', 'format': 'date'} in subschema["anyOf"]): + return value + jsonschema.validate(value, subschema) + except jsonschema.ValidationError as verr: print(verr) print(path) raise @@ -217,23 +327,14 @@ This includes: # Finally: convert to target type return self.PARSER[subschema.get("type", "string")](value) - def _get_subschema(self, path: list[str], schema: Union[dict, list] = None) -> dict: + def _get_subschema(self, path: list[str], schema: dict = None) -> dict: """Return the sub schema at ``path``.""" if schema is None: schema = self._schema assert schema is not None assert isinstance(schema, dict) - if path: - if schema["type"] == "object": - next_schema = schema["properties"][path[0]] - return self._get_subschema(path=path[1:], schema=next_schema) - if schema["type"] == "array": - items = schema["items"] - if "enum" in items: - return schema - next_schema = items["properties"][path[0]] - return self._get_subschema(path=path[1:], schema=next_schema) - return schema + + return xlsx_utils.get_subschema(path, schema) def _group_foreign_paths(foreign: list[list], common: list[str]) -> list[SimpleNamespace]: @@ -368,7 +469,8 @@ mydict: dict return mydict -def to_dict(xlsx: Union[str, BinaryIO], schema: Union[dict, str, TextIO]) -> dict: +def to_dict(xlsx: Union[str, BinaryIO], schema: Union[dict, str, TextIO], + validate: bool = None, strict: bool = False) -> dict: """Convert the xlsx contents to a dict, it must follow a schema. Parameters @@ -379,10 +481,17 @@ xlsx: Union[str, BinaryIO] schema: Union[dict, str, TextIO] Schema for validation of XLSX content. +validate: bool, optional + If True, validate the result against the schema. + +strict: bool, optional + If True, fail faster. + + Returns ------- out: dict A dict representing the JSON with the extracted data. """ - converter = XLSXConverter(xlsx, schema) + converter = XLSXConverter(xlsx, schema, strict=strict) return converter.to_dict() diff --git a/src/caosadvancedtools/table_json_conversion/fill_xlsx.py b/src/caosadvancedtools/table_json_conversion/fill_xlsx.py index 66495584f611d76a2c2e7a661e046dee68681d2d..f2e0abc3fc684172065d683c99c1c4309c80d6c0 100644 --- a/src/caosadvancedtools/table_json_conversion/fill_xlsx.py +++ b/src/caosadvancedtools/table_json_conversion/fill_xlsx.py @@ -22,6 +22,7 @@ from __future__ import annotations +import datetime import pathlib from types import SimpleNamespace from typing import Any, Optional, TextIO, Union @@ -155,6 +156,7 @@ class TemplateFiller: def _handle_data(self, data: dict, current_path: list[str] = None, context: TemplateFiller.Context = None, only_collect_insertables: bool = False, + utc: bool = False, ) -> Optional[dict[str, Any]]: """Handle the data and write it into ``workbook``. @@ -176,6 +178,8 @@ context: TemplateFiller.Context, optional only_collect_insertables: bool, optional If True, do not insert anything on this level, but return a dict with entries to be inserted. +utc: bool, optional + If True, store times as UTC. Else store as local time on a best-effort base. Returns ------- @@ -205,10 +209,19 @@ out: union[dict, None] assert len(set(type(entry) for entry in content)) == 1 if isinstance(content[0], dict): # all elements are dicts - # An array of objects: must go into exploded sheet - for entry in content: - self._handle_data(data=entry, current_path=path, context=next_context) - continue + # Heuristic to detect enum entries (only id and name): + if all(set(entry.keys()) == {"id", "name"} for entry in content): + # Convert to list of names, do not recurse + content = [entry["name"] for entry in content] + else: + # An array of objects: must go into exploded sheet + for entry in content: + self._handle_data(data=entry, current_path=path, context=next_context) + continue + # Heuristic to detect enum entries (dict with only id and name): + elif isinstance(content, dict) and set(content.keys()) == {"id", "name"}: + content = [content["name"]] + # "Normal" dicts elif isinstance(content, dict): # we recurse and simply use the result if not current_path: # Special handling for top level self._handle_data(content, current_path=path, context=next_context) @@ -233,6 +246,13 @@ out: union[dict, None] value = content[0] if isinstance(value, str): value = ILLEGAL_CHARACTERS_RE.sub("", value) + if isinstance(value, datetime.datetime): + if value.tzinfo is not None: + if utc: + value = value.astimezone(datetime.timezone.utc).replace(tzinfo=None) + else: + # Remove timezone, store in local timezone. + value = value.astimezone().replace(tzinfo=None) path_str = p2s(path) assert path_str not in insertables @@ -248,7 +268,8 @@ out: union[dict, None] sheet = None for path_str, value in insertables.items(): if self._graceful and path_str not in self._sheet_index: - warn(f"Ignoring path with missing sheet index: {path_str}") + if not (value is None or path_str.endswith(".id") or path_str.endswith(".name")): + warn(f"Ignoring path with missing sheet index: {path_str}") continue sheet_meta = self._sheet_index[path_str] if sheet is None: @@ -335,6 +356,7 @@ validation_schema: dict, optional if validation_schema is not None: validation_schema = array_schema_from_model_schema(read_or_dict(validation_schema)) try: + # FIXME redefine checker for datetime validate(data, validation_schema, format_checker=FormatChecker()) except ValidationError as verr: print(verr.message) diff --git a/src/caosadvancedtools/table_json_conversion/xlsx_utils.py b/src/caosadvancedtools/table_json_conversion/xlsx_utils.py index 32ed8552ce26f0c2245f9739721ba82ed95c12bf..5002f3ac7fe4bd78accffe0697cd7ecc7273dc27 100644 --- a/src/caosadvancedtools/table_json_conversion/xlsx_utils.py +++ b/src/caosadvancedtools/table_json_conversion/xlsx_utils.py @@ -235,8 +235,6 @@ proper_name: str if ii > len(parent): parent = foreign_path[:ii] - # print(data_paths, ii) - # breakpoint() return parent, data_paths[0][ii] @@ -260,6 +258,21 @@ def get_row_type_column_index(sheet: Worksheet): raise ValueError("The column which defines row types (COL_TYPE, PATH, ...) is missing") +def get_subschema(path: list[str], schema: dict) -> dict: + """Return the sub schema at ``path``.""" + if path: + if schema["type"] == "object": + next_schema = schema["properties"][path[0]] + return get_subschema(path=path[1:], schema=next_schema) + if schema["type"] == "array": + items = schema["items"] + if "enum" in items: + return schema + next_schema = items["properties"][path[0]] + return get_subschema(path=path[1:], schema=next_schema) + return schema + + def get_worksheet_for_path(path: list[str], defining_path_index: dict[str, list[list[str]]]) -> str: """Find the sheet name which corresponds to the given path.""" for sheetname, paths in defining_path_index.items(): diff --git a/unittests/table_json_conversion/data/multiple_choice_data_missing.xlsx b/unittests/table_json_conversion/data/multiple_choice_data_missing.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..10d76529ab11d2e79ca0651313dd50f0cc326341 Binary files /dev/null and b/unittests/table_json_conversion/data/multiple_choice_data_missing.xlsx differ diff --git a/unittests/table_json_conversion/data/multiple_refs_data_wrong_foreign.xlsx b/unittests/table_json_conversion/data/multiple_refs_data_wrong_foreign.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..f6e9d9f9f2024708a0b70d8ed4660cc97e04ff27 Binary files /dev/null and b/unittests/table_json_conversion/data/multiple_refs_data_wrong_foreign.xlsx differ diff --git a/unittests/table_json_conversion/data/simple_data_datetime.xlsx b/unittests/table_json_conversion/data/simple_data_datetime.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..c6752614e700ecfd6040c087ff3254a68fd5e158 Binary files /dev/null and b/unittests/table_json_conversion/data/simple_data_datetime.xlsx differ diff --git a/unittests/table_json_conversion/data/simple_data_missing.xlsx b/unittests/table_json_conversion/data/simple_data_missing.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..6e6d3a39d965de81236f2ad14bd2116ac4d7669b Binary files /dev/null and b/unittests/table_json_conversion/data/simple_data_missing.xlsx differ diff --git a/unittests/table_json_conversion/data/simple_data_wrong_foreign.xlsx b/unittests/table_json_conversion/data/simple_data_wrong_foreign.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..dbd91d29947ec673a704a762f474111223484e56 Binary files /dev/null and b/unittests/table_json_conversion/data/simple_data_wrong_foreign.xlsx differ diff --git a/unittests/table_json_conversion/test_fill_xlsx.py b/unittests/table_json_conversion/test_fill_xlsx.py index f580fdbf867f08db0d72ade3537d4f2c1e8301d6..899bb81ef1f91f3326f214f49f135a55b97d299f 100644 --- a/unittests/table_json_conversion/test_fill_xlsx.py +++ b/unittests/table_json_conversion/test_fill_xlsx.py @@ -19,6 +19,7 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. +import datetime import json import os import re @@ -33,6 +34,7 @@ from caosadvancedtools.table_json_conversion.fill_xlsx import fill_template from caosadvancedtools.table_json_conversion.xlsx_utils import ( get_row_type_column_index, get_path_rows, + read_or_dict, ) from .utils import compare_workbooks @@ -149,6 +151,32 @@ def test_fill_xlsx(): schema=rfp("data/multiple_choice_schema.json")) +def test_datetime(): + """Datetime values from LinkAhead are not serialized as strings.""" + json_file = rfp("data/simple_data.json") + template_file = rfp("data/simple_template.xlsx") + known_good = rfp("data/simple_data_datetime.xlsx") + # TODO Implement checker for datetime + # schema = rfp("data/simple_schema.json") + + # Set datetime explicitly + json_data = read_or_dict(json_file) + json_data["Training"][0]["date"] = datetime.datetime(2023, 1, 1) + + # Code copied mostly from `fill_and_compare(...)` + with tempfile.TemporaryDirectory() as tmpdir: + outfile = os.path.join(tmpdir, 'test.xlsx') + assert not os.path.exists(outfile) + fill_template(data=json_data, template=template_file, result=outfile, + # validation_schema=schema + ) + assert os.path.exists(outfile) + generated = load_workbook(outfile) # workbook can be read + + known_good_wb = load_workbook(known_good) + compare_workbooks(generated, known_good_wb) + + def test_errors(): with pytest.raises(AssertionError) as exc: fill_and_compare(json_file=rfp("data/error_simple_data.json"), diff --git a/unittests/table_json_conversion/test_read_data.py b/unittests/table_json_conversion/test_read_xlsx.py similarity index 59% rename from unittests/table_json_conversion/test_read_data.py rename to unittests/table_json_conversion/test_read_xlsx.py index eef9a9d88ddbf26fbf658ac5c3753e5133a831be..0eec2e9caa1f800ad86ab43057b8c512dc09881f 100644 --- a/unittests/table_json_conversion/test_read_data.py +++ b/unittests/table_json_conversion/test_read_xlsx.py @@ -20,6 +20,7 @@ """Testing the conversion from XLSX to JSON""" +import datetime import json import os import re @@ -39,17 +40,23 @@ def rfp(*pathcomponents): def convert_and_compare(xlsx_file: str, schema_file: str, known_good_file: str, - strict: bool = False) -> dict: + known_good_data: dict = None, strict: bool = False, + validate: bool = True) -> dict: """Convert an XLSX file and compare to a known result. +Exactly one of ``known_good_file`` and ``known_good_data`` should be non-empty. + Returns ------- json: dict The result of the conversion. """ - result = convert.to_dict(xlsx=xlsx_file, schema=schema_file) - with open(known_good_file, encoding="utf-8") as myfile: - expected = json.load(myfile) + result = convert.to_dict(xlsx=xlsx_file, schema=schema_file, validate=validate) + if known_good_file: + with open(known_good_file, encoding="utf-8") as myfile: + expected = json.load(myfile) + else: + expected = known_good_data assert_equal_jsons(result, expected, allow_none=not strict, allow_empty=not strict) return result @@ -70,6 +77,13 @@ def test_conversions(): known_good_file=rfp("data/multiple_choice_data.json"), strict=True) + with open(rfp("data/simple_data.json"), encoding="utf-8") as myfile: + expected_datetime = json.load(myfile) + expected_datetime["Training"][0]["date"] = datetime.datetime(2023, 1, 1, 0, 0) + convert_and_compare(xlsx_file=rfp("data/simple_data_datetime.xlsx"), + schema_file=rfp("data/simple_schema.json"), + known_good_file="", known_good_data=expected_datetime) + # Data loss when saving as xlsx with pytest.raises(AssertionError) as err: convert_and_compare(xlsx_file=rfp("data/simple_data_ascii_chars.xlsx"), @@ -78,6 +92,71 @@ def test_conversions(): assert str(err.value).startswith("Values at path ['Training', 0, ") +def test_missing_columns(): + with pytest.raises(ValueError) as caught: + convert.to_dict(xlsx=rfp("data/simple_data_missing.xlsx"), + schema=rfp("data/simple_schema.json"), strict=True) + assert str(caught.value) == "Missing column: Training.coach.given_name" + with pytest.warns(UserWarning) as caught: + convert.to_dict(xlsx=rfp("data/simple_data_missing.xlsx"), + schema=rfp("data/simple_schema.json")) + assert str(caught.pop().message) == "Missing column: Training.coach.given_name" + with pytest.warns(UserWarning) as caught: + convert.to_dict(xlsx=rfp("data/multiple_choice_data_missing.xlsx"), + schema=rfp("data/multiple_choice_schema.json")) + messages = {str(w.message) for w in caught} + for expected in [ + "Missing column: Training.skills.Communication", + "Missing column: Training.exam_types.Oral", + ]: + assert expected in messages + + +def test_faulty_foreign(): + # Simple wrong foreign key + converter = convert.XLSXConverter(xlsx=rfp("data/simple_data_wrong_foreign.xlsx"), + schema=rfp("data/simple_schema.json")) + with pytest.raises(RuntimeError): + converter.to_dict() + errors = converter.get_errors() + assert errors == {('Training.coach', 6): [['date', datetime.datetime(2023, 1, 2, 0, 0)], + ['url', 'www.indiscale.com']]} + + # More extensive example + converter = convert.XLSXConverter(xlsx=rfp("data/multiple_refs_data_wrong_foreign.xlsx"), + schema=rfp("data/multiple_refs_schema.json")) + with pytest.raises(RuntimeError): + converter.to_dict() + errors = converter.get_errors() + assert errors == { + ('Training.Organisation.Person', 8): [ + ['name', 'World Training Organization 2']], + ('Training.Organisation.Person', 9): [ + ['date', '2024-03-21T14:12:00.000Z'], + ['url', 'www.getlinkahead.com']], + ('Training.participant', 6): [ + ['date', '2024-03-21T14:12:00.000Z'], + ['url', None]], + ('Training.participant', 7): [ + ['date', '2024-03-21T14:12:00.000Z'], + ['url', None]], + } + + error_str = converter.get_error_str() + assert error_str == """Sheet: Training.Organisation.Person\tRow: 9 +\t\t['name']:\tWorld Training Organization 2 +Sheet: Training.Organisation.Person\tRow: 10 +\t\t['date']:\t2024-03-21T14:12:00.000Z +\t\t['url']:\twww.getlinkahead.com +Sheet: Training.participant\tRow: 7 +\t\t['date']:\t2024-03-21T14:12:00.000Z +\t\t['url']:\tNone +Sheet: Training.participant\tRow: 8 +\t\t['date']:\t2024-03-21T14:12:00.000Z +\t\t['url']:\tNone +""" + + def test_set_in_nested(): """Test the ``_set_in_nested`` function.""" set_in_nested = convert._set_in_nested # pylint: disable=protected-access