diff --git a/src/caosadvancedtools/table_json_conversion/convert.py b/src/caosadvancedtools/table_json_conversion/convert.py index 37c39aae91da39a9e7f027050b6bff97f85efc16..3bc255562f115963f84d207d37421c11f2243ef0 100644 --- a/src/caosadvancedtools/table_json_conversion/convert.py +++ b/src/caosadvancedtools/table_json_conversion/convert.py @@ -25,6 +25,7 @@ from __future__ import annotations import datetime import itertools import sys +import textwrap from functools import reduce from operator import getitem from types import SimpleNamespace @@ -95,65 +96,52 @@ def _format_exception_table(exceptions: list[tuple], worksheet_title: str, exceptions.sort(key=lambda tup: tup[1]) for row_i, col_i, excep in exceptions: if column_names is not None: - # Update Names + # Add a line with information about the current column if current_column != col_i: current_column = col_i new_data.append({ "loc": f"\nErrors in column '{column_names[col_i]}':", "type": "", "mess": [""] }) - # Setup - row = {} - new_data.append(row) - # Field + # Setup for current Exception + curr_err_data = {} + new_data.append(curr_err_data) + # Get field if isinstance(row_i, int): - row["loc"] = f"Cell {_column_id_to_chars(col_i)}{row_i + 1}" + curr_err_data["loc"] = f"Cell {_column_id_to_chars(col_i)}{row_i + 1}" else: - row["loc"] = f"Column {_column_id_to_chars(col_i)}" - lengths["loc"] = max(lengths["loc"], len(row["loc"])) - # Code - row["type"] = type(excep).__name__ - lengths["type"] = max(lengths["type"], len(row["type"])) - # Message + curr_err_data["loc"] = f"Column {_column_id_to_chars(col_i)}" + lengths["loc"] = max(lengths["loc"], len(curr_err_data["loc"])) + # Add error code + curr_err_data["type"] = type(excep).__name__ + lengths["type"] = max(lengths["type"], len(curr_err_data["type"])) + # Format message - split into lines lines = str(excep).split('\n') new_lines = [] for line in lines: - if len(line) > max_line_length: - words = line.split(' ') - current = "" - for word, next_word in zip(words, words[1:] + [""]): - if current != "": - current += " " - current += word - if len(current + next_word) > max_line_length: - lengths["mess"] = max(lengths["mess"], len(current)) - new_lines.append(current) - current = "" - if current != "": - lengths["mess"] = max(lengths["mess"], len(current)) - new_lines.append(current) - elif len(line) > 0: - lengths["mess"] = max(lengths["mess"], len(line)) - new_lines.append(line) + new_lines += textwrap.wrap(line, max_line_length, break_long_words=False) + for line in new_lines: + lengths["mess"] = max(lengths["mess"], len(line)) if new_lines == []: new_lines = [""] - row["mess"] = new_lines + curr_err_data["mess"] = new_lines + # Generate underline for each header dividers = {key: '–' * l for key, l in lengths.items()} dividers["mess"] = [dividers["mess"]] - - # Fill for the messages is set to 0, if we want another column or align - # right we need to use lengths["mess"] + # Fill with spaces for alignment string_rep = f"There were errors during the validation of worksheet '{worksheet_title}':\n\n" - for row in [headers, dividers] + new_data: - string_rep += ' {loc: <{fill}} '.format(loc=row["loc"], + for curr_err_data in [headers, dividers] + new_data: + string_rep += ' {loc: <{fill}} '.format(loc=curr_err_data["loc"], fill=lengths["loc"]) - string_rep += ' {typ: <{fill}} '.format(typ=row["type"], + string_rep += ' {typ: <{fill}} '.format(typ=curr_err_data["type"], fill=lengths["type"]) - string_rep += ' {mes: <{fill}}\n'.format(mes=row["mess"][0], fill=0) - for line in row["mess"][1:]: - # Front padding - string_rep += ' ' * (lengths["loc"] + lengths["type"] + 7) + # Fill for the messages is set to 0, if we want another column or align + # right we need to use lengths["mess"] + string_rep += ' {mes: <{fill}}\n'.format(mes=curr_err_data["mess"][0], fill=0) + for line in curr_err_data["mess"][1:]: + # Front padding for lines without location and error type + string_rep += ' ' * (lengths["loc"] + lengths["type"] + 6) string_rep += ' {mes: <{fill}}\n'.format(mes=line, fill=0) return string_rep @@ -194,7 +182,11 @@ class XLSXConverter: self._workbook = load_workbook(xlsx) self._schema = read_or_dict(schema) self._defining_path_index = xlsx_utils.get_defining_paths(self._workbook) - self._check_columns(fail_fast=strict) + try: + self._check_columns(fail_fast=strict) + except KeyError as e: + raise jsonschema.ValidationError(f"Malformed metadata: Cannot parse paths. " + f"Unknown path: {e}") from e self._handled_sheets: set[str] = set() self._result: dict = {} self._errors: dict = {} @@ -220,9 +212,29 @@ class XLSXConverter: self._handled_sheets = set() self._result = {} self._errors = {} - for sheetname in self._workbook.sheetnames: - if sheetname not in self._handled_sheets: - self._handle_sheet(self._workbook[sheetname], fail_later=collect_errors) + if not collect_errors: + for sheetname in self._workbook.sheetnames: + if sheetname not in self._handled_sheets: + self._handle_sheet(self._workbook[sheetname], fail_later=collect_errors) + else: + # Collect errors from converting + exceptions = [] + for sheetname in self._workbook.sheetnames: + if sheetname not in self._handled_sheets: + try: + self._handle_sheet(self._workbook[sheetname], fail_later=collect_errors) + except jsonschema.ValidationError as e: + exceptions.append(e) + # do not collect errors from sheet again + self._handled_sheets.add(sheetname) + if len(exceptions) == 1: + raise exceptions[0] + elif len(exceptions) > 1: + mess = "There were errors during the validation of several worksheets:\n\n" + mess += '\n\n'.join([str(e).replace("There were errors during the validation of worksheet", + "In worksheet") + for e in exceptions]) + raise jsonschema.ValidationError(mess) if validate: jsonschema.validate(self._result, self._schema) if self._errors: @@ -323,6 +335,7 @@ class XLSXConverter: # entries: dict[str, list[SimpleNamespace]] = {} exceptions = [] + warns = [] col_names = {} for row_idx, row in enumerate(sheet.iter_rows(values_only=True)): # Skip non-data rows @@ -359,7 +372,12 @@ class XLSXConverter: _set_in_nested(mydict=data, path=path, value=value, prefix=parent, skip=1) continue elif sheet.cell(col_type_row+1, col_idx+1).value is None: - warn(f"No metadata configured for column {_column_id_to_chars(col_idx)}.") + mess = (f"\nNo metadata configured for column " + f"'{_column_id_to_chars(col_idx)}' in worksheet " + f"'{sheet.title}'.\n") + if mess not in warns: + print(mess, file=sys.stderr) + warns.append(mess) # Prevent multiple instances of same warning except (ValueError, KeyError, jsonschema.ValidationError) as e: # Append error for entire column only once if isinstance(e, KeyError) and 'column' in str(e): diff --git a/unittests/table_json_conversion/data/simple_data_broken.xlsx b/unittests/table_json_conversion/data/simple_data_broken.xlsx index 0221570c942fc28f2b59a282f751781ff4a504fa..a65d464a53459de73e41fd20d807899c44728cda 100644 Binary files a/unittests/table_json_conversion/data/simple_data_broken.xlsx and b/unittests/table_json_conversion/data/simple_data_broken.xlsx differ diff --git a/unittests/table_json_conversion/data/simple_data_broken_paths.xlsx b/unittests/table_json_conversion/data/simple_data_broken_paths.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..0221570c942fc28f2b59a282f751781ff4a504fa Binary files /dev/null and b/unittests/table_json_conversion/data/simple_data_broken_paths.xlsx differ diff --git a/unittests/table_json_conversion/test_read_xlsx.py b/unittests/table_json_conversion/test_read_xlsx.py index 8fbf8a2ab504d91ed0389243bf1f15f0bf20eea4..f51e114f7ab526f0b366e48934dbb1f9573f1666 100644 --- a/unittests/table_json_conversion/test_read_xlsx.py +++ b/unittests/table_json_conversion/test_read_xlsx.py @@ -113,27 +113,47 @@ def test_missing_columns(): assert expected in messages -def test_wrong_datatype(): +def test_error_table(): with pytest.raises(jsonschema.ValidationError) as caught: convert.to_dict(xlsx=rfp("data/simple_data_broken.xlsx"), schema=rfp("data/simple_schema.json")) # Correct Errors + assert "Malformed metadata: Cannot parse paths in worksheet 'Person'." in str(caught.value) assert "'Not a num' is not of type 'number'" in str(caught.value) + assert "'Yes a number?' is not of type 'number'" in str(caught.value) assert "1.5 is not of type 'integer'" in str(caught.value) + assert "1.2345 is not of type 'integer'" in str(caught.value) + assert "'There is no entry in the schema" in str(caught.value) + assert "'Not an enum' is not one of [" in str(caught.value) # Correct Locations for line in str(caught.value).split('\n'): if "'Not a num' is not of type 'number'" in line: assert "J7" in line + if "'Yes a number?' is not of type 'number'" in line: + assert "J8" in line if "1.5 is not of type 'integer'" in line: assert "K7" in line if "1.2345 is not of type 'integer'" in line: assert "K8" in line - # No additional type errors - if "is not of type 'boolean'" in str(caught.value): # ToDo: Remove when boolean is fixed - assert str(caught.value).count("is not of type") == 3 + if "'There is no entry in the schema" in line: + assert "Column M" in line + if "'Not an enum' is not one of [" in line: + assert "G8" in line + # No additional errors + assert str(caught.value).count("Malformed metadata: Cannot parse paths in worksheet") == 1 + assert str(caught.value).count("There is no entry in the schema") == 1 + assert str(caught.value).count("is not one of") == 1 + # FIXME ToDo: Remove when boolean is fixed / when everything works as + # expected, set correct number. + if "is not of type 'boolean'" in str(caught.value): + assert str(caught.value).count("is not of type") == 6 else: - assert str(caught.value).count("is not of type") == 2 # FIXME when everything works as - # # expected, set correct number. + assert str(caught.value).count("is not of type") == 4 + # Check correct error message for completely unknown path + with pytest.raises(jsonschema.ValidationError) as caught: + convert.to_dict(xlsx=rfp("data/simple_data_broken_paths.xlsx"), + schema=rfp("data/simple_schema.json")) + assert "Malformed metadata: Cannot parse paths" in str(caught.value) def test_additional_column():