Skip to content
Snippets Groups Projects
Commit d0f4f3f5 authored by I. Nüske's avatar I. Nüske
Browse files

MNT: Implement review feedback, extend test, some more comments

parent 22dac5d1
No related branches found
No related tags found
2 merge requests!128MNT: Added a warning when column metadata is not configured, and a better...,!120XLSX-Konverter: Bessere Fehlermeldung bei inkorrektem Typ in Spalte, zusätzlicher Spalte
Pipeline #57714 passed
This commit is part of merge request !120. Comments created here will be created in the context of that merge request.
......@@ -25,6 +25,7 @@ from __future__ import annotations
import datetime
import itertools
import sys
import textwrap
from functools import reduce
from operator import getitem
from types import SimpleNamespace
......@@ -95,65 +96,52 @@ def _format_exception_table(exceptions: list[tuple], worksheet_title: str,
exceptions.sort(key=lambda tup: tup[1])
for row_i, col_i, excep in exceptions:
if column_names is not None:
# Update Names
# Add a line with information about the current column
if current_column != col_i:
current_column = col_i
new_data.append({
"loc": f"\nErrors in column '{column_names[col_i]}':",
"type": "", "mess": [""]
})
# Setup
row = {}
new_data.append(row)
# Field
# Setup for current Exception
curr_err_data = {}
new_data.append(curr_err_data)
# Get field
if isinstance(row_i, int):
row["loc"] = f"Cell {_column_id_to_chars(col_i)}{row_i + 1}"
curr_err_data["loc"] = f"Cell {_column_id_to_chars(col_i)}{row_i + 1}"
else:
row["loc"] = f"Column {_column_id_to_chars(col_i)}"
lengths["loc"] = max(lengths["loc"], len(row["loc"]))
# Code
row["type"] = type(excep).__name__
lengths["type"] = max(lengths["type"], len(row["type"]))
# Message
curr_err_data["loc"] = f"Column {_column_id_to_chars(col_i)}"
lengths["loc"] = max(lengths["loc"], len(curr_err_data["loc"]))
# Add error code
curr_err_data["type"] = type(excep).__name__
lengths["type"] = max(lengths["type"], len(curr_err_data["type"]))
# Format message - split into lines
lines = str(excep).split('\n')
new_lines = []
for line in lines:
if len(line) > max_line_length:
words = line.split(' ')
current = ""
for word, next_word in zip(words, words[1:] + [""]):
if current != "":
current += " "
current += word
if len(current + next_word) > max_line_length:
lengths["mess"] = max(lengths["mess"], len(current))
new_lines.append(current)
current = ""
if current != "":
lengths["mess"] = max(lengths["mess"], len(current))
new_lines.append(current)
elif len(line) > 0:
lengths["mess"] = max(lengths["mess"], len(line))
new_lines.append(line)
new_lines += textwrap.wrap(line, max_line_length, break_long_words=False)
for line in new_lines:
lengths["mess"] = max(lengths["mess"], len(line))
if new_lines == []:
new_lines = [""]
row["mess"] = new_lines
curr_err_data["mess"] = new_lines
# Generate underline for each header
dividers = {key: '' * l for key, l in lengths.items()}
dividers["mess"] = [dividers["mess"]]
# Fill for the messages is set to 0, if we want another column or align
# right we need to use lengths["mess"]
# Fill with spaces for alignment
string_rep = f"There were errors during the validation of worksheet '{worksheet_title}':\n\n"
for row in [headers, dividers] + new_data:
string_rep += ' {loc: <{fill}} '.format(loc=row["loc"],
for curr_err_data in [headers, dividers] + new_data:
string_rep += ' {loc: <{fill}} '.format(loc=curr_err_data["loc"],
fill=lengths["loc"])
string_rep += ' {typ: <{fill}} '.format(typ=row["type"],
string_rep += ' {typ: <{fill}} '.format(typ=curr_err_data["type"],
fill=lengths["type"])
string_rep += ' {mes: <{fill}}\n'.format(mes=row["mess"][0], fill=0)
for line in row["mess"][1:]:
# Front padding
string_rep += ' ' * (lengths["loc"] + lengths["type"] + 7)
# Fill for the messages is set to 0, if we want another column or align
# right we need to use lengths["mess"]
string_rep += ' {mes: <{fill}}\n'.format(mes=curr_err_data["mess"][0], fill=0)
for line in curr_err_data["mess"][1:]:
# Front padding for lines without location and error type
string_rep += ' ' * (lengths["loc"] + lengths["type"] + 6)
string_rep += ' {mes: <{fill}}\n'.format(mes=line, fill=0)
return string_rep
......@@ -194,7 +182,11 @@ class XLSXConverter:
self._workbook = load_workbook(xlsx)
self._schema = read_or_dict(schema)
self._defining_path_index = xlsx_utils.get_defining_paths(self._workbook)
self._check_columns(fail_fast=strict)
try:
self._check_columns(fail_fast=strict)
except KeyError as e:
raise jsonschema.ValidationError(f"Malformed metadata: Cannot parse paths. "
f"Unknown path: {e}") from e
self._handled_sheets: set[str] = set()
self._result: dict = {}
self._errors: dict = {}
......@@ -220,9 +212,29 @@ class XLSXConverter:
self._handled_sheets = set()
self._result = {}
self._errors = {}
for sheetname in self._workbook.sheetnames:
if sheetname not in self._handled_sheets:
self._handle_sheet(self._workbook[sheetname], fail_later=collect_errors)
if not collect_errors:
for sheetname in self._workbook.sheetnames:
if sheetname not in self._handled_sheets:
self._handle_sheet(self._workbook[sheetname], fail_later=collect_errors)
else:
# Collect errors from converting
exceptions = []
for sheetname in self._workbook.sheetnames:
if sheetname not in self._handled_sheets:
try:
self._handle_sheet(self._workbook[sheetname], fail_later=collect_errors)
except jsonschema.ValidationError as e:
exceptions.append(e)
# do not collect errors from sheet again
self._handled_sheets.add(sheetname)
if len(exceptions) == 1:
raise exceptions[0]
elif len(exceptions) > 1:
mess = "There were errors during the validation of several worksheets:\n\n"
mess += '\n\n'.join([str(e).replace("There were errors during the validation of worksheet",
"In worksheet")
for e in exceptions])
raise jsonschema.ValidationError(mess)
if validate:
jsonschema.validate(self._result, self._schema)
if self._errors:
......@@ -323,6 +335,7 @@ class XLSXConverter:
# entries: dict[str, list[SimpleNamespace]] = {}
exceptions = []
warns = []
col_names = {}
for row_idx, row in enumerate(sheet.iter_rows(values_only=True)):
# Skip non-data rows
......@@ -359,7 +372,12 @@ class XLSXConverter:
_set_in_nested(mydict=data, path=path, value=value, prefix=parent, skip=1)
continue
elif sheet.cell(col_type_row+1, col_idx+1).value is None:
warn(f"No metadata configured for column {_column_id_to_chars(col_idx)}.")
mess = (f"\nNo metadata configured for column "
f"'{_column_id_to_chars(col_idx)}' in worksheet "
f"'{sheet.title}'.\n")
if mess not in warns:
print(mess, file=sys.stderr)
warns.append(mess) # Prevent multiple instances of same warning
except (ValueError, KeyError, jsonschema.ValidationError) as e:
# Append error for entire column only once
if isinstance(e, KeyError) and 'column' in str(e):
......
No preview for this file type
File added
......@@ -113,27 +113,47 @@ def test_missing_columns():
assert expected in messages
def test_wrong_datatype():
def test_error_table():
with pytest.raises(jsonschema.ValidationError) as caught:
convert.to_dict(xlsx=rfp("data/simple_data_broken.xlsx"),
schema=rfp("data/simple_schema.json"))
# Correct Errors
assert "Malformed metadata: Cannot parse paths in worksheet 'Person'." in str(caught.value)
assert "'Not a num' is not of type 'number'" in str(caught.value)
assert "'Yes a number?' is not of type 'number'" in str(caught.value)
assert "1.5 is not of type 'integer'" in str(caught.value)
assert "1.2345 is not of type 'integer'" in str(caught.value)
assert "'There is no entry in the schema" in str(caught.value)
assert "'Not an enum' is not one of [" in str(caught.value)
# Correct Locations
for line in str(caught.value).split('\n'):
if "'Not a num' is not of type 'number'" in line:
assert "J7" in line
if "'Yes a number?' is not of type 'number'" in line:
assert "J8" in line
if "1.5 is not of type 'integer'" in line:
assert "K7" in line
if "1.2345 is not of type 'integer'" in line:
assert "K8" in line
# No additional type errors
if "is not of type 'boolean'" in str(caught.value): # ToDo: Remove when boolean is fixed
assert str(caught.value).count("is not of type") == 3
if "'There is no entry in the schema" in line:
assert "Column M" in line
if "'Not an enum' is not one of [" in line:
assert "G8" in line
# No additional errors
assert str(caught.value).count("Malformed metadata: Cannot parse paths in worksheet") == 1
assert str(caught.value).count("There is no entry in the schema") == 1
assert str(caught.value).count("is not one of") == 1
# FIXME ToDo: Remove when boolean is fixed / when everything works as
# expected, set correct number.
if "is not of type 'boolean'" in str(caught.value):
assert str(caught.value).count("is not of type") == 6
else:
assert str(caught.value).count("is not of type") == 2 # FIXME when everything works as
# # expected, set correct number.
assert str(caught.value).count("is not of type") == 4
# Check correct error message for completely unknown path
with pytest.raises(jsonschema.ValidationError) as caught:
convert.to_dict(xlsx=rfp("data/simple_data_broken_paths.xlsx"),
schema=rfp("data/simple_schema.json"))
assert "Malformed metadata: Cannot parse paths" in str(caught.value)
def test_additional_column():
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment