Skip to content
Snippets Groups Projects
Commit d0f4f3f5 authored by I. Nüske's avatar I. Nüske
Browse files

MNT: Implement review feedback, extend test, some more comments

parent 22dac5d1
Branches
Tags
2 merge requests!128MNT: Added a warning when column metadata is not configured, and a better...,!120XLSX-Konverter: Bessere Fehlermeldung bei inkorrektem Typ in Spalte, zusätzlicher Spalte
Pipeline #57714 passed
...@@ -25,6 +25,7 @@ from __future__ import annotations ...@@ -25,6 +25,7 @@ from __future__ import annotations
import datetime import datetime
import itertools import itertools
import sys import sys
import textwrap
from functools import reduce from functools import reduce
from operator import getitem from operator import getitem
from types import SimpleNamespace from types import SimpleNamespace
...@@ -95,65 +96,52 @@ def _format_exception_table(exceptions: list[tuple], worksheet_title: str, ...@@ -95,65 +96,52 @@ def _format_exception_table(exceptions: list[tuple], worksheet_title: str,
exceptions.sort(key=lambda tup: tup[1]) exceptions.sort(key=lambda tup: tup[1])
for row_i, col_i, excep in exceptions: for row_i, col_i, excep in exceptions:
if column_names is not None: if column_names is not None:
# Update Names # Add a line with information about the current column
if current_column != col_i: if current_column != col_i:
current_column = col_i current_column = col_i
new_data.append({ new_data.append({
"loc": f"\nErrors in column '{column_names[col_i]}':", "loc": f"\nErrors in column '{column_names[col_i]}':",
"type": "", "mess": [""] "type": "", "mess": [""]
}) })
# Setup # Setup for current Exception
row = {} curr_err_data = {}
new_data.append(row) new_data.append(curr_err_data)
# Field # Get field
if isinstance(row_i, int): if isinstance(row_i, int):
row["loc"] = f"Cell {_column_id_to_chars(col_i)}{row_i + 1}" curr_err_data["loc"] = f"Cell {_column_id_to_chars(col_i)}{row_i + 1}"
else: else:
row["loc"] = f"Column {_column_id_to_chars(col_i)}" curr_err_data["loc"] = f"Column {_column_id_to_chars(col_i)}"
lengths["loc"] = max(lengths["loc"], len(row["loc"])) lengths["loc"] = max(lengths["loc"], len(curr_err_data["loc"]))
# Code # Add error code
row["type"] = type(excep).__name__ curr_err_data["type"] = type(excep).__name__
lengths["type"] = max(lengths["type"], len(row["type"])) lengths["type"] = max(lengths["type"], len(curr_err_data["type"]))
# Message # Format message - split into lines
lines = str(excep).split('\n') lines = str(excep).split('\n')
new_lines = [] new_lines = []
for line in lines: for line in lines:
if len(line) > max_line_length: new_lines += textwrap.wrap(line, max_line_length, break_long_words=False)
words = line.split(' ') for line in new_lines:
current = ""
for word, next_word in zip(words, words[1:] + [""]):
if current != "":
current += " "
current += word
if len(current + next_word) > max_line_length:
lengths["mess"] = max(lengths["mess"], len(current))
new_lines.append(current)
current = ""
if current != "":
lengths["mess"] = max(lengths["mess"], len(current))
new_lines.append(current)
elif len(line) > 0:
lengths["mess"] = max(lengths["mess"], len(line)) lengths["mess"] = max(lengths["mess"], len(line))
new_lines.append(line)
if new_lines == []: if new_lines == []:
new_lines = [""] new_lines = [""]
row["mess"] = new_lines curr_err_data["mess"] = new_lines
# Generate underline for each header
dividers = {key: '' * l for key, l in lengths.items()} dividers = {key: '' * l for key, l in lengths.items()}
dividers["mess"] = [dividers["mess"]] dividers["mess"] = [dividers["mess"]]
# Fill with spaces for alignment
# Fill for the messages is set to 0, if we want another column or align
# right we need to use lengths["mess"]
string_rep = f"There were errors during the validation of worksheet '{worksheet_title}':\n\n" string_rep = f"There were errors during the validation of worksheet '{worksheet_title}':\n\n"
for row in [headers, dividers] + new_data: for curr_err_data in [headers, dividers] + new_data:
string_rep += ' {loc: <{fill}} '.format(loc=row["loc"], string_rep += ' {loc: <{fill}} '.format(loc=curr_err_data["loc"],
fill=lengths["loc"]) fill=lengths["loc"])
string_rep += ' {typ: <{fill}} '.format(typ=row["type"], string_rep += ' {typ: <{fill}} '.format(typ=curr_err_data["type"],
fill=lengths["type"]) fill=lengths["type"])
string_rep += ' {mes: <{fill}}\n'.format(mes=row["mess"][0], fill=0) # Fill for the messages is set to 0, if we want another column or align
for line in row["mess"][1:]: # right we need to use lengths["mess"]
# Front padding string_rep += ' {mes: <{fill}}\n'.format(mes=curr_err_data["mess"][0], fill=0)
string_rep += ' ' * (lengths["loc"] + lengths["type"] + 7) for line in curr_err_data["mess"][1:]:
# Front padding for lines without location and error type
string_rep += ' ' * (lengths["loc"] + lengths["type"] + 6)
string_rep += ' {mes: <{fill}}\n'.format(mes=line, fill=0) string_rep += ' {mes: <{fill}}\n'.format(mes=line, fill=0)
return string_rep return string_rep
...@@ -194,7 +182,11 @@ class XLSXConverter: ...@@ -194,7 +182,11 @@ class XLSXConverter:
self._workbook = load_workbook(xlsx) self._workbook = load_workbook(xlsx)
self._schema = read_or_dict(schema) self._schema = read_or_dict(schema)
self._defining_path_index = xlsx_utils.get_defining_paths(self._workbook) self._defining_path_index = xlsx_utils.get_defining_paths(self._workbook)
try:
self._check_columns(fail_fast=strict) self._check_columns(fail_fast=strict)
except KeyError as e:
raise jsonschema.ValidationError(f"Malformed metadata: Cannot parse paths. "
f"Unknown path: {e}") from e
self._handled_sheets: set[str] = set() self._handled_sheets: set[str] = set()
self._result: dict = {} self._result: dict = {}
self._errors: dict = {} self._errors: dict = {}
...@@ -220,9 +212,29 @@ class XLSXConverter: ...@@ -220,9 +212,29 @@ class XLSXConverter:
self._handled_sheets = set() self._handled_sheets = set()
self._result = {} self._result = {}
self._errors = {} self._errors = {}
if not collect_errors:
for sheetname in self._workbook.sheetnames: for sheetname in self._workbook.sheetnames:
if sheetname not in self._handled_sheets: if sheetname not in self._handled_sheets:
self._handle_sheet(self._workbook[sheetname], fail_later=collect_errors) self._handle_sheet(self._workbook[sheetname], fail_later=collect_errors)
else:
# Collect errors from converting
exceptions = []
for sheetname in self._workbook.sheetnames:
if sheetname not in self._handled_sheets:
try:
self._handle_sheet(self._workbook[sheetname], fail_later=collect_errors)
except jsonschema.ValidationError as e:
exceptions.append(e)
# do not collect errors from sheet again
self._handled_sheets.add(sheetname)
if len(exceptions) == 1:
raise exceptions[0]
elif len(exceptions) > 1:
mess = "There were errors during the validation of several worksheets:\n\n"
mess += '\n\n'.join([str(e).replace("There were errors during the validation of worksheet",
"In worksheet")
for e in exceptions])
raise jsonschema.ValidationError(mess)
if validate: if validate:
jsonschema.validate(self._result, self._schema) jsonschema.validate(self._result, self._schema)
if self._errors: if self._errors:
...@@ -323,6 +335,7 @@ class XLSXConverter: ...@@ -323,6 +335,7 @@ class XLSXConverter:
# entries: dict[str, list[SimpleNamespace]] = {} # entries: dict[str, list[SimpleNamespace]] = {}
exceptions = [] exceptions = []
warns = []
col_names = {} col_names = {}
for row_idx, row in enumerate(sheet.iter_rows(values_only=True)): for row_idx, row in enumerate(sheet.iter_rows(values_only=True)):
# Skip non-data rows # Skip non-data rows
...@@ -359,7 +372,12 @@ class XLSXConverter: ...@@ -359,7 +372,12 @@ class XLSXConverter:
_set_in_nested(mydict=data, path=path, value=value, prefix=parent, skip=1) _set_in_nested(mydict=data, path=path, value=value, prefix=parent, skip=1)
continue continue
elif sheet.cell(col_type_row+1, col_idx+1).value is None: elif sheet.cell(col_type_row+1, col_idx+1).value is None:
warn(f"No metadata configured for column {_column_id_to_chars(col_idx)}.") mess = (f"\nNo metadata configured for column "
f"'{_column_id_to_chars(col_idx)}' in worksheet "
f"'{sheet.title}'.\n")
if mess not in warns:
print(mess, file=sys.stderr)
warns.append(mess) # Prevent multiple instances of same warning
except (ValueError, KeyError, jsonschema.ValidationError) as e: except (ValueError, KeyError, jsonschema.ValidationError) as e:
# Append error for entire column only once # Append error for entire column only once
if isinstance(e, KeyError) and 'column' in str(e): if isinstance(e, KeyError) and 'column' in str(e):
......
No preview for this file type
File added
...@@ -113,27 +113,47 @@ def test_missing_columns(): ...@@ -113,27 +113,47 @@ def test_missing_columns():
assert expected in messages assert expected in messages
def test_wrong_datatype(): def test_error_table():
with pytest.raises(jsonschema.ValidationError) as caught: with pytest.raises(jsonschema.ValidationError) as caught:
convert.to_dict(xlsx=rfp("data/simple_data_broken.xlsx"), convert.to_dict(xlsx=rfp("data/simple_data_broken.xlsx"),
schema=rfp("data/simple_schema.json")) schema=rfp("data/simple_schema.json"))
# Correct Errors # Correct Errors
assert "Malformed metadata: Cannot parse paths in worksheet 'Person'." in str(caught.value)
assert "'Not a num' is not of type 'number'" in str(caught.value) assert "'Not a num' is not of type 'number'" in str(caught.value)
assert "'Yes a number?' is not of type 'number'" in str(caught.value)
assert "1.5 is not of type 'integer'" in str(caught.value) assert "1.5 is not of type 'integer'" in str(caught.value)
assert "1.2345 is not of type 'integer'" in str(caught.value)
assert "'There is no entry in the schema" in str(caught.value)
assert "'Not an enum' is not one of [" in str(caught.value)
# Correct Locations # Correct Locations
for line in str(caught.value).split('\n'): for line in str(caught.value).split('\n'):
if "'Not a num' is not of type 'number'" in line: if "'Not a num' is not of type 'number'" in line:
assert "J7" in line assert "J7" in line
if "'Yes a number?' is not of type 'number'" in line:
assert "J8" in line
if "1.5 is not of type 'integer'" in line: if "1.5 is not of type 'integer'" in line:
assert "K7" in line assert "K7" in line
if "1.2345 is not of type 'integer'" in line: if "1.2345 is not of type 'integer'" in line:
assert "K8" in line assert "K8" in line
# No additional type errors if "'There is no entry in the schema" in line:
if "is not of type 'boolean'" in str(caught.value): # ToDo: Remove when boolean is fixed assert "Column M" in line
assert str(caught.value).count("is not of type") == 3 if "'Not an enum' is not one of [" in line:
assert "G8" in line
# No additional errors
assert str(caught.value).count("Malformed metadata: Cannot parse paths in worksheet") == 1
assert str(caught.value).count("There is no entry in the schema") == 1
assert str(caught.value).count("is not one of") == 1
# FIXME ToDo: Remove when boolean is fixed / when everything works as
# expected, set correct number.
if "is not of type 'boolean'" in str(caught.value):
assert str(caught.value).count("is not of type") == 6
else: else:
assert str(caught.value).count("is not of type") == 2 # FIXME when everything works as assert str(caught.value).count("is not of type") == 4
# # expected, set correct number. # Check correct error message for completely unknown path
with pytest.raises(jsonschema.ValidationError) as caught:
convert.to_dict(xlsx=rfp("data/simple_data_broken_paths.xlsx"),
schema=rfp("data/simple_schema.json"))
assert "Malformed metadata: Cannot parse paths" in str(caught.value)
def test_additional_column(): def test_additional_column():
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment