Skip to content
Snippets Groups Projects
Commit 5ceee474 authored by I. Nüske's avatar I. Nüske
Browse files

ENH: convert.to_dict() now outputs encountered type errors in a table

parent 26079795
No related branches found
No related tags found
2 merge requests!128MNT: Added a warning when column metadata is not configured, and a better...,!120XLSX-Konverter: Bessere Fehlermeldung bei inkorrektem Typ in Spalte, zusätzlicher Spalte
......@@ -46,6 +46,114 @@ def _strict_bool(value: Any) -> bool:
raise TypeError(f"Not a good boolean: {repr(value)}")
def format_exception_table(exceptions: list(tuple), worksheet_title: str,
column_names: Optional[dict, list] = None,
max_line_length: Optional[int] = 120) -> str:
"""
Given a list of tuples containing a row and column number as well as an
exception in that order, and the title of the current worksheet, returns
a formatted table of the exceptions.
Optionally takes a dict of column names, if given a header will be
generated for each column and exceptions will be clustered by column.
Default line length is 120 and can be overwritten by max_line_length.
Params
------
exceptions: list of tuples containing row, column, and exception
Data to be formatted
worksheet_title: str
Name of the current worksheet
column_names: dict or list, optional
column_names[column_num] should return the name of
column column_names.
If given, exceptions will be clustered by column.
max_line_length: int
Soft cap for the line length of the resulting table
Return
------
string_rep: str
Table containing the given exceptions
"""
def to_char(num):
if num < 0:
return ""
return to_char(int(num / 26) - 1) + chr(int(num % 26) + 65)
max_line_length -= 40 # Estimate of Field + Type space use
headers = {"loc": "Field", "type": "Error Type", "mess": ["Message"]}
lengths = {key: len(headers[key]) for key in headers}
new_data = []
current_column = None
exceptions.sort(key=lambda tup: tup[1])
for row_i, col_i, excep in exceptions:
if column_names is not None:
# Update Names
if current_column != col_i:
current_column = col_i
new_data.append({
"loc": f"\nErrors in column '{column_names[col_i]}':",
"type": "", "mess": [""]
})
# Setup
row = {}
new_data.append(row)
# Field
if isinstance(row_i, int):
row["loc"] = f"{to_char(col_i)}{row_i + 1}"
else:
row["loc"] = f"{to_char(col_i)}"
lengths["loc"] = max(lengths["loc"], len(row["loc"]))
# Code
row["type"] = type(excep).__name__
lengths["type"] = max(lengths["type"], len(row["type"]))
# Message
lines = str(excep).split('\n')
new_lines = []
for line in lines:
if len(line) > max_line_length:
words = line.split(' ')
current = ""
for word, next_word in zip(words, words[1:] + [""]):
if current != "":
current += " "
current += word
if len(current + next_word) > max_line_length:
lengths["mess"] = max(lengths["mess"], len(current))
new_lines.append(current)
current = ""
if current != "":
lengths["mess"] = max(lengths["mess"], len(current))
new_lines.append(current)
elif len(line) > 0:
lengths["mess"] = max(lengths["mess"], len(line))
new_lines.append(line)
if new_lines == []:
new_lines = [""]
row["mess"] = new_lines
dividers = {key: '' * l for key, l in lengths.items()}
dividers["mess"] = [dividers["mess"]]
# Fill for the messages is set to 0, if we want another column or align
# right we need to use lengths["mess"]
string_rep = f"There were failures during validation of worksheet '{worksheet_title}':\n\n"
for row in [headers, dividers] + new_data:
string_rep += ' {loc: <{fill}} '.format(loc=row["loc"],
fill=lengths["loc"])
string_rep += ' {typ: <{fill}} '.format(typ=row["type"],
fill=lengths["type"])
string_rep += ' {mes: <{fill}}\n'.format(mes=row["mess"][0], fill=0)
for line in row["mess"][1:]:
# Front padding
string_rep += ' ' * (lengths["loc"] + lengths["type"] + 7)
string_rep += ' {mes: <{fill}}\n'.format(mes=line, fill=0)
return string_rep
class ForeignError(KeyError):
def __init__(self, *args, definitions: list, message: str = ""):
super().__init__(message, *args)
......@@ -205,9 +313,13 @@ class XLSXConverter:
# # - data: The actual data of this entry, a dict.
# entries: dict[str, list[SimpleNamespace]] = {}
exceptions = []
col_names = None
for row_idx, row in enumerate(sheet.iter_rows(values_only=True)):
# Skip non-data rows.
# Skip non-data rows and save the row containing column names
if row[row_type_column] is not None:
if row[row_type_column] == "IGNORE" and col_names is None:
col_names = row
continue
foreign_repr = ""
foreign = [] # A list of lists, each of which is: [path1, path2, ..., leaf, value]
......@@ -219,24 +331,27 @@ class XLSXConverter:
foreign.append(foreign_column_paths[col_idx] + [value])
continue
if col_idx in data_column_paths:
path = data_column_paths[col_idx]
if self._is_multiple_choice(path):
real_value = path.pop() # Last component is the enum value, insert above
# set up list
try:
_set_in_nested(mydict=data, path=path, value=[], prefix=parent, skip=1)
except ValueError as err:
if not str(err).startswith("There is already some value at"):
raise
if not xlsx_utils.parse_multiple_choice(value):
continue
_set_in_nested(mydict=data, path=path, value=real_value, prefix=parent,
skip=1, append_to_list=True)
else:
value = self._validate_and_convert(value, path)
_set_in_nested(mydict=data, path=path, value=value, prefix=parent, skip=1)
continue
try:
if col_idx in data_column_paths:
path = data_column_paths[col_idx]
if self._is_multiple_choice(path):
real_value = path.pop() # Last component is the enum value, insert above
# set up list
try:
_set_in_nested(mydict=data, path=path, value=[], prefix=parent, skip=1)
except ValueError as err:
if not str(err).startswith("There is already some value at"):
raise
if not xlsx_utils.parse_multiple_choice(value):
continue
_set_in_nested(mydict=data, path=path, value=real_value, prefix=parent,
skip=1, append_to_list=True)
else:
value = self._validate_and_convert(value, path)
_set_in_nested(mydict=data, path=path, value=value, prefix=parent, skip=1)
continue
except (ValueError, jsonschema.ValidationError) as e:
exceptions.append((row_idx, col_idx, e))
try:
# Find current position in tree
......@@ -250,6 +365,12 @@ class XLSXConverter:
if not fail_later:
raise
self._errors[(sheet.title, row_idx)] = kerr.definitions
if exceptions != []:
exception_table = format_exception_table(exceptions, sheet.title,
col_names)
raise jsonschema.ValidationError(exception_table)
self._handled_sheets.add(sheet.title)
def _is_multiple_choice(self, path: list[str]) -> bool:
......@@ -308,20 +429,15 @@ class XLSXConverter:
if isinstance(value, str) and ";" in value:
values = [self.PARSER[array_type](v) for v in value.split(";")]
return values
try:
# special case: datetime or date
if ("anyOf" in subschema):
if isinstance(value, datetime.datetime) and (
{'type': 'string', 'format': 'date-time'} in subschema["anyOf"]):
return value
if isinstance(value, datetime.date) and (
{'type': 'string', 'format': 'date'} in subschema["anyOf"]):
return value
jsonschema.validate(value, subschema)
except jsonschema.ValidationError as verr:
print(verr)
print(path)
raise
# special case: datetime or date
if ("anyOf" in subschema):
if isinstance(value, datetime.datetime) and (
{'type': 'string', 'format': 'date-time'} in subschema["anyOf"]):
return value
if isinstance(value, datetime.date) and (
{'type': 'string', 'format': 'date'} in subschema["anyOf"]):
return value
jsonschema.validate(value, subschema)
# Finally: convert to target type
return self.PARSER[subschema.get("type", "string")](value)
......
File added
......@@ -27,6 +27,7 @@ import re
from types import SimpleNamespace
import jsonschema
import pytest
from caosadvancedtools.table_json_conversion import convert
......@@ -112,6 +113,26 @@ def test_missing_columns():
assert expected in messages
def test_wrong_datatype():
with pytest.raises(jsonschema.ValidationError) as caught:
convert.to_dict(xlsx=rfp("data/simple_data_broken.xlsx"),
schema=rfp("data/simple_schema.json"))
# Correct Errors
assert "'Not a num' is not of type 'number'" in str(caught.value)
assert "1.5 is not of type 'integer'" in str(caught.value)
# Correct Locations
for line in str(caught.value).split('\n'):
if "'Not a num' is not of type 'number'" in line:
assert "J7" in line
if "1.5 is not of type 'integer'" in line:
assert "K7" in line
# No additional type errors
if "is not of type 'boolean'" in str(caught.value): # ToDo: Remove when boolean is fixed
assert str(caught.value).count("is not of type") == 3
else:
assert str(caught.value).count("is not of type") == 2
def test_faulty_foreign():
# Simple wrong foreign key
converter = convert.XLSXConverter(xlsx=rfp("data/simple_data_wrong_foreign.xlsx"),
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment