diff --git a/src/caosadvancedtools/table_json_conversion/convert.py b/src/caosadvancedtools/table_json_conversion/convert.py index 48a0f6760b91b1f067cdd07f49499f06a95669fb..e874c5352ff641401bee75ef828410a1aeb4414c 100644 --- a/src/caosadvancedtools/table_json_conversion/convert.py +++ b/src/caosadvancedtools/table_json_conversion/convert.py @@ -46,6 +46,114 @@ def _strict_bool(value: Any) -> bool: raise TypeError(f"Not a good boolean: {repr(value)}") +def format_exception_table(exceptions: list(tuple), worksheet_title: str, + column_names: Optional[dict, list] = None, + max_line_length: Optional[int] = 120) -> str: + """ + Given a list of tuples containing a row and column number as well as an + exception in that order, and the title of the current worksheet, returns + a formatted table of the exceptions. + + Optionally takes a dict of column names, if given a header will be + generated for each column and exceptions will be clustered by column. + + Default line length is 120 and can be overwritten by max_line_length. + + Params + ------ + exceptions: list of tuples containing row, column, and exception + Data to be formatted + worksheet_title: str + Name of the current worksheet + column_names: dict or list, optional + column_names[column_num] should return the name of + column column_names. + If given, exceptions will be clustered by column. + max_line_length: int + Soft cap for the line length of the resulting table + + Return + ------ + string_rep: str + Table containing the given exceptions + """ + def to_char(num): + if num < 0: + return "" + return to_char(int(num / 26) - 1) + chr(int(num % 26) + 65) + max_line_length -= 40 # Estimate of Field + Type space use + + headers = {"loc": "Field", "type": "Error Type", "mess": ["Message"]} + lengths = {key: len(headers[key]) for key in headers} + new_data = [] + + current_column = None + exceptions.sort(key=lambda tup: tup[1]) + for row_i, col_i, excep in exceptions: + if column_names is not None: + # Update Names + if current_column != col_i: + current_column = col_i + new_data.append({ + "loc": f"\nErrors in column '{column_names[col_i]}':", + "type": "", "mess": [""] + }) + # Setup + row = {} + new_data.append(row) + # Field + if isinstance(row_i, int): + row["loc"] = f"{to_char(col_i)}{row_i + 1}" + else: + row["loc"] = f"{to_char(col_i)}" + lengths["loc"] = max(lengths["loc"], len(row["loc"])) + # Code + row["type"] = type(excep).__name__ + lengths["type"] = max(lengths["type"], len(row["type"])) + # Message + lines = str(excep).split('\n') + new_lines = [] + for line in lines: + if len(line) > max_line_length: + words = line.split(' ') + current = "" + for word, next_word in zip(words, words[1:] + [""]): + if current != "": + current += " " + current += word + if len(current + next_word) > max_line_length: + lengths["mess"] = max(lengths["mess"], len(current)) + new_lines.append(current) + current = "" + if current != "": + lengths["mess"] = max(lengths["mess"], len(current)) + new_lines.append(current) + elif len(line) > 0: + lengths["mess"] = max(lengths["mess"], len(line)) + new_lines.append(line) + if new_lines == []: + new_lines = [""] + row["mess"] = new_lines + + dividers = {key: '–' * l for key, l in lengths.items()} + dividers["mess"] = [dividers["mess"]] + + # Fill for the messages is set to 0, if we want another column or align + # right we need to use lengths["mess"] + string_rep = f"There were failures during validation of worksheet '{worksheet_title}':\n\n" + for row in [headers, dividers] + new_data: + string_rep += ' {loc: <{fill}} '.format(loc=row["loc"], + fill=lengths["loc"]) + string_rep += ' {typ: <{fill}} '.format(typ=row["type"], + fill=lengths["type"]) + string_rep += ' {mes: <{fill}}\n'.format(mes=row["mess"][0], fill=0) + for line in row["mess"][1:]: + # Front padding + string_rep += ' ' * (lengths["loc"] + lengths["type"] + 7) + string_rep += ' {mes: <{fill}}\n'.format(mes=line, fill=0) + return string_rep + + class ForeignError(KeyError): def __init__(self, *args, definitions: list, message: str = ""): super().__init__(message, *args) @@ -205,9 +313,13 @@ class XLSXConverter: # # - data: The actual data of this entry, a dict. # entries: dict[str, list[SimpleNamespace]] = {} + exceptions = [] + col_names = None for row_idx, row in enumerate(sheet.iter_rows(values_only=True)): - # Skip non-data rows. + # Skip non-data rows and save the row containing column names if row[row_type_column] is not None: + if row[row_type_column] == "IGNORE" and col_names is None: + col_names = row continue foreign_repr = "" foreign = [] # A list of lists, each of which is: [path1, path2, ..., leaf, value] @@ -219,24 +331,27 @@ class XLSXConverter: foreign.append(foreign_column_paths[col_idx] + [value]) continue - if col_idx in data_column_paths: - path = data_column_paths[col_idx] - if self._is_multiple_choice(path): - real_value = path.pop() # Last component is the enum value, insert above - # set up list - try: - _set_in_nested(mydict=data, path=path, value=[], prefix=parent, skip=1) - except ValueError as err: - if not str(err).startswith("There is already some value at"): - raise - if not xlsx_utils.parse_multiple_choice(value): - continue - _set_in_nested(mydict=data, path=path, value=real_value, prefix=parent, - skip=1, append_to_list=True) - else: - value = self._validate_and_convert(value, path) - _set_in_nested(mydict=data, path=path, value=value, prefix=parent, skip=1) - continue + try: + if col_idx in data_column_paths: + path = data_column_paths[col_idx] + if self._is_multiple_choice(path): + real_value = path.pop() # Last component is the enum value, insert above + # set up list + try: + _set_in_nested(mydict=data, path=path, value=[], prefix=parent, skip=1) + except ValueError as err: + if not str(err).startswith("There is already some value at"): + raise + if not xlsx_utils.parse_multiple_choice(value): + continue + _set_in_nested(mydict=data, path=path, value=real_value, prefix=parent, + skip=1, append_to_list=True) + else: + value = self._validate_and_convert(value, path) + _set_in_nested(mydict=data, path=path, value=value, prefix=parent, skip=1) + continue + except (ValueError, jsonschema.ValidationError) as e: + exceptions.append((row_idx, col_idx, e)) try: # Find current position in tree @@ -250,6 +365,12 @@ class XLSXConverter: if not fail_later: raise self._errors[(sheet.title, row_idx)] = kerr.definitions + + if exceptions != []: + exception_table = format_exception_table(exceptions, sheet.title, + col_names) + raise jsonschema.ValidationError(exception_table) + self._handled_sheets.add(sheet.title) def _is_multiple_choice(self, path: list[str]) -> bool: @@ -308,20 +429,15 @@ class XLSXConverter: if isinstance(value, str) and ";" in value: values = [self.PARSER[array_type](v) for v in value.split(";")] return values - try: - # special case: datetime or date - if ("anyOf" in subschema): - if isinstance(value, datetime.datetime) and ( - {'type': 'string', 'format': 'date-time'} in subschema["anyOf"]): - return value - if isinstance(value, datetime.date) and ( - {'type': 'string', 'format': 'date'} in subschema["anyOf"]): - return value - jsonschema.validate(value, subschema) - except jsonschema.ValidationError as verr: - print(verr) - print(path) - raise + # special case: datetime or date + if ("anyOf" in subschema): + if isinstance(value, datetime.datetime) and ( + {'type': 'string', 'format': 'date-time'} in subschema["anyOf"]): + return value + if isinstance(value, datetime.date) and ( + {'type': 'string', 'format': 'date'} in subschema["anyOf"]): + return value + jsonschema.validate(value, subschema) # Finally: convert to target type return self.PARSER[subschema.get("type", "string")](value) diff --git a/unittests/table_json_conversion/data/simple_data_broken.xlsx b/unittests/table_json_conversion/data/simple_data_broken.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..361953f660f12cb37d979ff3b4c49895265131e3 Binary files /dev/null and b/unittests/table_json_conversion/data/simple_data_broken.xlsx differ diff --git a/unittests/table_json_conversion/test_read_xlsx.py b/unittests/table_json_conversion/test_read_xlsx.py index 0eec2e9caa1f800ad86ab43057b8c512dc09881f..6ee744efabbab9975dc0a64186550c92ebcdfb17 100644 --- a/unittests/table_json_conversion/test_read_xlsx.py +++ b/unittests/table_json_conversion/test_read_xlsx.py @@ -27,6 +27,7 @@ import re from types import SimpleNamespace +import jsonschema import pytest from caosadvancedtools.table_json_conversion import convert @@ -112,6 +113,26 @@ def test_missing_columns(): assert expected in messages +def test_wrong_datatype(): + with pytest.raises(jsonschema.ValidationError) as caught: + convert.to_dict(xlsx=rfp("data/simple_data_broken.xlsx"), + schema=rfp("data/simple_schema.json")) + # Correct Errors + assert "'Not a num' is not of type 'number'" in str(caught.value) + assert "1.5 is not of type 'integer'" in str(caught.value) + # Correct Locations + for line in str(caught.value).split('\n'): + if "'Not a num' is not of type 'number'" in line: + assert "J7" in line + if "1.5 is not of type 'integer'" in line: + assert "K7" in line + # No additional type errors + if "is not of type 'boolean'" in str(caught.value): # ToDo: Remove when boolean is fixed + assert str(caught.value).count("is not of type") == 3 + else: + assert str(caught.value).count("is not of type") == 2 + + def test_faulty_foreign(): # Simple wrong foreign key converter = convert.XLSXConverter(xlsx=rfp("data/simple_data_wrong_foreign.xlsx"),