MNT: Implement review feedback, extend test, some more comments

d0f4f3f5 · I. Nüske · 22dac5d1 · d0f4f3f5 · d0f4f3f5 · d0f4f3f5
Commit d0f4f3f5 authored 5 months ago by I. Nüske
--- a/src/caosadvancedtools/table_json_conversion/convert.py
+++ b/src/caosadvancedtools/table_json_conversion/convert.py
@@ -25,6 +25,7 @@ from __future__ import annotations
 import datetime
 import itertools
 import sys
+import textwrap
 from functools import reduce
 from operator import getitem
 from types import SimpleNamespace
@@ -95,65 +96,52 @@ def _format_exception_table(exceptions: list[tuple], worksheet_title: str,
    exceptions.sort(key=lambda tup: tup[1])
    for row_i, col_i, excep in exceptions:
        if column_names is not None:
-            # Update Names
+            # Add a line with information about the current column
            if current_column != col_i:
                current_column = col_i
                new_data.append({
                    "loc": f"\nErrors in column '{column_names[col_i]}':",
                    "type": "", "mess": [""]
                })
-        # Setup
-        row = {}
-        new_data.append(row)
-        # Field
+        # Setup for current Exception
+        curr_err_data = {}
+        new_data.append(curr_err_data)
+        # Get field
        if isinstance(row_i, int):
-            row["loc"] = f"Cell {_column_id_to_chars(col_i)}{row_i + 1}"
+            curr_err_data["loc"] = f"Cell {_column_id_to_chars(col_i)}{row_i + 1}"
        else:
-            row["loc"] = f"Column {_column_id_to_chars(col_i)}"
-        lengths["loc"] = max(lengths["loc"], len(row["loc"]))
-        # Code
-        row["type"] = type(excep).__name__
-        lengths["type"] = max(lengths["type"], len(row["type"]))
-        # Message
+            curr_err_data["loc"] = f"Column {_column_id_to_chars(col_i)}"
+        lengths["loc"] = max(lengths["loc"], len(curr_err_data["loc"]))
+        # Add error code
+        curr_err_data["type"] = type(excep).__name__
+        lengths["type"] = max(lengths["type"], len(curr_err_data["type"]))
+        # Format message - split into lines
        lines = str(excep).split('\n')
        new_lines = []
        for line in lines:
-            if len(line) > max_line_length:
-                words = line.split(' ')
-                current = ""
-                for word, next_word in zip(words, words[1:] + [""]):
-                    if current != "":
-                        current += " "
-                    current += word
-                    if len(current + next_word) > max_line_length:
-                        lengths["mess"] = max(lengths["mess"], len(current))
-                        new_lines.append(current)
-                        current = ""
-                if current != "":
-                    lengths["mess"] = max(lengths["mess"], len(current))
-                    new_lines.append(current)
-            elif len(line) > 0:
-                lengths["mess"] = max(lengths["mess"], len(line))
-                new_lines.append(line)
+            new_lines += textwrap.wrap(line, max_line_length, break_long_words=False)
+        for line in new_lines:
+            lengths["mess"] = max(lengths["mess"], len(line))
        if new_lines == []:
            new_lines = [""]
-        row["mess"] = new_lines
+        curr_err_data["mess"] = new_lines

+    # Generate underline for each header
    dividers = {key: '–' * l for key, l in lengths.items()}
    dividers["mess"] = [dividers["mess"]]
-
-    # Fill for the messages is set to 0, if we want another column or align
-    # right we need to use lengths["mess"]
+    # Fill with spaces for alignment
    string_rep = f"There were errors during the validation of worksheet '{worksheet_title}':\n\n"
-    for row in [headers, dividers] + new_data:
-        string_rep += ' {loc: <{fill}}  '.format(loc=row["loc"],
+    for curr_err_data in [headers, dividers] + new_data:
+        string_rep += ' {loc: <{fill}}  '.format(loc=curr_err_data["loc"],
                                                 fill=lengths["loc"])
-        string_rep += ' {typ: <{fill}}  '.format(typ=row["type"],
+        string_rep += ' {typ: <{fill}}  '.format(typ=curr_err_data["type"],
                                                 fill=lengths["type"])
-        string_rep += ' {mes: <{fill}}\n'.format(mes=row["mess"][0], fill=0)
-        for line in row["mess"][1:]:
-            # Front padding
-            string_rep += ' ' * (lengths["loc"] + lengths["type"] + 7)
+        # Fill for the messages is set to 0, if we want another column or align
+        # right we need to use lengths["mess"]
+        string_rep += ' {mes: <{fill}}\n'.format(mes=curr_err_data["mess"][0], fill=0)
+        for line in curr_err_data["mess"][1:]:
+            # Front padding for lines without location and error type
+            string_rep += ' ' * (lengths["loc"] + lengths["type"] + 6)
            string_rep += ' {mes: <{fill}}\n'.format(mes=line, fill=0)
    return string_rep

@@ -194,7 +182,11 @@ class XLSXConverter:
        self._workbook = load_workbook(xlsx)
        self._schema = read_or_dict(schema)
        self._defining_path_index = xlsx_utils.get_defining_paths(self._workbook)
-        self._check_columns(fail_fast=strict)
+        try:
+            self._check_columns(fail_fast=strict)
+        except KeyError as e:
+            raise jsonschema.ValidationError(f"Malformed metadata: Cannot parse paths. "
+                                             f"Unknown path: {e}") from e
        self._handled_sheets: set[str] = set()
        self._result: dict = {}
        self._errors: dict = {}
@@ -220,9 +212,29 @@ class XLSXConverter:
        self._handled_sheets = set()
        self._result = {}
        self._errors = {}
-        for sheetname in self._workbook.sheetnames:
-            if sheetname not in self._handled_sheets:
-                self._handle_sheet(self._workbook[sheetname], fail_later=collect_errors)
+        if not collect_errors:
+            for sheetname in self._workbook.sheetnames:
+                if sheetname not in self._handled_sheets:
+                    self._handle_sheet(self._workbook[sheetname], fail_later=collect_errors)
+        else:
+            # Collect errors from converting
+            exceptions = []
+            for sheetname in self._workbook.sheetnames:
+                if sheetname not in self._handled_sheets:
+                    try:
+                        self._handle_sheet(self._workbook[sheetname], fail_later=collect_errors)
+                    except jsonschema.ValidationError as e:
+                        exceptions.append(e)
+                        # do not collect errors from sheet again
+                        self._handled_sheets.add(sheetname)
+            if len(exceptions) == 1:
+                raise exceptions[0]
+            elif len(exceptions) > 1:
+                mess = "There were errors during the validation of several worksheets:\n\n"
+                mess += '\n\n'.join([str(e).replace("There were errors during the validation of worksheet",
+                                                    "In worksheet")
+                                     for e in exceptions])
+                raise jsonschema.ValidationError(mess)
        if validate:
            jsonschema.validate(self._result, self._schema)
        if self._errors:
@@ -323,6 +335,7 @@ class XLSXConverter:
        # entries: dict[str, list[SimpleNamespace]] = {}

        exceptions = []
+        warns = []
        col_names = {}
        for row_idx, row in enumerate(sheet.iter_rows(values_only=True)):
            # Skip non-data rows
@@ -359,7 +372,12 @@ class XLSXConverter:
                            _set_in_nested(mydict=data, path=path, value=value, prefix=parent, skip=1)
                        continue
                    elif sheet.cell(col_type_row+1, col_idx+1).value is None:
-                        warn(f"No metadata configured for column {_column_id_to_chars(col_idx)}.")
+                        mess = (f"\nNo metadata configured for column "
+                                f"'{_column_id_to_chars(col_idx)}' in worksheet "
+                                f"'{sheet.title}'.\n")
+                        if mess not in warns:
+                            print(mess, file=sys.stderr)
+                            warns.append(mess)  # Prevent multiple instances of same warning
                except (ValueError, KeyError, jsonschema.ValidationError) as e:
                    # Append error for entire column only once
                    if isinstance(e, KeyError) and 'column' in str(e):

--- a/unittests/table_json_conversion/data/simple_data_broken.xlsx
+++ b/unittests/table_json_conversion/data/simple_data_broken.xlsx
--- a/unittests/table_json_conversion/data/simple_data_broken_paths.xlsx
+++ b/unittests/table_json_conversion/data/simple_data_broken_paths.xlsx
--- a/unittests/table_json_conversion/test_read_xlsx.py
+++ b/unittests/table_json_conversion/test_read_xlsx.py
@@ -113,27 +113,47 @@ def test_missing_columns():
        assert expected in messages


-def test_wrong_datatype():
+def test_error_table():
    with pytest.raises(jsonschema.ValidationError) as caught:
        convert.to_dict(xlsx=rfp("data/simple_data_broken.xlsx"),
                        schema=rfp("data/simple_schema.json"))
    # Correct Errors
+    assert "Malformed metadata: Cannot parse paths in worksheet 'Person'." in str(caught.value)
    assert "'Not a num' is not of type 'number'" in str(caught.value)
+    assert "'Yes a number?' is not of type 'number'" in str(caught.value)
    assert "1.5 is not of type 'integer'" in str(caught.value)
+    assert "1.2345 is not of type 'integer'" in str(caught.value)
+    assert "'There is no entry in the schema" in str(caught.value)
+    assert "'Not an enum' is not one of [" in str(caught.value)
    # Correct Locations
    for line in str(caught.value).split('\n'):
        if "'Not a num' is not of type 'number'" in line:
            assert "J7" in line
+        if "'Yes a number?' is not of type 'number'" in line:
+            assert "J8" in line
        if "1.5 is not of type 'integer'" in line:
            assert "K7" in line
        if "1.2345 is not of type 'integer'" in line:
            assert "K8" in line
-    # No additional type errors
-    if "is not of type 'boolean'" in str(caught.value):   # ToDo: Remove when boolean is fixed
-        assert str(caught.value).count("is not of type") == 3
+        if "'There is no entry in the schema" in line:
+            assert "Column M" in line
+        if "'Not an enum' is not one of [" in line:
+            assert "G8" in line
+    # No additional errors
+    assert str(caught.value).count("Malformed metadata: Cannot parse paths in worksheet") == 1
+    assert str(caught.value).count("There is no entry in the schema") == 1
+    assert str(caught.value).count("is not one of") == 1
+    # FIXME ToDo: Remove when boolean is fixed / when everything works as
+    #             expected, set correct number.
+    if "is not of type 'boolean'" in str(caught.value):
+        assert str(caught.value).count("is not of type") == 6
    else:
-        assert str(caught.value).count("is not of type") == 2  # FIXME when everything works as
-        #                                                      # expected, set correct number.
+        assert str(caught.value).count("is not of type") == 4
+    # Check correct error message for completely unknown path
+    with pytest.raises(jsonschema.ValidationError) as caught:
+        convert.to_dict(xlsx=rfp("data/simple_data_broken_paths.xlsx"),
+                        schema=rfp("data/simple_schema.json"))
+    assert "Malformed metadata: Cannot parse paths" in str(caught.value)


 def test_additional_column():