ENH: Added error message to convert.to_dict() when trying to parse a column not in the schema

3b55e553 · I. Nüske · 5ceee474 · 3b55e553 · 3b55e553
Commit 3b55e553 authored 5 months ago by I. Nüske
--- a/src/caosadvancedtools/table_json_conversion/convert.py
+++ b/src/caosadvancedtools/table_json_conversion/convert.py
@@ -83,7 +83,7 @@ def format_exception_table(exceptions: list(tuple), worksheet_title: str,
        return to_char(int(num / 26) - 1) + chr(int(num % 26) + 65)
    max_line_length -= 40             # Estimate of Field + Type space use

-    headers = {"loc": "Field", "type": "Error Type", "mess": ["Message"]}
+    headers = {"loc": "Location", "type": "Error Type", "mess": ["Message"]}
    lengths = {key: len(headers[key]) for key in headers}
    new_data = []

@@ -103,9 +103,9 @@ def format_exception_table(exceptions: list(tuple), worksheet_title: str,
        new_data.append(row)
        # Field
        if isinstance(row_i, int):
-            row["loc"] = f"{to_char(col_i)}{row_i + 1}"
+            row["loc"] = f"Cell {to_char(col_i)}{row_i + 1}"
        else:
-            row["loc"] = f"{to_char(col_i)}"
+            row["loc"] = f"Column {to_char(col_i)}"
        lengths["loc"] = max(lengths["loc"], len(row["loc"]))
        # Code
        row["type"] = type(excep).__name__
@@ -140,7 +140,7 @@ def format_exception_table(exceptions: list(tuple), worksheet_title: str,

    # Fill for the messages is set to 0, if we want another column or align
    # right we need to use lengths["mess"]
-    string_rep = f"There were failures during validation of worksheet '{worksheet_title}':\n\n"
+    string_rep = f"There were errors during the validation of worksheet '{worksheet_title}':\n\n"
    for row in [headers, dividers] + new_data:
        string_rep += ' {loc: <{fill}}  '.format(loc=row["loc"],
                                                 fill=lengths["loc"])
@@ -314,12 +314,10 @@ class XLSXConverter:
        # entries: dict[str, list[SimpleNamespace]] = {}

        exceptions = []
-        col_names = None
+        col_names = {}
        for row_idx, row in enumerate(sheet.iter_rows(values_only=True)):
-            # Skip non-data rows and save the row containing column names
+            # Skip non-data rows
            if row[row_type_column] is not None:
-                if row[row_type_column] == "IGNORE" and col_names is None:
-                    col_names = row
                continue
            foreign_repr = ""
            foreign = []  # A list of lists, each of which is: [path1, path2, ..., leaf, value]
@@ -334,6 +332,7 @@ class XLSXConverter:
                try:
                    if col_idx in data_column_paths:
                        path = data_column_paths[col_idx]
+                        col_names[col_idx] = '.'.join(path)
                        if self._is_multiple_choice(path):
                            real_value = path.pop()  # Last component is the enum value, insert above
                            # set up list
@@ -350,8 +349,14 @@ class XLSXConverter:
                            value = self._validate_and_convert(value, path)
                            _set_in_nested(mydict=data, path=path, value=value, prefix=parent, skip=1)
                        continue
-                except (ValueError, jsonschema.ValidationError) as e:
-                    exceptions.append((row_idx, col_idx, e))
+                except (ValueError, KeyError, jsonschema.ValidationError) as e:
+                    # Append error for entire column only once
+                    if isinstance(e, KeyError) and 'column' in str(e):
+                        if len([err for ri, ci, err in exceptions
+                                if ci == col_idx and isinstance(err, KeyError)]) == 0:
+                            exceptions.append((None, col_idx, e))
+                    else:
+                        exceptions.append((row_idx, col_idx, e))

            try:
                # Find current position in tree
@@ -422,7 +427,10 @@ class XLSXConverter:
        """
        if value is None:
            return value
-        subschema = self._get_subschema(path)
+        try:
+            subschema = self._get_subschema(path)
+        except KeyError as e:
+            raise KeyError("There is no entry in the schema that corresponds to this column.")
        # Array handling only if schema says it's an array.
        if subschema.get("type") == "array":
            array_type = subschema["items"]["type"]

--- a/unittests/table_json_conversion/test_read_xlsx.py
+++ b/unittests/table_json_conversion/test_read_xlsx.py
@@ -133,6 +133,20 @@ def test_wrong_datatype():
        assert str(caught.value).count("is not of type") == 2


+def test_additional_column():
+    with pytest.raises(jsonschema.ValidationError) as caught:
+        convert.to_dict(xlsx=rfp("data/simple_data_broken.xlsx"),
+                        schema=rfp("data/simple_schema.json"))
+    # Correct Error
+    assert "no entry in the schema that corresponds to this column" in str(caught.value)
+    # Correct Location
+    for line in str(caught.value).split('\n'):
+        if "no entry in the schema that corresponds to this column" in line:
+            assert " M " in line
+    # No additional column errors
+    assert str(caught.value).count("no entry in the schema that corresponds to this column") == 1
+
+
 def test_faulty_foreign():
    # Simple wrong foreign key
    converter = convert.XLSXConverter(xlsx=rfp("data/simple_data_wrong_foreign.xlsx"),