Skip to content
Snippets Groups Projects

XLSXConverter paths validation

Merged I. Nüske requested to merge f-converter-unified-path-errors into dev
5 files
+ 149
44
Compare changes
  • Side-by-side
  • Inline

Files

+ 98
19
@@ -182,16 +182,107 @@ class XLSXConverter:
self._workbook = load_workbook(xlsx)
self._schema = read_or_dict(schema)
self._defining_path_index = xlsx_utils.get_defining_paths(self._workbook)
try:
self._check_columns(fail_fast=strict)
except KeyError as e:
raise jsonschema.ValidationError(f"Malformed metadata: Cannot parse paths. "
f"Unknown path: '{e.args[1]}' in sheet '{e.args[0]}'."
) from e
self._check_path_validity()
self._check_columns(fail_fast=strict)
self._handled_sheets: set[str] = set()
self._result: dict = {}
self._errors: dict = {}
def _check_path_validity(self):
"""
Method to check the workbook paths for completeness and correctness,
and raises a jsonschema.ValidationError containing information on all
faulty paths if any are found.
If this method does not raise an error, this does not mean the workbook
is formatted correctly, only that the contained paths are complete and
can be found in the schema.
"""
# Setup
error_message = ["There were errors during path validation:"]
only_warnings = True
for sheetname in self._workbook.sheetnames:
sheet = self._workbook[sheetname]
error_message.append(f"\nIn sheet {sheetname}:")
# Collect path information and filter out information column
row_i_col_type = xlsx_utils.get_column_type_row_index(sheet)
path_rows = xlsx_utils.get_path_rows(sheet)
paths = []
for col_i, col in enumerate(sheet.iter_cols()):
col_type = col[row_i_col_type].value
path = [col[row_i].value for row_i in path_rows
if col[row_i].value not in [None, '']]
if col_type == 'COL_TYPE':
continue
paths.append((col_type, path, col_i, col))
# Check paths
for col_type, path, col_i, col in paths:
# No column type set
if col_type in [None, '']:
if len(path) == 0: # Likely a comment column
# Check whether the column has any visible content
content_in_column = False
for cell in col:
visible_content = ''.join(str(cell.value)).split()
if cell.value is not None and visible_content != '':
content_in_column = True
# If yes - might be an error but is not forbidden, so warn
if content_in_column:
m = (f"Warning:\tIn column {_column_id_to_chars(col_i)} "
f"there is no column metadata set. This column "
f"will be ignored during parsing.")
error_message.append(m)
continue
else: # Path is set but no column type
only_warnings = False
m = (f"ERROR:\t\tIn column {_column_id_to_chars(col_i)} "
f"the column type is missing.")
error_message.append(m)
# No continue - even if column type is missing, we can check path
if len(path) == 0: # Column type is set but no path
only_warnings = False
m = (f"ERROR:\t\tIn column {_column_id_to_chars(col_i)} "
f"the path is missing.")
error_message.append(m)
continue
# Check path is in schema
try:
subschema = xlsx_utils.get_subschema(path, self._schema)
schema_type = subschema.get('type', None)
if schema_type is None and 'enum' in subschema:
schema_type = 'enum'
if schema_type is None and 'anyOf' in subschema:
schema_type = 'anyOf'
if schema_type == 'array': # Check item type instead
schema_type = subschema.get('items', {}).get('type', None)
if schema_type in ['object', 'array', None]:
m = (f"Warning:\tIn column {_column_id_to_chars(col_i)} "
f"the path may be incomplete.")
error_message.append(m)
except KeyError as e:
only_warnings = False
m = (f"ERROR:\t\tIn column {_column_id_to_chars(col_i)} "
f"parsing of the path '{'.'.join(path)}' fails "
f"on the path component {str(e)}.\n\t\t\t"
f"This likely means the path is incomplete or not "
f"present in the schema.")
error_message.append(m)
# Cleanup if no errors were found
if error_message[-1] == f"\nIn sheet {sheetname}:":
error_message.pop(-1)
# Determine whether error / warning / nothing should be raised
if error_message == ["There were errors during path validation:"]:
return
error_message = '\n'.join(error_message)
if only_warnings:
warn(error_message)
else:
raise jsonschema.ValidationError(error_message)
def to_dict(self, validate: bool = False, collect_errors: bool = True) -> dict:
"""Convert the xlsx contents to a dict.
@@ -316,7 +407,6 @@ class XLSXConverter:
If True, do not fail with unresolvable foreign definitions, but collect all errors.
"""
row_type_column = xlsx_utils.get_row_type_column_index(sheet)
col_type_row = xlsx_utils.get_column_type_row_index(sheet)
foreign_columns = xlsx_utils.get_foreign_key_columns(sheet)
foreign_column_paths = {col.index: col.path for col in foreign_columns.values()}
data_columns = xlsx_utils.get_data_columns(sheet)
@@ -339,7 +429,6 @@ class XLSXConverter:
# entries: dict[str, list[SimpleNamespace]] = {}
exceptions = []
warns = []
col_names = {}
for row_idx, row in enumerate(sheet.iter_rows(values_only=True)):
# Skip non-data rows
@@ -375,13 +464,6 @@ class XLSXConverter:
value = self._validate_and_convert(value, path)
_set_in_nested(mydict=data, path=path, value=value, prefix=parent, skip=1)
continue
elif sheet.cell(col_type_row+1, col_idx+1).value is None:
mess = (f"\nNo metadata configured for column "
f"'{_column_id_to_chars(col_idx)}' in worksheet "
f"'{sheet.title}'.\n")
if mess not in warns:
print(mess, file=sys.stderr)
warns.append(mess) # Prevent multiple instances of same warning
except (ValueError, KeyError, jsonschema.ValidationError) as e:
# Append error for entire column only once
if isinstance(e, KeyError) and 'column' in str(e):
@@ -460,10 +542,7 @@ class XLSXConverter:
"""
if value is None:
return value
try:
subschema = self._get_subschema(path)
except KeyError as e:
raise KeyError("There is no entry in the schema that corresponds to this column.") from e
subschema = self._get_subschema(path)
# Array handling only if schema says it's an array.
if subschema.get("type") == "array":
array_type = subschema["items"]["type"]
Loading