diff --git a/CHANGELOG.md b/CHANGELOG.md index 6a118d98343a2fbee4324c4a8afd0807c9582eed..6628f1a7cd50c8709c7f8ba9b08a5560113de7f1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,6 +33,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed ### - Yaml data model parser adds data types of properties of record types and other attributes which fixes https://gitlab.indiscale.com/caosdb/customers/f-fit/management/-/issues/58 +- `XLSXConverter` now checks path validity before parsing the worksheet. ### Security ### diff --git a/src/caosadvancedtools/table_json_conversion/convert.py b/src/caosadvancedtools/table_json_conversion/convert.py index b416fc29764fae73176f8c0177523a37dd353618..370dc85d953f44c302e2d3e04bbb0384a2b8b222 100644 --- a/src/caosadvancedtools/table_json_conversion/convert.py +++ b/src/caosadvancedtools/table_json_conversion/convert.py @@ -182,16 +182,107 @@ class XLSXConverter: self._workbook = load_workbook(xlsx) self._schema = read_or_dict(schema) self._defining_path_index = xlsx_utils.get_defining_paths(self._workbook) - try: - self._check_columns(fail_fast=strict) - except KeyError as e: - raise jsonschema.ValidationError(f"Malformed metadata: Cannot parse paths. " - f"Unknown path: '{e.args[1]}' in sheet '{e.args[0]}'." - ) from e + self._check_path_validity() + self._check_columns(fail_fast=strict) self._handled_sheets: set[str] = set() self._result: dict = {} self._errors: dict = {} + def _check_path_validity(self): + """ + Method to check the workbook paths for completeness and correctness, + and raises a jsonschema.ValidationError containing information on all + faulty paths if any are found. + + If this method does not raise an error, this does not mean the workbook + is formatted correctly, only that the contained paths are complete and + can be found in the schema. + """ + # Setup + error_message = ["There were errors during path validation:"] + only_warnings = True + for sheetname in self._workbook.sheetnames: + sheet = self._workbook[sheetname] + error_message.append(f"\nIn sheet {sheetname}:") + + # Collect path information and filter out information column + row_i_col_type = xlsx_utils.get_column_type_row_index(sheet) + path_rows = xlsx_utils.get_path_rows(sheet) + paths = [] + for col_i, col in enumerate(sheet.iter_cols()): + col_type = col[row_i_col_type].value + path = [col[row_i].value for row_i in path_rows + if col[row_i].value not in [None, '']] + if col_type == 'COL_TYPE': + continue + paths.append((col_type, path, col_i, col)) + + # Check paths + for col_type, path, col_i, col in paths: + # No column type set + if col_type in [None, '']: + if len(path) == 0: # Likely a comment column + # Check whether the column has any visible content + content_in_column = False + for cell in col: + visible_content = ''.join(str(cell.value)).split() + if cell.value is not None and visible_content != '': + content_in_column = True + # If yes - might be an error but is not forbidden, so warn + if content_in_column: + m = (f"Warning:\tIn column {_column_id_to_chars(col_i)} " + f"there is no column metadata set. This column " + f"will be ignored during parsing.") + error_message.append(m) + continue + else: # Path is set but no column type + only_warnings = False + m = (f"ERROR:\t\tIn column {_column_id_to_chars(col_i)} " + f"the column type is missing.") + error_message.append(m) + # No continue - even if column type is missing, we can check path + if len(path) == 0: # Column type is set but no path + only_warnings = False + m = (f"ERROR:\t\tIn column {_column_id_to_chars(col_i)} " + f"the path is missing.") + error_message.append(m) + continue + # Check path is in schema + try: + subschema = xlsx_utils.get_subschema(path, self._schema) + schema_type = subschema.get('type', None) + if schema_type is None and 'enum' in subschema: + schema_type = 'enum' + if schema_type is None and 'anyOf' in subschema: + schema_type = 'anyOf' + if schema_type == 'array': # Check item type instead + schema_type = subschema.get('items', {}).get('type', None) + if schema_type in ['object', 'array', None]: + m = (f"Warning:\tIn column {_column_id_to_chars(col_i)} " + f"the path may be incomplete.") + error_message.append(m) + except KeyError as e: + only_warnings = False + m = (f"ERROR:\t\tIn column {_column_id_to_chars(col_i)} " + f"parsing of the path '{'.'.join(path)}' fails " + f"on the path component {str(e)}.\n\t\t\t" + f"This likely means the path is incomplete or not " + f"present in the schema.") + error_message.append(m) + + # Cleanup if no errors were found + if error_message[-1] == f"\nIn sheet {sheetname}:": + error_message.pop(-1) + + # Determine whether error / warning / nothing should be raised + if error_message == ["There were errors during path validation:"]: + return + error_message = '\n'.join(error_message) + if only_warnings: + warn(error_message) + else: + raise jsonschema.ValidationError(error_message) + def to_dict(self, validate: bool = False, collect_errors: bool = True) -> dict: """Convert the xlsx contents to a dict. @@ -375,13 +466,6 @@ class XLSXConverter: value = self._validate_and_convert(value, path) _set_in_nested(mydict=data, path=path, value=value, prefix=parent, skip=1) continue - elif sheet.cell(col_type_row+1, col_idx+1).value is None: - mess = (f"\nNo metadata configured for column " - f"'{_column_id_to_chars(col_idx)}' in worksheet " - f"'{sheet.title}'.\n") - if mess not in warns: - print(mess, file=sys.stderr) - warns.append(mess) # Prevent multiple instances of same warning except (ValueError, KeyError, jsonschema.ValidationError) as e: # Append error for entire column only once if isinstance(e, KeyError) and 'column' in str(e): @@ -460,10 +544,7 @@ class XLSXConverter: """ if value is None: return value - try: - subschema = self._get_subschema(path) - except KeyError as e: - raise KeyError("There is no entry in the schema that corresponds to this column.") from e + subschema = self._get_subschema(path) # Array handling only if schema says it's an array. if subschema.get("type") == "array": array_type = subschema["items"]["type"] diff --git a/unittests/table_json_conversion/data/simple_data_broken.xlsx b/unittests/table_json_conversion/data/simple_data_broken.xlsx index c75da9faaefcb7610d84e16dd6ff17dcd055b008..1e61cf108365d8e825c97742924c8ffc7b9643c1 100644 Binary files a/unittests/table_json_conversion/data/simple_data_broken.xlsx and b/unittests/table_json_conversion/data/simple_data_broken.xlsx differ diff --git a/unittests/table_json_conversion/data/simple_data_broken_paths_2.xlsx b/unittests/table_json_conversion/data/simple_data_broken_paths_2.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..14bdbaaa5bcdf3e5541e8cfbddc68505e35f11f2 Binary files /dev/null and b/unittests/table_json_conversion/data/simple_data_broken_paths_2.xlsx differ diff --git a/unittests/table_json_conversion/test_read_xlsx.py b/unittests/table_json_conversion/test_read_xlsx.py index a34c046f95887f0de6a4a40b7102b1ad85a0de3e..2a81cdc828300401684f14aef535500b6fe2c99a 100644 --- a/unittests/table_json_conversion/test_read_xlsx.py +++ b/unittests/table_json_conversion/test_read_xlsx.py @@ -105,7 +105,8 @@ def test_missing_columns(): with pytest.warns(UserWarning) as caught: convert.to_dict(xlsx=rfp("data/simple_data_missing.xlsx"), schema=rfp("data/simple_schema.json")) - assert str(caught.pop().message) == "Missing column: Training.coach.given_name" + messages = {str(w.message) for w in caught} + assert "Missing column: Training.coach.given_name" in messages with pytest.warns(UserWarning) as caught: convert.to_dict(xlsx=rfp("data/multiple_choice_data_missing.xlsx"), schema=rfp("data/multiple_choice_schema.json")) @@ -122,12 +123,10 @@ def test_error_table(): convert.to_dict(xlsx=rfp("data/simple_data_broken.xlsx"), schema=rfp("data/simple_schema.json")) # Correct Errors - assert "Malformed metadata: Cannot parse paths in worksheet 'Person'." in str(caught.value) assert "'Not a num' is not of type 'number'" in str(caught.value) assert "'Yes a number?' is not of type 'number'" in str(caught.value) assert "1.5 is not of type 'integer'" in str(caught.value) assert "1.2345 is not of type 'integer'" in str(caught.value) - assert "'There is no entry in the schema" in str(caught.value) assert "'Not an enum' is not one of [" in str(caught.value) # Correct Locations matches = set() @@ -144,9 +143,6 @@ def test_error_table(): if "1.2345 is not of type 'integer'" in line: assert "K8" in line matches.add("K8") - if "'There is no entry in the schema" in line: - assert "Column M" in line - matches.add("Col M") if "'Not an enum' is not one of [" in line: assert "G8" in line matches.add("K8") @@ -157,33 +153,62 @@ def test_error_table(): if "'=NOT(TRUE())' is not of type 'boolean'" in line: assert "L10" in line matches.add("L10") - assert matches == {"J7", "J8", "K7", "K8", "Col M", "K8", "L9", "L10"} + assert matches == {"J7", "J8", "K7", "K8", "K8", "L9", "L10"} # No additional errors - assert str(caught.value).count("Malformed metadata: Cannot parse paths in worksheet") == 1 - assert str(caught.value).count("There is no entry in the schema") == 1 assert str(caught.value).count("is not one of") == 1 assert str(caught.value).count("is not of type") == 6 - # Check correct error message for completely unknown path - with pytest.raises(jsonschema.ValidationError) as caught: - convert.to_dict(xlsx=rfp("data/simple_data_broken_paths.xlsx"), - schema=rfp("data/simple_schema.json")) - assert ("Malformed metadata: Cannot parse paths. Unknown path: 'There' in sheet 'Person'." - == str(caught.value)) -def test_additional_column(): +def test_malformed_paths(): with pytest.raises(jsonschema.ValidationError) as caught: - convert.to_dict(xlsx=rfp("data/simple_data_broken.xlsx"), + convert.to_dict(xlsx=rfp("data/simple_data_broken_paths_2.xlsx"), schema=rfp("data/simple_schema.json")) - # Correct Error - assert "no entry in the schema that corresponds to this column" in str(caught.value) - # Correct Location - for line in str(caught.value).split('\n'): - if "no entry in the schema that corresponds to this column" in line: - assert " M " in line - # No additional column errors - assert str(caught.value).count("no entry in the schema that corresponds to this column") == 1 + message_lines = str(caught.value).lower().split('\n') + expected_errors = { + 'person': {'c': "column type is missing", + 'd': "parsing of the path", + 'e': "path may be incomplete"}, + 'training': {'c': "path is missing", + 'd': "column type is missing", + 'e': "path may be incomplete", + 'f': "parsing of the path", + 'g': "path may be incomplete", + 'h': "parsing of the path", + 'i': "parsing of the path"}, + 'training.coach': {'f': "no column metadata set"}} + current_sheet = None + for line in message_lines: + if 'in sheet' in line: + current_sheet = line.replace('in sheet ', '').replace(':', '') + continue + if 'in column' in line: + for column, expected_error in expected_errors[current_sheet].items(): + if f'in column {column}' in line: + assert expected_error in line + expected_errors[current_sheet].pop(column) + break + for _, errors_left in expected_errors.items(): + assert len(errors_left) == 0 + + +def test_empty_columns(): + with pytest.warns(UserWarning) as caught: + try: + convert.to_dict(xlsx=rfp("data/simple_data_broken.xlsx"), + schema=rfp("data/simple_schema.json")) + except jsonschema.ValidationError: + pass # Errors are checked in test_error_table + messages = {str(w.message).lower() for w in caught} + expected_warnings = {"column h": "no column metadata"} + for message in messages: + for column, warning in list(expected_warnings.items()): + if column in message: + assert warning in message + expected_warnings.pop(column) + else: + assert warning not in message + assert len(expected_warnings) == 0 def test_faulty_foreign():