I. Nüske · 22bfe2f2
--- a/src/caosadvancedtools/table_json_conversion/convert.py

+ 98

− 19

View file @ 22bfe2f2

Open in Web IDE
+++ b/src/caosadvancedtools/table_json_conversion/convert.py

+ 98

− 19

View file @ 22bfe2f2

Open in Web IDE
 @@ -182,16 +182,107 @@ class XLSXConverter:
        self._workbook = load_workbook(xlsx)
        self._schema = read_or_dict(schema)
        self._defining_path_index = xlsx_utils.get_defining_paths(self._workbook)
-        try:
-            self._check_columns(fail_fast=strict)
-        except KeyError as e:
-            raise jsonschema.ValidationError(f"Malformed metadata: Cannot parse paths. "
-                                             f"Unknown path: '{e.args[1]}' in sheet '{e.args[0]}'."
-                                             ) from e
+        self._check_path_validity()
+        self._check_columns(fail_fast=strict)
        self._handled_sheets: set[str] = set()
        self._result: dict = {}
        self._errors: dict = {}

+    def _check_path_validity(self):
+        """
+        Method to check the workbook paths for completeness and correctness,
+        and raises a jsonschema.ValidationError containing information on all
+        faulty paths if any are found.
+
+        If this method does not raise an error, this does not mean the workbook
+        is formatted correctly, only that the contained paths are complete and
+        can be found in the schema.
+        """
+        # Setup
+        error_message = ["There were errors during path validation:"]
+        only_warnings = True
+        for sheetname in self._workbook.sheetnames:
+            sheet = self._workbook[sheetname]
+            error_message.append(f"\nIn sheet {sheetname}:")
+
+            # Collect path information and filter out information column
+            row_i_col_type = xlsx_utils.get_column_type_row_index(sheet)
+            path_rows = xlsx_utils.get_path_rows(sheet)
+            paths = []
+            for col_i, col in enumerate(sheet.iter_cols()):
+                col_type = col[row_i_col_type].value
+                path = [col[row_i].value for row_i in path_rows
+                        if col[row_i].value not in [None, '']]
+                if col_type == 'COL_TYPE':
+                    continue
+                paths.append((col_type, path, col_i, col))
+
+            # Check paths
+            for col_type, path, col_i, col in paths:
+                # No column type set
+                if col_type in [None, '']:
+                    if len(path) == 0:              # Likely a comment column
+                        # Check whether the column has any visible content
+                        content_in_column = False
+                        for cell in col:
+                            visible_content = ''.join(str(cell.value)).split()
+                            if cell.value is not None and visible_content != '':
+                                content_in_column = True
+                        # If yes - might be an error but is not forbidden, so warn
+                        if content_in_column:
+                            m = (f"Warning:\tIn column {_column_id_to_chars(col_i)} "
+                                 f"there is no column metadata set. This column "
+                                 f"will be ignored during parsing.")
+                            error_message.append(m)
+                        continue
+                    else:                    # Path is set but no column type
+                        only_warnings = False
+                        m = (f"ERROR:\t\tIn column {_column_id_to_chars(col_i)} "
+                             f"the column type is missing.")
+                        error_message.append(m)
+                        # No continue - even if column type is missing, we can check path
+                if len(path) == 0:           # Column type is set but no path
+                    only_warnings = False
+                    m = (f"ERROR:\t\tIn column {_column_id_to_chars(col_i)} "
+                         f"the path is missing.")
+                    error_message.append(m)
+                    continue
+                # Check path is in schema
+                try:
+                    subschema = xlsx_utils.get_subschema(path, self._schema)
+                    schema_type = subschema.get('type', None)
+                    if schema_type is None and 'enum' in subschema:
+                        schema_type = 'enum'
+                    if schema_type is None and 'anyOf' in subschema:
+                        schema_type = 'anyOf'
+                    if schema_type == 'array':      # Check item type instead
+                        schema_type = subschema.get('items', {}).get('type', None)
+                    if schema_type in ['object', 'array', None]:
+                        m = (f"Warning:\tIn column {_column_id_to_chars(col_i)} "
+                             f"the path may be incomplete.")
+                        error_message.append(m)
+                except KeyError as e:
+                    only_warnings = False
+                    m = (f"ERROR:\t\tIn column {_column_id_to_chars(col_i)} "
+                         f"parsing of the path '{'.'.join(path)}' fails "
+                         f"on the path component {str(e)}.\n\t\t\t"
+                         f"This likely means the path is incomplete or not "
+                         f"present in the schema.")
+                    error_message.append(m)
+
+            # Cleanup if no errors were found
+            if error_message[-1] == f"\nIn sheet {sheetname}:":
+                error_message.pop(-1)
+
+        # Determine whether error / warning / nothing should be raised
+        if error_message == ["There were errors during path validation:"]:
+            return
+        error_message = '\n'.join(error_message)
+        if only_warnings:
+            warn(error_message)
+        else:
+            raise jsonschema.ValidationError(error_message)
+
    def to_dict(self, validate: bool = False, collect_errors: bool = True) -> dict:
        """Convert the xlsx contents to a dict.

 @@ -316,7 +407,6 @@ class XLSXConverter:
          If True, do not fail with unresolvable foreign definitions, but collect all errors.
        """
        row_type_column = xlsx_utils.get_row_type_column_index(sheet)
-        col_type_row = xlsx_utils.get_column_type_row_index(sheet)
        foreign_columns = xlsx_utils.get_foreign_key_columns(sheet)
        foreign_column_paths = {col.index: col.path for col in foreign_columns.values()}
        data_columns = xlsx_utils.get_data_columns(sheet)
 @@ -339,7 +429,6 @@ class XLSXConverter:
        # entries: dict[str, list[SimpleNamespace]] = {}

        exceptions = []
-        warns = []
        col_names = {}
        for row_idx, row in enumerate(sheet.iter_rows(values_only=True)):
            # Skip non-data rows
 @@ -375,13 +464,6 @@ class XLSXConverter:
                            value = self._validate_and_convert(value, path)
                            _set_in_nested(mydict=data, path=path, value=value, prefix=parent, skip=1)
                        continue
-                    elif sheet.cell(col_type_row+1, col_idx+1).value is None:
-                        mess = (f"\nNo metadata configured for column "
-                                f"'{_column_id_to_chars(col_idx)}' in worksheet "
-                                f"'{sheet.title}'.\n")
-                        if mess not in warns:
-                            print(mess, file=sys.stderr)
-                            warns.append(mess)  # Prevent multiple instances of same warning
                except (ValueError, KeyError, jsonschema.ValidationError) as e:
                    # Append error for entire column only once
                    if isinstance(e, KeyError) and 'column' in str(e):
 @@ -460,10 +542,7 @@ class XLSXConverter:
        """
        if value is None:
            return value
-        try:
-            subschema = self._get_subschema(path)
-        except KeyError as e:
-            raise KeyError("There is no entry in the schema that corresponds to this column.") from e
+        subschema = self._get_subschema(path)
        # Array handling only if schema says it's an array.
        if subschema.get("type") == "array":
            array_type = subschema["items"]["type"]