diff --git a/src/caosadvancedtools/table_json_conversion/fill_xlsx.py b/src/caosadvancedtools/table_json_conversion/fill_xlsx.py index 60b5c96c7de141b1ecb12254e6928252fe4a9f5c..585bc6bf283083b343c85cdbe3d0d9dbdeb9decc 100644 --- a/src/caosadvancedtools/table_json_conversion/fill_xlsx.py +++ b/src/caosadvancedtools/table_json_conversion/fill_xlsx.py @@ -206,6 +206,7 @@ class TemplateFiller: Index the sheets by all path arrays leading to them. Also create a simple column index by column type and path. + This method creates and populates the dict ``self._sheet_index``. """ self._sheet_index = {} for sheetname in self._workbook.sheetnames: @@ -227,7 +228,8 @@ class TemplateFiller: path.append(col[path_idx].value) # col_key = p2s([col[coltype_idx].value] + path) # col_index[col_key] = SimpleNamespace(column=col, col_index=col_idx) - if col[coltype_idx].value not in [ColumnType.SCALAR.name, ColumnType.LIST.name]: + if col[coltype_idx].value not in [ColumnType.SCALAR.name, ColumnType.LIST.name, + ColumnType.MULTIPLE_CHOICE.name]: continue path_str = p2s(path) @@ -308,16 +310,20 @@ out: union[dict, None] # collecting the data assert isinstance(content, list) - if len(content) > 1: - content = [ILLEGAL_CHARACTERS_RE.sub("", str(x)) for x in content] - value = ";".join(content) # TODO we need escaping of values - else: - value = content[0] - if isinstance(value, str): - value = ILLEGAL_CHARACTERS_RE.sub("", value) - path_str = p2s(path) - assert path_str not in insertables - insertables[path_str] = value + to_insert = self._try_multiple_choice(path, values=content) + if not to_insert: + if len(content) > 1: + content = [ILLEGAL_CHARACTERS_RE.sub("", str(x)) for x in content] + value = ";".join(content) # TODO we need escaping of values + else: + value = content[0] + if isinstance(value, str): + value = ILLEGAL_CHARACTERS_RE.sub("", value) + + path_str = p2s(path) + assert path_str not in insertables + to_insert = {path_str: value} + insertables.update(to_insert) if only_collect_insertables: return insertables if not current_path: # Top level returns, because there are only sheets for the children. @@ -353,6 +359,40 @@ out: union[dict, None] return None + def _try_multiple_choice(self, path: list[str], values: list[str]) -> Optional[dict[str, str]]: + """Try to create sheet content for a multiple choice property. + +Parameters +---------- +path: list[str] + The Path to this property. +values: list[str] + A list of possible choices, should be unique. + +Returns +------- +to_insert: Optional[dict[str, str]] + A path-value dict. None if this doesn't seem to be a multiple choice data set. + """ + try: + assert len(set(values)) == len(values) + to_insert = {} + found_sheet = None + for value in values: + assert isinstance(value, str) + path_str = p2s(path + [value]) + assert path_str in self._sheet_index + sheet_meta = self._sheet_index[path_str] + # All matches shall be on the same sheet + assert found_sheet is None or found_sheet == sheet_meta.sheetname + found_sheet = sheet_meta.sheetname + # Correct type + assert sheet_meta.col_type == ColumnType.MULTIPLE_CHOICE.name + to_insert[path_str] = "x" + except AssertionError: + return None + return to_insert + def fill_template(data: Union[dict, str, TextIO], template: str, result: str, validation_schema: Union[dict, str, TextIO] = None) -> None: diff --git a/src/caosadvancedtools/table_json_conversion/table_generator.py b/src/caosadvancedtools/table_json_conversion/table_generator.py index 8ca026a8758361b658e98cabbcff42b849bb07fe..857100ef2d1a36eccedcf118d2c59343c4c674c2 100644 --- a/src/caosadvancedtools/table_json_conversion/table_generator.py +++ b/src/caosadvancedtools/table_json_conversion/table_generator.py @@ -42,7 +42,8 @@ class ColumnType(Enum): SCALAR = 1 LIST = 2 FOREIGN = 3 - IGNORE = 4 + MULTIPLE_CHOICE = 4 + IGNORE = 5 class RowType(Enum): @@ -189,15 +190,16 @@ class TableTemplateGenerator(ABC): # if it is an array, value defs are in 'items' if schema.get('type') == 'array': - if (schema['items'].get('type') == 'object' - and len(path) > 1): # list of references; special treatment + items = schema['items'] + # list of references; special treatment + if (items.get('type') == 'object' and len(path) > 1): # we add a new sheet with columns generated from the subtree of the schema sheetname = p2s(path) if sheetname in sheets: raise ValueError("The schema would lead to two sheets with the same name, " f"which is forbidden: {sheetname}") col_def = self._treat_schema_element( - schema=schema['items'], sheets=sheets, path=path, foreign_keys=foreign_keys, + schema=items, sheets=sheets, path=path, foreign_keys=foreign_keys, level_in_sheet_name=len(path), array_paths=array_paths+[path] # since this level is an array extend the list ) @@ -223,8 +225,23 @@ class TableTemplateGenerator(ABC): # current sheet. return {} + # List of enums: represent as checkbox columns + if (schema.get("uniqueItems") is True and "enum" in items and len(items) == 1): + choices = items["enum"] + assert len(path) >= 1 + prop_name = path[-1] + result = {} + for choice in choices: + name = f"{prop_name}.{choice}" + result[name] = ( + ColumnType.MULTIPLE_CHOICE, + schema.get('description'), + path + [str(choice)], + ) + return result + # it is a list of primitive types -> semicolon separated list - schema = schema['items'] + schema = items ctype = ColumnType.LIST # This should only be the case for "new or existing reference". @@ -247,9 +264,8 @@ class TableTemplateGenerator(ABC): return cols # The schema is a leaf. - description = schema['description'] if 'description' in schema else None # definition of a single column - default_return = {p2s(path[level_in_sheet_name:]): (ctype, description, path)} + default_return = {p2s(path[level_in_sheet_name:]): (ctype, schema.get('description'), path)} if 'type' not in schema and 'enum' in schema: return default_return if 'type' not in schema and 'anyOf' in schema: @@ -350,12 +366,12 @@ class XLSXTemplateGenerator(TableTemplateGenerator): ordered_cols = self._get_ordered_cols(sheetdef) # create other columns - for index, (colname, ct, desc, path) in enumerate(ordered_cols): - ws.cell(1, 2 + index, ct.name) + for index, (colname, coltype, desc, path) in enumerate(ordered_cols): + ws.cell(1, 2 + index, coltype.name) for path_index, el in enumerate(path): ws.cell(2 + path_index, 2 + index, el) ws.cell(header_index, 2 + index, colname) - if ct == ColumnType.FOREIGN: + if coltype == ColumnType.FOREIGN: # Visual highlighting ws.cell(header_index, 2 + index).fill = yellowfill if desc: diff --git a/src/doc/table-json-conversion/specs.md b/src/doc/table-json-conversion/specs.md index 73e480440eccb923fd0979dc2adb653146667951..b4fe5b7152e7b9b9c9273d4e829bd189dd2e4f50 100644 --- a/src/doc/table-json-conversion/specs.md +++ b/src/doc/table-json-conversion/specs.md @@ -46,13 +46,15 @@ XLSX file. These properties have an attribute name and a value. The value can be a. A primitive (text, number, boolean, ...) b. A record c. A list of primitive types -d. A list of records +d. A list of unique enums (multiple choice) +e. A list of records In cases *a.* and *c.*, a cell is created in the column corresponding to the property in the XLSX file. In case *b.*, columns are created for the Properties of the record, where for each of the -Properties the cases *a.* - *d.* are considered recursively. +Properties the cases *a.* - *e.* are considered recursively. Case *d.* leads to a number of +columns, one for each of the possible choices. -For case *d.* however, the two-dimensional structure of an XLSX sheet is not sufficient. Therefore, +For case *e.* however, the two-dimensional structure of an XLSX sheet is not sufficient. Therefore, for such cases, *new* XLSX sheets/tables are created. In these sheets/tables, the referenced records are treated as described above (new columns for the @@ -61,26 +63,7 @@ these records are referenced. Let's now consider these four cases in detail and with examples: -### a. Properties with Primitive Data Types ### - -```JSON -{ - "Training": { - "date": "2023-01-01", - "url": "www.indiscale.com", - "duration": 1.0, - "participants": 1, - "remote": false - } -} -``` - -This entry is represented in an XLSX sheet with the following content: -date url duration participants remote -2023-01-01 www.indiscale.com 1.0 1 false - - -### a. Properties mit primitiven Datentypen ### +### a. Properties with primitive data types ### ```JSON { @@ -143,7 +126,31 @@ This entry would be represented in an XLSX sheet with the following content: The list elements are written into the cell separated by `;` (semicolon). If the elements contain the separator `;`, it is escaped with `\\`. -### d. Properties containing lists with references ### +### d. Multiple choice properties ### + +```JSON +{ + "Training": { + "date": "2024-04-17", + "skills": [ + "Planning", + "Evaluation" + ] + } +} +``` + +If the `skills` list is denoted as an `enum` array with `"uniqueItems": true` in the json schema, +this entry would be represented like this in an XLSX: + +| date | skills.Planning | skills.Communication | skills.Evaluation | +|------------|-----------------|----------------------|-------------------| +| 2024-04-17 | x | | x | + +Note that this example assumes that the list of possible choices, as given in the json schema, was +"Planning, Communication, Evaluation". + +### e. Properties containing lists with references ### ```JSON { @@ -195,7 +202,8 @@ special treatment. The following values are used: - ``IGNORE``: This row is ignored. It can be used for explanatory texts or layout. - ``COL_TYPE``: Typically the first row that is not `IGNORE`. It indicates the row that defines the - type of columns (`FOREIGN`, `SCALAR`, `LIST`, `IGNORE`). This row may occur only once. + type of columns (`FOREIGN`, `SCALAR`, `LIST`, `MULTIPLE_CHOICE`, `IGNORE`). This row must occur + exactly once per sheet. - ``PATH``: Indicates that the row is used to define the path within the JSON. These rows are typically hidden for users. @@ -244,6 +252,31 @@ For example, this table defines three coaches for the two trainings from the las | | 2024-02-27 | example.com/mp | Berta | | | 2024-02-27 | example.com/m | Chris | +#### Sepcial case: multiple choice "checkboxes" #### + +As a special case, enum arrays with `"uniqueItems": true` can be represented as multiple columns, +with one column per choice. The choices are denoted as the last `PATH` component, the column type +must be `MULTIPLE_CHOICE`. + +Stored data is denoted as an "x" character in the respective cell, empty cells denote that the item +was not selected. Additionally, the implementation also allows `TRUE` or `1` for selected items, +and `FALSE`, `0` or cells with only whitespace characters for deselected items: + +| `COL_TYPE` | `MULTIPLE_CHOICE` | `MULTIPLE_CHOICE` | `MULTIPLE_CHOICE` | +| `PATH` | `skills` | `skills` | `skills` | +| `PATH` | `Planning` | `Communication` | `Evaluation` | +| `IGNORE` | skills.Planning | skills.Communication | skills.Evaluation | +|------------|-------------------|----------------------|-------------------| +| | x | | X | +| | `" "` | `TRUE` | `FALSE` | +| | 0 | x | 1 | + +These rows correspond to: + +1. Planning, Evaluation +2. Communication +3. Communication, Evaluation + ## Current limitations ## The current implementation still lacks the following: diff --git a/unittests/table_json_conversion/data/multiple_choice_data.json b/unittests/table_json_conversion/data/multiple_choice_data.json new file mode 100644 index 0000000000000000000000000000000000000000..1f14911ea79e8d78a452bb221f693d1a01cce744 --- /dev/null +++ b/unittests/table_json_conversion/data/multiple_choice_data.json @@ -0,0 +1,11 @@ +{ + "Training": { + "name": "Super Skill Training", + "date": "2024-04-17", + "skills": [ + "Planning", + "Evaluation" + ], + "exam_types": [] + } +} diff --git a/unittests/table_json_conversion/data/multiple_choice_data.xlsx b/unittests/table_json_conversion/data/multiple_choice_data.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..28cf4007d8a1a061235863d12e5bdc5b5747f386 Binary files /dev/null and b/unittests/table_json_conversion/data/multiple_choice_data.xlsx differ diff --git a/unittests/table_json_conversion/data/multiple_choice_schema.json b/unittests/table_json_conversion/data/multiple_choice_schema.json new file mode 100644 index 0000000000000000000000000000000000000000..71bf0379aba4ad6f8510581ba0defadb81a66609 --- /dev/null +++ b/unittests/table_json_conversion/data/multiple_choice_schema.json @@ -0,0 +1,57 @@ +{ + "type": "object", + "properties": { + "Training": { + "type": "object", + "required": [], + "additionalProperties": false, + "title": "Training", + "properties": { + "name": { + "type": "string", + "description": "The name of the Record to be created" + }, + "date": { + "description": "The date of the training.", + "anyOf": [ + { + "type": "string", + "format": "date" + }, + { + "type": "string", + "format": "date-time" + } + ] + }, + "skills": { + "description": "Skills that are trained.", + "type": "array", + "items": { + "enum": [ + "Planning", + "Communication", + "Evaluation" + ] + }, + "uniqueItems": true + }, + "exam_types": { + "type": "array", + "items": { + "enum": [ + "Oral", + "Written" + ] + }, + "uniqueItems": true + } + } + } + }, + "required": [ + "Training" + ], + "additionalProperties": false, + "$schema": "https://json-schema.org/draft/2020-12/schema" +} diff --git a/unittests/table_json_conversion/data/multiple_choice_template.xlsx b/unittests/table_json_conversion/data/multiple_choice_template.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..e523506201ee7301dfa6f814e0315c01b95b08ee Binary files /dev/null and b/unittests/table_json_conversion/data/multiple_choice_template.xlsx differ diff --git a/unittests/table_json_conversion/test_fill_xlsx.py b/unittests/table_json_conversion/test_fill_xlsx.py index 946336da721f7c9affd5c553ccbb38cb46217eef..1315bd9fe06196ba5df31d34182293887d5a2bb1 100644 --- a/unittests/table_json_conversion/test_fill_xlsx.py +++ b/unittests/table_json_conversion/test_fill_xlsx.py @@ -138,6 +138,10 @@ def test_fill_xlsx(): template_file=rfp("data/simple_template.xlsx"), known_good=rfp("data/simple_data_ascii_chars.xlsx"), schema=rfp("data/simple_schema.json")) + fill_and_compare(json_file=rfp("data/multiple_choice_data.json"), + template_file=rfp("data/multiple_choice_template.xlsx"), + known_good=rfp("data/multiple_choice_data.xlsx"), + schema=rfp("data/multiple_choice_schema.json")) def test_errors(): diff --git a/unittests/table_json_conversion/test_table_template_generator.py b/unittests/table_json_conversion/test_table_template_generator.py index 8fc7b216d0eb2aa54ece6ace986cbeb227cc3e45..61da2142d1965adf63fda54c1fbbde1ce1e2a060 100644 --- a/unittests/table_json_conversion/test_table_template_generator.py +++ b/unittests/table_json_conversion/test_table_template_generator.py @@ -268,6 +268,13 @@ def test_model_with_indirect_reference(): outfile=None) +def test_model_with_multiple_choice(): + _compare_generated_to_known_good( + schema_file=rfp("data/multiple_choice_schema.json"), + known_good=rfp("data/multiple_choice_template.xlsx"), + outfile=None) + + def test_exceptions(): # Foreign keys must be lists with pytest.raises(ValueError, match="Foreign keys must be a list of strings, but a single "