From c14af58f874c9d37e6a633f681a31c7af9148cba Mon Sep 17 00:00:00 2001 From: Daniel <d.hornung@indiscale.com> Date: Tue, 5 Mar 2024 16:08:15 +0100 Subject: [PATCH] STY: Documentation andcode styled. --- .../table_json_conversion/table_generator.py | 254 ++++++++++-------- .../test_table_template_generator.py | 5 +- 2 files changed, 143 insertions(+), 116 deletions(-) diff --git a/src/caosadvancedtools/table_json_conversion/table_generator.py b/src/caosadvancedtools/table_json_conversion/table_generator.py index fb2a5149..9424c0e3 100644 --- a/src/caosadvancedtools/table_json_conversion/table_generator.py +++ b/src/caosadvancedtools/table_json_conversion/table_generator.py @@ -54,69 +54,83 @@ class TableTemplateGenerator(ABC): @abstractmethod def generate(self, schema: dict, foreign_keys: dict, filepath: str): - """ generates a sheet definition from a given JSON schema + """Generate a sheet definition from a given JSON schema. Parameters: ----------- - schema: dict, given JSON schema - foreign_keys: dict, a configuration that defines which attributes shall be used to create - additional columns when a list of references exists. Keys of the dict are - elements of the path within the JSON where the key is needed. Suppose we want - to distinguis Persons that are referenced by Trainings, foreign_keys must at - least contain the following: {"Training": {"Person": [key1, key2]}}. - Values wihtin the dicts can be a list representing the keys or - a dict that allows to set further foreign keys at lower depth. - In latter case if foreign keys exist at that level ( - e.g. in the above example there might be further levels below "Person".), - then the keys can be set using the special "__this__" key. - Example: {"Training": {"__this__": ["date"], "Person": [key1, key2]}} - Here, "date" is the sole key for Training. + schema: dict + Given JSON schema. + + foreign_keys: dict + A tree-like configuration (nested dict) that defines which attributes shall be used to + create additional columns when a list of references exists. The nested dict is + structured like the data model, its innermost elements are leaves of the path trees + within the JSON, they define the required keys. + + | Suppose we want to distinguish Persons that are referenced by Trainings, then + ``foreign_keys`` must at least contain the following: + | ``{"Training": {"Person": ["name", "email"]}}``. + + Values within the dicts can be either a list representing the keys (as in the example + above) or a dict that allows to set additional foreign keys at higher depths. In the + latter case (dict instead of list) if foreign keys exist at that level (e.g. in the + above example there might be further levels below "Person"), then the foreign keys can + be set using the special ``__this__`` key. + + Example: ``{"Training": {"__this__": ["date"], "Person": ["name", "email"]}}`` + Here, ``date`` is the sole foreign key for Training. """ def _generate_sheets_from_schema(self, schema: dict, foreign_keys: Optional[dict] = None ) -> Dict[str, Dict[str, Tuple[ColumnType, Optional[str], list]]]: - """ generates a sheet definition from a given JSON schema + """Generate a sheet definition from a given JSON schema. Parameters ---------- schema: dict given JSON schema - foreign_keys: dict + foreign_keys: dict, optional a configuration that defines which attributes shall be used to create - additional columns when a list of references exists. See foreign_keys + additional columns when a list of references exists. See ``foreign_keys`` argument of TableTemplateGenerator.generate. Returns ------- - dict - The structure of the dict, lets call it ``sheets`` is as follows: - ``sheets[sheetname][colname]= (COL_TYPE, description, [path])`` - I.e. the top level dict contains sheet names as keys and dicts as values. - The inner dict has column names as keys and tuples as values. The tuples have the - column type as the first element, the description of the corresponding property as - second and the path as a list as last. + sheets: dict + A two-level dict which describes columns of template sheets. + + | The structure of this two-level dict is as follows: + | ``sheets[sheetname][colname]= (<col_type>, <description>, [<path>, ...])`` + + I.e. the outer dict contains sheet names as keys, the inner dict has column names as + keys and tuples as values. These tuples consist of: + - the column type + - the description of the corresponding property + - a list representing the path. """ if not ("type" in schema or "anyOf" in schema): - raise ValueError("Inappropriate JSON schema: The following part should contain the " - f"'type' key:\n{schema}\n") + raise ValueError("Inappropriate JSON schema: The following object must contain the " + f"'type' or 'anyOf' key:\n{schema}\n") + if "properties" not in schema: + raise ValueError("Inappropriate JSON schema: The following object must contain " + f"the 'properties' key:\n{schema}\n") + if "type" in schema: + assert schema["type"] == "object" if foreign_keys is None: foreign_keys = {} # here, we treat the top level # sheets[sheetname][colname]= (COL_TYPE, description, [path]) sheets: Dict[str, Dict[str, Tuple[ColumnType, Optional[str], list]]] = {} - if "properties" not in schema: - raise ValueError("Inappropriate JSON schema: The following part should contain " - f"the 'properties' key:\n{schema}\n") - for rt_name in schema["properties"].keys(): - sheets[rt_name] = self._treat_schema_element(schema["properties"][rt_name], - sheets, [rt_name], foreign_keys) + for rt_name, rt_def in schema["properties"].items(): + sheets[rt_name] = self._treat_schema_element(schema=rt_def, sheets=sheets, + path=[rt_name], foreign_keys=foreign_keys) return sheets def _get_foreign_keys(self, keys: dict, path: list) -> list: - """ returns the foreign keys that are needed at the location to which path points """ - msg = f"A foreign key definition is missing for path:\n{path}\n{keys}" + """Return the foreign keys that are needed at the location to which path points.""" + msg = f"A foreign key definition is missing for path:\n{path}\nKeys are:\n{keys}" while path: if keys is None or path[0] not in keys: raise ValueError(msg) @@ -128,30 +142,35 @@ class TableTemplateGenerator(ABC): return keys raise ValueError(msg) - def _treat_schema_element(self, schema: dict, sheets: dict, path: list, + def _treat_schema_element(self, schema: dict, sheets: dict, path: list[str], foreign_keys: Optional[dict] = None, level_in_sheet_name: int = 1, array_paths: Optional[list] = None ) -> Dict[str, Tuple[ColumnType, Optional[str], list]]: - """ recursively transforms elements from the schema into column definitions + """Recursively transform elements from the schema into column definitions. + + ``sheets`` is modified in place. - sheets is modified in place. Parameters ---------- - array_paths: list - a list of path along the way to the current object, where the json contains arrays - schema: dict - part of the json schema; it must be the level that contains the type definition - (e.g. 'type' or 'oneOf' key) + schema: dict + Part of the json schema; it must be the level that contains the type definition + (e.g. 'type' or 'oneOf' key) + sheets: dict + All the sheets, indexed by their name. This is typically modified in place by this + method. + path: list[str] + The relevant (sub) path for this schema part? + array_paths: list + A list of path along the way to the current object, where the json contains arrays. Returns ------- - - dict - describing the columns; see doc string of _generate_sheets_from_schema + columns: dict + Describing the columns; see doc string of `_generate_sheets_from_schema`_ """ if not ("type" in schema or "enum" in schema or "oneOf" in schema or "anyOf" in schema): - raise ValueError("Inappropriate JSON schema: The following part should contain the " - f"'type' key:\n{schema}\n") + raise ValueError("Inappropriate JSON schema: The following schema part must contain " + f"'type', 'enum', 'oneOf' or 'anyOf':\n{schema}\n") if array_paths is None: # if this is not set, we are at top level and the top level element may always be an @@ -163,88 +182,94 @@ class TableTemplateGenerator(ABC): ctype = ColumnType.SCALAR # if it is an array, value defs are in 'items' - if 'type' in schema and schema['type'] == 'array': - if ('type' in schema['items'] and schema['items']['type'] == 'object' + if schema.get('type') == 'array': + if (schema['items'].get('type') == 'object' and len(path) > 1): # list of references; special treatment # we add a new sheet with columns generated from the subtree of the schema sheetname = ".".join(path) if sheetname in sheets: - raise ValueError(f"The shema would lead to two sheets with the same name which" - f" is forbidden:{sheetname}") + raise ValueError("The schema would lead to two sheets with the same name, " + f"which is forbidden: {sheetname}") sheets[sheetname] = self._treat_schema_element( - schema['items'], sheets, path, foreign_keys, len(path), + schema=schema['items'], sheets=sheets, path=path, foreign_keys=foreign_keys, + level_in_sheet_name=len(path), array_paths=array_paths+[path] # since this level is an array extend the list ) # and add the foreign keys that are necessary up to this point - for pa in array_paths: - keys = self._get_foreign_keys(foreign_keys, pa) - for ke in keys: - if ke in sheets[sheetname]: - raise ValueError(f"The shema would lead to two columns with the same " - f"name which is forbidden:{ke}") - sheets[sheetname][ke] = (ColumnType.FOREIGN, f"see sheet '{path[0]}'", - pa+[ke]) - # columns are added to the new sheet, thus we do not return columns + for array_path in array_paths: + keys = self._get_foreign_keys(foreign_keys, array_path) + for key in keys: + if key in sheets[sheetname]: + raise ValueError("The schema would lead to two columns with the same " + f"name which is forbidden: {key}") + sheets[sheetname][key] = (ColumnType.FOREIGN, f"see sheet '{path[0]}'", + array_path + [key]) + # Columns are added to the new sheet, thus we do not return any columns for the + # current sheet. return {} - # it is a list of primitive types -> semi colon separated list + # it is a list of primitive types -> semicolon separated list schema = schema['items'] ctype = ColumnType.LIST - if 'oneOf' in schema: - for el in schema['oneOf']: - if 'type' in el: - schema = el + # This should only be the case for "new or existing reference". + for el in schema.get('oneOf', []): + if 'type' in el: + schema = el + break - if "properties" in schema: # recurse for each property + if "properties" in schema: # recurse for each property, then return cols = {} for pname in schema["properties"]: col_defs = self._treat_schema_element( schema["properties"][pname], sheets, path+[pname], foreign_keys, level_in_sheet_name, array_paths=array_paths) - for k in col_defs.keys(): + for k in col_defs: if k in cols: - raise ValueError(f"The shema would lead to two columns with the same " - f"name which is forbidden:{k}") + raise ValueError(f"The schema would lead to two columns with the same " + f"name which is forbidden: {k}") cols.update(col_defs) return cols - else: # those are the leaves - description = schema['description'] if 'description' in schema else None - # definition of a single column - default_return = {".".join(path[level_in_sheet_name:]): (ctype, description, path)} - if 'type' not in schema and 'enum' in schema: - return default_return - if 'type' not in schema and 'anyOf' in schema: - for d in schema['anyOf']: - # currently the only case where this occurs is date formats - assert d['type'] == 'string' - assert d['format'] == 'date' or d['format'] == 'date-time' - return default_return - if schema["type"] in ['string', 'number', 'integer', 'boolean']: - if 'format' in schema and schema['format'] == 'data-url': - return {} # file; ignore for now - return default_return - raise ValueError("Inappropriate JSON schema: The following part should define an" - f" object with properties or a primitive type:\n{schema}\n") - raise RuntimeError("This should not be reached. Implementation error.") + + # The schema is a leaf. + description = schema['description'] if 'description' in schema else None + # definition of a single column + default_return = {".".join(path[level_in_sheet_name:]): (ctype, description, path)} + if 'type' not in schema and 'enum' in schema: + return default_return + if 'type' not in schema and 'anyOf' in schema: + for d in schema['anyOf']: + # currently the only case where this occurs is date formats + assert d['type'] == 'string' + assert d['format'] == 'date' or d['format'] == 'date-time' + return default_return + if schema["type"] in ['string', 'number', 'integer', 'boolean']: + if 'format' in schema and schema['format'] == 'data-url': + return {} # file; ignore for now + return default_return + raise ValueError("Inappropriate JSON schema: The following part should define an" + f" object with properties or a primitive type:\n{schema}\n") class XLSXTemplateGenerator(TableTemplateGenerator): - """ class for generating XLSX tables from json schema """ + """Class for generating XLSX tables from json schema definitions.""" def __init__(self): pass - def generate(self, schema: dict, foreign_keys: dict, filepath: str): - """ generates a sheet definition from a given JSON schema + def generate(self, schema: dict, foreign_keys: dict, filepath: str) -> None: + """Generate a sheet definition from a given JSON schema. Parameters: ----------- - schema: dict, given JSON schema - foreign_keys: dict, a configuration that defines which attributes shall be used to create - additional columns when a list of references exists. See foreign_keys - argument of TableTemplateGenerator.generate. - filepath: str, the XLSX file will be stored under this path. + schema: dict + Given JSON schema + foreign_keys: dict + A configuration that defines which attributes shall be used to create + additional columns when a list of references exists. See ``foreign_keys`` + argument of :ref:`TableTemplateGenerator.generate` . + filepath: str + The XLSX file will be stored under this path. """ sheets = self._generate_sheets_from_schema(schema, foreign_keys) wb = self._create_workbook_from_sheets_def(sheets) @@ -279,22 +304,23 @@ class XLSXTemplateGenerator(TableTemplateGenerator): def _create_workbook_from_sheets_def( self, sheets: Dict[str, Dict[str, Tuple[ColumnType, Optional[str], list]]]): + """Create and return a nice workbook for the given sheets.""" wb = Workbook() assert wb.sheetnames == ["Sheet"] - for sn, sheetdef in sheets.items(): - ws = wb.create_sheet(re.sub(INVALID_TITLE_REGEX, '_', sn)) - # first row will by the COL_TYPE row - # first column will be the indicator row with values COL_TYPE, PATH, IGNORE - # the COL_TYPE row will be followed by as many PATH rows as needed + for sheetname, sheetdef in sheets.items(): + ws = wb.create_sheet(re.sub(INVALID_TITLE_REGEX, '_', sheetname)) + # First row will by the COL_TYPE row. + # First column will be the indicator row with values COL_TYPE, PATH, IGNORE. + # The COL_TYPE row will be followed by as many PATH rows as needed. max_path_length = self._get_max_path_length(sheetdef) - header_index = 2+max_path_length - description_index = 3+max_path_length + header_index = 2 + max_path_length + description_index = 3 + max_path_length # create first column ws.cell(1, 1, RowType.COL_TYPE.name) for index in range(max_path_length): - ws.cell(2+index, 1, RowType.PATH.name) + ws.cell(2 + index, 1, RowType.PATH.name) ws.cell(header_index, 1, RowType.IGNORE.name) ws.cell(description_index, 1, RowType.IGNORE.name) @@ -302,12 +328,12 @@ class XLSXTemplateGenerator(TableTemplateGenerator): # create other columns for index, (colname, ct, desc, path) in enumerate(ordered_cols): - ws.cell(1, 2+index, ct.name) - for pi, el in enumerate(path): - ws.cell(2+pi, 2+index, el) - ws.cell(header_index, 2+index, colname) + ws.cell(1, 2 + index, ct.name) + for path_index, el in enumerate(path): + ws.cell(2 + path_index, 2 + index, el) + ws.cell(header_index, 2 + index, colname) if desc: - ws.cell(description_index, 2+index, desc) + ws.cell(description_index, 2 + index, desc) # hide special rows for index, row in enumerate(ws.rows): @@ -321,10 +347,10 @@ class XLSXTemplateGenerator(TableTemplateGenerator): del wb['Sheet'] # order sheets - # for index, sn in enumerate(sorted(wb.sheetnames)): - # wb.move_sheet(sn, index-wb.index(wb[sn])) + # for index, sheetname in enumerate(sorted(wb.sheetnames)): + # wb.move_sheet(sheetname, index-wb.index(wb[sheetname])) # reverse sheets - for index, sn in enumerate(wb.sheetnames[::-1]): - wb.move_sheet(sn, index-wb.index(wb[sn])) + for index, sheetname in enumerate(wb.sheetnames[::-1]): + wb.move_sheet(sheetname, index-wb.index(wb[sheetname])) return wb diff --git a/unittests/table_json_conversion/test_table_template_generator.py b/unittests/table_json_conversion/test_table_template_generator.py index 30e4f245..af3b3a2d 100644 --- a/unittests/table_json_conversion/test_table_template_generator.py +++ b/unittests/table_json_conversion/test_table_template_generator.py @@ -146,13 +146,14 @@ def test_get_foreign_keys(): assert ['a'] == generator._get_foreign_keys(fkd, ['Training']) fkd = {"Training": {'hallo'}} - with pytest.raises(ValueError, match=r"A foreign key definition is missing for path:\n\['Training'\]\n{'Training': \{'hallo'\}\}"): + with pytest.raises(ValueError, match=r"A foreign key definition is missing for path:\n\[" + r"'Training'\]\nKeys are:\n{'Training': \{'hallo'\}\}"): generator._get_foreign_keys(fkd, ['Training']) fkd = {"Training": {"__this__": ['a'], 'b': ['c']}} assert ['c'] == generator._get_foreign_keys(fkd, ['Training', 'b']) - with pytest.raises(ValueError, match=r"A foreign key definition is missing for.*"): + with pytest.raises(ValueError, match=r"A foreign key definition is missing for .*"): generator._get_foreign_keys({}, ['Training']) -- GitLab