Skip to content
Snippets Groups Projects
Verified Commit c14af58f authored by Daniel Hornung's avatar Daniel Hornung
Browse files

STY: Documentation andcode styled.

parent e1b5fd4a
No related branches found
No related tags found
4 merge requests!100WIP: Filling XLSX: Seems to be working.,!94ENH: add framework for converting json schema into table templates,!93Filling XLSX: Everything except multiple choice.,!92ENH: xlsx template generator
Pipeline #48136 failed
......@@ -54,69 +54,83 @@ class TableTemplateGenerator(ABC):
@abstractmethod
def generate(self, schema: dict, foreign_keys: dict, filepath: str):
""" generates a sheet definition from a given JSON schema
"""Generate a sheet definition from a given JSON schema.
Parameters:
-----------
schema: dict, given JSON schema
foreign_keys: dict, a configuration that defines which attributes shall be used to create
additional columns when a list of references exists. Keys of the dict are
elements of the path within the JSON where the key is needed. Suppose we want
to distinguis Persons that are referenced by Trainings, foreign_keys must at
least contain the following: {"Training": {"Person": [key1, key2]}}.
Values wihtin the dicts can be a list representing the keys or
a dict that allows to set further foreign keys at lower depth.
In latter case if foreign keys exist at that level (
e.g. in the above example there might be further levels below "Person".),
then the keys can be set using the special "__this__" key.
Example: {"Training": {"__this__": ["date"], "Person": [key1, key2]}}
Here, "date" is the sole key for Training.
schema: dict
Given JSON schema.
foreign_keys: dict
A tree-like configuration (nested dict) that defines which attributes shall be used to
create additional columns when a list of references exists. The nested dict is
structured like the data model, its innermost elements are leaves of the path trees
within the JSON, they define the required keys.
| Suppose we want to distinguish Persons that are referenced by Trainings, then
``foreign_keys`` must at least contain the following:
| ``{"Training": {"Person": ["name", "email"]}}``.
Values within the dicts can be either a list representing the keys (as in the example
above) or a dict that allows to set additional foreign keys at higher depths. In the
latter case (dict instead of list) if foreign keys exist at that level (e.g. in the
above example there might be further levels below "Person"), then the foreign keys can
be set using the special ``__this__`` key.
Example: ``{"Training": {"__this__": ["date"], "Person": ["name", "email"]}}``
Here, ``date`` is the sole foreign key for Training.
"""
def _generate_sheets_from_schema(self, schema: dict, foreign_keys: Optional[dict] = None
) -> Dict[str, Dict[str,
Tuple[ColumnType, Optional[str], list]]]:
""" generates a sheet definition from a given JSON schema
"""Generate a sheet definition from a given JSON schema.
Parameters
----------
schema: dict
given JSON schema
foreign_keys: dict
foreign_keys: dict, optional
a configuration that defines which attributes shall be used to create
additional columns when a list of references exists. See foreign_keys
additional columns when a list of references exists. See ``foreign_keys``
argument of TableTemplateGenerator.generate.
Returns
-------
dict
The structure of the dict, lets call it ``sheets`` is as follows:
``sheets[sheetname][colname]= (COL_TYPE, description, [path])``
I.e. the top level dict contains sheet names as keys and dicts as values.
The inner dict has column names as keys and tuples as values. The tuples have the
column type as the first element, the description of the corresponding property as
second and the path as a list as last.
sheets: dict
A two-level dict which describes columns of template sheets.
| The structure of this two-level dict is as follows:
| ``sheets[sheetname][colname]= (<col_type>, <description>, [<path>, ...])``
I.e. the outer dict contains sheet names as keys, the inner dict has column names as
keys and tuples as values. These tuples consist of:
- the column type
- the description of the corresponding property
- a list representing the path.
"""
if not ("type" in schema or "anyOf" in schema):
raise ValueError("Inappropriate JSON schema: The following part should contain the "
f"'type' key:\n{schema}\n")
raise ValueError("Inappropriate JSON schema: The following object must contain the "
f"'type' or 'anyOf' key:\n{schema}\n")
if "properties" not in schema:
raise ValueError("Inappropriate JSON schema: The following object must contain "
f"the 'properties' key:\n{schema}\n")
if "type" in schema:
assert schema["type"] == "object"
if foreign_keys is None:
foreign_keys = {}
# here, we treat the top level
# sheets[sheetname][colname]= (COL_TYPE, description, [path])
sheets: Dict[str, Dict[str, Tuple[ColumnType, Optional[str], list]]] = {}
if "properties" not in schema:
raise ValueError("Inappropriate JSON schema: The following part should contain "
f"the 'properties' key:\n{schema}\n")
for rt_name in schema["properties"].keys():
sheets[rt_name] = self._treat_schema_element(schema["properties"][rt_name],
sheets, [rt_name], foreign_keys)
for rt_name, rt_def in schema["properties"].items():
sheets[rt_name] = self._treat_schema_element(schema=rt_def, sheets=sheets,
path=[rt_name], foreign_keys=foreign_keys)
return sheets
def _get_foreign_keys(self, keys: dict, path: list) -> list:
""" returns the foreign keys that are needed at the location to which path points """
msg = f"A foreign key definition is missing for path:\n{path}\n{keys}"
"""Return the foreign keys that are needed at the location to which path points."""
msg = f"A foreign key definition is missing for path:\n{path}\nKeys are:\n{keys}"
while path:
if keys is None or path[0] not in keys:
raise ValueError(msg)
......@@ -128,30 +142,35 @@ class TableTemplateGenerator(ABC):
return keys
raise ValueError(msg)
def _treat_schema_element(self, schema: dict, sheets: dict, path: list,
def _treat_schema_element(self, schema: dict, sheets: dict, path: list[str],
foreign_keys: Optional[dict] = None, level_in_sheet_name: int = 1,
array_paths: Optional[list] = None
) -> Dict[str, Tuple[ColumnType, Optional[str], list]]:
""" recursively transforms elements from the schema into column definitions
"""Recursively transform elements from the schema into column definitions.
``sheets`` is modified in place.
sheets is modified in place.
Parameters
----------
array_paths: list
a list of path along the way to the current object, where the json contains arrays
schema: dict
part of the json schema; it must be the level that contains the type definition
(e.g. 'type' or 'oneOf' key)
schema: dict
Part of the json schema; it must be the level that contains the type definition
(e.g. 'type' or 'oneOf' key)
sheets: dict
All the sheets, indexed by their name. This is typically modified in place by this
method.
path: list[str]
The relevant (sub) path for this schema part?
array_paths: list
A list of path along the way to the current object, where the json contains arrays.
Returns
-------
dict
describing the columns; see doc string of _generate_sheets_from_schema
columns: dict
Describing the columns; see doc string of `_generate_sheets_from_schema`_
"""
if not ("type" in schema or "enum" in schema or "oneOf" in schema or "anyOf" in schema):
raise ValueError("Inappropriate JSON schema: The following part should contain the "
f"'type' key:\n{schema}\n")
raise ValueError("Inappropriate JSON schema: The following schema part must contain "
f"'type', 'enum', 'oneOf' or 'anyOf':\n{schema}\n")
if array_paths is None:
# if this is not set, we are at top level and the top level element may always be an
......@@ -163,88 +182,94 @@ class TableTemplateGenerator(ABC):
ctype = ColumnType.SCALAR
# if it is an array, value defs are in 'items'
if 'type' in schema and schema['type'] == 'array':
if ('type' in schema['items'] and schema['items']['type'] == 'object'
if schema.get('type') == 'array':
if (schema['items'].get('type') == 'object'
and len(path) > 1): # list of references; special treatment
# we add a new sheet with columns generated from the subtree of the schema
sheetname = ".".join(path)
if sheetname in sheets:
raise ValueError(f"The shema would lead to two sheets with the same name which"
f" is forbidden:{sheetname}")
raise ValueError("The schema would lead to two sheets with the same name, "
f"which is forbidden: {sheetname}")
sheets[sheetname] = self._treat_schema_element(
schema['items'], sheets, path, foreign_keys, len(path),
schema=schema['items'], sheets=sheets, path=path, foreign_keys=foreign_keys,
level_in_sheet_name=len(path),
array_paths=array_paths+[path] # since this level is an array extend the list
)
# and add the foreign keys that are necessary up to this point
for pa in array_paths:
keys = self._get_foreign_keys(foreign_keys, pa)
for ke in keys:
if ke in sheets[sheetname]:
raise ValueError(f"The shema would lead to two columns with the same "
f"name which is forbidden:{ke}")
sheets[sheetname][ke] = (ColumnType.FOREIGN, f"see sheet '{path[0]}'",
pa+[ke])
# columns are added to the new sheet, thus we do not return columns
for array_path in array_paths:
keys = self._get_foreign_keys(foreign_keys, array_path)
for key in keys:
if key in sheets[sheetname]:
raise ValueError("The schema would lead to two columns with the same "
f"name which is forbidden: {key}")
sheets[sheetname][key] = (ColumnType.FOREIGN, f"see sheet '{path[0]}'",
array_path + [key])
# Columns are added to the new sheet, thus we do not return any columns for the
# current sheet.
return {}
# it is a list of primitive types -> semi colon separated list
# it is a list of primitive types -> semicolon separated list
schema = schema['items']
ctype = ColumnType.LIST
if 'oneOf' in schema:
for el in schema['oneOf']:
if 'type' in el:
schema = el
# This should only be the case for "new or existing reference".
for el in schema.get('oneOf', []):
if 'type' in el:
schema = el
break
if "properties" in schema: # recurse for each property
if "properties" in schema: # recurse for each property, then return
cols = {}
for pname in schema["properties"]:
col_defs = self._treat_schema_element(
schema["properties"][pname], sheets, path+[pname], foreign_keys,
level_in_sheet_name, array_paths=array_paths)
for k in col_defs.keys():
for k in col_defs:
if k in cols:
raise ValueError(f"The shema would lead to two columns with the same "
f"name which is forbidden:{k}")
raise ValueError(f"The schema would lead to two columns with the same "
f"name which is forbidden: {k}")
cols.update(col_defs)
return cols
else: # those are the leaves
description = schema['description'] if 'description' in schema else None
# definition of a single column
default_return = {".".join(path[level_in_sheet_name:]): (ctype, description, path)}
if 'type' not in schema and 'enum' in schema:
return default_return
if 'type' not in schema and 'anyOf' in schema:
for d in schema['anyOf']:
# currently the only case where this occurs is date formats
assert d['type'] == 'string'
assert d['format'] == 'date' or d['format'] == 'date-time'
return default_return
if schema["type"] in ['string', 'number', 'integer', 'boolean']:
if 'format' in schema and schema['format'] == 'data-url':
return {} # file; ignore for now
return default_return
raise ValueError("Inappropriate JSON schema: The following part should define an"
f" object with properties or a primitive type:\n{schema}\n")
raise RuntimeError("This should not be reached. Implementation error.")
# The schema is a leaf.
description = schema['description'] if 'description' in schema else None
# definition of a single column
default_return = {".".join(path[level_in_sheet_name:]): (ctype, description, path)}
if 'type' not in schema and 'enum' in schema:
return default_return
if 'type' not in schema and 'anyOf' in schema:
for d in schema['anyOf']:
# currently the only case where this occurs is date formats
assert d['type'] == 'string'
assert d['format'] == 'date' or d['format'] == 'date-time'
return default_return
if schema["type"] in ['string', 'number', 'integer', 'boolean']:
if 'format' in schema and schema['format'] == 'data-url':
return {} # file; ignore for now
return default_return
raise ValueError("Inappropriate JSON schema: The following part should define an"
f" object with properties or a primitive type:\n{schema}\n")
class XLSXTemplateGenerator(TableTemplateGenerator):
""" class for generating XLSX tables from json schema """
"""Class for generating XLSX tables from json schema definitions."""
def __init__(self):
pass
def generate(self, schema: dict, foreign_keys: dict, filepath: str):
""" generates a sheet definition from a given JSON schema
def generate(self, schema: dict, foreign_keys: dict, filepath: str) -> None:
"""Generate a sheet definition from a given JSON schema.
Parameters:
-----------
schema: dict, given JSON schema
foreign_keys: dict, a configuration that defines which attributes shall be used to create
additional columns when a list of references exists. See foreign_keys
argument of TableTemplateGenerator.generate.
filepath: str, the XLSX file will be stored under this path.
schema: dict
Given JSON schema
foreign_keys: dict
A configuration that defines which attributes shall be used to create
additional columns when a list of references exists. See ``foreign_keys``
argument of :ref:`TableTemplateGenerator.generate` .
filepath: str
The XLSX file will be stored under this path.
"""
sheets = self._generate_sheets_from_schema(schema, foreign_keys)
wb = self._create_workbook_from_sheets_def(sheets)
......@@ -279,22 +304,23 @@ class XLSXTemplateGenerator(TableTemplateGenerator):
def _create_workbook_from_sheets_def(
self, sheets: Dict[str, Dict[str, Tuple[ColumnType, Optional[str], list]]]):
"""Create and return a nice workbook for the given sheets."""
wb = Workbook()
assert wb.sheetnames == ["Sheet"]
for sn, sheetdef in sheets.items():
ws = wb.create_sheet(re.sub(INVALID_TITLE_REGEX, '_', sn))
# first row will by the COL_TYPE row
# first column will be the indicator row with values COL_TYPE, PATH, IGNORE
# the COL_TYPE row will be followed by as many PATH rows as needed
for sheetname, sheetdef in sheets.items():
ws = wb.create_sheet(re.sub(INVALID_TITLE_REGEX, '_', sheetname))
# First row will by the COL_TYPE row.
# First column will be the indicator row with values COL_TYPE, PATH, IGNORE.
# The COL_TYPE row will be followed by as many PATH rows as needed.
max_path_length = self._get_max_path_length(sheetdef)
header_index = 2+max_path_length
description_index = 3+max_path_length
header_index = 2 + max_path_length
description_index = 3 + max_path_length
# create first column
ws.cell(1, 1, RowType.COL_TYPE.name)
for index in range(max_path_length):
ws.cell(2+index, 1, RowType.PATH.name)
ws.cell(2 + index, 1, RowType.PATH.name)
ws.cell(header_index, 1, RowType.IGNORE.name)
ws.cell(description_index, 1, RowType.IGNORE.name)
......@@ -302,12 +328,12 @@ class XLSXTemplateGenerator(TableTemplateGenerator):
# create other columns
for index, (colname, ct, desc, path) in enumerate(ordered_cols):
ws.cell(1, 2+index, ct.name)
for pi, el in enumerate(path):
ws.cell(2+pi, 2+index, el)
ws.cell(header_index, 2+index, colname)
ws.cell(1, 2 + index, ct.name)
for path_index, el in enumerate(path):
ws.cell(2 + path_index, 2 + index, el)
ws.cell(header_index, 2 + index, colname)
if desc:
ws.cell(description_index, 2+index, desc)
ws.cell(description_index, 2 + index, desc)
# hide special rows
for index, row in enumerate(ws.rows):
......@@ -321,10 +347,10 @@ class XLSXTemplateGenerator(TableTemplateGenerator):
del wb['Sheet']
# order sheets
# for index, sn in enumerate(sorted(wb.sheetnames)):
# wb.move_sheet(sn, index-wb.index(wb[sn]))
# for index, sheetname in enumerate(sorted(wb.sheetnames)):
# wb.move_sheet(sheetname, index-wb.index(wb[sheetname]))
# reverse sheets
for index, sn in enumerate(wb.sheetnames[::-1]):
wb.move_sheet(sn, index-wb.index(wb[sn]))
for index, sheetname in enumerate(wb.sheetnames[::-1]):
wb.move_sheet(sheetname, index-wb.index(wb[sheetname]))
return wb
......@@ -146,13 +146,14 @@ def test_get_foreign_keys():
assert ['a'] == generator._get_foreign_keys(fkd, ['Training'])
fkd = {"Training": {'hallo'}}
with pytest.raises(ValueError, match=r"A foreign key definition is missing for path:\n\['Training'\]\n{'Training': \{'hallo'\}\}"):
with pytest.raises(ValueError, match=r"A foreign key definition is missing for path:\n\["
r"'Training'\]\nKeys are:\n{'Training': \{'hallo'\}\}"):
generator._get_foreign_keys(fkd, ['Training'])
fkd = {"Training": {"__this__": ['a'], 'b': ['c']}}
assert ['c'] == generator._get_foreign_keys(fkd, ['Training', 'b'])
with pytest.raises(ValueError, match=r"A foreign key definition is missing for.*"):
with pytest.raises(ValueError, match=r"A foreign key definition is missing for .*"):
generator._get_foreign_keys({}, ['Training'])
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment