Skip to content
Snippets Groups Projects

Extend json-schema model parser

Merged Florian Spreckelsen requested to merge f-enhance-json-parser into dev
Files
10
@@ -77,7 +77,8 @@ JSON_SCHEMA_ATOMIC_TYPES = [
"string",
"boolean",
"integer",
"number"
"number",
"null"
]
@@ -153,7 +154,12 @@ def parse_model_from_string(string):
return parser.parse_model_from_string(string)
def parse_model_from_json_schema(filename: str, top_level_recordtype: bool = True):
def parse_model_from_json_schema(
filename: str,
top_level_recordtype: bool = True,
types_for_missing_array_items: dict = {},
ignore_unspecified_array_items: bool = False
):
"""Return a datamodel parsed from a json schema definition.
Parameters
@@ -163,7 +169,14 @@ def parse_model_from_json_schema(filename: str, top_level_recordtype: bool = Tru
top_level_recordtype : bool, optional
Whether there is a record type defined at the top level of the
schema. Default is true.
types_for_missing_array_items : dict, optional
dictionary containing fall-back types for json entries with `type:
array` but without `items` specification. Default is an empty dict.
ignore_unspecified_array_items : bool, optional
Whether to ignore `type: array` entries the type of which is not
specified by their `items` property or given in
`types_for_missing_array_items`. An error is raised if they are not
ignored. Default is False.
Returns
-------
@@ -179,8 +192,8 @@ def parse_model_from_json_schema(filename: str, top_level_recordtype: bool = Tru
"""
# @author Florian Spreckelsen
# @date 2022-02-17
# @review Daniel Hornung 2022-02-18
parser = JsonSchemaParser()
# @review Timm Fitschen 2023-05-25
parser = JsonSchemaParser(types_for_missing_array_items, ignore_unspecified_array_items)
return parser.parse_model_from_json_schema(filename, top_level_recordtype)
@@ -605,14 +618,13 @@ class Parser(object):
class JsonSchemaParser(Parser):
"""Extends the yaml parser to read in datamodels defined in a json schema.
**EXPERIMENTAL:** While this calss can already be used to create data models
**EXPERIMENTAL:** While this class can already be used to create data models
from basic json schemas, there are the following limitations and missing
features:
* Due to limitations of json-schema itself, we currently do not support
inheritance in the imported data models
* The same goes for suggested properties of RecordTypes
* Currently, ``$defs`` and ``$ref`` in the input schema are not resolved.
* Already defined RecordTypes and (scalar) Properties can't be re-used as
list properties
* Reference properties that are different from the referenced RT. (Although
@@ -620,13 +632,16 @@ class JsonSchemaParser(Parser):
* Values
* Roles
* The extern keyword from the yaml parser
* Currently, a json-schema cannot be transformed into a data model if its
root element isn't a RecordType (or Property) with ``title`` and ``type``.
"""
# @author Florian Spreckelsen
# @date 2022-02-17
# @review Timm Fitschen 2022-02-30
# @review Timm Fitschen 2023-05-25
def __init__(self, types_for_missing_array_items={}, ignore_unspecified_array_items=False):
super().__init__()
self.types_for_missing_array_items = types_for_missing_array_items
self.ignore_unspecified_array_items = ignore_unspecified_array_items
def parse_model_from_json_schema(self, filename: str, top_level_recordtype: bool = True):
"""Return a datamodel created from the definition in the json schema in
@@ -647,7 +662,7 @@ class JsonSchemaParser(Parser):
"""
# @author Florian Spreckelsen
# @date 2022-02-17
# @review Timm Fitschen 2022-02-30
# @review Timm Fitschen 2023-05-25
with open(filename, 'r') as schema_file:
model_dict = jsonref.load(schema_file)
@@ -671,7 +686,7 @@ class JsonSchemaParser(Parser):
our : DataModel
The datamodel defined in `model_dict`
"""
# @review Timm Fitschen 2022-02-30
# @review Timm Fitschen 2023-05-25
if isinstance(model_dict, dict):
model_dict = [model_dict]
@@ -693,10 +708,19 @@ class JsonSchemaParser(Parser):
# Check if this is a valid Json Schema
name = self._stringify(elt["title"], context=elt)
self._treat_element(elt, name)
elif "properties" in elt:
for key, prop in elt["properties"].items():
name = self._get_name_from_property(key, prop)
self._treat_element(prop, name)
elif "properties" in elt or "patternProperties" in elt:
# No top-level type but there are entities
if "properties" in elt:
for key, prop in elt["properties"].items():
name = self._get_name_from_property(key, prop)
self._treat_element(prop, name)
if "patternProperties" in elt:
# See also treatment in ``_treat_record_type``. Since here,
# there is no top-level RT we use the prefix `__Pattern`,
# i.e., the resulting Record Types will be called
# `__PatternElement`.
self._treat_pattern_properties(
elt["patternProperties"], name_prefix="__Pattern")
else:
# Neither RecordType itself, nor further properties in schema,
# so nothing to do here. Maybe add something in the future.
@@ -705,7 +729,7 @@ class JsonSchemaParser(Parser):
return DataModel(self.model.values())
def _get_name_from_property(self, key: str, prop: dict):
# @review Timm Fitschen 2023-05-25
if "title" in prop:
name = self._stringify(prop["title"])
else:
@@ -714,7 +738,7 @@ class JsonSchemaParser(Parser):
return name
def _get_atomic_datatype(self, elt):
# @review Timm Fitschen 2022-02-30
# @review Timm Fitschen 2023-05-25
if elt["type"] == "string":
if "format" in elt and elt["format"] in ["date", "date-time"]:
return db.DATETIME
@@ -726,11 +750,15 @@ class JsonSchemaParser(Parser):
return db.DOUBLE
elif elt["type"] == "boolean":
return db.BOOLEAN
elif elt["type"] == "null":
# This could be any datatype since a valid json will never have a
# value in a null property. We use TEXT for convenience.
return db.TEXT
else:
raise JsonSchemaDefinitionError(f"Unkown atomic type in {elt}.")
def _treat_element(self, elt: dict, name: str):
# @review Timm Fitschen 2022-02-30
# @review Timm Fitschen 2023-05-25
force_list = False
if name in self.model:
return self.model[name], force_list
@@ -741,7 +769,7 @@ class JsonSchemaParser(Parser):
if name == "name":
# This is identified with the CaosDB name property as long as the
# type is correct.
if not elt["type"] == "string":
if not elt["type"] == "string" and "string" not in elt["type"]:
raise JsonSchemaDefinitionError(
"The 'name' property must be string-typed, otherwise it cannot "
"be identified with CaosDB's name property."
@@ -769,11 +797,12 @@ class JsonSchemaParser(Parser):
# treat_something function
ent.description = elt["description"]
self.model[name] = ent
if ent is not None:
self.model[name] = ent
return ent, force_list
def _treat_record_type(self, elt: dict, name: str):
# @review Timm Fitschen 2022-02-30
# @review Timm Fitschen 2023-05-25
rt = db.RecordType(name=name)
if "required" in elt:
required = elt["required"]
@@ -795,6 +824,17 @@ class JsonSchemaParser(Parser):
rt.add_property(prop_ent, importance=importance,
datatype=db.LIST(prop_ent))
if "patternProperties" in elt:
pattern_property_rts = self._treat_pattern_properties(
elt["patternProperties"], name_prefix=name)
for ppr in pattern_property_rts:
# add reference to pattern property type. These can never be
# obligatory since pattern properties cannot be required in the
# original schema (since their actual names are not known a
# priori).
rt.add_property(ppr)
if "description" in elt:
rt.description = elt["description"]
return rt
@@ -816,28 +856,96 @@ class JsonSchemaParser(Parser):
return rt
def _treat_list(self, elt: dict, name: str):
# @review Timm Fitschen 2022-02-30
# @review Timm Fitschen 2023-05-25
if "items" not in elt:
if "items" not in elt and name not in self.types_for_missing_array_items:
if self.ignore_unspecified_array_items:
return None, False
raise JsonSchemaDefinitionError(
f"The definition of the list items is missing in {elt}.")
items = elt["items"]
if "enum" in items:
return self._treat_enum(items, name), True
if items["type"] in JSON_SCHEMA_ATOMIC_TYPES:
datatype = db.LIST(self._get_atomic_datatype(items))
if "items" in elt:
items = elt["items"]
if "enum" in items:
return self._treat_enum(items, name), True
if items["type"] in JSON_SCHEMA_ATOMIC_TYPES:
datatype = db.LIST(self._get_atomic_datatype(items))
return db.Property(name=name, datatype=datatype), False
if items["type"] == "object":
if "title" not in items or self._stringify(items["title"]) == name:
# Property is RecordType
return self._treat_record_type(items, name), True
else:
# List property will be an entity of its own with a name
# different from the referenced RT
ref_rt = self._treat_record_type(
items, self._stringify(items["title"]))
self.model[ref_rt.name] = ref_rt
return db.Property(name=name, datatype=db.LIST(ref_rt)), False
else:
# Use predefined type:
datatype = db.LIST(self.types_for_missing_array_items[name])
return db.Property(name=name, datatype=datatype), False
if items["type"] == "object":
if "title" not in items or self._stringify(items["title"]) == name:
# Property is RecordType
return self._treat_record_type(items, name), True
def _get_pattern_prop(self):
# @review Timm Fitschen 2023-05-25
if "__pattern_property_pattern_property" in self.model:
return self.model["__pattern_property_pattern_property"]
pp = db.Property(name="__matched_pattern", datatype=db.TEXT)
self.model["__pattern_property_pattern_property"] = pp
return pp
def _treat_pattern_properties(self, pattern_elements, name_prefix=""):
"""Special Treatment for pattern properties: A RecordType is created for
each pattern property. In case of a `type: object` PatternProperty, the
remaining properties of the JSON entry are appended to the new
RecordType; in case of an atomic type PatternProperty, a single value
Property is added to the RecordType.
Raises
------
NotImplementedError
In case of patternProperties with non-object, non-atomic type, e.g.,
array.
"""
# @review Timm Fitschen 2023-05-25
num_patterns = len(pattern_elements)
pattern_prop = self._get_pattern_prop()
returns = []
for ii, (key, element) in enumerate(pattern_elements.items()):
if "title" not in element:
name_suffix = f"_{ii+1}" if num_patterns > 1 else ""
name = name_prefix + "Entry" + name_suffix
else:
name = element["title"]
if element["type"] == "object":
# simple, is already an object, so can be treated like any other
# record type.
pattern_type = self._treat_record_type(element, name)
elif element["type"] in JSON_SCHEMA_ATOMIC_TYPES:
# create a property that stores the actual value of the pattern
# property.
propname = f"{name}_value"
prop = db.Property(name=propname, datatype=self._get_atomic_datatype(element))
self.model[propname] = prop
pattern_type = db.RecordType(name=name)
pattern_type.add_property(prop)
else:
# List property will be an entity of its own with a name
# different from the referenced RT
ref_rt = self._treat_record_type(
items, self._stringify(items["title"]))
self.model[ref_rt.name] = ref_rt
return db.Property(name=name, datatype=db.LIST(ref_rt)), False
raise NotImplementedError(
"Pattern properties are currently only supported for types " +
", ".join(JSON_SCHEMA_ATOMIC_TYPES) + ", and object.")
# Add pattern property and description
pattern_type.add_property(pattern_prop, importance=db.OBLIGATORY)
if pattern_type.description:
pattern_type.description += f"\n\npattern: {key}"
else:
pattern_type.description = f"pattern: {key}"
self.model[name] = pattern_type
returns.append(pattern_type)
return returns
if __name__ == "__main__":
Loading