Skip to content
Snippets Groups Projects

Extend json-schema model parser

Merged Florian Spreckelsen requested to merge f-enhance-json-parser into dev
Files
6
@@ -35,8 +35,9 @@ not defined, simply the name can be supplied with no value.
@@ -35,8 +35,9 @@ not defined, simply the name can be supplied with no value.
Parents can be provided under the 'inherit_from_xxxx' keywords. The value needs
Parents can be provided under the 'inherit_from_xxxx' keywords. The value needs
to be a list with the names. Here, NO NEW entities can be defined.
to be a list with the names. Here, NO NEW entities can be defined.
"""
"""
import json
import argparse
import argparse
 
import json
 
import jsonref
import re
import re
import sys
import sys
import yaml
import yaml
@@ -76,7 +77,8 @@ JSON_SCHEMA_ATOMIC_TYPES = [
@@ -76,7 +77,8 @@ JSON_SCHEMA_ATOMIC_TYPES = [
"string",
"string",
"boolean",
"boolean",
"integer",
"integer",
"number"
"number",
 
"null"
]
]
@@ -152,13 +154,29 @@ def parse_model_from_string(string):
@@ -152,13 +154,29 @@ def parse_model_from_string(string):
return parser.parse_model_from_string(string)
return parser.parse_model_from_string(string)
def parse_model_from_json_schema(filename: str):
def parse_model_from_json_schema(
 
filename: str,
 
top_level_recordtype: bool = True,
 
types_for_missing_array_items: dict = {},
 
ignore_unspecified_array_items: bool = False
 
):
"""Return a datamodel parsed from a json schema definition.
"""Return a datamodel parsed from a json schema definition.
Parameters
Parameters
----------
----------
filename : str
filename : str
The path of the json schema file that is to be parsed
The path of the json schema file that is to be parsed
 
top_level_recordtype : bool, optional
 
Whether there is a record type defined at the top level of the
 
schema. Default is true.
 
types_for_missing_array_items : dict, optional
 
dictionary containing fall-back types for json entries with `type:
 
array` but without `items` specification. Default is an empty dict.
 
ignore_unspecified_array_items : bool, optional
 
Whether to ignore `type: array` entries the type of which is not
 
specified by their `items` property or given in
 
`types_for_missing_array_items`. An error is raised if they are not
 
ignored. Default is False.
Returns
Returns
-------
-------
@@ -174,10 +192,10 @@ def parse_model_from_json_schema(filename: str):
@@ -174,10 +192,10 @@ def parse_model_from_json_schema(filename: str):
"""
"""
# @author Florian Spreckelsen
# @author Florian Spreckelsen
# @date 2022-02-17
# @date 2022-02-17
# @review Daniel Hornung 2022-02-18
# @review Timm Fitschen 2023-05-25
parser = JsonSchemaParser()
parser = JsonSchemaParser(types_for_missing_array_items, ignore_unspecified_array_items)
return parser.parse_model_from_json_schema(filename)
return parser.parse_model_from_json_schema(filename, top_level_recordtype)
class Parser(object):
class Parser(object):
@@ -600,14 +618,13 @@ class Parser(object):
@@ -600,14 +618,13 @@ class Parser(object):
class JsonSchemaParser(Parser):
class JsonSchemaParser(Parser):
"""Extends the yaml parser to read in datamodels defined in a json schema.
"""Extends the yaml parser to read in datamodels defined in a json schema.
**EXPERIMENTAL:** While this calss can already be used to create data models
**EXPERIMENTAL:** While this class can already be used to create data models
from basic json schemas, there are the following limitations and missing
from basic json schemas, there are the following limitations and missing
features:
features:
* Due to limitations of json-schema itself, we currently do not support
* Due to limitations of json-schema itself, we currently do not support
inheritance in the imported data models
inheritance in the imported data models
* The same goes for suggested properties of RecordTypes
* The same goes for suggested properties of RecordTypes
* Currently, ``$defs`` and ``$ref`` in the input schema are not resolved.
* Already defined RecordTypes and (scalar) Properties can't be re-used as
* Already defined RecordTypes and (scalar) Properties can't be re-used as
list properties
list properties
* Reference properties that are different from the referenced RT. (Although
* Reference properties that are different from the referenced RT. (Although
@@ -615,15 +632,18 @@ class JsonSchemaParser(Parser):
@@ -615,15 +632,18 @@ class JsonSchemaParser(Parser):
* Values
* Values
* Roles
* Roles
* The extern keyword from the yaml parser
* The extern keyword from the yaml parser
* Currently, a json-schema cannot be transformed into a data model if its
root element isn't a RecordType (or Property) with ``title`` and ``type``.
"""
"""
# @author Florian Spreckelsen
# @author Florian Spreckelsen
# @date 2022-02-17
# @date 2022-02-17
# @review Timm Fitschen 2022-02-30
# @review Timm Fitschen 2023-05-25
 
 
def __init__(self, types_for_missing_array_items={}, ignore_unspecified_array_items=False):
 
super().__init__()
 
self.types_for_missing_array_items = types_for_missing_array_items
 
self.ignore_unspecified_array_items = ignore_unspecified_array_items
def parse_model_from_json_schema(self, filename: str):
def parse_model_from_json_schema(self, filename: str, top_level_recordtype: bool = True):
"""Return a datamodel created from the definition in the json schema in
"""Return a datamodel created from the definition in the json schema in
`filename`.
`filename`.
@@ -631,6 +651,9 @@ class JsonSchemaParser(Parser):
@@ -631,6 +651,9 @@ class JsonSchemaParser(Parser):
----------
----------
filename : str
filename : str
The path to the json-schema file containing the datamodel definition
The path to the json-schema file containing the datamodel definition
 
top_level_recordtype : bool, optional
 
Whether there is a record type defined at the top level of the
 
schema. Default is true.
Returns
Returns
-------
-------
@@ -639,13 +662,13 @@ class JsonSchemaParser(Parser):
@@ -639,13 +662,13 @@ class JsonSchemaParser(Parser):
"""
"""
# @author Florian Spreckelsen
# @author Florian Spreckelsen
# @date 2022-02-17
# @date 2022-02-17
# @review Timm Fitschen 2022-02-30
# @review Timm Fitschen 2023-05-25
with open(filename, 'r') as schema_file:
with open(filename, 'r') as schema_file:
model_dict = json.load(schema_file)
model_dict = jsonref.load(schema_file)
return self._create_model_from_dict(model_dict)
return self._create_model_from_dict(model_dict, top_level_recordtype=top_level_recordtype)
def _create_model_from_dict(self, model_dict: [dict, List[dict]]):
def _create_model_from_dict(self, model_dict: [dict, List[dict]], top_level_recordtype: bool = True):
"""Parse a dictionary and return the Datamodel created from it.
"""Parse a dictionary and return the Datamodel created from it.
The dictionary was typically created from the model definition in a json schema file.
The dictionary was typically created from the model definition in a json schema file.
@@ -654,36 +677,68 @@ class JsonSchemaParser(Parser):
@@ -654,36 +677,68 @@ class JsonSchemaParser(Parser):
----------
----------
model_dict : dict or list[dict]
model_dict : dict or list[dict]
One or several dictionaries read in from a json-schema file
One or several dictionaries read in from a json-schema file
 
top_level_recordtype : bool, optional
 
Whether there is a record type defined at the top level of the
 
schema. Default is true.
Returns
Returns
-------
-------
our : DataModel
our : DataModel
The datamodel defined in `model_dict`
The datamodel defined in `model_dict`
"""
"""
# @review Timm Fitschen 2022-02-30
# @review Timm Fitschen 2023-05-25
if isinstance(model_dict, dict):
if isinstance(model_dict, dict):
model_dict = [model_dict]
model_dict = [model_dict]
for ii, elt in enumerate(model_dict):
for ii, elt in enumerate(model_dict):
if "title" not in elt:
raise JsonSchemaDefinitionError(
f"Object {ii+1} is lacking the `title` key word")
if "type" not in elt:
raise JsonSchemaDefinitionError(
f"Object {ii+1} is lacking the `type` key word")
# Check if this is a valid Json Schema
try:
try:
jsonschema.Draft202012Validator.check_schema(elt)
jsonschema.Draft202012Validator.check_schema(elt)
except jsonschema.SchemaError as err:
except jsonschema.SchemaError as err:
 
key = elt["title"] if "title" in elt else f"element {ii}"
raise JsonSchemaDefinitionError(
raise JsonSchemaDefinitionError(
f"Json Schema error in {elt['title']}:\n{str(err)}") from err
f"Json Schema error in {key}:\n{str(err)}") from err
name = self._stringify(elt["title"], context=elt)
self._treat_element(elt, name)
if top_level_recordtype:
 
if "title" not in elt:
 
raise JsonSchemaDefinitionError(
 
f"Object {ii+1} is lacking the `title` key word")
 
if "type" not in elt:
 
raise JsonSchemaDefinitionError(
 
f"Object {ii+1} is lacking the `type` key word")
 
# Check if this is a valid Json Schema
 
name = self._stringify(elt["title"], context=elt)
 
self._treat_element(elt, name)
 
elif "properties" in elt or "patternProperties" in elt:
 
# No top-level type but there are entities
 
if "properties" in elt:
 
for key, prop in elt["properties"].items():
 
name = self._get_name_from_property(key, prop)
 
self._treat_element(prop, name)
 
if "patternProperties" in elt:
 
# See also treatment in ``_treat_record_type``. Since here,
 
# there is no top-level RT we use the prefix `__Pattern`,
 
# i.e., the resulting Record Types will be called
 
# `__PatternElement`.
 
self._treat_pattern_properties(
 
elt["patternProperties"], name_prefix="__Pattern")
 
else:
 
# Neither RecordType itself, nor further properties in schema,
 
# so nothing to do here. Maybe add something in the future.
 
continue
return DataModel(self.model.values())
return DataModel(self.model.values())
 
def _get_name_from_property(self, key: str, prop: dict):
 
# @review Timm Fitschen 2023-05-25
 
if "title" in prop:
 
name = self._stringify(prop["title"])
 
else:
 
name = self._stringify(key)
 
 
return name
 
def _get_atomic_datatype(self, elt):
def _get_atomic_datatype(self, elt):
# @review Timm Fitschen 2022-02-30
# @review Timm Fitschen 2023-05-25
if elt["type"] == "string":
if elt["type"] == "string":
if "format" in elt and elt["format"] in ["date", "date-time"]:
if "format" in elt and elt["format"] in ["date", "date-time"]:
return db.DATETIME
return db.DATETIME
@@ -695,11 +750,15 @@ class JsonSchemaParser(Parser):
@@ -695,11 +750,15 @@ class JsonSchemaParser(Parser):
return db.DOUBLE
return db.DOUBLE
elif elt["type"] == "boolean":
elif elt["type"] == "boolean":
return db.BOOLEAN
return db.BOOLEAN
 
elif elt["type"] == "null":
 
# This could be any datatype since a valid json will never have a
 
# value in a null property. We use TEXT for convenience.
 
return db.TEXT
else:
else:
raise JsonSchemaDefinitionError(f"Unkown atomic type in {elt}.")
raise JsonSchemaDefinitionError(f"Unkown atomic type in {elt}.")
def _treat_element(self, elt: dict, name: str):
def _treat_element(self, elt: dict, name: str):
# @review Timm Fitschen 2022-02-30
# @review Timm Fitschen 2023-05-25
force_list = False
force_list = False
if name in self.model:
if name in self.model:
return self.model[name], force_list
return self.model[name], force_list
@@ -710,12 +769,17 @@ class JsonSchemaParser(Parser):
@@ -710,12 +769,17 @@ class JsonSchemaParser(Parser):
if name == "name":
if name == "name":
# This is identified with the CaosDB name property as long as the
# This is identified with the CaosDB name property as long as the
# type is correct.
# type is correct.
if not elt["type"] == "string":
if not elt["type"] == "string" and "string" not in elt["type"]:
raise JsonSchemaDefinitionError(
raise JsonSchemaDefinitionError(
"The 'name' property must be string-typed, otherwise it cannot "
"The 'name' property must be string-typed, otherwise it cannot "
"be identified with CaosDB's name property."
"be identified with CaosDB's name property."
)
)
return None, force_list
return None, force_list
 
# LinkAhead suports null for all types, so in the very special case of
 
# `"type": ["null", "<other_type>"]`, only consider the other type:
 
if isinstance(elt["type"], list) and len(elt["type"]) == 2 and "null" in elt["type"]:
 
elt["type"].remove("null")
 
elt["type"] = elt["type"][0]
if "enum" in elt:
if "enum" in elt:
ent = self._treat_enum(elt, name)
ent = self._treat_enum(elt, name)
elif elt["type"] in JSON_SCHEMA_ATOMIC_TYPES:
elif elt["type"] in JSON_SCHEMA_ATOMIC_TYPES:
@@ -733,11 +797,12 @@ class JsonSchemaParser(Parser):
@@ -733,11 +797,12 @@ class JsonSchemaParser(Parser):
# treat_something function
# treat_something function
ent.description = elt["description"]
ent.description = elt["description"]
self.model[name] = ent
if ent is not None:
 
self.model[name] = ent
return ent, force_list
return ent, force_list
def _treat_record_type(self, elt: dict, name: str):
def _treat_record_type(self, elt: dict, name: str):
# @review Timm Fitschen 2022-02-30
# @review Timm Fitschen 2023-05-25
rt = db.RecordType(name=name)
rt = db.RecordType(name=name)
if "required" in elt:
if "required" in elt:
required = elt["required"]
required = elt["required"]
@@ -745,10 +810,7 @@ class JsonSchemaParser(Parser):
@@ -745,10 +810,7 @@ class JsonSchemaParser(Parser):
required = []
required = []
if "properties" in elt:
if "properties" in elt:
for key, prop in elt["properties"].items():
for key, prop in elt["properties"].items():
if "title" in prop:
name = self._get_name_from_property(key, prop)
name = self._stringify(prop["title"])
else:
name = self._stringify(key)
prop_ent, force_list = self._treat_element(prop, name)
prop_ent, force_list = self._treat_element(prop, name)
if prop_ent is None:
if prop_ent is None:
# Nothing to be appended since the property has to be
# Nothing to be appended since the property has to be
@@ -762,6 +824,17 @@ class JsonSchemaParser(Parser):
@@ -762,6 +824,17 @@ class JsonSchemaParser(Parser):
rt.add_property(prop_ent, importance=importance,
rt.add_property(prop_ent, importance=importance,
datatype=db.LIST(prop_ent))
datatype=db.LIST(prop_ent))
 
if "patternProperties" in elt:
 
 
pattern_property_rts = self._treat_pattern_properties(
 
elt["patternProperties"], name_prefix=name)
 
for ppr in pattern_property_rts:
 
# add reference to pattern property type. These can never be
 
# obligatory since pattern properties cannot be required in the
 
# original schema (since their actual names are not known a
 
# priori).
 
rt.add_property(ppr)
 
if "description" in elt:
if "description" in elt:
rt.description = elt["description"]
rt.description = elt["description"]
return rt
return rt
@@ -783,28 +856,96 @@ class JsonSchemaParser(Parser):
@@ -783,28 +856,96 @@ class JsonSchemaParser(Parser):
return rt
return rt
def _treat_list(self, elt: dict, name: str):
def _treat_list(self, elt: dict, name: str):
# @review Timm Fitschen 2022-02-30
# @review Timm Fitschen 2023-05-25
if "items" not in elt:
if "items" not in elt and name not in self.types_for_missing_array_items:
 
if self.ignore_unspecified_array_items:
 
return None, False
raise JsonSchemaDefinitionError(
raise JsonSchemaDefinitionError(
f"The definition of the list items is missing in {elt}.")
f"The definition of the list items is missing in {elt}.")
items = elt["items"]
if "items" in elt:
if "enum" in items:
items = elt["items"]
return self._treat_enum(items, name), True
if "enum" in items:
if items["type"] in JSON_SCHEMA_ATOMIC_TYPES:
return self._treat_enum(items, name), True
datatype = db.LIST(self._get_atomic_datatype(items))
if items["type"] in JSON_SCHEMA_ATOMIC_TYPES:
 
datatype = db.LIST(self._get_atomic_datatype(items))
 
return db.Property(name=name, datatype=datatype), False
 
if items["type"] == "object":
 
if "title" not in items or self._stringify(items["title"]) == name:
 
# Property is RecordType
 
return self._treat_record_type(items, name), True
 
else:
 
# List property will be an entity of its own with a name
 
# different from the referenced RT
 
ref_rt = self._treat_record_type(
 
items, self._stringify(items["title"]))
 
self.model[ref_rt.name] = ref_rt
 
return db.Property(name=name, datatype=db.LIST(ref_rt)), False
 
else:
 
# Use predefined type:
 
datatype = db.LIST(self.types_for_missing_array_items[name])
return db.Property(name=name, datatype=datatype), False
return db.Property(name=name, datatype=datatype), False
if items["type"] == "object":
if "title" not in items or self._stringify(items["title"]) == name:
def _get_pattern_prop(self):
    • This is a property, that is added automatically to the datamodel as soon as there is at least one pattern property in the schema. See new documentation.

Please register or sign in to reply
# Property is RecordType
# @review Timm Fitschen 2023-05-25
return self._treat_record_type(items, name), True
if "__pattern_property_pattern_property" in self.model:
 
return self.model["__pattern_property_pattern_property"]
 
pp = db.Property(name="__matched_pattern", datatype=db.TEXT)
 
self.model["__pattern_property_pattern_property"] = pp
 
return pp
 
 
def _treat_pattern_properties(self, pattern_elements, name_prefix=""):
 
"""Special Treatment for pattern properties: A RecordType is created for
 
each pattern property. In case of a `type: object` PatternProperty, the
 
remaining properties of the JSON entry are appended to the new
 
RecordType; in case of an atomic type PatternProperty, a single value
 
Property is added to the RecordType.
 
 
Raises
 
------
 
NotImplementedError
 
In case of patternProperties with non-object, non-atomic type, e.g.,
 
array.
 
 
"""
 
# @review Timm Fitschen 2023-05-25
 
num_patterns = len(pattern_elements)
 
pattern_prop = self._get_pattern_prop()
 
returns = []
 
for ii, (key, element) in enumerate(pattern_elements.items()):
 
if "title" not in element:
 
name_suffix = f"_{ii+1}" if num_patterns > 1 else ""
 
name = name_prefix + "Entry" + name_suffix
 
else:
 
name = element["title"]
 
if element["type"] == "object":
 
# simple, is already an object, so can be treated like any other
 
# record type.
 
pattern_type = self._treat_record_type(element, name)
 
elif element["type"] in JSON_SCHEMA_ATOMIC_TYPES:
Please register or sign in to reply
 
# create a property that stores the actual value of the pattern
 
# property.
 
propname = f"{name}_value"
 
prop = db.Property(name=propname, datatype=self._get_atomic_datatype(element))
 
self.model[propname] = prop
 
pattern_type = db.RecordType(name=name)
 
pattern_type.add_property(prop)
 
else:
 
raise NotImplementedError(
 
"Pattern properties are currently only supported for types " +
 
", ".join(JSON_SCHEMA_ATOMIC_TYPES) + ", and object.")
 
 
# Add pattern property and description
 
pattern_type.add_property(pattern_prop, importance=db.OBLIGATORY)
 
if pattern_type.description:
 
pattern_type.description += f"\n\npattern: {key}"
else:
else:
# List property will be an entity of its own with a name
pattern_type.description = f"pattern: {key}"
# different from the referenced RT
ref_rt = self._treat_record_type(
self.model[name] = pattern_type
items, self._stringify(items["title"]))
returns.append(pattern_type)
self.model[ref_rt.name] = ref_rt
return db.Property(name=name, datatype=db.LIST(ref_rt)), False
return returns
if __name__ == "__main__":
if __name__ == "__main__":
Loading