diff --git a/CHANGELOG.md b/CHANGELOG.md index 018d8876d941b6cc9fe05f33c375033512ddb5ce..62105323a81f22594c92601a405e287dc76106ee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - CFood that creates a Record for each line in a csv file - `generic_analysis.py` allows to easily call scripts to perform analyses in server side scripting [EXPERIMENTAL] +- **EXPERIMENTAL:** Models parser can import from Json Schema files now: + `models.parser.parse_model_from_json_schema(...)`. See the documentation of + `models.parser.JsonSchemaParser` for the limitations of the current + implementation. - New keyword "role" in yaml data model that allows creation of Records and Files. - It is now possible to set values of properties and default values of properties directly in the yaml model. @@ -35,7 +39,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### - `check_reference_field` function to check whether entities with provided ids exits (for example when importing data from a table) -- added the `datatypes` argument to `TableImporter` for columns that do not +- added the `datatypes` argument to `TableImporter` for columns that do not need a special conversion function ## [0.3.0] - 2021-11-02 ## @@ -49,14 +53,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - New class to collect possible problems with the data model - New class for checking and importing tables - Function to get a file path to a shared resource directory -- Function to setup logging appropriate for server side scripts with webui +- Function to setup logging appropriate for server side scripts with webui output - New class for collecting information for exporting tables, e.g., to metadata repositories - new name parsing - new test for software folder structure - new assure_name_is function -- two utility functions when working with files: NameCollector and +- two utility functions when working with files: NameCollector and get_file_via_download - Automated documentation builds: `make doc` - Crawler documentation @@ -69,8 +73,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed ### -- identifiables of single CFoods are now treated one after the other. This - allows them to have dependencies among each other if they are ordered +- identifiables of single CFoods are now treated one after the other. This + allows them to have dependencies among each other if they are ordered correctly - identifiables must have at least one property or a name - `caosadvancedtools.serverside.helper.init_data_model` also checks the role @@ -98,9 +102,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 cause an `sqlite3.IntegrityError` if more than one change was cached for the same entity. - #40 Insertion of identifiables with missing obligatory properties -- Before, a Property with the datatype "LIST(TEXT)" would lead to the creation +- Before, a Property with the datatype "LIST(TEXT)" would lead to the creation of a RecordType. This is fixed now. -- #52 `XLSimporter.read_xls` throwed a wrong error when reading from a file with a wrong ending. +- #52 `XLSimporter.read_xls` throwed a wrong error when reading from a file with a wrong ending. Now, a `DataInconsistencyError` is raised instead of a ValueError. - List properties are no longer updated unnecessarily by the crawler. diff --git a/integrationtests/test.sh b/integrationtests/test.sh index aebec016e62187f1f790a9bd4a3a31d77f934d50..a142d917215eb7469faab9c66a581539ce867e4e 100755 --- a/integrationtests/test.sh +++ b/integrationtests/test.sh @@ -83,6 +83,9 @@ python3 -m pytest test_crawl_with_datamodel_problems.py echo "Testing table export" python3 -m pytest test_base_table_exporter_integration.py +echo "Testing json-schema datamodel parser" +python3 -m pytest test_json_schema_datamodel_parser.py + echo "Testing yaml datamodel parser" python3 -m pytest test_yaml_parser.py diff --git a/integrationtests/test_datamodel.schema.json b/integrationtests/test_datamodel.schema.json new file mode 100644 index 0000000000000000000000000000000000000000..356964702dd83a8c81edf1e8d72bf4a30468e6f2 --- /dev/null +++ b/integrationtests/test_datamodel.schema.json @@ -0,0 +1,85 @@ +[ + { + "title": "TestTypeWithAtomicProps", + "description": "RecordType with scalar atomic properties", + "type": "object", + "properties": { + "simple_text_prop": { "type": "string" }, + "int_prop_with_name": { "type": "integer", "title": "IntegerProperty" }, + "double_prop": { + "type": "number", + "description": "Some generic double-valued property" + }, + "bool_prop": { "type": "boolean" }, + "datetime_prop": { "type": "string", "format": "date-time" }, + "date_prop": { "type": "string", "format": "date" } + }, + "required": [ "simple_text_prop", "double_prop" ] + }, + { + "title": "TestTypeWithReferencesAndEnum", + "type": "object", + "properties": { + "TestTypeWithAtomicProps": {}, + "OtherReference": { + "type": "object", + "description": "Some generic refernced RecordType", + "properties": {} + }, + "named_refernce": { + "type": "object", + "title": "NamedReference", + "properties": { + "simple_text_prop": {} + } + }, + "string_enum": { + "type": "string", + "enum": [ "StringEnumA", "StringEnumB", "StringEnumC" ] + }, + "named_enum": { + "type": "string", + "title": "NamedEnum", + "enum": [ "NameA", "NameB", "NameC" ] + } + } + }, + { + "title": "TestTypeWithLists", + "type": "object", + "properties": { + "string_list": { + "type": "array", + "description": "A list of words", + "items": { "type": "string" } + }, + "named_int_list": { + "type": "array", + "title": "NamedIntList", + "items": { "type": "integer" } + }, + "ListRecordType": { + "type": "array", + "items": { "type": "object", "properties": {} } + }, + "NamedReferenceList": { + "type": "array", + "items": { + "title": "ReferencedListTypeWithName", + "type": "object", + "description": "Referenced by a named list-of-references property", + "properties": { + "double_prop": {} + } + } + }, + "ListNumberEnum": { + "type": "array", + "items": { + "type": "number", + "enum": [ 1.1, 2.2, 3.3 ] + } + } + } + } +] diff --git a/integrationtests/test_json_schema_datamodel_parser.py b/integrationtests/test_json_schema_datamodel_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..21ae8d2d7bad5527a7a314220b38af8ff816475f --- /dev/null +++ b/integrationtests/test_json_schema_datamodel_parser.py @@ -0,0 +1,174 @@ +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2022 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2022 Florian Spreckelsen <f.spreckelsen@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify it under +# the terms of the GNU Affero General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your option) any +# later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +# details. +# +# You should have received a copy of the GNU Affero General Public License along +# with this program. If not, see <https://www.gnu.org/licenses/>. +# + +import os + +import caosdb as db +from caosadvancedtools.models.parser import parse_model_from_json_schema + + +def _clear_db(): + ents = db.execute_query("FIND ENTITY WITH ID>99") + if ents: + ents.delete() + + +def setup_module(): + _clear_db() + + +def teardown_module(): + _clear_db() + + +def _load_and_sync(fname): + """Load datamodel from json schema in fname and synchronize it without asking. + + """ + # @author Florian Spreckelsen + # @date 2022-03-23 + fpath = os.path.join(os.path.dirname(os.path.abspath(__file__)), fname) + model = parse_model_from_json_schema(fpath) + model.sync_data_model(noquestion=True) + + +def test_json_parsed_datamodel(): + # @author Florian Spreckelsen + # @date 2022-03-23 + + _load_and_sync("test_datamodel.schema.json") + + # RecordType with atomic properties + rt1 = db.execute_query( + "FIND RECORDTYPE TestTypeWithAtomicProps", unique=True) + assert rt1.description == "RecordType with scalar atomic properties" + assert rt1.get_property("simple_text_prop") is not None + assert rt1.get_property("simple_text_prop").datatype == db.TEXT + assert rt1.get_importance("simple_text_prop") == db.OBLIGATORY + + assert rt1.get_property("IntegerProperty") is not None + assert rt1.get_property("IntegerProperty").datatype == db.INTEGER + assert rt1.get_importance("IntegerProperty") == db.RECOMMENDED + + assert rt1.get_property("double_prop") is not None + assert rt1.get_property("double_prop").datatype == db.DOUBLE + assert rt1.get_importance("double_prop") == db.OBLIGATORY + assert (db.Property(name="double_prop").retrieve().description == + "Some generic double-valued property") + + further_props = [ + ("bool_prop", db.BOOLEAN), + ("datetime_prop", db.DATETIME), + ("date_prop", db.DATETIME) + ] + for name, dtype in further_props: + assert rt1.get_property(name) is not None + assert rt1.get_property(name).datatype == dtype + assert rt1.get_importance(name) == db.RECOMMENDED + + # RecordType with references and enums + rt2 = db.execute_query( + "FIND RECORDTYPE TestTypeWithReferencesAndEnum", unique=True) + assert rt2.get_property(rt1.name) is not None + assert rt2.get_property(rt1.name).is_reference() + assert rt2.get_property(rt1.name).name == rt1.name + assert rt2.get_property(rt1.name).id == rt1.id + + other_ref_type = db.execute_query( + "FIND RECORDTYPE OtherReference", unique=True) + assert rt2.get_property(other_ref_type.name) is not None + assert rt2.get_property(other_ref_type.name).is_reference() + assert rt2.get_property(other_ref_type.name).name == other_ref_type.name + assert rt2.get_property(other_ref_type.name).id == other_ref_type.id + assert other_ref_type.description == "Some generic refernced RecordType" + assert len(other_ref_type.properties) == 0 + + named_ref_type = db.execute_query( + "FIND RECORDTYPE NamedReference", unique=True) + assert rt2.get_property(named_ref_type.name) is not None + assert rt2.get_property(named_ref_type.name).is_reference() + assert rt2.get_property(named_ref_type.name).name == named_ref_type.name + assert rt2.get_property(named_ref_type.name).id == named_ref_type.id + assert named_ref_type.get_property("simple_text_prop") is not None + assert (named_ref_type.get_property("simple_text_prop").id == + rt1.get_property("simple_text_prop").id) + assert (named_ref_type.get_property("simple_text_prop").datatype == + rt1.get_property("simple_text_prop").datatype) + + enums = { + "string_enum": ["StringEnumA", "StringEnumB", "StringEnumC"], + "NamedEnum": ["NameA", "NameB", "NameC"] + } + for enum_type_name, enum_names in enums.items(): + enum_type = db.execute_query( + f"FIND RECORDTYPE {enum_type_name}", unique=True) + assert len(enum_type.properties) == 0 + enum_records = db.execute_query(f"FIND RECORD {enum_type_name}") + assert len(enum_records) == len(enum_names) + for rec in enum_records: + assert rec.name in enum_names + assert rt2.get_property(enum_type_name) is not None + assert rt2.get_property(enum_type_name).is_reference() + assert rt2.get_property(enum_type_name).name == enum_type.name + assert rt2.get_property(enum_type_name).id == enum_type.id + + # Recordtype with lists + rt3 = db.execute_query("FIND RECORDTYPE TestTypeWithLists", unique=True) + assert rt3.get_property("string_list") is not None + assert rt3.get_property("string_list").datatype == db.LIST(db.TEXT) + string_list_prop = db.Property(name="string_list").retrieve() + assert string_list_prop.description == "A list of words" + assert string_list_prop.datatype == db.LIST(db.TEXT) + assert string_list_prop.id == rt3.get_property("string_list").id + + assert rt3.get_property("NamedIntList") is not None + assert rt3.get_property("NamedIntList").datatype == db.LIST(db.INTEGER) + + # This is a list of a plain references to a specific type + list_rt = db.execute_query("FIND RECORDTYPE ListRecordType", unique=True) + assert len(list_rt.properties) == 0 + assert rt3.get_property(list_rt.name) is not None + assert rt3.get_property(list_rt.name).is_reference() + assert rt3.get_property(list_rt.name).datatype == db.LIST(list_rt) + assert rt3.get_property(list_rt.name).id == list_rt.id + + # This is a list property of its own, referencing another separate RT + referenced_list_rt = db.execute_query( + "FIND RECORDTYPE ReferencedListTypeWithName", unique=True) + assert referenced_list_rt.description == "Referenced by a named list-of-references property" + assert referenced_list_rt.get_property("double_prop") is not None + assert (referenced_list_rt.get_property("double_prop").id == + rt1.get_property("double_prop").id) + assert rt3.get_property("NamedReferenceList") is not None + assert rt3.get_property("NamedReferenceList").is_reference() + assert rt3.get_property( + "NamedReferenceList").datatype == db.LIST(referenced_list_rt) + assert rt3.get_property("NamedReferenceList").id != referenced_list_rt.id + + enum_type = db.execute_query("FIND RECORDTYPE ListNumberEnum", unique=True) + assert len(enum_type.properties) == 0 + enum_names = ["1.1", "2.2", "3.3"] + enum_records = db.execute_query("FIND RECORD ListNumberEnum") + assert len(enum_records) == len(enum_names) + for rec in enum_records: + assert rec.name in enum_names + assert rt3.get_property(enum_type.name) is not None + assert rt3.get_property(enum_type.name).datatype == db.LIST(enum_type) + assert rt3.get_property(enum_type.name).id == enum_type.id diff --git a/pylintrc b/pylintrc new file mode 100644 index 0000000000000000000000000000000000000000..8a12125d4b71d3df5f7866277c41ee15401a4a93 --- /dev/null +++ b/pylintrc @@ -0,0 +1,9 @@ +# -*- mode:conf; -*- + +[FORMAT] +# Good variable names which should always be accepted, separated by a comma +good-names=ii,rt + + +[TYPECHECK] +ignored-modules=etree diff --git a/setup.py b/setup.py index 411a5c3dcd6ba362e7e7c8e6015e103acdf5bd31..98599d9a5ead13520726546c23cbe59c57242fc0 100755 --- a/setup.py +++ b/setup.py @@ -155,6 +155,7 @@ def setup_package(): author='Henrik tom Wörden', author_email='h.tomwoerden@indiscale.com', install_requires=["caosdb>=0.7.0", + "jsonschema>=4.4.0", "numpy>=1.17.3", "openpyxl>=3.0.0", "pandas>=1.2.0", diff --git a/src/caosadvancedtools/models/parser.py b/src/caosadvancedtools/models/parser.py index 5770bb483df6bdb00743dd63cc01fe598e26514e..40a61c6c9dbf3273c0287827cc68974d7be716cf 100644 --- a/src/caosadvancedtools/models/parser.py +++ b/src/caosadvancedtools/models/parser.py @@ -1,3 +1,22 @@ +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2022 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2022 Florian Spreckelsen <f.spreckelsen@indiscale.com> +# Copyright (C) 2022 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + """ This module (and script) provides methods to read a DataModel from a YAML file. @@ -16,11 +35,15 @@ not defined, simply the name can be supplied with no value. Parents can be provided under the 'inherit_from_xxxx' keywords. The value needs to be a list with the names. Here, NO NEW entities can be defined. """ +import json import re import sys +import yaml + +from typing import List +import jsonschema import caosdb as db -import yaml from .data_model import CAOSDB_INTERNAL_PROPERTIES, DataModel @@ -47,6 +70,13 @@ KEYWORDS_IGNORED = [ "unit", ] +JSON_SCHEMA_ATOMIC_TYPES = [ + "string", + "boolean", + "integer", + "number" +] + def _get_listdatatype(dtype): """matches a string to check whether the type definition is a list @@ -98,6 +128,14 @@ class YamlDefinitionError(RuntimeError): super().__init__(template.format(line)) +class JsonSchemaDefinitionError(RuntimeError): + # @author Florian Spreckelsen + # @date 2022-02-17 + # @review Daniel Hornung 2022-02-18 + def __init__(self, msg): + super().__init__(msg) + + def parse_model_from_yaml(filename): """Shortcut if the Parser object is not needed.""" parser = Parser() @@ -112,6 +150,34 @@ def parse_model_from_string(string): return parser.parse_model_from_string(string) +def parse_model_from_json_schema(filename: str): + """Return a datamodel parsed from a json schema definition. + + Parameters + ---------- + filename : str + The path of the json schema file that is to be parsed + + Returns + ------- + out : Datamodel + The datamodel generated from the input schema which then can be used for + synchronizing with CaosDB. + + Note + ---- + This is an experimental feature, see ``JsonSchemaParser`` for information + about the limitations of the current implementation. + + """ + # @author Florian Spreckelsen + # @date 2022-02-17 + # @review Daniel Hornung 2022-02-18 + parser = JsonSchemaParser() + + return parser.parse_model_from_json_schema(filename) + + class Parser(object): def __init__(self): """Initialize an empty parser object and initialize the dictionary of entities and the list of @@ -517,6 +583,202 @@ class Parser(object): self.model[key] = db.RecordType(name=key) +class JsonSchemaParser(Parser): + """Extends the yaml parser to read in datamodels defined in a json schema. + + **EXPERIMENTAL:** While this calss can already be used to create data models + from basic json schemas, there are the following limitations and missing + features: + + * Due to limitations of json-schema itself, we currently do not support + inheritance in the imported data models + * The same goes for suggested properties of RecordTypes + * Currently, ``$defs`` and ``$ref`` in the input schema are not resolved. + * Already defined RecordTypes and (scalar) Properties can't be re-used as + list properties + * Reference properties that are different from the referenced RT. (Although + this is possible for list of references) + * Values + * Roles + * The extern keyword from the yaml parser + * Currently, a json-schema cannot be transformed into a data model if its + root element isn't a RecordType (or Property) with ``title`` and ``type``. + + """ + # @author Florian Spreckelsen + # @date 2022-02-17 + # @review Timm Fitschen 2022-02-30 + + def parse_model_from_json_schema(self, filename: str): + """Return a datamodel created from the definition in the json schema in + `filename`. + + Parameters + ---------- + filename : str + The path to the json-schema file containing the datamodel definition + + Returns + ------- + out : DataModel + The created DataModel + """ + # @author Florian Spreckelsen + # @date 2022-02-17 + # @review Timm Fitschen 2022-02-30 + with open(filename, 'r') as schema_file: + model_dict = json.load(schema_file) + + return self._create_model_from_dict(model_dict) + + def _create_model_from_dict(self, model_dict: [dict, List[dict]]): + """Parse a dictionary read in from the model definition in a json schema and + return the Datamodel created from it. + + Parameters + ---------- + model_dict : dict or list[dict] + One or several dictionaries read in from a json-schema file + + Returns + ------- + our : DataModel + The datamodel defined in `model_dict` + """ + # @review Timm Fitschen 2022-02-30 + if isinstance(model_dict, dict): + model_dict = [model_dict] + + for ii, elt in enumerate(model_dict): + if "title" not in elt: + raise JsonSchemaDefinitionError( + f"Object {ii+1} is lacking the `title` key word") + if "type" not in elt: + raise JsonSchemaDefinitionError( + f"Object {ii+1} is lacking the `type` key word") + # Check if this is a valid Json Schema + try: + jsonschema.Draft202012Validator.check_schema(elt) + except jsonschema.SchemaError as err: + raise JsonSchemaDefinitionError( + f"Json Schema error in {elt['title']}:\n{str(err)}") from err + name = self._stringify(elt["title"], context=elt) + self._treat_element(elt, name) + + return DataModel(self.model.values()) + + def _get_atomic_datatype(self, elt): + # @review Timm Fitschen 2022-02-30 + if elt["type"] == "string": + if "format" in elt and elt["format"] in ["date", "date-time"]: + return db.DATETIME + else: + return db.TEXT + elif elt["type"] == "integer": + return db.INTEGER + elif elt["type"] == "number": + return db.DOUBLE + elif elt["type"] == "boolean": + return db.BOOLEAN + else: + raise JsonSchemaDefinitionError(f"Unkown atomic type in {elt}.") + + def _treat_element(self, elt: dict, name: str): + # @review Timm Fitschen 2022-02-30 + force_list = False + if name in self.model: + return self.model[name], force_list + if "type" not in elt: + # Each element must have a specific type + raise JsonSchemaDefinitionError( + f"`type` is missing in element {name}.") + if "enum" in elt: + ent = self._treat_enum(elt, name) + elif elt["type"] in JSON_SCHEMA_ATOMIC_TYPES: + ent = db.Property( + name=name, datatype=self._get_atomic_datatype(elt)) + elif elt["type"] == "object": + ent = self._treat_record_type(elt, name) + elif elt["type"] == "array": + ent, force_list = self._treat_list(elt, name) + else: + raise NotImplementedError( + f"Cannot parse items of type '{elt['type']}' (yet).") + if "description" in elt and ent.description is None: + # There is a description and it hasn't been set by another + # treat_something function + ent.description = elt["description"] + + self.model[name] = ent + return ent, force_list + + def _treat_record_type(self, elt: dict, name: str): + # @review Timm Fitschen 2022-02-30 + rt = db.RecordType(name=name) + if "required" in elt: + required = elt["required"] + else: + required = [] + if "properties" in elt: + for key, prop in elt["properties"].items(): + if "title" in prop: + name = self._stringify(prop["title"]) + else: + name = self._stringify(key) + prop_ent, force_list = self._treat_element(prop, name) + importance = db.OBLIGATORY if key in required else db.RECOMMENDED + if not force_list: + rt.add_property(prop_ent, importance=importance) + else: + # Special case of rt used as a list property + rt.add_property(prop_ent, importance=importance, + datatype=db.LIST(prop_ent)) + + if "description" in elt: + rt.description = elt["description"] + return rt + + def _treat_enum(self, elt: dict, name: str): + # @review Timm Fitschen 2022-02-30 + if "type" in elt and elt["type"] == "integer": + raise NotImplementedError( + "Integer-enums are not allowd until " + "https://gitlab.indiscale.com/caosdb/src/caosdb-server/-/issues/224 " + "has been fixed." + ) + rt = db.RecordType(name=name) + for enum_elt in elt["enum"]: + rec = db.Record(name=self._stringify(enum_elt)) + rec.add_parent(rt) + self.model[enum_elt] = rec + + return rt + + def _treat_list(self, elt: dict, name: str): + # @review Timm Fitschen 2022-02-30 + + if not "items" in elt: + raise JsonSchemaDefinitionError( + f"The definition of the list items is missing in {elt}.") + items = elt["items"] + if "enum" in items: + return self._treat_enum(items, name), True + if items["type"] in JSON_SCHEMA_ATOMIC_TYPES: + datatype = db.LIST(self._get_atomic_datatype(items)) + return db.Property(name=name, datatype=datatype), False + if items["type"] == "object": + if not "title" in items or self._stringify(items["title"]) == name: + # Property is RecordType + return self._treat_record_type(items, name), True + else: + # List property will be an entity of its own with a name + # different from the referenced RT + ref_rt = self._treat_record_type( + items, self._stringify(items["title"])) + self.model[ref_rt.name] = ref_rt + return db.Property(name=name, datatype=db.LIST(ref_rt)), False + + if __name__ == "__main__": model = parse_model_from_yaml('data_model.yml') print(model) diff --git a/unittests/json-schema-models/datamodel_atomic_properties.schema.json b/unittests/json-schema-models/datamodel_atomic_properties.schema.json new file mode 100644 index 0000000000000000000000000000000000000000..3828f131180a839d5c9b8bc5aa1a1285717da723 --- /dev/null +++ b/unittests/json-schema-models/datamodel_atomic_properties.schema.json @@ -0,0 +1,24 @@ +[ + { + "title": "Dataset1", + "description": "Some description", + "type": "object", + "properties": { + "title": { "type": "string", "description": "full dataset title" }, + "campaign": { "type": "string", "description": "FIXME" }, + "number_prop": { "type": "number", "description": "Some float property" } + }, + "required": [ "title", "number_prop" ] + }, + { + "title": "Dataset2", + "type": "object", + "properties": { + "date_time": { "type": "string", "format": "date-time" }, + "date": { "type": "string", "format": "date" }, + "integer": { "type": "integer", "description": "Some integer property" }, + "boolean": { "type": "boolean" }, + "number_prop": { "type": "number", "description": "Some float property" } + } + } +] diff --git a/unittests/json-schema-models/datamodel_enum_prop.schema.json b/unittests/json-schema-models/datamodel_enum_prop.schema.json new file mode 100644 index 0000000000000000000000000000000000000000..a14008d141606368519c0caadc30b16a1dc9d16d --- /dev/null +++ b/unittests/json-schema-models/datamodel_enum_prop.schema.json @@ -0,0 +1,16 @@ +{ + "title": "Dataset", + "description": "Some description", + "type": "object", + "properties": { + "license": { + "type": "string", + "enum": ["CC-BY", "CC-BY-SA", "CC0", "restricted access"] + }, + "number_enum": { + "type": "number", + "enum": [1.1, 2.2, 3.3] + } + }, + "required": ["license"] +} diff --git a/unittests/json-schema-models/datamodel_int_enum_broken.schema.json b/unittests/json-schema-models/datamodel_int_enum_broken.schema.json new file mode 100644 index 0000000000000000000000000000000000000000..159b84ac36c26325b59cdd25d2830152c4acdaaa --- /dev/null +++ b/unittests/json-schema-models/datamodel_int_enum_broken.schema.json @@ -0,0 +1,11 @@ +{ + "title": "Dataset", + "description": "Some description", + "type": "object", + "properties": { + "int_enum": { + "type": "integer", + "enum": [1, 2, 3] + } + } +} diff --git a/unittests/json-schema-models/datamodel_list_properties.schema.json b/unittests/json-schema-models/datamodel_list_properties.schema.json new file mode 100644 index 0000000000000000000000000000000000000000..b95f468a1c13f1912266e65f029654077ce6a14e --- /dev/null +++ b/unittests/json-schema-models/datamodel_list_properties.schema.json @@ -0,0 +1,46 @@ +{ + "title": "Dataset", + "description": "Dataset with list (array) properties", + "type": "object", + "properties": { + "keywords": { + "type": "array", + "items": { "type": "string" } + }, + "booleans": { + "type": "array", + "items": { "type": "boolean" } + }, + "integers": { + "type": "array", + "items": { "type": "integer" } + }, + "floats": { + "type": "array", + "items": { "type": "number" } + }, + "datetimes": { + "type": "array", + "items": { "type": "string", "format": "date-time" } + }, + "dates": { + "type": "array", + "items": { "type": "string", "format": "date" } + }, + "reference": { + "type": "array", + "items": { "type": "object", "properties": {} } + }, + "reference_with_name": { + "type": "array", + "items": { "type": "object", "title": "event", "properties": {} } + }, + "license": { + "type": "array", + "items": { + "type": "string", + "enum": ["CC-BY", "CC-BY-SA", "CC0", "restricted access"] + } + } + } +} diff --git a/unittests/json-schema-models/datamodel_missing_property_type.schema.json b/unittests/json-schema-models/datamodel_missing_property_type.schema.json new file mode 100644 index 0000000000000000000000000000000000000000..eac3cc563df587568c4e9610d72618610566beef --- /dev/null +++ b/unittests/json-schema-models/datamodel_missing_property_type.schema.json @@ -0,0 +1,7 @@ +{ + "title": "Dataset", + "type": "object", + "properties": { + "method": { "description": "Missing property type" } + } +} diff --git a/unittests/json-schema-models/datamodel_references.schema.json b/unittests/json-schema-models/datamodel_references.schema.json new file mode 100644 index 0000000000000000000000000000000000000000..6b79a9bcdbbd8beaf9974a600e9c5ff30cb513f4 --- /dev/null +++ b/unittests/json-schema-models/datamodel_references.schema.json @@ -0,0 +1,24 @@ +{ + "title": "Dataset", + "description": "", + "type": "object", + "properties": { + "event": { + "type": "object", + "properties": { + "longitude": { + "type": "number" + }, + "latitude": { + "type": "number" + }, + "location": { + "type": "string", + "description": "geographical location (e.g., North Sea; Espoo, Finland)" + } + }, + "required": ["longitude", "latitude"] + } + }, + "required": ["event"] +} diff --git a/unittests/json-schema-models/datamodel_required_no_list.schema.json b/unittests/json-schema-models/datamodel_required_no_list.schema.json new file mode 100644 index 0000000000000000000000000000000000000000..f3697a71320bc8baf05156bec2c71f3915378654 --- /dev/null +++ b/unittests/json-schema-models/datamodel_required_no_list.schema.json @@ -0,0 +1,7 @@ +{ + "title": "Dataset", + "description": "", + "type": "object", + + "required": "Dataset" +} diff --git a/unittests/json-schema-models/datamodel_string_properties.schema.json b/unittests/json-schema-models/datamodel_string_properties.schema.json new file mode 100644 index 0000000000000000000000000000000000000000..62bc0a2a4250050e5433038bf61e7c9692bb0200 --- /dev/null +++ b/unittests/json-schema-models/datamodel_string_properties.schema.json @@ -0,0 +1,14 @@ +{ + "title": "Dataset", + "description": "", + "type": "object", + + "properties": { + "title": { "type": "string", "description": "full dataset title" }, + "campaign": { "type": "string", "description": "FIXME" }, + "method": { "type": "string", "description": "FIXME" }, + "titled": { "title": "The title", "type": "string", "description": "None" } + }, + + "required": ["title"] +} diff --git a/unittests/test_json_schema_model_parser.py b/unittests/test_json_schema_model_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..4b44f6efa1cda19c04ee13a6a50b04cefbff9177 --- /dev/null +++ b/unittests/test_json_schema_model_parser.py @@ -0,0 +1,342 @@ +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2022 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2022 Florian Spreckelsen <f.spreckelsen@indiscale.com> +# Copyright (C) 2022 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify it under +# the terms of the GNU Affero General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your option) any +# later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +# details. +# +# You should have received a copy of the GNU Affero General Public License along +# with this program. If not, see <https://www.gnu.org/licenses/>. +# + +# @review Daniel Hornung 2022-02-18 + +import os +import pytest + +import caosdb as db +from caosadvancedtools.models.parser import (parse_model_from_json_schema, + JsonSchemaDefinitionError) + +FILEPATH = os.path.join(os.path.dirname( + os.path.abspath(__file__)), 'json-schema-models') + + +def test_rt_with_string_properties(): + """Test datamodel parsing of datamodel_string_properties.schema.json""" + # @author Florian Spreckelsen + # @date 2022-02-17 + + model = parse_model_from_json_schema( + os.path.join(FILEPATH, + "datamodel_string_properties.schema.json")) + assert "Dataset" in model + dataset_rt = model["Dataset"] + assert isinstance(dataset_rt, db.RecordType) + assert dataset_rt.name == "Dataset" + assert dataset_rt.description == "" + assert len(dataset_rt.get_properties()) == 4 + + assert dataset_rt.get_property("title") is not None + assert dataset_rt.get_property("campaign") is not None + assert dataset_rt.get_property("method") is not None + + assert dataset_rt.get_property("The title") is not None + assert dataset_rt.get_property("titled") is None + + title_prop = dataset_rt.get_property("title") + assert title_prop.datatype == db.TEXT + assert dataset_rt.get_importance(title_prop.name) == db.OBLIGATORY + + campaign_prop = dataset_rt.get_property("campaign") + assert campaign_prop.datatype == db.TEXT + assert dataset_rt.get_importance(campaign_prop.name) == db.RECOMMENDED + + method_prop = dataset_rt.get_property("method") + assert method_prop.datatype == db.TEXT + assert dataset_rt.get_importance(method_prop.name) == db.RECOMMENDED + + +def test_datamodel_with_atomic_properties(): + """Test read-in of two separate record types with atomic-typed properties.""" + # @author Florian Spreckelsen + # @date 2022-02-18 + + model = parse_model_from_json_schema(os.path.join( + FILEPATH, "datamodel_atomic_properties.schema.json")) + assert "Dataset1" in model + assert "Dataset2" in model + + rt1 = model["Dataset1"] + assert isinstance(rt1, db.RecordType) + assert rt1.name == "Dataset1" + assert rt1.description == "Some description" + assert len(rt1.get_properties()) == 3 + + assert rt1.get_property("title") is not None + assert rt1.get_property("campaign") is not None + assert rt1.get_property("number_prop") is not None + + title_prop = rt1.get_property("title") + assert title_prop.datatype == db.TEXT + assert rt1.get_importance(title_prop.name) == db.OBLIGATORY + + campaign_prop = rt1.get_property("campaign") + assert campaign_prop.datatype == db.TEXT + assert rt1.get_importance(campaign_prop.name) == db.RECOMMENDED + + float_prop = rt1.get_property("number_prop") + assert float_prop.datatype == db.DOUBLE + assert rt1.get_importance(float_prop.name) == db.OBLIGATORY + + rt2 = model["Dataset2"] + assert isinstance(rt2, db.RecordType) + assert rt2.name == "Dataset2" + assert not rt2.description + assert len(rt2.get_properties()) == 5 + + date_prop = rt2.get_property("date") + assert date_prop.datatype == db.DATETIME + + datetime_prop = rt2.get_property("date_time") + assert date_prop.datatype == db.DATETIME + + int_prop = rt2.get_property("integer") + assert int_prop.datatype == db.INTEGER + assert int_prop.description == "Some integer property" + + bool_prop = rt2.get_property("boolean") + assert bool_prop.datatype == db.BOOLEAN + + float_prop2 = rt2.get_property("number_prop") + assert float_prop.datatype == float_prop2.datatype + + +def test_required_no_list(): + """Exception must be raised when "required" is not a list.""" + # @author Daniel Hornung + # @date 2022-02-18 + + with pytest.raises(JsonSchemaDefinitionError) as err: + parse_model_from_json_schema( + os.path.join(FILEPATH, + "datamodel_required_no_list.schema.json")) + assert "'Dataset' is not of type 'array'" in str(err.value) + + +def test_missing_property_type(): + """Exception must be raised when "type" is missing.""" + with pytest.raises(JsonSchemaDefinitionError) as err: + parse_model_from_json_schema( + os.path.join(FILEPATH, + "datamodel_missing_property_type.schema.json")) + assert "`type` is missing" in str(err.value) + + +def test_enum(): + """Enums are represented in references to records of a specific type.""" + # @author Florian Spreckelsen + # @date 2022-03-16 + + model = parse_model_from_json_schema(os.path.join( + FILEPATH, "datamodel_enum_prop.schema.json")) + licenses = ["CC-BY", "CC-BY-SA", "CC0", "restricted access"] + for name in ["Dataset", "license"] + licenses: + assert name in model + + assert isinstance(model["Dataset"], db.RecordType) + assert model["Dataset"].get_property("license") is not None + assert model["Dataset"].get_property("license").is_reference() + assert model["Dataset"].get_property("license").datatype.name == "license" + assert isinstance(model["license"], db.RecordType) + + for name in licenses: + assert isinstance(model[name], db.Record) + assert model[name].name == name + assert len(model[name].parents) == 1 + assert model[name].has_parent(model["license"]) + + # Also allow enums with non-string types + number_enums = ["1.1", "2.2", "3.3"] + for name in ["number_enum"] + number_enums: + assert name in model + + assert isinstance(model["number_enum"], db.RecordType) + assert model["Dataset"].get_property("number_enum") is not None + assert model["Dataset"].get_property("number_enum").is_reference() + assert model["Dataset"].get_property( + "number_enum").datatype.name == "number_enum" + + for name in number_enums: + assert isinstance(model[name], db.Record) + assert model[name].name == name + assert len(model[name].parents) == 1 + assert model[name].has_parent(model["number_enum"]) + + +@pytest.mark.xfail(reason="Don't allow integer enums until https://gitlab.indiscale.com/caosdb/src/caosdb-server/-/issues/224 has been fixed") +def test_int_enum(): + """Check an enum property with type: integer""" + # @author Florian Spreckelsen + # @date 2022-03-22 + + model = parse_model_from_json_schema(os.path.join( + FILEPATH, "datamodel_int_enum_broken.schema.json")) + int_enums = ["1", "2", "3"] + for name in ["Dataset", "int_enum"] + int_enums: + assert name in model + + assert isinstance(model["Dataset"], db.RecordType) + assert model["Dataset"].get_property("int_enum") is not None + assert model["Dataset"].get_property("int_enum").is_reference + assert model["Dataset"].get_property( + "int_enum").datatype.name == "int_enum" + assert isinstance(model["int_enum"], db.RecordType) + + for name in int_enums: + assert isinstance(model[name], db.Record) + assert model[name].name == name + assert len(model[name].parents) == 1 + assert model[name].has_parent(model["int_enum"]) + + +def test_references(): + """Test reference properties""" + # @author Florian Spreckelsen + # @date 2022-03-17 + + model = parse_model_from_json_schema(os.path.join( + FILEPATH, "datamodel_references.schema.json")) + for name in ["Dataset", "event", "longitude", "latitude", "location"]: + assert name in model + + assert isinstance(model["Dataset"], db.RecordType) + assert model["Dataset"].get_property("event") is not None + assert model["Dataset"].get_importance("event") == db.OBLIGATORY + assert model["Dataset"].get_property("event").is_reference() + assert model["Dataset"].get_property("event").datatype.name == "event" + + assert isinstance(model["event"], db.RecordType) + assert model["event"].get_property("longitude") is not None + assert model["event"].get_importance("longitude") == db.OBLIGATORY + assert model["event"].get_property("longitude").datatype == db.DOUBLE + + assert model["event"].get_property("latitude") is not None + assert model["event"].get_importance("latitude") == db.OBLIGATORY + assert model["event"].get_property("latitude").datatype == db.DOUBLE + + assert model["event"].get_property("location") is not None + assert model["event"].get_importance("location") == db.RECOMMENDED + assert model["event"].get_property("location").datatype == db.TEXT + + assert isinstance(model["longitude"], db.Property) + assert model["longitude"].datatype == db.DOUBLE + + assert isinstance(model["latitude"], db.Property) + assert model["latitude"].datatype == db.DOUBLE + + assert isinstance(model["location"], db.Property) + assert model["location"].datatype == db.TEXT + assert model["location"].description == "geographical location (e.g., North Sea; Espoo, Finland)" + + +def test_list(): + """Test list properties with all possible datatypes.""" + # @author Florian Spreckelsen + # @date 2022-03-17 + + model = parse_model_from_json_schema(os.path.join( + FILEPATH, "datamodel_list_properties.schema.json")) + licenses = ["CC-BY", "CC-BY-SA", "CC0", "restricted access"] + names = ["Dataset", "keywords", "booleans", "integers", "floats", + "datetimes", "dates", "reference", "reference_with_name", "event", + "license"] + for name in names + licenses: + assert name in model + + dataset_rt = model["Dataset"] + assert dataset_rt.get_property("keywords") is not None + assert dataset_rt.get_property("keywords").datatype == db.LIST(db.TEXT) + assert isinstance(model["keywords"], db.Property) + assert model["keywords"].name == "keywords" + assert model["keywords"].datatype == db.LIST(db.TEXT) + + assert dataset_rt.get_property("booleans") is not None + assert dataset_rt.get_property("booleans").datatype == db.LIST(db.BOOLEAN) + assert isinstance(model["booleans"], db.Property) + assert model["booleans"].name == "booleans" + assert model["booleans"].datatype == db.LIST(db.BOOLEAN) + + assert dataset_rt.get_property("integers") is not None + assert dataset_rt.get_property("integers").datatype == db.LIST(db.INTEGER) + assert isinstance(model["integers"], db.Property) + assert model["integers"].name == "integers" + assert model["integers"].datatype == db.LIST(db.INTEGER) + + assert dataset_rt.get_property("floats") is not None + assert dataset_rt.get_property("floats").datatype == db.LIST(db.DOUBLE) + assert isinstance(model["floats"], db.Property) + assert model["floats"].name == "floats" + assert model["floats"].datatype == db.LIST(db.DOUBLE) + + assert dataset_rt.get_property("datetimes") is not None + assert dataset_rt.get_property( + "datetimes").datatype == db.LIST(db.DATETIME) + assert isinstance(model["datetimes"], db.Property) + assert model["datetimes"].name == "datetimes" + assert model["datetimes"].datatype == db.LIST(db.DATETIME) + + assert dataset_rt.get_property("dates") is not None + assert dataset_rt.get_property( + "dates").datatype == db.LIST(db.DATETIME) + assert isinstance(model["dates"], db.Property) + assert model["dates"].name == "dates" + assert model["dates"].datatype == db.LIST(db.DATETIME) + + # Simple reference list property + assert dataset_rt.get_property("reference") is not None + assert dataset_rt.get_property("reference").is_reference() + assert dataset_rt.get_property( + "reference").datatype == db.LIST("reference") + assert isinstance(model["reference"], db.RecordType) + assert model["reference"].name == "reference" + assert dataset_rt.get_property( + "reference").datatype == db.LIST(model["reference"]) + + # Reference list with name + assert dataset_rt.get_property("reference_with_name") is not None + assert dataset_rt.get_property("reference_with_name").is_reference() + assert dataset_rt.get_property( + "reference_with_name").datatype == db.LIST("event") + assert isinstance(model["event"], db.RecordType) + assert model["event"].name == "event" + assert dataset_rt.get_property( + "reference_with_name").datatype == db.LIST(model["event"]) + assert isinstance(model["reference_with_name"], db.Property) + assert model["reference_with_name"].name == "reference_with_name" + assert model["reference_with_name"].datatype == db.LIST(model["event"]) + + # References to enum types + assert dataset_rt.get_property("license") is not None + assert dataset_rt.get_property("license").is_reference() + assert dataset_rt.get_property("license").datatype == db.LIST("license") + assert isinstance(model["license"], db.RecordType) + assert model["license"].name == "license" + assert dataset_rt.get_property( + "license").datatype == db.LIST(model["license"]) + + for name in licenses: + assert isinstance(model[name], db.Record) + assert model[name].name == name + assert len(model[name].parents) == 1 + assert model[name].has_parent(model["license"]) diff --git a/unittests/test_parser.py b/unittests/test_yaml_model_parser.py similarity index 100% rename from unittests/test_parser.py rename to unittests/test_yaml_model_parser.py