Skip to content
Snippets Groups Projects
Commit 59c84e36 authored by Alexander Schlemmer's avatar Alexander Schlemmer
Browse files

Merge branch 'f-json-validator' into 'dev'

Validator that checks created records using a json schema

See merge request !201
parents 02487638 4c70a0a6
No related branches found
No related tags found
2 merge requests!217TST: Make NamedTemporaryFiles Windows-compatible,!201Validator that checks created records using a json schema
Pipeline #58466 passed
......@@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added ###
- Validation module for checking a list of generated records against a list of json schemas
that can be generated from a yaml data model file.
- DictElementConverters can now make use of `match_properties` which
works analogous to `match_properties` in ROCrateEntityConverter and
`match_attrib` in XMLConverter.
......
#!/usr/bin/env python3
# encoding: utf-8
#
# This file is a part of the LinkAhead Project.
#
# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com>
# Copyright (C) 2024 Alexander Schlemmer
#
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
# ** end header
#
"""
This module contains functions to validate the output of a scanner run with a
json schema.
"""
import jsonschema
import linkahead as db
# from caosadvancedtools.models.parser import parse_model_from_string
from caosadvancedtools.json_schema_exporter import recordtype_to_json_schema
from caosadvancedtools.models.parser import parse_model_from_yaml
from jsonschema import ValidationError
from linkahead.high_level_api import convert_to_python_object
def load_json_schema_from_datamodel_yaml(filename: str) -> dict[str, dict]:
"""
Load a data model yaml file (using caosadvancedtools) and convert
all record types into a json schema using the json_schema_exporter module.
Arguments
---------
filename: str
The filename of the yaml file to load.
Returns
-------
A dict of json schema objects. The keys are the record types for which the schemas
are generated.
"""
model = parse_model_from_yaml(filename)
rt_schemas = {}
for el_key, el in model.items():
if isinstance(el, db.RecordType):
rt_schemas[el_key] = recordtype_to_json_schema(el)
return rt_schemas
def representer_ordereddict(dumper, data):
"""
Helper function to be able to represent the converted json schema objects correctly as yaml.
This representer essentially replaced OrderedDict objects with simple dict objects.
Since Python 3.7 dicts are ordered by default, see e.g.:
https://softwaremaniacs.org/blog/2020/02/05/dicts-ordered/en/
Example how to use the representer:
```python
yaml.add_representer(OrderedDict, caoscrawler.validator.representer_ordereddict)
```
"""
return dumper.represent_data(dict(data))
def _apply_schema_patches(pobj: dict):
"""
Changes applied:
- properties are moved vom subitem "proeprties" to top-level.
- The following keys are deleted: parents, role, name, description, metadata, properties
"""
if "properties" not in pobj:
# this is probably a file
return pobj
for prop in pobj["properties"]:
if isinstance(pobj["properties"][prop], dict):
pobj[prop] = _apply_schema_patches(pobj["properties"][prop])
else:
pobj[prop] = pobj["properties"][prop]
for keyd in ("parents", "role", "name",
"description", "metadata", "properties"):
if keyd in pobj:
del pobj[keyd]
return pobj
def convert_record(record: db.Record):
"""
Convert a record into a form suitable for validation with jsonschema.
Uses `high_level_api.convert_to_python_object`
Afterwards `_apply_schema_patches` is called recursively to refactor the dictionary
to match the current form of the jsonschema.
Arguments:
----------
record: db.Record
The record that is supposed to be converted.
"""
pobj = convert_to_python_object(record).serialize()
return _apply_schema_patches(pobj)
def validate(records: list[db.Record], schemas: dict[str, dict]) -> list[tuple]:
"""
Validate a list of records against a dictionary of schemas.
The keys of the dictionary are record types and the corresponding values are json schemata
associated with that record type. The current implementation assumes that each record that is
checked has exactly one parent and raises an error if that is not the case.
The schema belonging to a record is identified using the name of the first (and only) parent
of the record.
Arguments:
----------
records: list[db.Record]
List of records that will be validated.
schemas: dict[str, dict]
A dictionary of JSON schemas generated using `load_json_schema_from_datamodel_yaml`.
Returns:
--------
A list of tuples, one element for each record:
- Index 0: A boolean that determines whether the schema belonging to the record type of the
record matched.
- Index 1: A validation error if the schema did not match or None otherwise.
"""
retval = []
for r in records:
if len(r.parents) != 1:
raise NotImplementedError(
"Schema validation is only supported if records have exactly one parent.")
parname = r.parents[0].name
if parname not in schemas:
raise RuntimeError(
"No schema for record type {} in schema dictionary.".format(parname))
try:
jsonschema.validate(convert_record(r), schemas[parname])
retval.append((True, None))
except ValidationError as ex:
retval.append((False, ex))
return retval
Dataset:
obligatory_properties:
keywords:
datatype: TEXT
dateModified:
datatype: DATETIME
#!/usr/bin/env python3
# encoding: utf-8
#
# This file is a part of the LinkAhead Project.
#
# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com>
# Copyright (C) 2024 Alexander Schlemmer <a.schlemmer@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
"""
test validation
"""
from os.path import join
from pathlib import Path
import jsonschema
import linkahead as db
import pytest
from caoscrawler.validator import (convert_record,
load_json_schema_from_datamodel_yaml,
validate)
from jsonschema import ValidationError
UNITTESTDIR = Path(__file__).parent
def test_create_json_schema():
json = load_json_schema_from_datamodel_yaml(join(UNITTESTDIR, "datamodels", "datamodel.yaml"))
r = db.Record()
r.add_parent(name="Dataset")
r.add_property(name="keywords", value="jakdlfjakdf")
r.add_property(name="dateModified", value="2024-11-16")
pobj = convert_record(r)
# print(yaml.dump(pobj))
# print(yaml.dump(json[0]))
assert "Dataset" in json
jsonschema.validate(pobj, json["Dataset"])
# Failing test:
r = db.Record()
r.add_parent(name="Dataset")
r.add_property(name="keywordss", value="jakdlfjakdf")
r.add_property(name="dateModified", value="2024-11-16")
pobj = convert_record(r)
with pytest.raises(ValidationError, match=".*'keywords' is a required property.*"):
jsonschema.validate(pobj, json["Dataset"])
def test_validation():
"""
Test for the main validation API function `validate`
"""
json = load_json_schema_from_datamodel_yaml(
join(UNITTESTDIR, "datamodels", "datamodel.yaml"))
r1 = db.Record()
r1.add_parent(name="Dataset")
r1.add_property(name="keywords", value="jakdlfjakdf")
r1.add_property(name="dateModified", value="2024-11-16")
r2 = db.Record()
r2.add_parent(name="Dataset")
r2.add_property(name="keywordss", value="jakdlfjakdf")
r2.add_property(name="dateModified", value="2024-11-16")
valres = validate([r1, r2], json)
assert valres[0][0] is True
assert valres[0][1] is None
assert not valres[1][0]
assert isinstance(valres[1][1], ValidationError)
assert valres[1][1].message == "'keywords' is a required property"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment