diff --git a/CHANGELOG.md b/CHANGELOG.md index 7c50fb1b69907ffb23b05fa8829d01a34b6c9165..77ab1a2bbd479a18a883ec210236e48170513e28 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### +- Validation module for checking a list of generated records against a list of json schemas + that can be generated from a yaml data model file. - DictElementConverters can now make use of `match_properties` which works analogous to `match_properties` in ROCrateEntityConverter and `match_attrib` in XMLConverter. diff --git a/src/caoscrawler/validator.py b/src/caoscrawler/validator.py new file mode 100644 index 0000000000000000000000000000000000000000..33e29b02db429e3382248bbd80d2d00cd7b07c6b --- /dev/null +++ b/src/caoscrawler/validator.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer +# +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +""" +This module contains functions to validate the output of a scanner run with a +json schema. +""" + +import jsonschema +import linkahead as db +# from caosadvancedtools.models.parser import parse_model_from_string +from caosadvancedtools.json_schema_exporter import recordtype_to_json_schema +from caosadvancedtools.models.parser import parse_model_from_yaml +from jsonschema import ValidationError +from linkahead.high_level_api import convert_to_python_object + + +def load_json_schema_from_datamodel_yaml(filename: str) -> dict[str, dict]: + """ + Load a data model yaml file (using caosadvancedtools) and convert + all record types into a json schema using the json_schema_exporter module. + + Arguments + --------- + filename: str + The filename of the yaml file to load. + + Returns + ------- + A dict of json schema objects. The keys are the record types for which the schemas + are generated. + """ + + model = parse_model_from_yaml(filename) + + rt_schemas = {} + for el_key, el in model.items(): + if isinstance(el, db.RecordType): + rt_schemas[el_key] = recordtype_to_json_schema(el) + + return rt_schemas + + +def representer_ordereddict(dumper, data): + """ + Helper function to be able to represent the converted json schema objects correctly as yaml. + This representer essentially replaced OrderedDict objects with simple dict objects. + + Since Python 3.7 dicts are ordered by default, see e.g.: + https://softwaremaniacs.org/blog/2020/02/05/dicts-ordered/en/ + + Example how to use the representer: + ```python + yaml.add_representer(OrderedDict, caoscrawler.validator.representer_ordereddict) + ``` + """ + return dumper.represent_data(dict(data)) + + +def _apply_schema_patches(pobj: dict): + """ + Changes applied: + - properties are moved vom subitem "proeprties" to top-level. + - The following keys are deleted: parents, role, name, description, metadata, properties + """ + if "properties" not in pobj: + # this is probably a file + return pobj + for prop in pobj["properties"]: + if isinstance(pobj["properties"][prop], dict): + pobj[prop] = _apply_schema_patches(pobj["properties"][prop]) + else: + pobj[prop] = pobj["properties"][prop] + + for keyd in ("parents", "role", "name", + "description", "metadata", "properties"): + if keyd in pobj: + del pobj[keyd] + + return pobj + + +def convert_record(record: db.Record): + """ + Convert a record into a form suitable for validation with jsonschema. + + Uses `high_level_api.convert_to_python_object` + Afterwards `_apply_schema_patches` is called recursively to refactor the dictionary + to match the current form of the jsonschema. + + Arguments: + ---------- + record: db.Record + The record that is supposed to be converted. + """ + pobj = convert_to_python_object(record).serialize() + return _apply_schema_patches(pobj) + + +def validate(records: list[db.Record], schemas: dict[str, dict]) -> list[tuple]: + """ + Validate a list of records against a dictionary of schemas. + The keys of the dictionary are record types and the corresponding values are json schemata + associated with that record type. The current implementation assumes that each record that is + checked has exactly one parent and raises an error if that is not the case. + The schema belonging to a record is identified using the name of the first (and only) parent + of the record. + + Arguments: + ---------- + + records: list[db.Record] + List of records that will be validated. + + schemas: dict[str, dict] + A dictionary of JSON schemas generated using `load_json_schema_from_datamodel_yaml`. + + Returns: + -------- + A list of tuples, one element for each record: + + - Index 0: A boolean that determines whether the schema belonging to the record type of the + record matched. + - Index 1: A validation error if the schema did not match or None otherwise. + """ + + retval = [] + for r in records: + if len(r.parents) != 1: + raise NotImplementedError( + "Schema validation is only supported if records have exactly one parent.") + parname = r.parents[0].name + if parname not in schemas: + raise RuntimeError( + "No schema for record type {} in schema dictionary.".format(parname)) + try: + jsonschema.validate(convert_record(r), schemas[parname]) + retval.append((True, None)) + except ValidationError as ex: + retval.append((False, ex)) + return retval diff --git a/unittests/datamodels/datamodel.yaml b/unittests/datamodels/datamodel.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2759ecba7f2967062937d9b2f4805a9b501ab6c4 --- /dev/null +++ b/unittests/datamodels/datamodel.yaml @@ -0,0 +1,6 @@ +Dataset: + obligatory_properties: + keywords: + datatype: TEXT + dateModified: + datatype: DATETIME diff --git a/unittests/test_validation.py b/unittests/test_validation.py new file mode 100644 index 0000000000000000000000000000000000000000..a3215963f67b61241b321a0eb7345f9fe6fde1f2 --- /dev/null +++ b/unittests/test_validation.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer <a.schlemmer@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" +test validation +""" +from os.path import join +from pathlib import Path + +import jsonschema +import linkahead as db +import pytest +from caoscrawler.validator import (convert_record, + load_json_schema_from_datamodel_yaml, + validate) +from jsonschema import ValidationError + +UNITTESTDIR = Path(__file__).parent + + +def test_create_json_schema(): + json = load_json_schema_from_datamodel_yaml(join(UNITTESTDIR, "datamodels", "datamodel.yaml")) + r = db.Record() + r.add_parent(name="Dataset") + r.add_property(name="keywords", value="jakdlfjakdf") + r.add_property(name="dateModified", value="2024-11-16") + + pobj = convert_record(r) + # print(yaml.dump(pobj)) + # print(yaml.dump(json[0])) + assert "Dataset" in json + jsonschema.validate(pobj, json["Dataset"]) + + # Failing test: + r = db.Record() + r.add_parent(name="Dataset") + r.add_property(name="keywordss", value="jakdlfjakdf") + r.add_property(name="dateModified", value="2024-11-16") + + pobj = convert_record(r) + + with pytest.raises(ValidationError, match=".*'keywords' is a required property.*"): + jsonschema.validate(pobj, json["Dataset"]) + + +def test_validation(): + """ + Test for the main validation API function `validate` + """ + json = load_json_schema_from_datamodel_yaml( + join(UNITTESTDIR, "datamodels", "datamodel.yaml")) + r1 = db.Record() + r1.add_parent(name="Dataset") + r1.add_property(name="keywords", value="jakdlfjakdf") + r1.add_property(name="dateModified", value="2024-11-16") + + r2 = db.Record() + r2.add_parent(name="Dataset") + r2.add_property(name="keywordss", value="jakdlfjakdf") + r2.add_property(name="dateModified", value="2024-11-16") + + valres = validate([r1, r2], json) + assert valres[0][0] is True + assert valres[0][1] is None + assert not valres[1][0] + assert isinstance(valres[1][1], ValidationError) + assert valres[1][1].message == "'keywords' is a required property"