diff --git a/src/caoscrawler/validator.py b/src/caoscrawler/validator.py index 1663aa460ba817d63dce5aa2e5d814935f622be5..8e0efd94ef348e67b991708a3d5d7917e9dd32a9 100644 --- a/src/caoscrawler/validator.py +++ b/src/caoscrawler/validator.py @@ -41,7 +41,7 @@ from caoscrawler import scanner # from collections import OrderedDict -def load_json_schema_from_datamodel_yaml(filename: str) -> list: +def load_json_schema_from_datamodel_yaml(filename: str) -> dict[str, dict]: """ Load a data model yaml file (using caosadvancedtools) and convert all record types into a json schema using the json_schema_exporter module. @@ -53,43 +53,50 @@ def load_json_schema_from_datamodel_yaml(filename: str) -> list: Returns ------- - A list of json schema objects. + A dict of json schema objects. The keys are the record types for which the schemas + are generated. """ model = parse_model_from_yaml(filename) - rt_schemas = [] + rt_schemas = {} for el_key, el in model.items(): if isinstance(el, db.RecordType): - rt_schemas.append(recordtype_to_json_schema(el)) + rt_schemas[el_key] = recordtype_to_json_schema(el) return rt_schemas def representer_ordereddict(dumper, data): - # yaml.add_representer(OrderedDict, caoscrawler.validator.representer_ordereddict) - return dumper.represent_data(dict(data)) + """ + Helper function to be able to represent the converted json schema objects correctly as yaml. + This representer essentially replaced OrderedDict objects with simple dict objects. + Since Python 3.7 dicts are ordered by default, see e.g.: + https://softwaremaniacs.org/blog/2020/02/05/dicts-ordered/en/ -def convert_record(record: db.Record): + Example how to use the representer: + ```python + yaml.add_representer(OrderedDict, caoscrawler.validator.representer_ordereddict) + ``` """ - Convert a record into a form suitable for validation with jsonschema. + return dumper.represent_data(dict(data)) - Uses high_level_api.convert_to_python_object +def _apply_schema_patches(pobj: dict): + """ Changes applied: - properties are moved vom subitem "proeprties" to top-level. - The following keys are deleted: parents, role, name, description, metadata, properties - - Arguments: - ---------- - record: db.Record - The record that is supposed to be converted. """ - pobj = convert_to_python_object(record).serialize() - + if "properties" not in pobj: + # this is probably a file + return pobj for prop in pobj["properties"]: - pobj[prop] = pobj["properties"][prop] + if isinstance(pobj["properties"][prop], dict): + pobj[prop] = _apply_schema_patches(pobj["properties"][prop]) + else: + pobj[prop] = pobj["properties"][prop] for keyd in ("parents", "role", "name", "description", "metadata", "properties"): @@ -99,12 +106,31 @@ def convert_record(record: db.Record): return pobj -def validate(records: list[db.Record], schemas: list[dict]) -> list[tuple[bool, list]]: +def convert_record(record: db.Record): """ - Validate a list of records against a list of possible JSON schemas. + Convert a record into a form suitable for validation with jsonschema. - It is tried to validate each schema from the list of schemas. If none of them validates - without error, it is assumed that it does not match at all. + Uses `high_level_api.convert_to_python_object` + Afterwards `_apply_schema_patches` is called recursively to refactor the dictionary + to match the current form of the jsonschema. + + Arguments: + ---------- + record: db.Record + The record that is supposed to be converted. + """ + pobj = convert_to_python_object(record).serialize() + return _apply_schema_patches(pobj) + + +def validate(records: list[db.Record], schemas: dict[str, dict]) -> list[tuple]: + """ + Validate a list of records against a dictionary of schemas. + The keys of the dictionary are record types and the corresponding values are json schemata + associated with that record type. The current implementation assumes that each record that is + checked has exactly one parent and raises an error if that is not the case. + The schema belonging to a record is identified using the name of the first (and only) parent + of the record. Arguments: ---------- @@ -112,24 +138,30 @@ def validate(records: list[db.Record], schemas: list[dict]) -> list[tuple[bool, records: list[db.Record] List of records that will be validated. - schemas: list[dict] - A list of JSON schemas generated using `load_json_schema_from_datamodel_yaml`. + schemas: dict[str, dict] + A dictionary of JSON schemas generated using `load_json_schema_from_datamodel_yaml`. Returns: -------- A list of tuples, one element for each record: - - Index 0: A boolean that determines whether at least one schema matched for this record. - - Index 1: A list of schemas matching the record at this position of the list `records`. + - Index 0: A boolean that determines whether the schema belonging to the record type of the + record matched. + - Index 1: A validation error if the schema did not match or None otherwise. """ + retval = [] for r in records: - matching_schemas = [] - for schema in schemas: - try: - jsonschema.validate(convert_record(r), schema) - matching_schemas.append(schema) - except ValidationError: - pass - retval.append((len(matching_schemas) > 0, matching_schemas)) + if len(r.parents) != 1: + raise RuntimeError( + "Schema validation is only supported if records have exactly one parent.") + parname = r.parents[0].name + if parname not in schemas: + raise RuntimeError( + "No schema for record type {} in schema dictionary.".format(parname)) + try: + jsonschema.validate(convert_record(r), schemas[parname]) + retval.append((True, None)) + except ValidationError as ex: + retval.append((False, ex)) return retval diff --git a/unittests/test_validation.py b/unittests/test_validation.py index 42bf33baa648670c932e92367e49c8a98dabd07f..a3215963f67b61241b321a0eb7345f9fe6fde1f2 100644 --- a/unittests/test_validation.py +++ b/unittests/test_validation.py @@ -23,16 +23,12 @@ """ test validation """ -import importlib -import os from os.path import join from pathlib import Path -import caoscrawler import jsonschema import linkahead as db import pytest -import yaml from caoscrawler.validator import (convert_record, load_json_schema_from_datamodel_yaml, validate) @@ -51,7 +47,8 @@ def test_create_json_schema(): pobj = convert_record(r) # print(yaml.dump(pobj)) # print(yaml.dump(json[0])) - jsonschema.validate(pobj, json[0]) + assert "Dataset" in json + jsonschema.validate(pobj, json["Dataset"]) # Failing test: r = db.Record() @@ -62,7 +59,7 @@ def test_create_json_schema(): pobj = convert_record(r) with pytest.raises(ValidationError, match=".*'keywords' is a required property.*"): - jsonschema.validate(pobj, json[0]) + jsonschema.validate(pobj, json["Dataset"]) def test_validation(): @@ -82,7 +79,8 @@ def test_validation(): r2.add_property(name="dateModified", value="2024-11-16") valres = validate([r1, r2], json) - assert valres[0][0] - assert len(valres[0][1]) == 1 - assert valres[0][1][0] == json[0] - assert len(valres[1][1]) == 0 + assert valres[0][0] is True + assert valres[0][1] is None + assert not valres[1][0] + assert isinstance(valres[1][1], ValidationError) + assert valres[1][1].message == "'keywords' is a required property"