diff --git a/src/caoscrawler/validator.py b/src/caoscrawler/validator.py index 5610470389580d8f11c9782a63023da1c98f5ef8..3cd57cd5842e3de31d59b5e4f489009a009ee1cc 100644 --- a/src/caoscrawler/validator.py +++ b/src/caoscrawler/validator.py @@ -41,7 +41,7 @@ from caoscrawler import scanner # from collections import OrderedDict -def load_json_schema_from_datamodel_yaml(filename: str) -> list: +def load_json_schema_from_datamodel_yaml(filename: str) -> dict[str, dict]: """ Load a data model yaml file (using caosadvancedtools) and convert all record types into a json schema using the json_schema_exporter module. @@ -53,25 +53,37 @@ def load_json_schema_from_datamodel_yaml(filename: str) -> list: Returns ------- - A list of json schema objects. + A dict of json schema objects. The keys are the record types for which the schemas + are generated. """ model = parse_model_from_yaml(filename) - rt_schemas = [] + rt_schemas = {} for el_key, el in model.items(): if isinstance(el, db.RecordType): - rt_schemas.append(recordtype_to_json_schema(el)) + rt_schemas[el_key] = recordtype_to_json_schema(el) return rt_schemas def representer_ordereddict(dumper, data): - # yaml.add_representer(OrderedDict, caoscrawler.validator.representer_ordereddict) + """ + Helper function to be able to represent the converted json schema objects correctly as yaml. + This representer essentially replaced OrderedDict objects with simple dict objects. + + Since Python 3.7 dicts are ordered by default, see e.g.: + https://softwaremaniacs.org/blog/2020/02/05/dicts-ordered/en/ + + Example how to use the representer: + ```python + yaml.add_representer(OrderedDict, caoscrawler.validator.representer_ordereddict) + ``` + """ return dumper.represent_data(dict(data)) -def apply_schema_patches(pobj: dict): +def _apply_schema_patches(pobj: dict): """ Changes applied: - properties are moved vom subitem "proeprties" to top-level. @@ -82,7 +94,7 @@ def apply_schema_patches(pobj: dict): return pobj for prop in pobj["properties"]: if isinstance(pobj["properties"][prop], dict): - pobj[prop] = apply_schema_patches(pobj["properties"][prop]) + pobj[prop] = _apply_schema_patches(pobj["properties"][prop]) else: pobj[prop] = pobj["properties"][prop] @@ -98,8 +110,8 @@ def convert_record(record: db.Record): """ Convert a record into a form suitable for validation with jsonschema. - Uses high_level_api.convert_to_python_object - Afterwards apply_schema_patches is called recursively to refactor the dictionary + Uses `high_level_api.convert_to_python_object` + Afterwards `_apply_schema_patches` is called recursively to refactor the dictionary to match the current form of the jsonschema. Arguments: @@ -108,10 +120,10 @@ def convert_record(record: db.Record): The record that is supposed to be converted. """ pobj = convert_to_python_object(record).serialize() - return apply_schema_patches(pobj) + return _apply_schema_patches(pobj) -def validate(records: list[db.Record], schemas: list[dict]) -> list[tuple[bool, list]]: +def validate(records: list[db.Record], schemas: dict[str, dict]) -> list[tuple[bool, list]]: """ Validate a list of records against a list of possible JSON schemas. @@ -134,14 +146,19 @@ def validate(records: list[db.Record], schemas: list[dict]) -> list[tuple[bool, - Index 0: A boolean that determines whether at least one schema matched for this record. - Index 1: A list of schemas matching the record at this position of the list `records`. """ + retval = [] for r in records: - matching_schemas = [] - for schema in schemas: - try: - jsonschema.validate(convert_record(r), schema) - matching_schemas.append(schema) - except ValidationError: - pass - retval.append((len(matching_schemas) > 0, matching_schemas)) + if len(r.parents) != 1: + raise RuntimeError( + "Schema validation is only supported if records have exactly one parent.") + parname = r.parents[0].name + if parname not in schemas: + raise RuntimeError( + "No schema for record type {} in schema dictionary.".format(parname)) + try: + jsonschema.validate(convert_record(r), schemas[parname]) + retval.append((True, None)) + except ValidationError as ex: + retval.append((False, ex)) return retval diff --git a/unittests/test_validation.py b/unittests/test_validation.py index 42bf33baa648670c932e92367e49c8a98dabd07f..a3215963f67b61241b321a0eb7345f9fe6fde1f2 100644 --- a/unittests/test_validation.py +++ b/unittests/test_validation.py @@ -23,16 +23,12 @@ """ test validation """ -import importlib -import os from os.path import join from pathlib import Path -import caoscrawler import jsonschema import linkahead as db import pytest -import yaml from caoscrawler.validator import (convert_record, load_json_schema_from_datamodel_yaml, validate) @@ -51,7 +47,8 @@ def test_create_json_schema(): pobj = convert_record(r) # print(yaml.dump(pobj)) # print(yaml.dump(json[0])) - jsonschema.validate(pobj, json[0]) + assert "Dataset" in json + jsonschema.validate(pobj, json["Dataset"]) # Failing test: r = db.Record() @@ -62,7 +59,7 @@ def test_create_json_schema(): pobj = convert_record(r) with pytest.raises(ValidationError, match=".*'keywords' is a required property.*"): - jsonschema.validate(pobj, json[0]) + jsonschema.validate(pobj, json["Dataset"]) def test_validation(): @@ -82,7 +79,8 @@ def test_validation(): r2.add_property(name="dateModified", value="2024-11-16") valres = validate([r1, r2], json) - assert valres[0][0] - assert len(valres[0][1]) == 1 - assert valres[0][1][0] == json[0] - assert len(valres[1][1]) == 0 + assert valres[0][0] is True + assert valres[0][1] is None + assert not valres[1][0] + assert isinstance(valres[1][1], ValidationError) + assert valres[1][1].message == "'keywords' is a required property"