diff --git a/CHANGELOG.md b/CHANGELOG.md index 175dc1b2a8f1a9ff3258e7ef03d8bb0640357918..404424ded153081a1b16d7e5b0923d9284695949 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed ### +- Validation against schema in table json converter fixed. + ### Security ### ### Documentation ### diff --git a/src/caosadvancedtools/json_schema_exporter.py b/src/caosadvancedtools/json_schema_exporter.py index 7a92eaad8aeb55ae5c34e529de4848ce4a12b0f9..56568ca18eb10f501fa13bc766613367050c034d 100644 --- a/src/caosadvancedtools/json_schema_exporter.py +++ b/src/caosadvancedtools/json_schema_exporter.py @@ -112,7 +112,7 @@ class JsonSchemaExporter: to create them. Instead, only the choice of existing elements should be given. do_not_retrieve : list[str], optional - A list of RedcordType names, for which no Records shall be retrieved. Instead, only an + A list of RecordType names, for which no Records shall be retrieved. Instead, only an object description should be given. If this list overlaps with the `do_not_create` parameter, the behavior is undefined. no_remote : bool, optional diff --git a/src/caosadvancedtools/table_json_conversion/convert.py b/src/caosadvancedtools/table_json_conversion/convert.py index 4b02fa46a8e7a2426118cd987e5d84f906e2dfdb..7a3d63a2444d09f0c9f695edfa8fd6865593f62e 100644 --- a/src/caosadvancedtools/table_json_conversion/convert.py +++ b/src/caosadvancedtools/table_json_conversion/convert.py @@ -31,6 +31,7 @@ from operator import getitem from types import SimpleNamespace from typing import Any, BinaryIO, Callable, TextIO, Union, Optional from warnings import warn +from copy import deepcopy import jsonschema from openpyxl import load_workbook @@ -152,6 +153,51 @@ class ForeignError(KeyError): self.definitions = definitions +def _validate_jsonschema(instance, schema): + # Checks whether a key: value pair is in the given schema or fulfills the + # criteria of a direct subschema (anyOf, allOf, oneOf) + def in_schema(key, val, schema): + if schema.get(key, None) == val: + return True + if 'anyOf' in schema: + return any([in_schema(key, val, sub) for sub in schema['anyOf']]) + if 'allOf' in schema: + return all([in_schema(key, val, sub) for sub in schema['allOf']]) + if 'oneOf' in schema: + return [in_schema(key, val, sub) for sub in schema['oneOf']].count(True) == 1 + return False + + # Removes Key: None and datetime instances from nested dicts and lists of + # any depth. Key: None is currently valid as there is no 'obligatory with + # value', and datetime cannot be checked by jsonschema. + def remove_incompatible_values(it, schema): + if isinstance(it, list): + schema = schema.get('items', schema) + for elem in it: + remove_incompatible_values(elem, schema) + elif isinstance(it, dict): + schema = schema.get('properties', schema) + for key, elem in list(it.items()): + if elem is None: + it.pop(key) + elif isinstance(elem, datetime.date) or isinstance(elem, datetime.datetime): + if in_schema('format', 'date', schema[key]) or in_schema('format', 'date-time', schema[key]): + it.pop(key) + elif isinstance(it, (dict, list)): + remove_incompatible_values(elem, schema[key]) + return it + + # If instance is not a dict, remove_incompatible_values would not remove + # the value if it is valid, so we need to check manually by wrapping + instance = deepcopy(instance) + if not isinstance(instance, dict): + if remove_incompatible_values({'key': instance}, {'key': schema}) == {}: + return + # Clean dict and validate + instance = remove_incompatible_values(deepcopy(instance), schema) + jsonschema.validate(instance, schema=schema) + + class XLSXConverter: """Class for conversion from XLSX to JSON. @@ -328,7 +374,7 @@ class XLSXConverter: for e in exceptions]) raise jsonschema.ValidationError(mess) if validate: - jsonschema.validate(self._result, self._schema) + _validate_jsonschema(self._result, self._schema) if self._errors: raise RuntimeError("There were error while handling the XLSX file.") return self._result @@ -563,7 +609,7 @@ class XLSXConverter: value = False if value == 1 or isinstance(value, str) and '=true()' == value.lower(): value = True - jsonschema.validate(value, subschema) + _validate_jsonschema(value, subschema) # Finally: convert to target type return self.PARSER[subschema.get("type", "string")](value) @@ -707,8 +753,6 @@ def _set_in_nested(mydict: dict, path: list, value: Any, prefix: list = [], skip return mydict -# ToDo: Fix https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools/-/issues/138 -# and remove pylint disable def to_dict(xlsx: Union[str, BinaryIO], schema: Union[dict, str, TextIO], validate: Optional[bool] = None, strict: bool = False) -> dict: """Convert the xlsx contents to a dict, it must follow a schema. @@ -733,9 +777,5 @@ def to_dict(xlsx: Union[str, BinaryIO], schema: Union[dict, str, TextIO], out: dict A dict representing the JSON with the extracted data. """ - if validate: - raise NotImplementedError( - "For input validation implement " - "https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools/-/issues/138") converter = XLSXConverter(xlsx, schema, strict=strict) - return converter.to_dict() + return converter.to_dict(validate=validate) diff --git a/unittests/table_json_conversion/test_read_xlsx.py b/unittests/table_json_conversion/test_read_xlsx.py index ff32c6b1203112c6931e98e55de5e4c981167452..d453ab3593ec36aa1197727f5ed51d1fb6fea10f 100644 --- a/unittests/table_json_conversion/test_read_xlsx.py +++ b/unittests/table_json_conversion/test_read_xlsx.py @@ -30,7 +30,7 @@ from typing import Optional import jsonschema import pytest -from caosadvancedtools.table_json_conversion import convert +from caosadvancedtools.table_json_conversion import convert, xlsx_utils from .utils import assert_equal_jsons @@ -53,9 +53,11 @@ Returns json: dict The result of the conversion. """ - # FIXME Set default "validate" back to True, after implementation of - # https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools/-/issues/138 - result = convert.to_dict(xlsx=xlsx_file, schema=schema_file, validate=validate) + with open(schema_file, encoding="utf8", mode="r") as sch_f: + model_schema = json.load(sch_f) + data_schema = xlsx_utils.array_schema_from_model_schema(model_schema) + + result = convert.to_dict(xlsx=xlsx_file, schema=data_schema, validate=True) if known_good_file: with open(known_good_file, encoding="utf-8") as myfile: expected = json.load(myfile)