diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index 7221e53bfdb14e30f0ca87b05470462ce9904066..0a3311bf9e12ab0ad355a3fee8d48efbecd68715 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -199,9 +199,7 @@ def handle_value(value: Union[dict, str, list], values: GeneralStore): return (propvalue, collection_mode) -def create_records(values: GeneralStore, - records: RecordStore, - def_records: dict): +def create_records(values: GeneralStore, records: RecordStore, def_records: dict): # list of keys to identify, which variables have been set by which paths: # the items are tuples: # 0: record name @@ -455,8 +453,7 @@ class Converter(object, metaclass=ABCMeta): class DirectoryConverter(Converter): - def create_children(self, generalStore: GeneralStore, - element: StructureElement): + def create_children(self, generalStore: GeneralStore, element: StructureElement): # TODO: See comment on types and inheritance if not isinstance(element, Directory): raise RuntimeError( @@ -542,8 +539,7 @@ class MarkdownFileConverter(Converter): reads the yaml header of markdown files (if a such a header exists). """ - def create_children(self, generalStore: GeneralStore, - element: StructureElement): + def create_children(self, generalStore: GeneralStore, element: StructureElement): # TODO: See comment on types and inheritance if not isinstance(element, File): raise RuntimeError("A markdown file is needed to create children.") @@ -629,7 +625,7 @@ def validate_against_json_schema(instance, schema_resource: Union[dict, str]): validate(instance=instance, schema=schema) except ValidationError as err: raise ConverterValidationError( - f"Couldn't validate {instance}:\n{err.message}") + f"\nCouldn't validate {instance}:\n{err.message}") class DictElementConverter(Converter): @@ -638,7 +634,15 @@ class DictElementConverter(Converter): if not isinstance(element, DictElement): raise ValueError("create_children was called with wrong type of StructureElement") - return self._create_children_from_dict(element.value) + try: + return self._create_children_from_dict(element.value) + except ConverterValidationError as err: + path = generalStore[self.name] + print(path) + raise ConverterValidationError( + "Error during the validation of the dictionary located at the following node " + "in the data structure:\n" + f"{path}\n" + err.message) def _create_children_from_dict(self, data): if "validate" in self.definition and self.definition["validate"]: @@ -701,9 +705,16 @@ class JSONFileConverter(Converter): with open(element.path, 'r') as json_file: json_data = json.load(json_file) if "validate" in self.definition and self.definition["validate"]: - validate_against_json_schema(json_data, self.definition["validate"]) + try: + validate_against_json_schema(json_data, self.definition["validate"]) + except ConverterValidationError as err: + raise ConverterValidationError( + "Error during the validation of the JSON file:\n" + f"{element.path}\n" + err.message) structure_element = convert_basic_element( - json_data, "The JSON File contained content that was parsed to a Python object" + json_data, + name=element.name+"_child_dict", + msg_prefix="The JSON File contained content that was parsed to a Python object" " with an unexpected type.") return [structure_element] @@ -729,9 +740,16 @@ class YAMLFileConverter(Converter): with open(element.path, 'r') as yaml_file: yaml_data = yaml.safe_load(yaml_file) if "validate" in self.definition and self.definition["validate"]: - validate_against_json_schema(yaml_data, self.definition["validate"]) + try: + validate_against_json_schema(yaml_data, self.definition["validate"]) + except ConverterValidationError as err: + raise ConverterValidationError( + "Error during the validation of the YAML file:\n" + f"{element.path}\n" + err.message) structure_element = convert_basic_element( - yaml_data, "The YAML File contained content that was parsed to a Python object" + yaml_data, + name=element.name+"_child_dict", + msg_prefix="The YAML File contained content that was parsed to a Python object" " with an unexpected type.") return [structure_element] diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index a3983515ef503b571bfa344421465331a7d0e394..a9eb413c124576caf1b10ae0a733268182261f8c 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -55,7 +55,7 @@ from caosdb.apiutils import (compare_entities, EntityMergeConflictError, merge_entities) from caosdb.common.datatype import is_reference -from .converters import Converter, DirectoryConverter +from .converters import Converter, DirectoryConverter, ConverterValidationError from .identifiable import Identifiable from .identifiable_adapters import (IdentifiableAdapter, LocalStorageIdentifiableAdapter, @@ -306,6 +306,7 @@ class Crawler(object): definition[key] = os.path.join( os.path.dirname(definition_path), value) if not os.path.isfile(definition[key]): + # TODO treat this properly somewhere raise FileNotFoundError( f"Couldn't find validation file {definition[key]}") elif isinstance(value, dict): @@ -498,13 +499,11 @@ class Crawler(object): items = [items] self.run_id = uuid.uuid1() - local_converters = Crawler.initialize_converters( - crawler_definition, converter_registry) + local_converters = Crawler.initialize_converters(crawler_definition, converter_registry) # This recursive crawling procedure generates the update list: self.crawled_data: list[db.Record] = [] - self._crawl(items, local_converters, self.generalStore, - self.recordStore, [], []) + self._crawl(items, local_converters, self.generalStore, self.recordStore, [], []) if self.debug: self.debug_converters = local_converters @@ -1188,8 +1187,8 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) keys_modified = converter.create_records( generalStore_copy, recordStore_copy, element) - children = converter.create_children( - generalStore_copy, element) + children = converter.create_children(generalStore_copy, element) + if self.debug: # add provenance information for each varaible self.debug_tree[str(element)] = ( @@ -1275,7 +1274,11 @@ def crawler_main(crawled_directory_path: str, 0 if successful """ crawler = Crawler(debug=debug, securityMode=securityMode) - crawler.crawl_directory(crawled_directory_path, cfood_file_name) + try: + crawler.crawl_directory(crawled_directory_path, cfood_file_name) + except ConverterValidationError as err: + print(err) + return 1 if provenance_file is not None: crawler.save_debug_data(provenance_file) diff --git a/unittests/test_data/failing_validation/cfood.yml b/unittests/test_data/failing_validation/cfood.yml new file mode 100644 index 0000000000000000000000000000000000000000..e0bc8668457cefaedd5244c424c99a150e30347c --- /dev/null +++ b/unittests/test_data/failing_validation/cfood.yml @@ -0,0 +1,11 @@ +# This is a test cfood for: +# https://gitlab.com/caosdb/caosdb-crawler/-/issues/9 + +Data: # name of the converter + type: Directory + match: (.*) + subtree: + json: + type: JSONFile + match: data.json + validate: schema.json diff --git a/unittests/test_data/failing_validation/cfood2.yml b/unittests/test_data/failing_validation/cfood2.yml new file mode 100644 index 0000000000000000000000000000000000000000..d896553f7acbdf4fe149bd8bd0e63c96bc121916 --- /dev/null +++ b/unittests/test_data/failing_validation/cfood2.yml @@ -0,0 +1,14 @@ +# This is a test cfood for: +# https://gitlab.com/caosdb/caosdb-crawler/-/issues/9 + +Data: # name of the converter + type: Directory + match: (.*) + subtree: + json: + type: JSONFile + match: data.json + subtree: + dict: + type: Dict + validate: schema.json diff --git a/unittests/test_data/failing_validation/data.json b/unittests/test_data/failing_validation/data.json new file mode 100644 index 0000000000000000000000000000000000000000..f6ecf65f31a974233d8bf5f1b779e0718ea41258 --- /dev/null +++ b/unittests/test_data/failing_validation/data.json @@ -0,0 +1,3 @@ +{ + "a": 5 +} diff --git a/unittests/test_data/failing_validation/identifiables.yml b/unittests/test_data/failing_validation/identifiables.yml new file mode 100644 index 0000000000000000000000000000000000000000..f6bb04a59bda2e6169e3dc037a60686f6fd935a3 --- /dev/null +++ b/unittests/test_data/failing_validation/identifiables.yml @@ -0,0 +1,3 @@ + +license: + - name diff --git a/unittests/test_data/failing_validation/schema.json b/unittests/test_data/failing_validation/schema.json new file mode 100644 index 0000000000000000000000000000000000000000..657d89842c3941c68da7bfeff7f52026d26b5f6f --- /dev/null +++ b/unittests/test_data/failing_validation/schema.json @@ -0,0 +1,10 @@ +{ + "title": "Dataset", + "description": "", + "type": "object", + "properties": { + "a": { + "type": "string" + } + } +} diff --git a/unittests/test_tool.py b/unittests/test_tool.py index 71180b17e22409bc2491a51d4cdd45ed6f4aa346..37f714990ad3713cccf2d518cbb4aa2bc64833cc 100755 --- a/unittests/test_tool.py +++ b/unittests/test_tool.py @@ -1,8 +1,32 @@ -#!/bin/python -# Tests for the tool using pytest -# Adapted from check-sfs -# A. Schlemmer, 06/2021 - +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2021 Alexander Schlemmer +# Copyright (C) 2023 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# Copyright (C) 2023 IndiScale GmbH <info@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" +Tests for the tool using pytest +Adapted from check-sfs +""" + +import os from caoscrawler.crawl import Crawler, SecurityMode from caoscrawler.identifiable import Identifiable from caoscrawler.structure_elements import File, DictTextElement, DictListElement @@ -11,6 +35,7 @@ from simulated_server_data import full_data from functools import partial from copy import deepcopy from unittest.mock import patch +from caoscrawler.crawl import crawler_main import caosdb.common.models as dbmodels from unittest.mock import MagicMock, Mock from os.path import join, dirname, basename @@ -740,6 +765,36 @@ def crawler_mocked_for_backref_test(crawler): return crawler +def test_validation_error_print(capsys): + # there should be no server interaction since we only test the behavior if a validation error + # occurs during the data collection stage + DATADIR = os.path.join(os.path.dirname(__file__), "test_data", "failing_validation") + ret = crawler_main(DATADIR, + os.path.join(DATADIR, "cfood.yml"), + os.path.join(DATADIR, "identifiables.yml"), + True, + None, + False, + "/use_case_simple_presentation") + captured = capsys.readouterr() + assert "Couldn't validate" in captured.out + + +def test_validation_error_print_dict(capsys): + # there should be no server interaction since we only test the behavior if a validation error + # occurs during the data collection stage + DATADIR = os.path.join(os.path.dirname(__file__), "test_data", "failing_validation") + ret = crawler_main(DATADIR, + os.path.join(DATADIR, "cfood2.yml"), + os.path.join(DATADIR, "identifiables.yml"), + True, + None, + False, + "/use_case_simple_presentation") + # captured = capsys.readouterr() + # assert "Couldn't validate" in captured.out + + def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test): crawler = crawler_mocked_for_backref_test identlist = [Identifiable(name="A", record_type="BR"), diff --git a/unittests/test_validation.py b/unittests/test_validation.py deleted file mode 100644 index 686c66f72f55b66344322e0c6f3b9d1a2b76b3f9..0000000000000000000000000000000000000000 --- a/unittests/test_validation.py +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -# -# ** header v3.0 -# This file is a part of the CaosDB Project. -# -# Copyright (C) 2022 Alexander Schlemmer -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see <https://www.gnu.org/licenses/>. -# -# ** end header -# - -""" -Test the validation of cfood definition files. -""" - -from caoscrawler.crawl import Crawler - -from tempfile import NamedTemporaryFile - -import yaml -import pytest