diff --git a/src/newcrawler/converters.py b/src/newcrawler/converters.py index 7e079aa3d345345d102cd4fcf763e4a09f2200fc..c269cde96f61e80a95eb22d19735acf0e604b36b 100644 --- a/src/newcrawler/converters.py +++ b/src/newcrawler/converters.py @@ -23,7 +23,7 @@ # ** end header # -from jsonschema import validate +from jsonschema import validate, ValidationError import os import re import caosdb as db @@ -50,6 +50,9 @@ SPECIAL_PROPERTIES = ("description", "name", "id", "path", class ConverterValidationError(Exception): """To be raised if contents of an element to be converted are invalid.""" + def __init__(self, msg): + self.message = msg + def handle_value(value: Union[dict, str], values: GeneralStore): """ @@ -201,7 +204,8 @@ class Converter(object): converter_registry: dict): if "type" not in definition: - raise RuntimeError("Type is mandatory for converter entries in CFood definition.") + raise RuntimeError( + "Type is mandatory for converter entries in CFood definition.") if definition["type"] not in converter_registry: raise RuntimeError("Unknown Type: {}".format(definition["type"])) @@ -268,7 +272,8 @@ class DirectoryConverter(Converter): def create_children(self, generalStore: GeneralStore, element: StructureElement): if not isinstance(element, Directory): - raise RuntimeError("Directory converters can only create children from directories.") + raise RuntimeError( + "Directory converters can only create children from directories.") return self.create_children_from_directory(element) @@ -338,7 +343,8 @@ class MarkdownFileConverter(Converter): if not isinstance(element, File): raise RuntimeError("A markdown file is needed to create children.") - header = yaml_header_tools.get_header_from_file(element.path, clean=False) + header = yaml_header_tools.get_header_from_file( + element.path, clean=False) children: list[StructureElement] = [] for name, entry in header.items(): @@ -347,7 +353,8 @@ class MarkdownFileConverter(Converter): elif type(entry) == str: children.append(DictTextElement(name, entry)) else: - raise RuntimeError("Header entry {} has incompatible type.".format(name)) + raise RuntimeError( + "Header entry {} has incompatible type.".format(name)) return children def typecheck(self, element: StructureElement): @@ -451,15 +458,19 @@ class JSONFileConverter(DictConverter): if isinstance(self.definition["validate"], dict): schema = self.definition["validate"] elif isinstance(self.definition["validate"], str): + with open(self.definition["validate"], 'r') as json_file: - # TODO path shall be relative to cfood yml. Adjust the path here! schema = json.load(json_file) else: raise ValueError("The value of 'validate' has to be a string describing the path " "to the json schema file (relative to the cfood yml) " "or a dict containing the schema.") # Validate the json content - validate(instance=json_data, schema=schema) + try: + validate(instance=json_data, schema=schema) + except ValidationError as err: + raise ConverterValidationError( + f"Couldn't validate {json_data}:\n{err.message}") return self._create_children_from_dict(json_data) @@ -498,7 +509,8 @@ class DictListElementConverter(Converter): def create_children(self, generalStore: GeneralStore, element: StructureElement): if not isinstance(element, DictListElement): - raise RuntimeError("This converter can only process DictListElements.") + raise RuntimeError( + "This converter can only process DictListElements.") return [TextElement(str(index), list_element) for index, list_element in enumerate(element.value)] def typecheck(self, element: StructureElement): @@ -511,7 +523,8 @@ class DictListElementConverter(Converter): if m is None: return None if "match" in self.definition: - raise NotImplementedError("Match is not implemented for DictListElement.") + raise NotImplementedError( + "Match is not implemented for DictListElement.") return m.groupdict() @@ -532,7 +545,8 @@ class DictDictElementConverter(Dict): if m is None: return None if "match" in self.definition: - raise NotImplementedError("Match is not implemented for DictListElement.") + raise NotImplementedError( + "Match is not implemented for DictListElement.") return m.groupdict() diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 2fa3eeb7b5d88f13ff3b2b8fc951f6e9302964b6..100b10062916fb992d2bb19241d1cf8ea543e44c 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -29,14 +29,19 @@ test the converters module from newcrawler.converters import Converter from newcrawler.stores import GeneralStore -from newcrawler.converters import MarkdownFileConverter, JSONFileConverter, DictConverter +from newcrawler.converters import (ConverterValidationError, + MarkdownFileConverter, JSONFileConverter, + DictConverter) from newcrawler.structure_elements import Directory -from newcrawler.structure_elements import (File, DictTextElement, DictListElement, DictElement, - DictBooleanElement, DictDictElement, DictIntegerElement, DictFloatElement) +from newcrawler.structure_elements import (File, DictTextElement, + DictListElement, DictElement, + DictBooleanElement, DictDictElement, + DictIntegerElement, DictFloatElement) from test_tool import rfp import pytest +import os import importlib @@ -170,10 +175,11 @@ def test_markdown_converter(converter_registry): def test_json_converter(converter_registry): test_json = File("testjson.json", rfp( - "test_directories", "single_file_test_data", "testjson.json")) + "test_directories", "examples_json", "testjson.json")) + schema_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "test_directories", "examples_json", "testjson.schema.json") jsonconverter = JSONFileConverter( - definition={"match": "(.*)"}, + definition={"match": "(.*)", "validate": schema_path}, name="TestJSONFileConverter", converter_registry=converter_registry) @@ -189,7 +195,7 @@ def test_json_converter(converter_registry): assert children[0].value == "DEMO" assert children[1].__class__ == DictIntegerElement - assert children[1].name == "id" + assert children[1].name == "projectId" assert children[1].value.__class__ == int assert children[1].value == 10002 @@ -217,3 +223,13 @@ def test_json_converter(converter_registry): assert children[7].__class__ == DictTextElement assert children[7].name == "url" assert children[7].value.__class__ == str + + broken_json = File("brokenjson.json", rfp( + "test_directories", "examples_json", "brokenjson.json")) + m = jsonconverter.match(broken_json) + + # Doesn't validate because of missing required 'name' property + with pytest.raises(ConverterValidationError) as err: + children = jsonconverter.create_children(None, broken_json) + + assert err.value.message.startswith("Couldn't validate") diff --git a/unittests/test_directories/examples_json/brokenjson.json b/unittests/test_directories/examples_json/brokenjson.json new file mode 100644 index 0000000000000000000000000000000000000000..9c012bf062264014278fc2df7be6cf33b65c7469 --- /dev/null +++ b/unittests/test_directories/examples_json/brokenjson.json @@ -0,0 +1,13 @@ +{ + "projectId": 10002, + "archived": false, + "coordinator": { + "firstname": "Miri", + "lastname": "Mueller", + "email": "miri.mueller@science.de" + }, + "start_date": "2022-03-01", + "candidates": ["Mouse", "Penguine"], + "rvalue": 0.4444, + "url": "https://site.de/index.php/" +}