diff --git a/CHANGELOG.md b/CHANGELOG.md index 69d938aec8dab10d3d9961b0321beec3dbf8cff9..6aed1071a8909a68dfbb625d6fcce1098581dc9d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 definition. It is checked against the installed version upon loading of the definition. - JSON schema validation can also be used in the DictElementConverter +- YAMLFileConverter class; to parse YAML files ### Changed ### diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index 393b20e03fa3276fe33e9d266dbfd07e611b758f..efdbb10204e4784f514a54c05e778bac5a0b0ed4 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -649,6 +649,33 @@ class JSONFileConverter(Converter): return [structure_element] +class YAMLFileConverter(Converter): + def typecheck(self, element: StructureElement): + return isinstance(element, File) + + def match(self, element: StructureElement): + # TODO: See comment on types and inheritance + if not self.typecheck(element): + raise RuntimeError("Element must be a file") + m = re.match(self.definition["match"], element.name) + if m is None: + return None + return m.groupdict() + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + # TODO: See comment on types and inheritance + if not isinstance(element, File): + raise ValueError("create_children was called with wrong type of StructureElement") + with open(element.path, 'r') as yaml_file: + yaml_data = yaml.safe_load(yaml_file) + if "validate" in self.definition and self.definition["validate"]: + validate_against_json_schema(yaml_data, self.definition["validate"]) + structure_element = convert_basic_element( + yaml_data, "The YAML File contained content that was parsed to a Python object" + " with an unexpected type.") + return [structure_element] + + def match_name_and_value(definition, name, value): """ takes match definitions from the definition argument and applies regular expressiion to name diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 644782d78cd4060677e04cb039f7a3f679e3ccb6..a3983515ef503b571bfa344421465331a7d0e394 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -345,6 +345,9 @@ class Crawler(object): "JSONFile": { "converter": "JSONFileConverter", "package": "caoscrawler.converters"}, + "YAMLFile": { + "converter": "YAMLFileConverter", + "package": "caoscrawler.converters"}, "CSVTableConverter": { "converter": "CSVTableConverter", "package": "caoscrawler.converters"}, diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 1178f187ad6ec28afbdeb200b071d8b9694b951a..71cb42cf1841aa95e901047cc5037a1a123e3a72 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -23,6 +23,8 @@ """ test the converters module """ +import json +import yaml import importlib import os import pytest @@ -32,7 +34,7 @@ from caoscrawler.converters import (Converter, ConverterValidationError, DictEle DirectoryConverter, DictIntegerElementConverter, handle_value, MarkdownFileConverter, FloatElementConverter, IntegerElementConverter, - JSONFileConverter) + JSONFileConverter, YAMLFileConverter) from caoscrawler.converters import _AbstractScalarValueElementConverter from caoscrawler.crawl import Crawler from caoscrawler.stores import GeneralStore @@ -235,17 +237,96 @@ def test_json_converter(converter_registry): else: raise ValueError() + invalid_json = File( + "invalidjson.json", + rfp("test_directories", "examples_json", "invalidjson.json") + ) + # Doesn't validate because of missing required 'name' property + with pytest.raises(ConverterValidationError) as err: + jsonconverter.create_children(None, invalid_json) + assert err.value.message.startswith("Couldn't validate") + broken_json = File( "brokenjson.json", rfp("test_directories", "examples_json", "brokenjson.json") ) - m = jsonconverter.match(broken_json) + with pytest.raises(json.decoder.JSONDecodeError) as err: + jsonconverter.create_children(None, broken_json) + + +def test_yaml_converter(converter_registry): + test_yaml = File("testyaml.yml", rfp( + "test_directories", "test_yamls", "testyaml.yml")) + + schema_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), + "test_directories", "test_yamls", "testyaml.schema.json") + yamlconverter = YAMLFileConverter( + definition={"match": "(.*)", "validate": schema_path}, + name="TestYAMLFileConverter", + converter_registry=converter_registry) + + m = yamlconverter.match(test_yaml) + assert m is not None + assert len(m) == 0 + + dict_el = yamlconverter.create_children(None, test_yaml) + assert len(dict_el) == 1 + + dictconverter = DictElementConverter( + definition={"match_name": "(.*)"}, + name="dictconv", + converter_registry=converter_registry) + children = dictconverter.create_children(None, dict_el[0]) + for child in children: + if child.name == "name": + assert isinstance(child, TextElement) + assert isinstance(child.value, str) + assert child.value == "DEMO" + elif child.name == "projectId": + assert isinstance(child, IntegerElement) + assert isinstance(child.value, int) + assert child.value == 10002 + elif child.name == "archived": + assert isinstance(child, BooleanElement) + assert isinstance(child.value, bool) + assert child.value is False + elif child.name == "Person": + assert isinstance(child, ListElement) + assert isinstance(child.value, list) + assert len(child.value) == 2 + elif child.name == "start_date": + assert isinstance(child, TextElement) + assert isinstance(child.value, str) + assert child.value == '2022-03-01' + elif child.name == "candidates": + assert isinstance(child, ListElement) + assert isinstance(child.value, list) + assert child.value == ["Mouse", "Penguine"] + elif child.name == "rvalue": + assert isinstance(child, FloatElement) + assert isinstance(child.value, float) + elif child.name == "url": + assert isinstance(child, TextElement) + assert isinstance(child.value, str) + else: + raise ValueError() + + invalid_yaml = File( + "invalidyaml.yml", + rfp("test_directories", "test_yamls", "invalidyaml.yml") + ) # Doesn't validate because of missing required 'name' property with pytest.raises(ConverterValidationError) as err: - jsonconverter.create_children(None, broken_json) + yamlconverter.create_children(None, invalid_yaml) + assert err.value.message.startswith("Couldn't validate") - assert err.value.message.startswith("Couldn't validate") + broken_yaml = File( + "brokenyaml.yml", + rfp("test_directories", "test_yamls", "brokenyaml.yml") + ) + with pytest.raises(yaml.parser.ParserError) as err: + yamlconverter.create_children(None, broken_yaml) def test_variable_replacement(): diff --git a/unittests/test_directories/examples_json/brokenjson.json b/unittests/test_directories/examples_json/brokenjson.json index 9c012bf062264014278fc2df7be6cf33b65c7469..20b17a7e6767c3f9f2569d5b9d5711940845857a 100644 --- a/unittests/test_directories/examples_json/brokenjson.json +++ b/unittests/test_directories/examples_json/brokenjson.json @@ -1,13 +1 @@ -{ - "projectId": 10002, - "archived": false, - "coordinator": { - "firstname": "Miri", - "lastname": "Mueller", - "email": "miri.mueller@science.de" - }, - "start_date": "2022-03-01", - "candidates": ["Mouse", "Penguine"], - "rvalue": 0.4444, - "url": "https://site.de/index.php/" -} +a: 5 diff --git a/unittests/test_directories/examples_json/invalidjson.json b/unittests/test_directories/examples_json/invalidjson.json new file mode 100644 index 0000000000000000000000000000000000000000..9c012bf062264014278fc2df7be6cf33b65c7469 --- /dev/null +++ b/unittests/test_directories/examples_json/invalidjson.json @@ -0,0 +1,13 @@ +{ + "projectId": 10002, + "archived": false, + "coordinator": { + "firstname": "Miri", + "lastname": "Mueller", + "email": "miri.mueller@science.de" + }, + "start_date": "2022-03-01", + "candidates": ["Mouse", "Penguine"], + "rvalue": 0.4444, + "url": "https://site.de/index.php/" +} diff --git a/unittests/test_directories/test_yamls/brokenyaml.yml b/unittests/test_directories/test_yamls/brokenyaml.yml new file mode 100644 index 0000000000000000000000000000000000000000..5c34318c2147fb2285de5a1513f46168e33e6baf --- /dev/null +++ b/unittests/test_directories/test_yamls/brokenyaml.yml @@ -0,0 +1 @@ +} diff --git a/unittests/test_directories/test_yamls/invalidyaml.yml b/unittests/test_directories/test_yamls/invalidyaml.yml new file mode 100644 index 0000000000000000000000000000000000000000..0528ebea1740947f7e675cf77f906fd86beaa6f9 --- /dev/null +++ b/unittests/test_directories/test_yamls/invalidyaml.yml @@ -0,0 +1,14 @@ +"projectId": 10002 +"archived": false +"Person": + - "firstname": "Miri" + "lastname": "Mueller" + "other": null + "email": "miri.mueller@science.de" + - "firstname": "Mara" + "lastname": "Mueller" + "email": "mara.mueller@science.de" +"start_date": "2022-03-01" +"candidates": ["Mouse", "Penguine"] +"rvalue": 0.4444 +"url": "https://site.de/index.php/" diff --git a/unittests/test_directories/test_yamls/testyaml.schema.json b/unittests/test_directories/test_yamls/testyaml.schema.json new file mode 100644 index 0000000000000000000000000000000000000000..fc784a61079e4737f1a0176fe4240133f5d1b5d0 --- /dev/null +++ b/unittests/test_directories/test_yamls/testyaml.schema.json @@ -0,0 +1,60 @@ +{ + "title": "Dataset", + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "projectId": { + "type": "integer" + }, + "archived": { + "type": "boolean" + }, + "Person": { + "type": "array", + "items": { + "type": "object", + "properties": { + "firstname": { + "type": "string" + }, + "lastname": { + "type": "string" + }, + "email": { + "type": "string" + } + }, + "required": [ + "firstname", + "lastname", + "email" + ], + "additionalProperties": true + } + }, + "start_date": { + "type": "string", + "format": "date" + }, + "candidates": { + "type": "array", + "items": { + "type": "string" + } + }, + "rvalue": { + "type": "number" + }, + "url": { + "type": "string" + } + }, + "required": [ + "name", + "projectId", + "Person" + ], + "additionalProperties": false +} diff --git a/unittests/test_directories/test_yamls/testyaml.yml b/unittests/test_directories/test_yamls/testyaml.yml new file mode 100644 index 0000000000000000000000000000000000000000..d658e187f62b1b66521be385ae34386273b3b98a --- /dev/null +++ b/unittests/test_directories/test_yamls/testyaml.yml @@ -0,0 +1,15 @@ +"name": "DEMO" +"projectId": 10002 +"archived": false +"Person": + - "firstname": "Miri" + "lastname": "Mueller" + "other": null + "email": "miri.mueller@science.de" + - "firstname": "Mara" + "lastname": "Mueller" + "email": "mara.mueller@science.de" +"start_date": "2022-03-01" +"candidates": ["Mouse", "Penguine"] +"rvalue": 0.4444 +"url": "https://site.de/index.php/"