diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 06059ed1eb532948419bfadd98473333a108203e..d62d00860b531fbc4e7b7b69452f118c5d677bd8 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -229,14 +229,27 @@ class Crawler(object): with open(crawler_definition_path, "r") as f: crawler_definitions = list(yaml.safe_load_all(f)) - if len(crawler_definitions) == 1: - # Simple case, just one document: - crawler_definition = crawler_definitions[0] - elif len(crawler_definitions) == 2: - crawler_definition = crawler_definitions[1] - else: - raise RuntimeError( - "Crawler definition must not contain more than two documents.") + crawler_definition = self._load_definition_from_yaml_dict( + crawler_definitions) + + return self._resolve_validator_paths(crawler_definition, crawler_definition_path) + + def _load_definition_from_yaml_dict(self, crawler_definitions: List[Dict]): + """Load crawler definitions from a list of (yaml) dicts `crawler_definitions` which + contains either one or two documents. + + Doesn't resolve the validator paths in the cfood definition, so for + internal and testing use only. + + """ + if len(crawler_definitions) == 1: + # Simple case, just one document: + crawler_definition = crawler_definitions[0] + elif len(crawler_definitions) == 2: + crawler_definition = crawler_definitions[1] + else: + raise RuntimeError( + "Crawler definition must not contain more than two documents.") # TODO: at this point this function can already load the cfood schema extensions # from the crawler definition and add them to the yaml schema that will be @@ -251,11 +264,16 @@ class Crawler(object): for key in crawler_definition["Converters"]: schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append( key) + if len(crawler_definitions) == 2: + if "Converters" in crawler_definitions[0]["metadata"]: + for key in crawler_definitions[0]["metadata"]["Converters"]: + schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append( + key) # Validate the cfood schema: validate(instance=crawler_definition, schema=schema["cfood"]) - return self._resolve_validator_paths(crawler_definition, crawler_definition_path) + return crawler_definition def _resolve_validator_paths(self, definition: dict, definition_path: str): """Resolve path to validation files with respect to the file in which @@ -405,7 +423,8 @@ class Crawler(object): continue elif key == "Converters": continue - converters.append(Converter.converter_factory(value, key, converter_registry)) + converters.append(Converter.converter_factory( + value, key, converter_registry)) return converters @@ -1089,7 +1108,8 @@ def crawler_main(crawled_directory_path: str, # correct the file path: # elem.file = os.path.join(args.path, elem.file) if prefix is None: - raise RuntimeError("No prefix set. Prefix must be set if files are used.") + raise RuntimeError( + "No prefix set. Prefix must be set if files are used.") if elem.path.startswith(prefix): elem.path = elem.path[len(prefix):] elem.file = None diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 15a39b72e75c8f26f3df80c24d48a4c5c2585029..30c5972c4f006aaf9923dfc058c3b861d8b5123b 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -23,12 +23,17 @@ """ test the converters module """ +import importlib +import os +import pytest +import yaml -from caoscrawler.converters import Converter +from caoscrawler.converters import (Converter, ConverterValidationError, + DictConverter, DirectoryConverter, + handle_value, MarkdownFileConverter, + JSONFileConverter) +from caoscrawler.crawl import Crawler from caoscrawler.stores import GeneralStore -from caoscrawler.converters import (ConverterValidationError, DictConverter, - DirectoryConverter, handle_value, - MarkdownFileConverter, JSONFileConverter) from caoscrawler.structure_elements import (File, DictTextElement, DictListElement, DictElement, DictBooleanElement, DictDictElement, @@ -37,10 +42,6 @@ from caoscrawler.structure_elements import (File, DictTextElement, from test_tool import rfp -import pytest -import os -import importlib - @pytest.fixture def converter_registry(): @@ -348,3 +349,39 @@ def test_filter_children_of_directory(converter_registry): with pytest.raises(RuntimeError): children = dc.create_children(None, test_dir) + + +def test_validate_custom_converters(): + one_doc_yaml = """ +Converters: + MyNewType: + converter: MyNewTypeConverter + package: some_package.my_converters +MyElement: + type: MyNewType + match: something + """ + crawler1 = Crawler() + one_doc_definitions = crawler1._load_definition_from_yaml_dict( + [yaml.load(one_doc_yaml, Loader=yaml.SafeLoader)]) + assert "MyElement" in one_doc_definitions + assert one_doc_definitions["MyElement"]["type"] == "MyNewType" + + # this has to be equivalent + two_doc_yaml = """ +--- +metadata: + Converters: + MyNewType: + converter: MyNewTypeConverter + package: some_package.my_converters +--- +MyElement: + type: MyNewType + match: something + """ + crawler2 = Crawler() + two_doc_definitions = crawler2._load_definition_from_yaml_dict( + list(yaml.safe_load_all(two_doc_yaml))) + assert "MyElement" in two_doc_definitions + assert two_doc_definitions["MyElement"]["type"] == one_doc_definitions["MyElement"]["type"]