diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 06059ed1eb532948419bfadd98473333a108203e..f8edec54cefec42964db8252f5cadb55dd813b35 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -229,14 +229,27 @@ class Crawler(object): with open(crawler_definition_path, "r") as f: crawler_definitions = list(yaml.safe_load_all(f)) - if len(crawler_definitions) == 1: - # Simple case, just one document: - crawler_definition = crawler_definitions[0] - elif len(crawler_definitions) == 2: - crawler_definition = crawler_definitions[1] - else: - raise RuntimeError( - "Crawler definition must not contain more than two documents.") + crawler_definition = self._load_definition_from_yaml_dict( + crawler_definitions, crawler_definition_path) + + return self._resolve_validator_paths(crawler_definition, crawler_definition_path) + + def _load_definition_from_yaml_dict(self, crawler_definitions: List[Dict]): + """Load crawler definitions from a list of (yaml) dicts `crawler_definitions` which + contains either one or two documents. + + Doesn't resolve the validator paths in the cfood definition, so for + internal and testing use only. + + """ + if len(crawler_definitions) == 1: + # Simple case, just one document: + crawler_definition = crawler_definitions[0] + elif len(crawler_definitions) == 2: + crawler_definition = crawler_definitions[1] + else: + raise RuntimeError( + "Crawler definition must not contain more than two documents.") # TODO: at this point this function can already load the cfood schema extensions # from the crawler definition and add them to the yaml schema that will be @@ -251,11 +264,16 @@ class Crawler(object): for key in crawler_definition["Converters"]: schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append( key) + if len(crawler_definitions) == 2: + if "Converters" in crawler_definitions[0]["metadata"]: + for key in crawler_definitions[0]["metadata"]["Converters"]: + schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append( + key) # Validate the cfood schema: validate(instance=crawler_definition, schema=schema["cfood"]) - return self._resolve_validator_paths(crawler_definition, crawler_definition_path) + return crawler_definition def _resolve_validator_paths(self, definition: dict, definition_path: str): """Resolve path to validation files with respect to the file in which @@ -405,7 +423,8 @@ class Crawler(object): continue elif key == "Converters": continue - converters.append(Converter.converter_factory(value, key, converter_registry)) + converters.append(Converter.converter_factory( + value, key, converter_registry)) return converters @@ -1089,7 +1108,8 @@ def crawler_main(crawled_directory_path: str, # correct the file path: # elem.file = os.path.join(args.path, elem.file) if prefix is None: - raise RuntimeError("No prefix set. Prefix must be set if files are used.") + raise RuntimeError( + "No prefix set. Prefix must be set if files are used.") if elem.path.startswith(prefix): elem.path = elem.path[len(prefix):] elem.file = None