diff --git a/src/newcrawler/crawl.py b/src/newcrawler/crawl.py index d40dcdfde56a45c7ec088347c87a59b5d3872c44..64a8e457a4f8e94ef49df8ccd59b9a3f20c85ea5 100644 --- a/src/newcrawler/crawl.py +++ b/src/newcrawler/crawl.py @@ -200,7 +200,28 @@ class Crawler(object): # Validate the cfood schema: validate(instance=crawler_definition, schema=schema["cfood"]) - return crawler_definition + return self._resolve_validator_paths(crawler_definition, crawler_definition_path) + + def _resolve_validator_paths(self, definition: dict, definition_path: str): + """Resolve path to validation files with respect to the file in which + the crawler was defined. + + """ + + for key, value in definition.items(): + + if key == "validate" and isinstance(value, str): + # Validator is given by a path + if not value.startswith('/'): + # Not an absolute path + definition[key] = os.path.join( + os.path.dirname(definition_path), value) + elif isinstance(value, dict): + # Recursively resolve all validators + definition[key] = self._resolve_validator_paths( + value, definition_path) + + return definition def load_converters(self, definition: dict): """ @@ -294,7 +315,8 @@ class Crawler(object): continue elif key == "Converters": continue - local_converters.append(Converter.converter_factory(value, key, converter_registry)) + local_converters.append(Converter.converter_factory( + value, key, converter_registry)) return local_converters @@ -384,11 +406,13 @@ class Crawler(object): if isinstance(el, db.Entity): if el not in flat: flat.append(el) - self.create_flat_list([el], flat) # TODO: move inside if block? + # TODO: move inside if block? + self.create_flat_list([el], flat) elif isinstance(p.value, db.Entity): if p.value not in flat: flat.append(p.value) - self.create_flat_list([p.value], flat) # TODO: move inside if block? + # TODO: move inside if block? + self.create_flat_list([p.value], flat) def all_references_are_existing_already(self, record: db.Record): """ @@ -420,7 +444,8 @@ class Crawler(object): for el in p.value: lst = [] if (isinstance(el, db.Entity) and el.id is None): - cached = self.get_identified_record_from_local_cache(el) + cached = self.get_identified_record_from_local_cache( + el) if cached is None: raise RuntimeError("Not in cache.") if not check_identical(cached, el, True): @@ -530,7 +555,8 @@ class Crawler(object): # This record is a duplicate that can be removed. Make sure we do not lose # information # Update an (local) identified record that will be inserted - newrecord = self.get_identified_record_from_local_cache(record) + newrecord = self.get_identified_record_from_local_cache( + record) self.copy_attributes(fro=record, to=newrecord) # Bend references to the other object # TODO refactor this @@ -612,7 +638,8 @@ class Crawler(object): resolved_references = True if len(flat) > 0: - raise RuntimeError("Could not resolve all Entity references. Circular Dependency?") + raise RuntimeError( + "Could not resolve all Entity references. Circular Dependency?") return to_be_inserted, to_be_updated @@ -698,7 +725,8 @@ class Crawler(object): if self.identifiableAdapter is None: raise RuntimeError("Should not happen.") - to_be_inserted, to_be_updated = self.split_into_inserts_and_updates(updateList) + to_be_inserted, to_be_updated = self.split_into_inserts_and_updates( + updateList) # remove unnecessary updates from list for el in to_be_updated: @@ -727,7 +755,8 @@ class Crawler(object): d = Crawler.debug_build_usage_tree(subconv) k = list(d.keys()) if len(k) != 1: - raise RuntimeError("Unkonwn error during building of usage tree.") + raise RuntimeError( + "Unkonwn error during building of usage tree.") res[converter.name]["subtree"][k[0]] = d[k[0]] return res @@ -794,7 +823,8 @@ class Crawler(object): keys_modified = converter.create_records( generalStore_copy, recordStore_copy, element) - children = converter.create_children(generalStore_copy, element) + children = converter.create_children( + generalStore_copy, element) if self.debug: # add provenance information for each varaible self.debug_tree[str(element)] = ( @@ -806,8 +836,10 @@ class Crawler(object): mod_info = self.debug_metadata["provenance"] for record_name, prop_name in keys_modified: # TODO: check - internal_id = recordStore_copy.get_internal_id(record_name) - record_identifier = record_name + "_" + str(internal_id) + internal_id = recordStore_copy.get_internal_id( + record_name) + record_identifier = record_name + \ + "_" + str(internal_id) converter.metadata["usage"].add(record_identifier) mod_info[record_identifier][prop_name] = (structure_elements_path + [element.get_name()], converters_path + [converter.name]) diff --git a/unittests/test_json.py b/unittests/test_json.py index 85635f32d7d5e5044d3f23844782c7cafafa82aa..12498b0f58ffbfc7d303d59ea84506e4d214744c 100644 --- a/unittests/test_json.py +++ b/unittests/test_json.py @@ -38,9 +38,9 @@ from test_tool import rfp, dircheckstr def test_json(): - crawler_definition_path = rfp("test_directories", "single_file_test_data", + crawler_definition_path = rfp("test_directories", "examples_json", "jsontest_cfood.yml") - json_file_path = rfp("test_directories", "single_file_test_data", "testjson.json") + json_file_path = rfp("test_directories", "examples_json", "testjson.json") crawler = Crawler(debug=True) crawler_definition = crawler.load_definition(crawler_definition_path)