diff --git a/end-to-end-tests/test_crawler.py b/end-to-end-tests/test_crawler.py index 5090f5d7e33891a9d8426c0fd15631f7235c36e6..258e503dd3d5bbd3cc8ad6119c309aebf1f0faef 100644 --- a/end-to-end-tests/test_crawler.py +++ b/end-to-end-tests/test_crawler.py @@ -30,5 +30,5 @@ def test_crawl(): crawl a directory as it would be created by export from kadi and running a data quality check """ print(os.listdir(DATADIR)) - trigger_crawler(os.fspath(DATADIR)) - raise NotImplementedError("Test not implemented.") + retval = trigger_crawler(os.fspath(DATADIR)) + assert retval diff --git a/pylinkahead.ini b/pylinkahead.ini index 51a601e22879416d0f96305ce86b7a690fab2b0f..326f8b6c0ad5f4fb25ea31cd9fa4e738c358ecaf 100644 --- a/pylinkahead.ini +++ b/pylinkahead.ini @@ -20,6 +20,7 @@ username=admin password_method=plain password=caosdb + ## OR: `input`: username is optional, password is entered by the user directly # password_method=input @@ -32,5 +33,5 @@ password=caosdb ## pip install keyring # password_method=keyring +timeout=10000 -timeout=10000 \ No newline at end of file diff --git a/src/ruqad/crawler.py b/src/ruqad/crawler.py index bd6a6c49dc80b697d60226a5eb2162deb52a88e3..1e5a0334ad05550bd86cb72a7a92fcba4ddd4689 100644 --- a/src/ruqad/crawler.py +++ b/src/ruqad/crawler.py @@ -18,13 +18,15 @@ from caoscrawler.validator import (load_json_schema_from_datamodel_yaml, ruqad_crawler_settings = resources.files('ruqad').joinpath('resources/crawler-settings') -def trigger_crawler(target_dir: str): +def trigger_crawler(target_dir: str) -> bool: """ Trigger a standard crawler run equivalent to the command line: ``` caosdb-crawler -i crawler/identifiables.yaml -s update crawler/cfood.yaml <target_dir> ``` + + Return False in case of unsuccessful metadata validation and True otherwise. """ # insert all .zip and .eln files, if they do not yet exist @@ -52,12 +54,14 @@ def trigger_crawler(target_dir: str): # Remove files from entities: records = [r for r in entities if r.role == "Record"] - # breakpoint() validation = validate(records, schemas) - # breakpoint() + if not all([i[0] for i in validation]): - print("Metadata validation failed.") - sys.exit(1) + print("Metadata validation failed. Validation errors:") + for v, recordtype in zip(validation, schemas.keys()): + if not v[0]: + print("{}: {}".format(recordtype, v[1])) + return False print("crawl", target_dir) crawler_main(crawled_directory_path=target_dir, @@ -65,3 +69,5 @@ def trigger_crawler(target_dir: str): identifiables_definition_file=ruqad_crawler_settings.joinpath( 'identifiables.yaml'), remove_prefix="/"+os.path.basename(target_dir)) + + return True diff --git a/src/ruqad/resources/crawler-settings/cfood.yaml b/src/ruqad/resources/crawler-settings/cfood.yaml index bc5c2635570e7f7c7717ede5dfbaadd879637869..cf3bc7d91f51f723db363348d42bc3a5188b0f65 100644 --- a/src/ruqad/resources/crawler-settings/cfood.yaml +++ b/src/ruqad/resources/crawler-settings/cfood.yaml @@ -187,7 +187,7 @@ DataDir: AuthorID: match_name: "@id" type: TextElement - match_value: ^(?P<url>(?P<repo>https://.*?)(/users/)(?P<unr>[0-9]+))$ + match_value: ^(?P<url>(?P<repo>https?://.*?)(/users/)(?P<unr>[0-9]+))$ transform: cast_nr_type: in: $unr