diff --git a/end-to-end-tests/test_crawler.py b/end-to-end-tests/test_crawler.py index 258e503dd3d5bbd3cc8ad6119c309aebf1f0faef..23d40379e88368922875f5890de2bae1fffcce72 100644 --- a/end-to-end-tests/test_crawler.py +++ b/end-to-end-tests/test_crawler.py @@ -18,6 +18,7 @@ tests the crawling of ELN files """ import os +import re from pathlib import Path from ruqad.crawler import trigger_crawler @@ -30,5 +31,18 @@ def test_crawl(): crawl a directory as it would be created by export from kadi and running a data quality check """ print(os.listdir(DATADIR)) - retval = trigger_crawler(os.fspath(DATADIR)) + retval, ent_qc = trigger_crawler(os.fspath(DATADIR)) + + # Check that validation of metadata was successful: assert retval + + # Check that license was present in 1223 and absent in 1222: + qc = {} + for ent in ent_qc: + pth = ent.get_property("ELNFile").value.path + match = re.match("/.*/.*/(?P<folder>[0-9]+)/.*\.eln", pth) + assert match is not None + qc[match.group("folder")] = ent + + assert qc["1223"].get_property("FAIRLicenseCheck").value + assert not qc["1222"].get_property("FAIRLicenseCheck").value diff --git a/src/ruqad/crawler.py b/src/ruqad/crawler.py index b01ff9fc80175ff08c68c01f0df5a987702e3146..2dc803a4404499f115df207796bc789d01b7618b 100644 --- a/src/ruqad/crawler.py +++ b/src/ruqad/crawler.py @@ -18,7 +18,7 @@ from caoscrawler.validator import (load_json_schema_from_datamodel_yaml, ruqad_crawler_settings = resources.files('ruqad').joinpath('resources/crawler-settings') -def trigger_crawler(target_dir: str) -> bool: +def trigger_crawler(target_dir: str) -> tuple[bool, list[db.Entity]]: """ Trigger a standard crawler run equivalent to the command line: @@ -26,7 +26,10 @@ def trigger_crawler(target_dir: str) -> bool: caosdb-crawler -i crawler/identifiables.yaml -s update crawler/cfood.yaml <target_dir> ``` - Return False in case of unsuccessful metadata validation and True otherwise. + A tuple: + - 1st element of tuple: Return False in case of unsuccessful metadata validation + and True otherwise. + - 2nd element of tuple: list of quality check records. """ # insert all .zip and .eln files, if they do not yet exist @@ -53,6 +56,17 @@ def trigger_crawler(target_dir: str) -> bool: entities = scan_directory(target_dir, ruqad_crawler_settings.joinpath('cfood.yaml')) + ent_qc = [] + + # Show warning if license is not present in an eln file: + for ent in entities: + if not (len(ent.parents) == 1 and ent.parents[0].name == "QualityCheck"): + continue + ent_qc.append(ent) + + if not ent.get_property("FAIRLicenseCheck").value: + print("{} does not contain a license.".format(ent.get_property("ELNFile").value.path)) + # Remove files from entities: records = [r for r in entities if r.role == "Record"] validation = validate(records, schemas) @@ -62,7 +76,7 @@ def trigger_crawler(target_dir: str) -> bool: for v, recordtype in zip(validation, schemas.keys()): if not v[0]: print("{}: {}".format(recordtype, v[1])) - return False + return (False, ent_qc) print("crawl", target_dir) crawler_main(crawled_directory_path=target_dir, @@ -71,4 +85,4 @@ def trigger_crawler(target_dir: str) -> bool: 'identifiables.yaml'), remove_prefix="/"+os.path.basename(target_dir)) - return True + return (True, ent_qc)