diff --git a/README.md b/README.md index a09687e4877ba90c9216ac23f611285ecc911976..7f35de5582cc7b602d4379ec2a2928ee5274a462 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,16 @@ analysis. For more information, look at the section "SCA" below. - Then run `make unittest` or `pytest unittests/`. ### E2E Tests + +#### LinkAhead setup #### + +- Start an (empty) LinkAhead instance. +- Set up `pylinkahead.ini` to use this instance. +- Call `./insert_datamodel.sh` in the `linkahead_setup` directory to set up the datamodel in the + LinkAhead instance. + +#### KADI token #### + In order to run the E2E test, you need to create a personal access token (pat) in the public [demo instance](https://demo-kadi4mat.iam.kit.edu). You can then run the test as follows: `KADITOKEN=<token> python -m pytest end-to-end-tests/test_kadi.py` diff --git a/end-to-end-tests/data/crawler_data/ruqad/1222/report.zip b/end-to-end-tests/data/crawler_data/ruqad/1222/report.zip index 24b60a48f33cd9499ddbcd797cab3ad4be1b9746..56dc42703c5bdf8c1936ae6e86f2e981e86770cc 100644 Binary files a/end-to-end-tests/data/crawler_data/ruqad/1222/report.zip and b/end-to-end-tests/data/crawler_data/ruqad/1222/report.zip differ diff --git a/end-to-end-tests/data/crawler_data/ruqad/1222/report.zip_old b/end-to-end-tests/data/crawler_data/ruqad/1222/report.zip_old new file mode 100644 index 0000000000000000000000000000000000000000..24b60a48f33cd9499ddbcd797cab3ad4be1b9746 Binary files /dev/null and b/end-to-end-tests/data/crawler_data/ruqad/1222/report.zip_old differ diff --git a/end-to-end-tests/data/crawler_data/ruqad/1223/export.eln b/end-to-end-tests/data/crawler_data/ruqad/1223/export.eln new file mode 100644 index 0000000000000000000000000000000000000000..7ac35e47d62ce4a5c4c68eb0056cc8b447faa2e8 Binary files /dev/null and b/end-to-end-tests/data/crawler_data/ruqad/1223/export.eln differ diff --git a/end-to-end-tests/data/crawler_data/ruqad/1223/report.zip b/end-to-end-tests/data/crawler_data/ruqad/1223/report.zip new file mode 100644 index 0000000000000000000000000000000000000000..56dc42703c5bdf8c1936ae6e86f2e981e86770cc Binary files /dev/null and b/end-to-end-tests/data/crawler_data/ruqad/1223/report.zip differ diff --git a/end-to-end-tests/test_crawler.py b/end-to-end-tests/test_crawler.py index 258e503dd3d5bbd3cc8ad6119c309aebf1f0faef..66e73a6b77acc7249d25cd96147ab053f3422139 100644 --- a/end-to-end-tests/test_crawler.py +++ b/end-to-end-tests/test_crawler.py @@ -18,6 +18,7 @@ tests the crawling of ELN files """ import os +import re from pathlib import Path from ruqad.crawler import trigger_crawler @@ -25,10 +26,40 @@ from ruqad.crawler import trigger_crawler DATADIR = Path(__file__).parent / "data" / "crawler_data" -def test_crawl(): +def test_crawl(capsys): """ crawl a directory as it would be created by export from kadi and running a data quality check """ + print(f"\nData directory: {DATADIR}") print(os.listdir(DATADIR)) - retval = trigger_crawler(os.fspath(DATADIR)) + + retval, ent_qc = trigger_crawler(os.fspath(DATADIR)) + + stdout, stderr = capsys.readouterr() + + # Check whether the warning is displayed for the license check: + assert "/1222/export.eln does not contain a license." in stdout + assert "/1223/export.eln does not contain a license." not in stdout + + # Check that validation of metadata was successful: assert retval + + # Check that license was present in 1223 and absent in 1222: + qc = {} + for ent in ent_qc: + pth = ent.get_property("ELNFile").value.path + # Get folder name ("1222" or "1223") + match = re.match("/.*/.*/(?P<folder>[0-9]+)/.*\\.eln", pth) + assert match is not None + qc[match.group("folder")] = ent + + assert qc["1223"].get_property("FAIRLicenseCheck").value + assert not qc["1222"].get_property("FAIRLicenseCheck").value + + # Check whether the information from "report.zip" is present: + for total, passed, d in ((20, 18, "1222"), + (20, 18, "1223")): + assert type(qc[d].get_property("numTotalChecks").value) == int + assert type(qc[d].get_property("numPassingChecks").value) == int + assert qc[d].get_property("numTotalChecks").value == total + assert qc[d].get_property("numPassingChecks").value == passed diff --git a/linkahead-setup/datamodel.yaml b/linkahead-setup/datamodel.yaml deleted file mode 100644 index f63e1dc21de0cd227b84b49ec3361d31bc858599..0000000000000000000000000000000000000000 --- a/linkahead-setup/datamodel.yaml +++ /dev/null @@ -1,44 +0,0 @@ -Dataset: - obligatory_properties: - Author: - Repository: - dateModified: - datatype: DATETIME - dateCreated: - datatype: DATETIME - recommended_properties: - MetaData: - datatype: LIST<MetaData> - notes: - datatype: TEXT - rating: - datatype: INTEGER - voltage: - datatype: DOUBLE - unit: V - - -MetaData: - obligatory_properties: - v: - datatype: TEXT - -Author: - obligatory_properties: - url: - datatype: TEXT - recommended_properties: - nr: - datatype: INTEGER - -Repository: - obligatory_properties: - url: - -ELNFile: - recommended_properties: - QualityReportFile: - -QualityReportFile: - recommended_properties: - ELNFile: diff --git a/linkahead-setup/datamodel.yaml b/linkahead-setup/datamodel.yaml new file mode 120000 index 0000000000000000000000000000000000000000..72b936d5f2eeb8fbf98fe4bd1fbf12e11393a06f --- /dev/null +++ b/linkahead-setup/datamodel.yaml @@ -0,0 +1 @@ +../src/ruqad/resources/crawler-settings/datamodel.yaml \ No newline at end of file diff --git a/pylinkahead.ini b/pylinkahead.ini index 326f8b6c0ad5f4fb25ea31cd9fa4e738c358ecaf..fb782ac7944295c18eff64df8698a497adf2ef1d 100644 --- a/pylinkahead.ini +++ b/pylinkahead.ini @@ -4,8 +4,8 @@ # - the location given in the env variable PYLINKAHEADINI [Connection] -url=https://demo.indiscale.com/ -#url=https://localhost:10443/ +# url=https://demo.indiscale.com/ +url=https://localhost:10443/ ## If this option is set, the SSL certificate will be ignored. Use with care! ssl_insecure=1 @@ -17,8 +17,8 @@ username=admin ## ## DEFAULT: the password method is `plain`, now the password must be saved as ## plain text. -password_method=plain -password=caosdb +password_method=input +# password=caosdb ## OR: `input`: username is optional, password is entered by the user directly diff --git a/src/ruqad/crawler.py b/src/ruqad/crawler.py index 1e5a0334ad05550bd86cb72a7a92fcba4ddd4689..cf3284f506e84657faaca2be1fad016893c384a1 100644 --- a/src/ruqad/crawler.py +++ b/src/ruqad/crawler.py @@ -18,7 +18,7 @@ from caoscrawler.validator import (load_json_schema_from_datamodel_yaml, ruqad_crawler_settings = resources.files('ruqad').joinpath('resources/crawler-settings') -def trigger_crawler(target_dir: str) -> bool: +def trigger_crawler(target_dir: str) -> tuple[bool, list[db.Entity]]: """ Trigger a standard crawler run equivalent to the command line: @@ -26,7 +26,13 @@ def trigger_crawler(target_dir: str) -> bool: caosdb-crawler -i crawler/identifiables.yaml -s update crawler/cfood.yaml <target_dir> ``` - Return False in case of unsuccessful metadata validation and True otherwise. + Returns + ------- + + out: tuple[bool, list[db.Entity]] + - 1st element of tuple: ``False`` in case of unsuccessful metadata validation + and ``True`` otherwise. + - 2nd element of tuple: list of quality check records. """ # insert all .zip and .eln files, if they do not yet exist @@ -38,6 +44,7 @@ def trigger_crawler(target_dir: str) -> bool: file_entity = join(fp[len(target_dir):], fn) file_ent = db.File(file=file_path, path=file_entity) + print(f"retrieve {join(fp, fn)}") file_ent.retrieve() if file_ent.id is None: @@ -52,6 +59,17 @@ def trigger_crawler(target_dir: str) -> bool: entities = scan_directory(target_dir, ruqad_crawler_settings.joinpath('cfood.yaml')) + ent_qc = [] # Quality check result records + + # Show warning if license is not present in an eln file: + for ent in entities: + if not (len(ent.parents) == 1 and ent.parents[0].name == "QualityCheck"): + continue + ent_qc.append(ent) + + if not ent.get_property("FAIRLicenseCheck").value: + print("{} does not contain a license.".format(ent.get_property("ELNFile").value.path)) + # Remove files from entities: records = [r for r in entities if r.role == "Record"] validation = validate(records, schemas) @@ -61,7 +79,7 @@ def trigger_crawler(target_dir: str) -> bool: for v, recordtype in zip(validation, schemas.keys()): if not v[0]: print("{}: {}".format(recordtype, v[1])) - return False + return (False, ent_qc) print("crawl", target_dir) crawler_main(crawled_directory_path=target_dir, @@ -70,4 +88,4 @@ def trigger_crawler(target_dir: str) -> bool: 'identifiables.yaml'), remove_prefix="/"+os.path.basename(target_dir)) - return True + return (True, ent_qc) diff --git a/src/ruqad/resources/crawler-settings/cfood.yaml b/src/ruqad/resources/crawler-settings/cfood.yaml index cf3bc7d91f51f723db363348d42bc3a5188b0f65..3590556fe7a91c2ce4388fef2e219773116bbc1a 100644 --- a/src/ruqad/resources/crawler-settings/cfood.yaml +++ b/src/ruqad/resources/crawler-settings/cfood.yaml @@ -1,6 +1,6 @@ --- metadata: - crawler-version: 0.9.2 + crawler-version: 0.10.2 macros: --- Converters: @@ -10,6 +10,12 @@ Converters: ROCrateEntity: converter: ROCrateEntityConverter package: caoscrawler.converters + ZipFileConverter: + converter: ZipFileConverter + package: caoscrawler.converters + JSONFileConverter: + converter: JSONFileConverter + package: caoscrawler.converters Transformers: cast_metadata_type: function: cast_metadata_type @@ -28,9 +34,15 @@ DataDir: DataDir: type: Directory match: ^[0-9]+$ + records: + QualityCheck: + FAIRMetadataCheck: true + FAIRPIDCheck: false + FAIRLicenseCheck: false + FAIRProvenanceCheck: false subtree: QualityReportFile: - type: SimpleFile + type: ZipFileConverter match: ^report\.zip$ transform: elnfilename: @@ -40,6 +52,36 @@ DataDir: - replace: remove: report.zip insert: export.eln + subtree: + SummaryFile: + type: JSONFileConverter + match: ^qc_summary.json$ + subtree: + SubTree: + type: Dict + match: .* + subtree: + CheckCounts: + match: check_counts + type: Dict + match_properties: + num_total_checks: ^(?P<num_total>[0-9]+)$ + num_passing_checks: ^(?P<num_passing>[0-9]+)$ + transform: + cast_num_total_to_int: + in: $num_total + out: $num_total + functions: + - cast_to_int: + cast_num_passing_to_int: + in: $num_passing + out: $num_passing + functions: + - cast_to_int: + records: + QualityCheck: + numTotalChecks: $num_total + numPassingChecks: $num_passing records: ELNFileElement: parents: @@ -53,32 +95,26 @@ DataDir: role: File file: $QualityReportFile path: $QualityReportFile + QualityCheck: ELNFile: $ELNFileElement + QualityReportFile: $QualityReportFileElement ELNFile: type: ELNFile - transform: - qualityfilename: - in: $ELNFile - out: $QualityReportFile - functions: - - replace: - insert: report.zip - remove: export.eln match: ^.*\.eln$ records: - QualityReportFileElement: - parents: - - QualityReportFile - role: File - file: $QualityReportFile - path: $QualityReportFile ELNFileElement: parents: - ELNFile role: File file: $ELNFile path: $ELNFile - QualityReportFile: $QualityReportFileElement + + QualityCheck: + ELNFile: $ELNFileElement + + Dataset: + ELNFile: $ELNFileElement + QualityCheck: $QualityCheck subtree: AuthorDataset: type: ROCrateEntity @@ -96,6 +132,16 @@ DataDir: match_properties: "@id": \./$ subtree: + + License: + type: TextElement + match_name: license + match_value: ^(?P<license>.*)$ + records: + QualityCheck: + FAIRLicenseCheck: true + Dataset: + license: $license Dataset: type: ROCrateEntity @@ -111,9 +157,11 @@ DataDir: description: $description dateModified: $dateModified dateCreated: $dateCreated - ELNFile: $ELNFileElement - QualityReportFile: $QualityReportFileElement + + # QualityReportFile: $QualityReportFileElement # reference not possible on this level + subtree: + Description: type: DictElement match_name: description @@ -156,28 +204,6 @@ DataDir: records: Dataset: $propid: $propvalue - - # MetaData: - # type: DictElement - # records: - # MetaData: - # Dataset: - # MetaData: +$MetaData - # subtree: - # PropertyID: - # type: TextElement - # match_name: propertyID - # match_value: (?P<propid>.*)$ - # records: - # MetaData: - # name: $propid - # PropertyValue: - # type: TextElement - # match_name: value - # match_value: (?P<propvalue>.*)$ - # records: - # MetaData: - # v: $propvalue Author: # breakpoint: true type: DictElement @@ -203,4 +229,6 @@ DataDir: Dataset: Author: $Author Repository: $Repository + QualityCheck: + FAIRProvenanceCheck: true diff --git a/src/ruqad/resources/crawler-settings/datamodel.yaml b/src/ruqad/resources/crawler-settings/datamodel.yaml index f101a76cb4290755840fbd98e4da05375f863dc0..f16f0cf70cfdc199d172650a512ba1846f457753 100644 --- a/src/ruqad/resources/crawler-settings/datamodel.yaml +++ b/src/ruqad/resources/crawler-settings/datamodel.yaml @@ -16,6 +16,9 @@ Dataset: voltage: datatype: DOUBLE unit: V + QualityCheck: + license: + datatype: TEXT MetaData: @@ -35,10 +38,24 @@ Repository: obligatory_properties: url: -# ELNFile: -# recommended_properties: -# QualityReportFile: -# -# QualityReportFile: -# recommended_properties: -# ELNFile: +ELNFile: + +QualityReportFile: + +QualityCheck: + obligatory_properties: + ELNFile: + recommended_properties: + QualityReportFile: + numTotalChecks: + datatype: INTEGER + numPassingChecks: + datatype: INTEGER + FAIRLicenseCheck: + datatype: BOOLEAN + FAIRMetadataCheck: + datatype: BOOLEAN + FAIRPIDCheck: + datatype: BOOLEAN + FAIRProvenanceCheck: + datatype: BOOLEAN diff --git a/src/ruqad/resources/crawler-settings/identifiables.yaml b/src/ruqad/resources/crawler-settings/identifiables.yaml index c6d31894ea25350bda5aa3018062d3d737a763a7..6d61d506b620ee78183f479ee6036f23a4bad90e 100644 --- a/src/ruqad/resources/crawler-settings/identifiables.yaml +++ b/src/ruqad/resources/crawler-settings/identifiables.yaml @@ -7,3 +7,5 @@ Author: - url Repository: - url +QualityCheck: +- ELNFile