From 51522d707a48a91011e21e8f2cd42238eb1c0011 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <a.schlemmer@indiscale.com> Date: Thu, 5 Dec 2024 10:49:03 +0100 Subject: [PATCH] ENH: add numbers from quality check zip report to quality check record --- src/ruqad/crawler.py | 1 + .../resources/crawler-settings/cfood.yaml | 50 ++++++++++++++----- .../crawler-settings/identifiables.yaml | 2 + 3 files changed, 41 insertions(+), 12 deletions(-) diff --git a/src/ruqad/crawler.py b/src/ruqad/crawler.py index 1e5a033..b01ff9f 100644 --- a/src/ruqad/crawler.py +++ b/src/ruqad/crawler.py @@ -38,6 +38,7 @@ def trigger_crawler(target_dir: str) -> bool: file_entity = join(fp[len(target_dir):], fn) file_ent = db.File(file=file_path, path=file_entity) + print(f"retrieve {join(fp, fn)}") file_ent.retrieve() if file_ent.id is None: diff --git a/src/ruqad/resources/crawler-settings/cfood.yaml b/src/ruqad/resources/crawler-settings/cfood.yaml index 5c93338..2821470 100644 --- a/src/ruqad/resources/crawler-settings/cfood.yaml +++ b/src/ruqad/resources/crawler-settings/cfood.yaml @@ -10,6 +10,12 @@ Converters: ROCrateEntity: converter: ROCrateEntityConverter package: caoscrawler.converters + ZipFileConverter: + converter: ZipFileConverter + package: caoscrawler.converters + JSONFileConverter: + converter: JSONFileConverter + package: caoscrawler.converters Transformers: cast_metadata_type: function: cast_metadata_type @@ -28,6 +34,8 @@ DataDir: DataDir: type: Directory match: ^[0-9]+$ + records: + QualityCheck: subtree: QualityReportFile: type: ZipFileConverter @@ -45,15 +53,31 @@ DataDir: type: JSONFileConverter match: ^qc_summary.json$ subtree: - CheckCounts: + SubTree: type: Dict - match_properties: - num_total_checks: ^(?P<num_total>[0-9]+)$ - num_passing_checks: ^(?P<num_passing>[0-9]+)$ - records: - QualityCheck: - numTotalChecks: $num_total - numPassingChecks: $num_passing + match: .* + subtree: + CheckCounts: + match: check_counts + type: Dict + match_properties: + num_total_checks: ^(?P<num_total>[0-9]+)$ + num_passing_checks: ^(?P<num_passing>[0-9]+)$ + transform: + cast_num_total_to_int: + in: $num_total + out: $num_total + functions: + - cast_to_int: + cast_num_passing_to_int: + in: $num_passing + out: $num_passing + functions: + - cast_to_int: + records: + QualityCheck: + numTotalChecks: $num_total + numPassingChecks: $num_passing records: ELNFileElement: parents: @@ -61,15 +85,15 @@ DataDir: role: File file: $ELNFile path: $ELNFile - QualityCheck: - ELNFile: $ELNFileElement - FAIRMetadataCheck: true QualityReportFileElement: parents: - QualityReportFile role: File file: $QualityReportFile path: $QualityReportFile + QualityCheck: + ELNFile: $ELNFileElement + QualityReportFile: $QualityReportFileElement ELNFile: type: ELNFile match: ^.*\.eln$ @@ -82,6 +106,7 @@ DataDir: path: $ELNFile QualityCheck: ELNFile: $ELNFileElement + FAIRMetadataCheck: true subtree: AuthorDataset: type: ROCrateEntity @@ -115,7 +140,8 @@ DataDir: dateModified: $dateModified dateCreated: $dateCreated ELNFile: $ELNFileElement - QualityReportFile: $QualityReportFileElement + # QualityReportFile: $QualityReportFileElement # reference not possible on this level + QualityCheck: $QualityCheck subtree: Description: type: DictElement diff --git a/src/ruqad/resources/crawler-settings/identifiables.yaml b/src/ruqad/resources/crawler-settings/identifiables.yaml index c6d3189..6d61d50 100644 --- a/src/ruqad/resources/crawler-settings/identifiables.yaml +++ b/src/ruqad/resources/crawler-settings/identifiables.yaml @@ -7,3 +7,5 @@ Author: - url Repository: - url +QualityCheck: +- ELNFile -- GitLab