diff --git a/src/ruqad/crawler.py b/src/ruqad/crawler.py index 1e5a0334ad05550bd86cb72a7a92fcba4ddd4689..b01ff9fc80175ff08c68c01f0df5a987702e3146 100644 --- a/src/ruqad/crawler.py +++ b/src/ruqad/crawler.py @@ -38,6 +38,7 @@ def trigger_crawler(target_dir: str) -> bool: file_entity = join(fp[len(target_dir):], fn) file_ent = db.File(file=file_path, path=file_entity) + print(f"retrieve {join(fp, fn)}") file_ent.retrieve() if file_ent.id is None: diff --git a/src/ruqad/resources/crawler-settings/cfood.yaml b/src/ruqad/resources/crawler-settings/cfood.yaml index 5c933380b71afd5853ebc662b351aa21594f99a4..28214703b105c90efef519b5638dd8bbeec7b105 100644 --- a/src/ruqad/resources/crawler-settings/cfood.yaml +++ b/src/ruqad/resources/crawler-settings/cfood.yaml @@ -10,6 +10,12 @@ Converters: ROCrateEntity: converter: ROCrateEntityConverter package: caoscrawler.converters + ZipFileConverter: + converter: ZipFileConverter + package: caoscrawler.converters + JSONFileConverter: + converter: JSONFileConverter + package: caoscrawler.converters Transformers: cast_metadata_type: function: cast_metadata_type @@ -28,6 +34,8 @@ DataDir: DataDir: type: Directory match: ^[0-9]+$ + records: + QualityCheck: subtree: QualityReportFile: type: ZipFileConverter @@ -45,15 +53,31 @@ DataDir: type: JSONFileConverter match: ^qc_summary.json$ subtree: - CheckCounts: + SubTree: type: Dict - match_properties: - num_total_checks: ^(?P<num_total>[0-9]+)$ - num_passing_checks: ^(?P<num_passing>[0-9]+)$ - records: - QualityCheck: - numTotalChecks: $num_total - numPassingChecks: $num_passing + match: .* + subtree: + CheckCounts: + match: check_counts + type: Dict + match_properties: + num_total_checks: ^(?P<num_total>[0-9]+)$ + num_passing_checks: ^(?P<num_passing>[0-9]+)$ + transform: + cast_num_total_to_int: + in: $num_total + out: $num_total + functions: + - cast_to_int: + cast_num_passing_to_int: + in: $num_passing + out: $num_passing + functions: + - cast_to_int: + records: + QualityCheck: + numTotalChecks: $num_total + numPassingChecks: $num_passing records: ELNFileElement: parents: @@ -61,15 +85,15 @@ DataDir: role: File file: $ELNFile path: $ELNFile - QualityCheck: - ELNFile: $ELNFileElement - FAIRMetadataCheck: true QualityReportFileElement: parents: - QualityReportFile role: File file: $QualityReportFile path: $QualityReportFile + QualityCheck: + ELNFile: $ELNFileElement + QualityReportFile: $QualityReportFileElement ELNFile: type: ELNFile match: ^.*\.eln$ @@ -82,6 +106,7 @@ DataDir: path: $ELNFile QualityCheck: ELNFile: $ELNFileElement + FAIRMetadataCheck: true subtree: AuthorDataset: type: ROCrateEntity @@ -115,7 +140,8 @@ DataDir: dateModified: $dateModified dateCreated: $dateCreated ELNFile: $ELNFileElement - QualityReportFile: $QualityReportFileElement + # QualityReportFile: $QualityReportFileElement # reference not possible on this level + QualityCheck: $QualityCheck subtree: Description: type: DictElement diff --git a/src/ruqad/resources/crawler-settings/identifiables.yaml b/src/ruqad/resources/crawler-settings/identifiables.yaml index c6d31894ea25350bda5aa3018062d3d737a763a7..6d61d506b620ee78183f479ee6036f23a4bad90e 100644 --- a/src/ruqad/resources/crawler-settings/identifiables.yaml +++ b/src/ruqad/resources/crawler-settings/identifiables.yaml @@ -7,3 +7,5 @@ Author: - url Repository: - url +QualityCheck: +- ELNFile