From 51522d707a48a91011e21e8f2cd42238eb1c0011 Mon Sep 17 00:00:00 2001
From: Alexander Schlemmer <a.schlemmer@indiscale.com>
Date: Thu, 5 Dec 2024 10:49:03 +0100
Subject: [PATCH] ENH: add numbers from quality check zip report to quality
 check record

---
 src/ruqad/crawler.py                          |  1 +
 .../resources/crawler-settings/cfood.yaml     | 50 ++++++++++++++-----
 .../crawler-settings/identifiables.yaml       |  2 +
 3 files changed, 41 insertions(+), 12 deletions(-)

diff --git a/src/ruqad/crawler.py b/src/ruqad/crawler.py
index 1e5a033..b01ff9f 100644
--- a/src/ruqad/crawler.py
+++ b/src/ruqad/crawler.py
@@ -38,6 +38,7 @@ def trigger_crawler(target_dir: str) -> bool:
                 file_entity = join(fp[len(target_dir):], fn)
                 file_ent = db.File(file=file_path,
                                    path=file_entity)
+
                 print(f"retrieve {join(fp, fn)}")
                 file_ent.retrieve()
                 if file_ent.id is None:
diff --git a/src/ruqad/resources/crawler-settings/cfood.yaml b/src/ruqad/resources/crawler-settings/cfood.yaml
index 5c93338..2821470 100644
--- a/src/ruqad/resources/crawler-settings/cfood.yaml
+++ b/src/ruqad/resources/crawler-settings/cfood.yaml
@@ -10,6 +10,12 @@ Converters:
   ROCrateEntity:
     converter: ROCrateEntityConverter
     package: caoscrawler.converters
+  ZipFileConverter:
+    converter: ZipFileConverter
+    package: caoscrawler.converters
+  JSONFileConverter:
+    converter: JSONFileConverter
+    package: caoscrawler.converters
 Transformers:
   cast_metadata_type:
     function: cast_metadata_type
@@ -28,6 +34,8 @@ DataDir:
         DataDir:
           type: Directory
           match: ^[0-9]+$
+          records:
+            QualityCheck:
           subtree:
             QualityReportFile:
               type: ZipFileConverter
@@ -45,15 +53,31 @@ DataDir:
                   type: JSONFileConverter
                   match: ^qc_summary.json$
                   subtree:
-                    CheckCounts:
+                    SubTree:
                       type: Dict
-                      match_properties:
-                        num_total_checks: ^(?P<num_total>[0-9]+)$
-                        num_passing_checks: ^(?P<num_passing>[0-9]+)$
-                      records:
-                        QualityCheck:
-                          numTotalChecks: $num_total
-                          numPassingChecks: $num_passing
+                      match: .*
+                      subtree:
+                        CheckCounts:
+                          match: check_counts
+                          type: Dict
+                          match_properties:
+                            num_total_checks: ^(?P<num_total>[0-9]+)$
+                            num_passing_checks: ^(?P<num_passing>[0-9]+)$
+                          transform:
+                            cast_num_total_to_int:
+                              in: $num_total
+                              out: $num_total
+                              functions:
+                                - cast_to_int:
+                            cast_num_passing_to_int:
+                              in: $num_passing
+                              out: $num_passing
+                              functions:
+                                - cast_to_int:
+                          records:
+                            QualityCheck:
+                              numTotalChecks: $num_total
+                              numPassingChecks: $num_passing
               records:
                 ELNFileElement:
                   parents:
@@ -61,15 +85,15 @@ DataDir:
                   role: File
                   file: $ELNFile
                   path: $ELNFile
-                QualityCheck:
-                  ELNFile: $ELNFileElement
-                  FAIRMetadataCheck: true
                 QualityReportFileElement:
                   parents:
                   - QualityReportFile
                   role: File
                   file: $QualityReportFile
                   path: $QualityReportFile
+                QualityCheck:
+                  ELNFile: $ELNFileElement
+                  QualityReportFile: $QualityReportFileElement
             ELNFile:
               type: ELNFile
               match: ^.*\.eln$
@@ -82,6 +106,7 @@ DataDir:
                   path: $ELNFile
                 QualityCheck:
                   ELNFile: $ELNFileElement
+                  FAIRMetadataCheck: true
               subtree:
                 AuthorDataset:
                   type: ROCrateEntity
@@ -115,7 +140,8 @@ DataDir:
                           dateModified: $dateModified
                           dateCreated: $dateCreated
                           ELNFile: $ELNFileElement
-                          QualityReportFile: $QualityReportFileElement
+                          # QualityReportFile: $QualityReportFileElement  # reference not possible on this level
+                          QualityCheck: $QualityCheck
                       subtree:
                         Description:
                           type: DictElement
diff --git a/src/ruqad/resources/crawler-settings/identifiables.yaml b/src/ruqad/resources/crawler-settings/identifiables.yaml
index c6d3189..6d61d50 100644
--- a/src/ruqad/resources/crawler-settings/identifiables.yaml
+++ b/src/ruqad/resources/crawler-settings/identifiables.yaml
@@ -7,3 +7,5 @@ Author:
 - url
 Repository:
 - url
+QualityCheck:
+- ELNFile
-- 
GitLab