Merge branch 'f-deploy-in-docker' into 'main'

Deploy ruqad in docker See merge request caosdb/customers/f-fit/ruqad!4

Merge branch 'f-deploy-in-docker' into 'main'
6a07f736 · Daniel Hornung · 4916e4e9 · 55eb267f · 6a07f736 · 6a07f736
Commit 6a07f736 authored 9 months ago by Daniel Hornung
--- a/.docker/Dockerfile
+++ b/.docker/Dockerfile
@@ -4,34 +4,31 @@ RUN apt-get update && \
    curl \
    git \
    openjdk-17-jdk-headless \
-    python3-autopep8 \
    python3-pip \
-    python3-pytest \
-    python3-sphinx \
    -y
-RUN pip3 install --break-system-packages pylint recommonmark sphinx-rtd-theme tox
+
 COPY .docker/wait-for-it.sh /wait-for-it.sh
 ARG PYLIB
+ARG ADVANCED
+# Version files
 ADD https://gitlab.indiscale.com/api/v4/projects/97/repository/commits/${PYLIB} \
    pylib_version.json
-RUN git clone https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git && \
-    cd caosdb-pylib && git checkout ${PYLIB} && pip3 install --break-system-packages .
-ARG ADVANCED
 ADD https://gitlab.indiscale.com/api/v4/projects/104/repository/commits/${ADVANCED} \
  advanced_version.json
-RUN git clone https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git && \
-    cd caosdb-advanced-user-tools && git checkout ${ADVANCED} && pip3 install --break-system-packages .[h5-crawler]
-COPY . /git
+# Install pylib and advanced user tools
+RUN pip install --break-system-packages -U \
+  git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@${PYLIB} \
+  git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@${ADVANCED}

-# Delete .git because it is huge.
+# Install Ruqad
+COPY . /git
 RUN rm -r /git/.git
-
-RUN cd /git/ && pip3 install --break-system-packages .
+RUN cd /git/ && pip3 install --break-system-packages .[all]

 WORKDIR /git/integrationtests
 # wait for server,
-CMD /wait-for-it.sh caosdb-server:10443 -t 500 -- \
+CMD /wait-for-it.sh caosdb-server:10443 -t 120 --strict -- \
    # ... install pycaosdb.ini the server-side scripts
-    cp /git/.docker/sss_pycaosdb.ini /scripting/home/.pycaosdb.ini && \
+    cp /git/.docker/sss_pylinkahead.ini /scripting/home/.pylinkahead.ini && \
    # ... and run tests
    pytest-3 .
--- a/.docker/sss_pycaosdb.ini
+++ b/.docker/sss_pycaosdb.ini
--- a/.dockerignore
+++ b/.dockerignore
+# -*- mode:conf; -*-
+
+# auto saves, caches
+*~
+__pycache__
+.coverage
+.tox
+
+# development artifacts
+venv
+.venv
+.env
+
+# configurations
+
+# TODO Exclude later, for the time being this will be hardcoded into the image.
+# qualitycheck_config.toml
+
+# build artifacts
+*.egg-info
+build
+/src/doc/_apidoc/
--- a/.gitignore
+++ b/.gitignore
@@ -7,7 +7,8 @@ __pycache__
 .tox

 # development artifacts
-venv/
+venv
+.venv
 /.env/

 # configurations

--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -53,23 +53,17 @@ e2e_tests:
    - python -m pytest end-to-end-tests/test_kadi.py


-unittest_py3.8:
+unittest_py3.9:
  tags: [cached-dind]
  stage: test
-  image: python:3.8
+  image: python:3.9
  script: &python_test_script
-    # install dependencies
-    - pip install pytest pytest-cov
-    - pip install .
+    # TODO Remove this manual crawler installation after the crawler has been released in version 0.10.2
+    - pip install --root-user-action=ignore git+https://gitlab.indiscale.com/caosdb/src/caosdb-crawler.git@dev
+    - pip install .[all]
    # actual test
    - pytest --cov=ruqad -vv ./unittests

-unittest_py3.9:
-  tags: [cached-dind]
-  stage: test
-  image: python:3.9
-  script: *python_test_script
-
 unittest_py3.10:
  tags: [cached-dind]
  stage: test
@@ -109,6 +103,11 @@ build-testenv:
      - command -v wget
      - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
        # use here general latest or specific branch latest...
+      - |-
+        echo > qualitycheck_config.toml << EOF
+        s3_endpoint = "https://s3.computational.bio.uni-giessen.de"
+        s3_bucket = "ruqad"
+        EOF
      - docker build
        --file docker/Dockerfile
        -t $CI_REGISTRY_IMAGE .

--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 - LinkAhead crawler for the metadata check.
 - Triggers the quality checker.
+- Docker build file and instructions.

 ### Changed ###


--- a/Makefile
+++ b/Makefile
@@ -43,6 +43,6 @@ lint:
 	pylint --unsafe-load-any-extension=y -d all -e E,F src/linkahead_python_package_template
 .PHONY: lint

-unittest:
+unittests:
 	tox -r
-.PHONY: unittest
+.PHONY: unittests
--- a/README.md
+++ b/README.md
 # FAIR Dataspaces RuQaD

-TODO
+RuQaD (Reuse Quality-assured Data) is a demonstrator for connecting and populating FAIR data spaces.
+Ruqad connects to [Kadi4Mat](https://kadi.iam.kit.edu/) instances, runs [quality checks](https://git.rwth-aachen.de/fair-ds/ap-4-2-demonstrator/ap-4.2-data-validation-and-quality-assurance-demonstrator) on the data, stores the results
+in a [LinkAhead](https://getlinkahead.com) instance and makes the data available via an [EDC (Eclipse Dataspace
+Components)](https://projects.eclipse.org/projects/technology.edc) instance.

 ## Usage

-TODO
+### Installation ###

-### Unit Tests
+Simply install with:

-Run `pytest unittests/`.
+`pip install .`
+
+### Run locally ###
+
+- Make sure that `qualitycheck_config.toml` and `secrets.sh` are filled with valied values.
+- Run `(set -a && . secrets.sh && rq_monitor)`, a short explanation follows:
+  - `(...)`: Putting the parentheses prevents pollution of your shell with the variables defined in
+    `secrets.sh`.
+  - `set -a`: This automatically exports all set variables in the shell.
+  - `. secrets.sh`: (Mind the dot `.`) This sources the variables defined in the file.
+  - `rq_monitor`: This starts the actual monitor, using the variables defined before.
+- To run the service on data, insert new data to the Kadi4Mat instance:
+  - Log in to the Kadi server, with an account whose records are visible with the configured token.
+  - Create new record.
+  - Quickly append a file (for example `abalone.csv` from the *demonstrator4.2* example repo) to the
+    record.
+  - Wait for the new record with the file to be digested by the Ruqad monitor.
+
+### unit Tests
+
+- For testing, install with the `test` extra, for example like so: `pip install .[test]`
+- Then run `make unittest` or `pytest unittests/`.

 ### E2E Tests
 In order to run the E2E test, you need to create a personal access token (pat) in the public 
@@ -17,23 +41,19 @@ In order to run the E2E test, you need to create a personal access token (pat) i

 ### Code style and liniting

-Run `make style lint` after installing the dependencies listed below.
-
-### Documentation
-
-Run `make doc` after installing the dependencies listed below.
+Run `make style lint` after installing with the `dev` extra.  (The `dev` extra includes the `test`
+extra.)

-## Dependencies
+<!-- ### Documentation -->

-Package and optional dependencies are declared in the `pyproject.toml`;
-additional dependencies for testing are listed in the `tox.ini`.
+<!-- Run `make doc` after installing the dependencies listed below. -->

-For linting and code-style we additionally require
+<!-- For building the documentation we require -->

- `pylint`
+<!-- - `sphinx` -->
+<!-- - `recommonmark`  -->
+<!-- - `sphinx-rtd-theme` -->

-For building the documentation we require
+## Docker deployment ##

- `sphinx`
- `recommonmark` 
- `sphinx-rtd-theme`
+Ruqad can also be deployed as a container.  More documentation on this is in `docker/`.
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
 FROM python:3.13

-RUN pip install pytest autopep8 pylint
+# development root: /ruqad
 COPY ./src /ruqad/src
 COPY ./unittests /ruqad/unittests
 COPY ./end-to-end-tests /ruqad/end-to-end-tests
 COPY ./pyproject.toml /ruqad/
-RUN cd /ruqad && pip install .
-CMD python -m ruqad.monitor
+
+# static configuration
+COPY ./qualitycheck_config.toml /ruqad/
+COPY ./pylinkahead.ini /ruqad/
+
+# Installing the package
+WORKDIR /ruqad/
+# TODO Remove this manual crawler installation after the crawler has been released in version 0.10.2
+RUN pip install --root-user-action=ignore \
+  git+https://gitlab.indiscale.com/caosdb/src/caosdb-crawler.git@dev
+
+RUN pip install --root-user-action=ignore .[all]
+
+CMD rq_monitor
--- a/docker/readme.md
+++ b/docker/readme.md
+# Build the Docker image #
+
+## Build configuration ##
+
+- Make sure that your `qualitycheck_config.toml` is up to date.
+- Update `pylinkahead.ini` if necessary.
+
+## Actual building ##
+
+Building the Docker image from within this `docker/` directory:
+
+```sh
+docker build -t ruqad:dev -f Dockerfile ..
+```
+
+# Runtime configuration #
+
+Open `../secrets.example.sh` and save it as `secrets.sh`, then fill in all the required
+configuration.
+
+# Run the image #
+
+You can start Docker with
+
+`docker run --env-file=../secrets.sh ruqad:dev`
+
+## Add data ##
+
+1. Log into the configured Kadi instance.
+2. Create new record with the access token's user, then attach a file.
+3. When the monitor finds the file, it should [trigger the pipeline](https://gitlab.indiscale.com/caosdb/customers/f-fit/demonstrator4.2-example-data/-/pipelines/) for the quality check.
+4. After the quality check has completed, the crawler should create a LinkAhead record and insert it
+   into the specified LinkAhead instance.
--- a/end-to-end-tests/test_crawler.py
+++ b/end-to-end-tests/test_crawler.py
@@ -18,21 +18,17 @@
 tests the crawling of ELN files
 """
 import os
-import zipfile
-from datetime import datetime
 from pathlib import Path
-from tempfile import NamedTemporaryFile
-from time import sleep
-from uuid import uuid1

 from ruqad.crawler import trigger_crawler

 DATADIR = Path(__file__).parent / "data" / "crawler_data"

+
 def test_crawl():
    """
    crawl a directory as it would be created by export from kadi and running a data quality check
    """
    print(os.listdir(DATADIR))
-    trigger_crawler(os.fspath(DATADIR))
-    klsdjf
+    retval = trigger_crawler(os.fspath(DATADIR))
+    assert retval
--- a/pylinkahead.ini
+++ b/pylinkahead.ini
+# The INI file must be located either in
+# - $CWD/pylinkahead.ini
+# - $HOME/.pylinkahead.ini
+# - the location given in the env variable PYLINKAHEADINI
+
+[Connection]
+url=https://demo.indiscale.com/
+#url=https://localhost:10443/
+
+## If this option is set, the SSL certificate will be ignored.  Use with care!
+ssl_insecure=1
+
+username=admin
+
+## The password input method can be chosen with the `password_method` setting,
+## which by default is set to `plain`.
+##
+## DEFAULT: the password method is `plain`, now the password must be saved as
+## plain text.
+password_method=plain
+password=caosdb
+
+
+## OR: `input`: username is optional, password is entered by the user directly
+# password_method=input
+
+## OR: `pass`: password is retrieved from the "pass" password manager
+# password_method=pass
+# password_identifier=...
+
+## OR: `keyring`: using the system keyring/wallet (macOS, GNOME, KDE, Windows)
+## requires installation of the keyring python package:
+## pip install keyring
+# password_method=keyring
+
+timeout=10000
+
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,7 +24,7 @@ classifiers = [
 requires-python = ">= 3.8"
 dependencies = [
             "linkahead",
-             "caoscrawler[rocrate]",
+             "caoscrawler[rocrate] >= 0.10.2",
             "kadi-apy",
             "boto3>=1.35",
             "toml>=0.10",
@@ -39,10 +39,21 @@ Changelog = "https://gitlab.indiscale.com/caosdb/src/linkahead-python-package-te

 [project.optional-dependencies]
 dev = [
+    "autopep8",
+    "pylint",
+    "ruqad[test]",
 ]
 test = [
    "pytest",
+    "pytest-cov",
 ]
+all = [
+    "ruqad[dev]",
+]
+
+[project.scripts]
+rq_monitor = "ruqad.monitor:monitor"
+rq_qualitycheck = "ruqad.qualitycheck:main"

 [tool.setuptools.package-data]
-ruqad = ["src/ruqad/resources/crawler-settings"]
+ruqad = ["resources/**/*"]
--- a/secrets.example.sh
+++ b/secrets.example.sh
-# Insert your secrets here, save as `secrets.sh`, then source the file with
-#   . secrets.sh
-# Do not add this file to your version control!
+# Save this file as `secrets.sh`, insert your secrets here, then source the file with
+#   set -a && . secrets.sh
+#
+# !!! Do not add this file to your version control !!!
+#
+#
+# Note: Do not quote the value if you plan to use the file as a Docker env-file

 ## Kadi
 # Host and token to retrieve data
-export KADIHOST="https://demo-kadi4mat.iam.kit.edu"
-export KADITOKEN="pat_KADI123456789"
+KADIHOST=https://demo-kadi4mat.iam.kit.edu
+KADITOKEN=pat_KADI123456789

 ## S3
 # Key ID and secret to access the S3 bucket defined in `qualitycheck_config.toml`.
-export S3_ACCESS_KEY_ID="456S3S3S3654"
-export S3_SECRET_ACCESS_KEY="123S3S3S3987"
+S3_ACCESS_KEY_ID=456S3S3S3654
+S3_SECRET_ACCESS_KEY=123S3S3S3987

 ## Gitlab
 # Tokens to trigger a pipeline run and to get pipeline status and result via the API.
-export GITLAB_PIPELINE_TOKEN="glptt-123456789"
-export GITLAB_API_TOKEN="glpat-987654321"
+GITLAB_PIPELINE_TOKEN=glptt-123456789
+GITLAB_API_TOKEN=glpat-987654321
--- a/src/ruqad/crawler.py
+++ b/src/ruqad/crawler.py
@@ -4,23 +4,29 @@


 import os
+import sys
 from importlib import resources
 from os import walk
 from os.path import join

 import linkahead as db
 from caoscrawler.crawl import crawler_main
+from caoscrawler.scanner import scan_directory
+from caoscrawler.validator import (load_json_schema_from_datamodel_yaml,
+                                   validate)

 ruqad_crawler_settings = resources.files('ruqad').joinpath('resources/crawler-settings')


-def trigger_crawler(target_dir: str):
+def trigger_crawler(target_dir: str) -> bool:
    """
    Trigger a standard crawler run equivalent to the command line:

    ```
    caosdb-crawler -i crawler/identifiables.yaml -s update crawler/cfood.yaml <target_dir>
    ```
+
+    Return False in case of unsuccessful metadata validation and True otherwise.
    """

    # insert all .zip and .eln files, if they do not yet exist
@@ -40,6 +46,22 @@ def trigger_crawler(target_dir: str):
                else:
                    print(f"update {join(fp, fn)}")
                    file_ent.update()
+    print("meta data check")
+    datamodel_yaml_file = ruqad_crawler_settings.joinpath('datamodel.yaml')
+    schemas = load_json_schema_from_datamodel_yaml(datamodel_yaml_file)
+    entities = scan_directory(target_dir,
+                              ruqad_crawler_settings.joinpath('cfood.yaml'))
+
+    # Remove files from entities:
+    records = [r for r in entities if r.role == "Record"]
+    validation = validate(records, schemas)
+
+    if not all([i[0] for i in validation]):
+        print("Metadata validation failed. Validation errors:")
+        for v, recordtype in zip(validation, schemas.keys()):
+            if not v[0]:
+                print("{}: {}".format(recordtype, v[1]))
+        return False

    print("crawl", target_dir)
    crawler_main(crawled_directory_path=target_dir,
@@ -47,3 +69,5 @@ def trigger_crawler(target_dir: str):
                 identifiables_definition_file=ruqad_crawler_settings.joinpath(
                     'identifiables.yaml'),
                 remove_prefix="/"+os.path.basename(target_dir))
+
+    return True
--- a/src/ruqad/crawler_extensions/__init__.py
+++ b/src/ruqad/crawler_extensions/__init__.py
--- a/src/ruqad/crawler_extensions/converters.py
+++ b/src/ruqad/crawler_extensions/converters.py
+# encoding: utf-8
+#
+# This file is a part of the LinkAhead Project.
+#
+# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com>
+# Copyright (C) 2024 Alexander Schlemmer <a.schlemmer@indiscale.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
--- a/src/ruqad/crawler_extensions/transformers.py
+++ b/src/ruqad/crawler_extensions/transformers.py
+# encoding: utf-8
+#
+# This file is a part of the LinkAhead Project.
+#
+# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com>
+# Copyright (C) 2024 Alexander Schlemmer <a.schlemmer@indiscale.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+from typing import Any
+
+
+def cast_metadata_type(in_value: Any, in_parameters: dict) -> Any:
+    """
+    Cast the type of `in_value` to the type given by in_parameters["out_type"]
+    if the varname given by in_parameters["var_name"] equals in_parameters["var_value"].
+    Just return the in_value otherwise.
+
+    The type can be one of:
+    float, int, bool, str
+    """
+    if "out_type" not in in_parameters:
+        raise RuntimeError("Parameter `out_type` missing.")
+
+    if "var_name" not in in_parameters:
+        raise RuntimeError("Parameter `var_name` missing.")
+
+    if "var_value" not in in_parameters:
+        raise RuntimeError("Parameter `var_value` missing.")
+
+    typedict = {
+        "float": float, "int": int, "bool": bool, "str": str
+    }
+
+    out_type = in_parameters["out_type"]
+    if out_type not in typedict.keys():
+        raise RuntimeError("Parameter `out_type` can only be one of float, int, bool or str.")
+
+    if in_parameters["var_name"] != in_parameters["var_value"]:
+        return in_value
+
+    return typedict[out_type](in_value)
--- a/src/ruqad/monitor.py
+++ b/src/ruqad/monitor.py
@@ -25,27 +25,32 @@
 import traceback
 import shutil
 import os
-import argparse
-import sys
+
 from time import sleep
 from tempfile import TemporaryDirectory
 from datetime import datetime, timezone

-sys.path.append(os.path.dirname(__file__))
-
-from .qualitycheck import QualityChecker                           # NOQA
-from .kadi import collect_records_created_after, download_eln_for  # NOQA
-from .crawler import trigger_crawler                               # NOQA
-from kadi_apy import KadiManager                                  # NOQA
+from ruqad.qualitycheck import QualityChecker
+from ruqad.kadi import collect_records_created_after, download_eln_for
+from ruqad.crawler import trigger_crawler
+from kadi_apy import KadiManager


 KADIARGS = {
    "host": os.environ['KADIHOST'],
-    "pat": os.environ['KADITOKEN']
+    "pat": os.environ['KADITOKEN'],
 }


-if __name__ == "__main__":
+def monitor():
+    """Continuously monitor the Kadi instance given in the environment variables.
+
+    For each new item found, the following steps are performed:
+
+    - Download the eln-format wrapped item.
+    - Run the quality check.
+    - Run the crawler on the item and the quality check result.
+    """
    cut_off_date = datetime.fromisoformat("1990-01-01 02:34:42.484312+00:00")
    while True:
        try:
@@ -84,3 +89,7 @@ if __name__ == "__main__":
            print("ERROR")
            print(traceback.format_exc())
            print(e)
+
+
+if __name__ == "__main__":
+    monitor()
--- a/src/ruqad/qualitycheck.py
+++ b/src/ruqad/qualitycheck.py
@@ -62,7 +62,9 @@ out: dict
 class QualityChecker:

    class CheckFailed(RuntimeError):
-        pass
+        def __init__(self, reason: dict):
+            super().__init__()
+            self.reason = reason

    def __init__(self):
        """The QualityChecker can do quality checks for content.
@@ -116,7 +118,7 @@ out : bool
            job_id = self._wait_for_check(pipeline_id=pipeline_id)
            self._download_result(job_id=job_id, target_dir=target_dir)
        except self.CheckFailed as cfe:
-            print("Check failed")
+            print(f"Check failed:\nStatus: {cfe.reason['status']}")
            breakpoint()

            check_ok = False
@@ -226,7 +228,7 @@ remove_prefix : Optional[str]
        while True:
            cmd_result = run(cmd, check=True, capture_output=True)
            result = json.loads(cmd_result.stdout)
-            if result["finished_at"] is not None:
+            if result["status"] != "running" and result["finished_at"] is not None:
                break
            time.sleep(1)
        if not result["status"] == "success":
@@ -247,7 +249,7 @@ remove_prefix : Optional[str]
        result = json.loads(cmd_result.stdout)
        evaluate_job = [job for job in result if job["name"] == "evaluate"][0]
        if not evaluate_job["status"] == "success":
-            raise self.CheckFailed()
+            raise self.CheckFailed(result)
        report_job = [job for job in result if job["name"] == "report"][0]
        return report_job["id"]