diff --git a/.docker/Dockerfile b/.docker/Dockerfile index 97b8fdf2b5ae43dc96726e16ea21a2c6a1883fdb..abd249d68ca701588290ab6242bc8b510f91021d 100644 --- a/.docker/Dockerfile +++ b/.docker/Dockerfile @@ -4,34 +4,31 @@ RUN apt-get update && \ curl \ git \ openjdk-17-jdk-headless \ - python3-autopep8 \ python3-pip \ - python3-pytest \ - python3-sphinx \ -y -RUN pip3 install --break-system-packages pylint recommonmark sphinx-rtd-theme tox + COPY .docker/wait-for-it.sh /wait-for-it.sh ARG PYLIB +ARG ADVANCED +# Version files ADD https://gitlab.indiscale.com/api/v4/projects/97/repository/commits/${PYLIB} \ pylib_version.json -RUN git clone https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git && \ - cd caosdb-pylib && git checkout ${PYLIB} && pip3 install --break-system-packages . -ARG ADVANCED ADD https://gitlab.indiscale.com/api/v4/projects/104/repository/commits/${ADVANCED} \ - advanced_version.json -RUN git clone https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git && \ - cd caosdb-advanced-user-tools && git checkout ${ADVANCED} && pip3 install --break-system-packages .[h5-crawler] -COPY . /git + advanced_version.json +# Install pylib and advanced user tools +RUN pip install --break-system-packages -U \ + git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@${PYLIB} \ + git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@${ADVANCED} -# Delete .git because it is huge. +# Install Ruqad +COPY . /git RUN rm -r /git/.git - -RUN cd /git/ && pip3 install --break-system-packages . +RUN cd /git/ && pip3 install --break-system-packages .[all] WORKDIR /git/integrationtests # wait for server, -CMD /wait-for-it.sh caosdb-server:10443 -t 500 -- \ +CMD /wait-for-it.sh caosdb-server:10443 -t 120 --strict -- \ # ... install pycaosdb.ini the server-side scripts - cp /git/.docker/sss_pycaosdb.ini /scripting/home/.pycaosdb.ini && \ + cp /git/.docker/sss_pylinkahead.ini /scripting/home/.pylinkahead.ini && \ # ... and run tests pytest-3 . diff --git a/.docker/sss_pycaosdb.ini b/.docker/sss_pylinkahead.ini similarity index 100% rename from .docker/sss_pycaosdb.ini rename to .docker/sss_pylinkahead.ini diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000000000000000000000000000000000000..3bd2ef9f9c7d14203de62dca01f7e939a12756b9 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,22 @@ +# -*- mode:conf; -*- + +# auto saves, caches +*~ +__pycache__ +.coverage +.tox + +# development artifacts +venv +.venv +.env + +# configurations + +# TODO Exclude later, for the time being this will be hardcoded into the image. +# qualitycheck_config.toml + +# build artifacts +*.egg-info +build +/src/doc/_apidoc/ diff --git a/.gitignore b/.gitignore index 02ddac7b25f95509cd4122423c25276e221bd939..658f3bc6d629d280013e781a9ff2241155ff9b58 100644 --- a/.gitignore +++ b/.gitignore @@ -7,7 +7,8 @@ __pycache__ .tox # development artifacts -venv/ +venv +.venv /.env/ # configurations diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index d299aa7f830e460c3925c34db8a21187e6db69fc..ea7a39c0e07d2997a09a6b713c2b49e97a9d1e78 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -53,23 +53,17 @@ e2e_tests: - python -m pytest end-to-end-tests/test_kadi.py -unittest_py3.8: +unittest_py3.9: tags: [cached-dind] stage: test - image: python:3.8 + image: python:3.9 script: &python_test_script - # install dependencies - - pip install pytest pytest-cov - - pip install . + # TODO Remove this manual crawler installation after the crawler has been released in version 0.10.2 + - pip install --root-user-action=ignore git+https://gitlab.indiscale.com/caosdb/src/caosdb-crawler.git@dev + - pip install .[all] # actual test - pytest --cov=ruqad -vv ./unittests -unittest_py3.9: - tags: [cached-dind] - stage: test - image: python:3.9 - script: *python_test_script - unittest_py3.10: tags: [cached-dind] stage: test @@ -109,6 +103,11 @@ build-testenv: - command -v wget - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY # use here general latest or specific branch latest... + - |- + echo > qualitycheck_config.toml << EOF + s3_endpoint = "https://s3.computational.bio.uni-giessen.de" + s3_bucket = "ruqad" + EOF - docker build --file docker/Dockerfile -t $CI_REGISTRY_IMAGE . diff --git a/CHANGELOG.md b/CHANGELOG.md index dd4609543d8988a0a2bf0f35846a1cc80beb55f1..45541164eb3d04203fe8152780608ba6960ac5a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### +- LinkAhead crawler for the metadata check. +- Triggers the quality checker. +- Docker build file and instructions. + ### Changed ### ### Deprecated ### diff --git a/Makefile b/Makefile index b8bc521c7f9046dbcbea56c08a26903ba924acab..aac219654c7180b28c014c78a5fbe5cfffaf60dc 100644 --- a/Makefile +++ b/Makefile @@ -43,6 +43,6 @@ lint: pylint --unsafe-load-any-extension=y -d all -e E,F src/linkahead_python_package_template .PHONY: lint -unittest: +unittests: tox -r -.PHONY: unittest +.PHONY: unittests diff --git a/README.md b/README.md index 217f4c6fb014cae406295e3781b6066b80f590eb..8aa2b7b6d47be00f291f5bfe94abb82827974587 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,38 @@ # FAIR Dataspaces RuQaD -TODO +RuQaD (Reuse Quality-assured Data) is a demonstrator for connecting and populating FAIR data spaces. +Ruqad connects to [Kadi4Mat](https://kadi.iam.kit.edu/) instances, runs [quality checks](https://git.rwth-aachen.de/fair-ds/ap-4-2-demonstrator/ap-4.2-data-validation-and-quality-assurance-demonstrator) on the data, stores the results +in a [LinkAhead](https://getlinkahead.com) instance and makes the data available via an [EDC (Eclipse Dataspace +Components)](https://projects.eclipse.org/projects/technology.edc) instance. ## Usage -TODO +### Installation ### -### Unit Tests +Simply install with: -Run `pytest unittests/`. +`pip install .` + +### Run locally ### + +- Make sure that `qualitycheck_config.toml` and `secrets.sh` are filled with valied values. +- Run `(set -a && . secrets.sh && rq_monitor)`, a short explanation follows: + - `(...)`: Putting the parentheses prevents pollution of your shell with the variables defined in + `secrets.sh`. + - `set -a`: This automatically exports all set variables in the shell. + - `. secrets.sh`: (Mind the dot `.`) This sources the variables defined in the file. + - `rq_monitor`: This starts the actual monitor, using the variables defined before. +- To run the service on data, insert new data to the Kadi4Mat instance: + - Log in to the Kadi server, with an account whose records are visible with the configured token. + - Create new record. + - Quickly append a file (for example `abalone.csv` from the *demonstrator4.2* example repo) to the + record. + - Wait for the new record with the file to be digested by the Ruqad monitor. + +### unit Tests + +- For testing, install with the `test` extra, for example like so: `pip install .[test]` +- Then run `make unittest` or `pytest unittests/`. ### E2E Tests In order to run the E2E test, you need to create a personal access token (pat) in the public @@ -17,23 +41,19 @@ In order to run the E2E test, you need to create a personal access token (pat) i ### Code style and liniting -Run `make style lint` after installing the dependencies listed below. - -### Documentation - -Run `make doc` after installing the dependencies listed below. +Run `make style lint` after installing with the `dev` extra. (The `dev` extra includes the `test` +extra.) -## Dependencies +<!-- ### Documentation --> -Package and optional dependencies are declared in the `pyproject.toml`; -additional dependencies for testing are listed in the `tox.ini`. +<!-- Run `make doc` after installing the dependencies listed below. --> -For linting and code-style we additionally require +<!-- For building the documentation we require --> -- `pylint` +<!-- - `sphinx` --> +<!-- - `recommonmark` --> +<!-- - `sphinx-rtd-theme` --> -For building the documentation we require +## Docker deployment ## -- `sphinx` -- `recommonmark` -- `sphinx-rtd-theme` +Ruqad can also be deployed as a container. More documentation on this is in `docker/`. diff --git a/dependencies.txt b/dependencies.txt deleted file mode 100644 index 66d8840d3881cc4efcee2c8a66105567712a10b8..0000000000000000000000000000000000000000 --- a/dependencies.txt +++ /dev/null @@ -1,3 +0,0 @@ -# For the quality checker. -boto3>=1.35 -toml>=0.10 diff --git a/docker/Dockerfile b/docker/Dockerfile index cab341deb1c6d513ed9d290aa2bd36bc7ee7c3e7..638ffac54845811cdb761d827e80c57e7eb4b584 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,9 +1,21 @@ FROM python:3.13 -RUN pip install pytest autopep8 pylint +# development root: /ruqad COPY ./src /ruqad/src COPY ./unittests /ruqad/unittests COPY ./end-to-end-tests /ruqad/end-to-end-tests COPY ./pyproject.toml /ruqad/ -RUN cd /ruqad && pip install . -CMD python -m ruqad.monitor + +# static configuration +COPY ./qualitycheck_config.toml /ruqad/ +COPY ./pylinkahead.ini /ruqad/ + +# Installing the package +WORKDIR /ruqad/ +# TODO Remove this manual crawler installation after the crawler has been released in version 0.10.2 +RUN pip install --root-user-action=ignore \ + git+https://gitlab.indiscale.com/caosdb/src/caosdb-crawler.git@dev + +RUN pip install --root-user-action=ignore .[all] + +CMD rq_monitor diff --git a/docker/readme.md b/docker/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..4cef20b21681848b42df9a47a13d892ea8ff0a65 --- /dev/null +++ b/docker/readme.md @@ -0,0 +1,33 @@ +# Build the Docker image # + +## Build configuration ## + +- Make sure that your `qualitycheck_config.toml` is up to date. +- Update `pylinkahead.ini` if necessary. + +## Actual building ## + +Building the Docker image from within this `docker/` directory: + +```sh +docker build -t ruqad:dev -f Dockerfile .. +``` + +# Runtime configuration # + +Open `../secrets.example.sh` and save it as `secrets.sh`, then fill in all the required +configuration. + +# Run the image # + +You can start Docker with + +`docker run --env-file=../secrets.sh ruqad:dev` + +## Add data ## + +1. Log into the configured Kadi instance. +2. Create new record with the access token's user, then attach a file. +3. When the monitor finds the file, it should [trigger the pipeline](https://gitlab.indiscale.com/caosdb/customers/f-fit/demonstrator4.2-example-data/-/pipelines/) for the quality check. +4. After the quality check has completed, the crawler should create a LinkAhead record and insert it + into the specified LinkAhead instance. diff --git a/pylinkahead.ini b/pylinkahead.ini index c09c71c477be0deb3d6986846862724e97cad0d8..326f8b6c0ad5f4fb25ea31cd9fa4e738c358ecaf 100644 --- a/pylinkahead.ini +++ b/pylinkahead.ini @@ -4,8 +4,8 @@ # - the location given in the env variable PYLINKAHEADINI [Connection] -# url=https://demo.indiscale.com/ -url=https://localhost:10443/ +url=https://demo.indiscale.com/ +#url=https://localhost:10443/ ## If this option is set, the SSL certificate will be ignored. Use with care! ssl_insecure=1 @@ -17,8 +17,9 @@ username=admin ## ## DEFAULT: the password method is `plain`, now the password must be saved as ## plain text. -# password_method=plain -# password=caosdb +password_method=plain +password=caosdb + ## OR: `input`: username is optional, password is entered by the user directly # password_method=input @@ -32,5 +33,5 @@ username=admin ## pip install keyring # password_method=keyring - timeout=10000 + diff --git a/pyproject.toml b/pyproject.toml index 14b5e70f0ac1d0716712e99a5bc33f117b8c13f7..40a1f04374735ec49d46948ac804125d4304f548 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,9 +24,10 @@ classifiers = [ requires-python = ">= 3.8" dependencies = [ "linkahead", - "caoscrawler[rocrate]", - "kadi-apy" - + "caoscrawler[rocrate] >= 0.10.2", + "kadi-apy", + "boto3>=1.35", + "toml>=0.10", ] [project.urls] @@ -38,10 +39,21 @@ Changelog = "https://gitlab.indiscale.com/caosdb/src/linkahead-python-package-te [project.optional-dependencies] dev = [ + "autopep8", + "pylint", + "ruqad[test]", ] test = [ "pytest", + "pytest-cov", +] +all = [ + "ruqad[dev]", ] +[project.scripts] +rq_monitor = "ruqad.monitor:monitor" +rq_qualitycheck = "ruqad.qualitycheck:main" + [tool.setuptools.package-data] -ruqad = ["src/ruqad/resources/crawler-settings"] +ruqad = ["resources/**/*"] diff --git a/secrets.example.sh b/secrets.example.sh index 8b260b014726581ebc82afe4135b6b94cc5d6bb3..9dd0047fa8444caa8c915aa5fe74618be921f0ca 100644 --- a/secrets.example.sh +++ b/secrets.example.sh @@ -1,13 +1,22 @@ -# Insert your secrets here, save as `secrets.sh`, then source the file with -# . secrets.sh -# Do not add this file to your version control! +# Save this file as `secrets.sh`, insert your secrets here, then source the file with +# set -a && . secrets.sh +# +# !!! Do not add this file to your version control !!! +# +# +# Note: Do not quote the value if you plan to use the file as a Docker env-file -# Token to trigger a pipeline run. -export GITLAB_PIPELINE_TOKEN="glptt-123456789" - -# Token to get pipeline status and result via the API. -export GITLAB_API_TOKEN="glpat-987654321" +## Kadi +# Host and token to retrieve data +KADIHOST=https://demo-kadi4mat.iam.kit.edu +KADITOKEN=pat_KADI123456789 +## S3 # Key ID and secret to access the S3 bucket defined in `qualitycheck_config.toml`. -export S3_ACCESS_KEY_ID="456S3S3S3654" -export S3_SECRET_ACCESS_KEY="123S3S3S3987" +S3_ACCESS_KEY_ID=456S3S3S3654 +S3_SECRET_ACCESS_KEY=123S3S3S3987 + +## Gitlab +# Tokens to trigger a pipeline run and to get pipeline status and result via the API. +GITLAB_PIPELINE_TOKEN=glptt-123456789 +GITLAB_API_TOKEN=glpat-987654321 diff --git a/src/ruqad/monitor.py b/src/ruqad/monitor.py old mode 100644 new mode 100755 index 66f27fd1d565462650c6e8d797b3541985b407cb..70a463c5c70a3f7e8acde06988f40d8fe8c06990 --- a/src/ruqad/monitor.py +++ b/src/ruqad/monitor.py @@ -1,26 +1,56 @@ +#!/usr/bin/env python3 + +# This file is a part of the RuQaD project. +# +# Copyright (C) 2024 IndiScale GmbH <www.indiscale.com> +# Copyright (C) 2024 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Daemon like script which monitors the Kadi4Mat server for new items. """ -monitor the kadi instance -""" -import os -import shutil + import traceback -from datetime import datetime, timezone -from tempfile import TemporaryDirectory +import shutil +import os + from time import sleep +from tempfile import TemporaryDirectory +from datetime import datetime, timezone +from ruqad.qualitycheck import QualityChecker +from ruqad.kadi import collect_records_created_after, download_eln_for +from ruqad.crawler import trigger_crawler from kadi_apy import KadiManager -from .crawler import trigger_crawler -from .kadi import collect_records_created_after, download_eln_for -from .qualitycheck import QualityChecker KADIARGS = { "host": os.environ['KADIHOST'], - "pat": os.environ['KADITOKEN'] + "pat": os.environ['KADITOKEN'], } -if __name__ == "__main__": +def monitor(): + """Continuously monitor the Kadi instance given in the environment variables. + + For each new item found, the following steps are performed: + + - Download the eln-format wrapped item. + - Run the quality check. + - Run the crawler on the item and the quality check result. + """ cut_off_date = datetime.fromisoformat("1990-01-01 02:34:42.484312+00:00") while True: try: @@ -44,7 +74,7 @@ if __name__ == "__main__": qc.check(filename=eln_file, target_dir=cdir) print(f"Quality check done. {os.listdir(cdir)}") # trigger crawler on dir - remote_dir_path= os.path.join(cdir, "ruqad", str(rid)) + remote_dir_path = os.path.join(cdir, "ruqad", str(rid)) os.makedirs(remote_dir_path) shutil.move(os.path.join(cdir, "artifacts.zip"), os.path.join(remote_dir_path, "report.zip")) @@ -59,3 +89,7 @@ if __name__ == "__main__": print("ERROR") print(traceback.format_exc()) print(e) + + +if __name__ == "__main__": + monitor() diff --git a/src/ruqad/qualitycheck.py b/src/ruqad/qualitycheck.py index a0419b7fe18313c820f254fd4b9b3520afbb5878..9ba034590f94b12f9d2756be0eafb8fcc38fbb66 100755 --- a/src/ruqad/qualitycheck.py +++ b/src/ruqad/qualitycheck.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# This file is a part of the RuQaD Project. +# This file is a part of the RuQaD project. # # Copyright (C) 2024 IndiScale GmbH <www.indiscale.com> # Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> @@ -27,7 +27,11 @@ import argparse import json import os import time +from pathlib import Path from subprocess import run +from tempfile import TemporaryDirectory +from typing import Optional +from zipfile import ZipFile import boto3 import toml @@ -58,7 +62,9 @@ out: dict class QualityChecker: class CheckFailed(RuntimeError): - pass + def __init__(self, reason: dict): + super().__init__() + self.reason = reason def __init__(self): """The QualityChecker can do quality checks for content. @@ -112,9 +118,8 @@ out : bool job_id = self._wait_for_check(pipeline_id=pipeline_id) self._download_result(job_id=job_id, target_dir=target_dir) except self.CheckFailed as cfe: - print("Check failed") - from IPython import embed - embed() + print(f"Check failed:\nStatus: {cfe.reason['status']}") + breakpoint() check_ok = False @@ -135,15 +140,51 @@ This deletes all the objects in the bucket. for obj in objects: self._s3_client.delete_object(Bucket=self._bucketname, Key=obj["Key"]) - def _upload(self, filename: str): + def _extract_content(self, filename: str, upload: bool = False): + """Extract content from the archive. May also upload to S3. + + Parameters + ---------- + filename : str + + upload : bool, default=False + """ + with TemporaryDirectory() as tmp: + if not tmp.endswith(os.path.sep): + tmp = tmp + os.path.sep + zipf = ZipFile(filename) + zipf.extractall(path=tmp) # TODO Zip bomb detection and prevention. + for name in zipf.namelist(): + if name.endswith(".json"): + continue + if upload: + self._upload(os.path.join(tmp, name), remove_prefix=tmp) + + def _upload(self, filename: str, remove_prefix: Optional[str] = None): """Upload the file to the S3 bucket. +Compressed files (with suffix .zip or .eln) will be extracted first. + Parameters ---------- filename : str The file to be checked. + +remove_prefix : Optional[str] + If given, remove this prefix from the filename when storing into the bucket. """ - self._s3_client.upload_file(filename, self._bucketname, filename) + # Check file type first. + if Path(filename).suffix in [".eln", ".zip"]: + self._extract_content(filename, upload=True) + return + + target_filename = filename + if remove_prefix: + if not filename.startswith(remove_prefix): + raise ValueError(f"{filename} was expected to start with {remove_prefix}") + target_filename = filename[len(remove_prefix):] + self._s3_client.upload_file(filename, self._bucketname, + os.path.join("data", target_filename)) def _trigger_check(self) -> str: """Trigger a new pipeline to start quality checks. @@ -158,7 +199,7 @@ filename : str "-X", "POST", "--fail", "-F", f"token={self._config['gitlab_pipeline_token']}", - "-F", "ref=main", + "-F", "ref=ruqad", "https://gitlab.indiscale.com/api/v4/projects/268/trigger/pipeline" ] cmd_result = run(cmd, check=False, capture_output=True) @@ -187,7 +228,7 @@ filename : str while True: cmd_result = run(cmd, check=True, capture_output=True) result = json.loads(cmd_result.stdout) - if result["finished_at"] is not None: + if result["status"] != "running" and result["finished_at"] is not None: break time.sleep(1) if not result["status"] == "success": @@ -195,6 +236,10 @@ filename : str raise self.CheckFailed(result) # Get jobs. + # We expect that these jobs are run runby the pipeline: + # - evaluate: run the quality check + # - report: build the report + # - pages: publish the report (not relevant for us) cmd = [ "curl", "--header", f"PRIVATE-TOKEN: {self._config['gitlab_api_token']}", @@ -204,7 +249,7 @@ filename : str result = json.loads(cmd_result.stdout) evaluate_job = [job for job in result if job["name"] == "evaluate"][0] if not evaluate_job["status"] == "success": - raise self.CheckFailed() + raise self.CheckFailed(result) report_job = [job for job in result if job["name"] == "report"][0] return report_job["id"] diff --git a/tox.ini b/tox.ini index 09fa08a0c811f4b251f09f50d481fc9486c94658..d88b4b08b3a4698d20e8ab5a35c7fddf6d4b1910 100644 --- a/tox.ini +++ b/tox.ini @@ -3,7 +3,7 @@ envlist = py38, py39, py310, py311, py312, py313 skip_missing_interpreters = true [testenv] -deps = . +deps = .[all] [flake8] max-line-length = 100