diff --git a/.docker/Dockerfile b/.docker/Dockerfile index 97b8fdf2b5ae43dc96726e16ea21a2c6a1883fdb..abd249d68ca701588290ab6242bc8b510f91021d 100644 --- a/.docker/Dockerfile +++ b/.docker/Dockerfile @@ -4,34 +4,31 @@ RUN apt-get update && \ curl \ git \ openjdk-17-jdk-headless \ - python3-autopep8 \ python3-pip \ - python3-pytest \ - python3-sphinx \ -y -RUN pip3 install --break-system-packages pylint recommonmark sphinx-rtd-theme tox + COPY .docker/wait-for-it.sh /wait-for-it.sh ARG PYLIB +ARG ADVANCED +# Version files ADD https://gitlab.indiscale.com/api/v4/projects/97/repository/commits/${PYLIB} \ pylib_version.json -RUN git clone https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git && \ - cd caosdb-pylib && git checkout ${PYLIB} && pip3 install --break-system-packages . -ARG ADVANCED ADD https://gitlab.indiscale.com/api/v4/projects/104/repository/commits/${ADVANCED} \ - advanced_version.json -RUN git clone https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git && \ - cd caosdb-advanced-user-tools && git checkout ${ADVANCED} && pip3 install --break-system-packages .[h5-crawler] -COPY . /git + advanced_version.json +# Install pylib and advanced user tools +RUN pip install --break-system-packages -U \ + git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@${PYLIB} \ + git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@${ADVANCED} -# Delete .git because it is huge. +# Install Ruqad +COPY . /git RUN rm -r /git/.git - -RUN cd /git/ && pip3 install --break-system-packages . +RUN cd /git/ && pip3 install --break-system-packages .[all] WORKDIR /git/integrationtests # wait for server, -CMD /wait-for-it.sh caosdb-server:10443 -t 500 -- \ +CMD /wait-for-it.sh caosdb-server:10443 -t 120 --strict -- \ # ... install pycaosdb.ini the server-side scripts - cp /git/.docker/sss_pycaosdb.ini /scripting/home/.pycaosdb.ini && \ + cp /git/.docker/sss_pylinkahead.ini /scripting/home/.pylinkahead.ini && \ # ... and run tests pytest-3 . diff --git a/.docker/sss_pycaosdb.ini b/.docker/sss_pylinkahead.ini similarity index 100% rename from .docker/sss_pycaosdb.ini rename to .docker/sss_pylinkahead.ini diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000000000000000000000000000000000000..3bd2ef9f9c7d14203de62dca01f7e939a12756b9 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,22 @@ +# -*- mode:conf; -*- + +# auto saves, caches +*~ +__pycache__ +.coverage +.tox + +# development artifacts +venv +.venv +.env + +# configurations + +# TODO Exclude later, for the time being this will be hardcoded into the image. +# qualitycheck_config.toml + +# build artifacts +*.egg-info +build +/src/doc/_apidoc/ diff --git a/.gitignore b/.gitignore index 02ddac7b25f95509cd4122423c25276e221bd939..658f3bc6d629d280013e781a9ff2241155ff9b58 100644 --- a/.gitignore +++ b/.gitignore @@ -7,7 +7,8 @@ __pycache__ .tox # development artifacts -venv/ +venv +.venv /.env/ # configurations diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index d299aa7f830e460c3925c34db8a21187e6db69fc..ea7a39c0e07d2997a09a6b713c2b49e97a9d1e78 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -53,23 +53,17 @@ e2e_tests: - python -m pytest end-to-end-tests/test_kadi.py -unittest_py3.8: +unittest_py3.9: tags: [cached-dind] stage: test - image: python:3.8 + image: python:3.9 script: &python_test_script - # install dependencies - - pip install pytest pytest-cov - - pip install . + # TODO Remove this manual crawler installation after the crawler has been released in version 0.10.2 + - pip install --root-user-action=ignore git+https://gitlab.indiscale.com/caosdb/src/caosdb-crawler.git@dev + - pip install .[all] # actual test - pytest --cov=ruqad -vv ./unittests -unittest_py3.9: - tags: [cached-dind] - stage: test - image: python:3.9 - script: *python_test_script - unittest_py3.10: tags: [cached-dind] stage: test @@ -109,6 +103,11 @@ build-testenv: - command -v wget - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY # use here general latest or specific branch latest... + - |- + echo > qualitycheck_config.toml << EOF + s3_endpoint = "https://s3.computational.bio.uni-giessen.de" + s3_bucket = "ruqad" + EOF - docker build --file docker/Dockerfile -t $CI_REGISTRY_IMAGE . diff --git a/CHANGELOG.md b/CHANGELOG.md index 1d16808360dc31193b7983f0d8578373694a518f..45541164eb3d04203fe8152780608ba6960ac5a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - LinkAhead crawler for the metadata check. - Triggers the quality checker. +- Docker build file and instructions. ### Changed ### diff --git a/Makefile b/Makefile index b8bc521c7f9046dbcbea56c08a26903ba924acab..aac219654c7180b28c014c78a5fbe5cfffaf60dc 100644 --- a/Makefile +++ b/Makefile @@ -43,6 +43,6 @@ lint: pylint --unsafe-load-any-extension=y -d all -e E,F src/linkahead_python_package_template .PHONY: lint -unittest: +unittests: tox -r -.PHONY: unittest +.PHONY: unittests diff --git a/README.md b/README.md index 217f4c6fb014cae406295e3781b6066b80f590eb..8aa2b7b6d47be00f291f5bfe94abb82827974587 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,38 @@ # FAIR Dataspaces RuQaD -TODO +RuQaD (Reuse Quality-assured Data) is a demonstrator for connecting and populating FAIR data spaces. +Ruqad connects to [Kadi4Mat](https://kadi.iam.kit.edu/) instances, runs [quality checks](https://git.rwth-aachen.de/fair-ds/ap-4-2-demonstrator/ap-4.2-data-validation-and-quality-assurance-demonstrator) on the data, stores the results +in a [LinkAhead](https://getlinkahead.com) instance and makes the data available via an [EDC (Eclipse Dataspace +Components)](https://projects.eclipse.org/projects/technology.edc) instance. ## Usage -TODO +### Installation ### -### Unit Tests +Simply install with: -Run `pytest unittests/`. +`pip install .` + +### Run locally ### + +- Make sure that `qualitycheck_config.toml` and `secrets.sh` are filled with valied values. +- Run `(set -a && . secrets.sh && rq_monitor)`, a short explanation follows: + - `(...)`: Putting the parentheses prevents pollution of your shell with the variables defined in + `secrets.sh`. + - `set -a`: This automatically exports all set variables in the shell. + - `. secrets.sh`: (Mind the dot `.`) This sources the variables defined in the file. + - `rq_monitor`: This starts the actual monitor, using the variables defined before. +- To run the service on data, insert new data to the Kadi4Mat instance: + - Log in to the Kadi server, with an account whose records are visible with the configured token. + - Create new record. + - Quickly append a file (for example `abalone.csv` from the *demonstrator4.2* example repo) to the + record. + - Wait for the new record with the file to be digested by the Ruqad monitor. + +### unit Tests + +- For testing, install with the `test` extra, for example like so: `pip install .[test]` +- Then run `make unittest` or `pytest unittests/`. ### E2E Tests In order to run the E2E test, you need to create a personal access token (pat) in the public @@ -17,23 +41,19 @@ In order to run the E2E test, you need to create a personal access token (pat) i ### Code style and liniting -Run `make style lint` after installing the dependencies listed below. - -### Documentation - -Run `make doc` after installing the dependencies listed below. +Run `make style lint` after installing with the `dev` extra. (The `dev` extra includes the `test` +extra.) -## Dependencies +<!-- ### Documentation --> -Package and optional dependencies are declared in the `pyproject.toml`; -additional dependencies for testing are listed in the `tox.ini`. +<!-- Run `make doc` after installing the dependencies listed below. --> -For linting and code-style we additionally require +<!-- For building the documentation we require --> -- `pylint` +<!-- - `sphinx` --> +<!-- - `recommonmark` --> +<!-- - `sphinx-rtd-theme` --> -For building the documentation we require +## Docker deployment ## -- `sphinx` -- `recommonmark` -- `sphinx-rtd-theme` +Ruqad can also be deployed as a container. More documentation on this is in `docker/`. diff --git a/docker/Dockerfile b/docker/Dockerfile index cab341deb1c6d513ed9d290aa2bd36bc7ee7c3e7..638ffac54845811cdb761d827e80c57e7eb4b584 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,9 +1,21 @@ FROM python:3.13 -RUN pip install pytest autopep8 pylint +# development root: /ruqad COPY ./src /ruqad/src COPY ./unittests /ruqad/unittests COPY ./end-to-end-tests /ruqad/end-to-end-tests COPY ./pyproject.toml /ruqad/ -RUN cd /ruqad && pip install . -CMD python -m ruqad.monitor + +# static configuration +COPY ./qualitycheck_config.toml /ruqad/ +COPY ./pylinkahead.ini /ruqad/ + +# Installing the package +WORKDIR /ruqad/ +# TODO Remove this manual crawler installation after the crawler has been released in version 0.10.2 +RUN pip install --root-user-action=ignore \ + git+https://gitlab.indiscale.com/caosdb/src/caosdb-crawler.git@dev + +RUN pip install --root-user-action=ignore .[all] + +CMD rq_monitor diff --git a/docker/readme.md b/docker/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..4cef20b21681848b42df9a47a13d892ea8ff0a65 --- /dev/null +++ b/docker/readme.md @@ -0,0 +1,33 @@ +# Build the Docker image # + +## Build configuration ## + +- Make sure that your `qualitycheck_config.toml` is up to date. +- Update `pylinkahead.ini` if necessary. + +## Actual building ## + +Building the Docker image from within this `docker/` directory: + +```sh +docker build -t ruqad:dev -f Dockerfile .. +``` + +# Runtime configuration # + +Open `../secrets.example.sh` and save it as `secrets.sh`, then fill in all the required +configuration. + +# Run the image # + +You can start Docker with + +`docker run --env-file=../secrets.sh ruqad:dev` + +## Add data ## + +1. Log into the configured Kadi instance. +2. Create new record with the access token's user, then attach a file. +3. When the monitor finds the file, it should [trigger the pipeline](https://gitlab.indiscale.com/caosdb/customers/f-fit/demonstrator4.2-example-data/-/pipelines/) for the quality check. +4. After the quality check has completed, the crawler should create a LinkAhead record and insert it + into the specified LinkAhead instance. diff --git a/end-to-end-tests/test_crawler.py b/end-to-end-tests/test_crawler.py index 1ac2eeb94c8d997539c74a846f8258f1e595e474..258e503dd3d5bbd3cc8ad6119c309aebf1f0faef 100644 --- a/end-to-end-tests/test_crawler.py +++ b/end-to-end-tests/test_crawler.py @@ -18,21 +18,17 @@ tests the crawling of ELN files """ import os -import zipfile -from datetime import datetime from pathlib import Path -from tempfile import NamedTemporaryFile -from time import sleep -from uuid import uuid1 from ruqad.crawler import trigger_crawler DATADIR = Path(__file__).parent / "data" / "crawler_data" + def test_crawl(): """ crawl a directory as it would be created by export from kadi and running a data quality check """ print(os.listdir(DATADIR)) - trigger_crawler(os.fspath(DATADIR)) - klsdjf + retval = trigger_crawler(os.fspath(DATADIR)) + assert retval diff --git a/pylinkahead.ini b/pylinkahead.ini new file mode 100644 index 0000000000000000000000000000000000000000..326f8b6c0ad5f4fb25ea31cd9fa4e738c358ecaf --- /dev/null +++ b/pylinkahead.ini @@ -0,0 +1,37 @@ +# The INI file must be located either in +# - $CWD/pylinkahead.ini +# - $HOME/.pylinkahead.ini +# - the location given in the env variable PYLINKAHEADINI + +[Connection] +url=https://demo.indiscale.com/ +#url=https://localhost:10443/ + +## If this option is set, the SSL certificate will be ignored. Use with care! +ssl_insecure=1 + +username=admin + +## The password input method can be chosen with the `password_method` setting, +## which by default is set to `plain`. +## +## DEFAULT: the password method is `plain`, now the password must be saved as +## plain text. +password_method=plain +password=caosdb + + +## OR: `input`: username is optional, password is entered by the user directly +# password_method=input + +## OR: `pass`: password is retrieved from the "pass" password manager +# password_method=pass +# password_identifier=... + +## OR: `keyring`: using the system keyring/wallet (macOS, GNOME, KDE, Windows) +## requires installation of the keyring python package: +## pip install keyring +# password_method=keyring + +timeout=10000 + diff --git a/pyproject.toml b/pyproject.toml index d22cef83a0fe3ddfc00688da5fffa024732a3ff7..40a1f04374735ec49d46948ac804125d4304f548 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,7 @@ classifiers = [ requires-python = ">= 3.8" dependencies = [ "linkahead", - "caoscrawler[rocrate]", + "caoscrawler[rocrate] >= 0.10.2", "kadi-apy", "boto3>=1.35", "toml>=0.10", @@ -39,10 +39,21 @@ Changelog = "https://gitlab.indiscale.com/caosdb/src/linkahead-python-package-te [project.optional-dependencies] dev = [ + "autopep8", + "pylint", + "ruqad[test]", ] test = [ "pytest", + "pytest-cov", ] +all = [ + "ruqad[dev]", +] + +[project.scripts] +rq_monitor = "ruqad.monitor:monitor" +rq_qualitycheck = "ruqad.qualitycheck:main" [tool.setuptools.package-data] -ruqad = ["src/ruqad/resources/crawler-settings"] +ruqad = ["resources/**/*"] diff --git a/secrets.example.sh b/secrets.example.sh index 507718f943cc0e44d694b7824f17866b595b7ce6..9dd0047fa8444caa8c915aa5fe74618be921f0ca 100644 --- a/secrets.example.sh +++ b/secrets.example.sh @@ -1,18 +1,22 @@ -# Insert your secrets here, save as `secrets.sh`, then source the file with -# . secrets.sh -# Do not add this file to your version control! +# Save this file as `secrets.sh`, insert your secrets here, then source the file with +# set -a && . secrets.sh +# +# !!! Do not add this file to your version control !!! +# +# +# Note: Do not quote the value if you plan to use the file as a Docker env-file ## Kadi # Host and token to retrieve data -export KADIHOST="https://demo-kadi4mat.iam.kit.edu" -export KADITOKEN="pat_KADI123456789" +KADIHOST=https://demo-kadi4mat.iam.kit.edu +KADITOKEN=pat_KADI123456789 ## S3 # Key ID and secret to access the S3 bucket defined in `qualitycheck_config.toml`. -export S3_ACCESS_KEY_ID="456S3S3S3654" -export S3_SECRET_ACCESS_KEY="123S3S3S3987" +S3_ACCESS_KEY_ID=456S3S3S3654 +S3_SECRET_ACCESS_KEY=123S3S3S3987 ## Gitlab # Tokens to trigger a pipeline run and to get pipeline status and result via the API. -export GITLAB_PIPELINE_TOKEN="glptt-123456789" -export GITLAB_API_TOKEN="glpat-987654321" +GITLAB_PIPELINE_TOKEN=glptt-123456789 +GITLAB_API_TOKEN=glpat-987654321 diff --git a/src/ruqad/crawler.py b/src/ruqad/crawler.py index f790b928c63b74ea924fd6058ebe619fb624998c..1e5a0334ad05550bd86cb72a7a92fcba4ddd4689 100644 --- a/src/ruqad/crawler.py +++ b/src/ruqad/crawler.py @@ -4,23 +4,29 @@ import os +import sys from importlib import resources from os import walk from os.path import join import linkahead as db from caoscrawler.crawl import crawler_main +from caoscrawler.scanner import scan_directory +from caoscrawler.validator import (load_json_schema_from_datamodel_yaml, + validate) ruqad_crawler_settings = resources.files('ruqad').joinpath('resources/crawler-settings') -def trigger_crawler(target_dir: str): +def trigger_crawler(target_dir: str) -> bool: """ Trigger a standard crawler run equivalent to the command line: ``` caosdb-crawler -i crawler/identifiables.yaml -s update crawler/cfood.yaml <target_dir> ``` + + Return False in case of unsuccessful metadata validation and True otherwise. """ # insert all .zip and .eln files, if they do not yet exist @@ -40,6 +46,22 @@ def trigger_crawler(target_dir: str): else: print(f"update {join(fp, fn)}") file_ent.update() + print("meta data check") + datamodel_yaml_file = ruqad_crawler_settings.joinpath('datamodel.yaml') + schemas = load_json_schema_from_datamodel_yaml(datamodel_yaml_file) + entities = scan_directory(target_dir, + ruqad_crawler_settings.joinpath('cfood.yaml')) + + # Remove files from entities: + records = [r for r in entities if r.role == "Record"] + validation = validate(records, schemas) + + if not all([i[0] for i in validation]): + print("Metadata validation failed. Validation errors:") + for v, recordtype in zip(validation, schemas.keys()): + if not v[0]: + print("{}: {}".format(recordtype, v[1])) + return False print("crawl", target_dir) crawler_main(crawled_directory_path=target_dir, @@ -47,3 +69,5 @@ def trigger_crawler(target_dir: str): identifiables_definition_file=ruqad_crawler_settings.joinpath( 'identifiables.yaml'), remove_prefix="/"+os.path.basename(target_dir)) + + return True diff --git a/src/ruqad/crawler_extensions/__init__.py b/src/ruqad/crawler_extensions/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/ruqad/crawler_extensions/converters.py b/src/ruqad/crawler_extensions/converters.py new file mode 100644 index 0000000000000000000000000000000000000000..8ed4646a7be8b209ffa58d8887375063bb191770 --- /dev/null +++ b/src/ruqad/crawler_extensions/converters.py @@ -0,0 +1,19 @@ +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer <a.schlemmer@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. diff --git a/src/ruqad/crawler_extensions/transformers.py b/src/ruqad/crawler_extensions/transformers.py new file mode 100644 index 0000000000000000000000000000000000000000..f30ebc76d68d561101de5e2d53ddd428fe00fa9d --- /dev/null +++ b/src/ruqad/crawler_extensions/transformers.py @@ -0,0 +1,53 @@ +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer <a.schlemmer@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +from typing import Any + + +def cast_metadata_type(in_value: Any, in_parameters: dict) -> Any: + """ + Cast the type of `in_value` to the type given by in_parameters["out_type"] + if the varname given by in_parameters["var_name"] equals in_parameters["var_value"]. + Just return the in_value otherwise. + + The type can be one of: + float, int, bool, str + """ + if "out_type" not in in_parameters: + raise RuntimeError("Parameter `out_type` missing.") + + if "var_name" not in in_parameters: + raise RuntimeError("Parameter `var_name` missing.") + + if "var_value" not in in_parameters: + raise RuntimeError("Parameter `var_value` missing.") + + typedict = { + "float": float, "int": int, "bool": bool, "str": str + } + + out_type = in_parameters["out_type"] + if out_type not in typedict.keys(): + raise RuntimeError("Parameter `out_type` can only be one of float, int, bool or str.") + + if in_parameters["var_name"] != in_parameters["var_value"]: + return in_value + + return typedict[out_type](in_value) diff --git a/src/ruqad/monitor.py b/src/ruqad/monitor.py index d25d1d4eb57097b4da4dd73616b9a65ef9b43244..70a463c5c70a3f7e8acde06988f40d8fe8c06990 100755 --- a/src/ruqad/monitor.py +++ b/src/ruqad/monitor.py @@ -25,27 +25,32 @@ import traceback import shutil import os -import argparse -import sys + from time import sleep from tempfile import TemporaryDirectory from datetime import datetime, timezone -sys.path.append(os.path.dirname(__file__)) - -from .qualitycheck import QualityChecker # NOQA -from .kadi import collect_records_created_after, download_eln_for # NOQA -from .crawler import trigger_crawler # NOQA -from kadi_apy import KadiManager # NOQA +from ruqad.qualitycheck import QualityChecker +from ruqad.kadi import collect_records_created_after, download_eln_for +from ruqad.crawler import trigger_crawler +from kadi_apy import KadiManager KADIARGS = { "host": os.environ['KADIHOST'], - "pat": os.environ['KADITOKEN'] + "pat": os.environ['KADITOKEN'], } -if __name__ == "__main__": +def monitor(): + """Continuously monitor the Kadi instance given in the environment variables. + + For each new item found, the following steps are performed: + + - Download the eln-format wrapped item. + - Run the quality check. + - Run the crawler on the item and the quality check result. + """ cut_off_date = datetime.fromisoformat("1990-01-01 02:34:42.484312+00:00") while True: try: @@ -84,3 +89,7 @@ if __name__ == "__main__": print("ERROR") print(traceback.format_exc()) print(e) + + +if __name__ == "__main__": + monitor() diff --git a/src/ruqad/qualitycheck.py b/src/ruqad/qualitycheck.py index 2b17bfc5ecab1c6eaeded49e941bf2fc49c08360..9ba034590f94b12f9d2756be0eafb8fcc38fbb66 100755 --- a/src/ruqad/qualitycheck.py +++ b/src/ruqad/qualitycheck.py @@ -62,7 +62,9 @@ out: dict class QualityChecker: class CheckFailed(RuntimeError): - pass + def __init__(self, reason: dict): + super().__init__() + self.reason = reason def __init__(self): """The QualityChecker can do quality checks for content. @@ -116,7 +118,7 @@ out : bool job_id = self._wait_for_check(pipeline_id=pipeline_id) self._download_result(job_id=job_id, target_dir=target_dir) except self.CheckFailed as cfe: - print("Check failed") + print(f"Check failed:\nStatus: {cfe.reason['status']}") breakpoint() check_ok = False @@ -226,7 +228,7 @@ remove_prefix : Optional[str] while True: cmd_result = run(cmd, check=True, capture_output=True) result = json.loads(cmd_result.stdout) - if result["finished_at"] is not None: + if result["status"] != "running" and result["finished_at"] is not None: break time.sleep(1) if not result["status"] == "success": @@ -247,7 +249,7 @@ remove_prefix : Optional[str] result = json.loads(cmd_result.stdout) evaluate_job = [job for job in result if job["name"] == "evaluate"][0] if not evaluate_job["status"] == "success": - raise self.CheckFailed() + raise self.CheckFailed(result) report_job = [job for job in result if job["name"] == "report"][0] return report_job["id"] diff --git a/src/ruqad/resources/crawler-settings/cfood.yaml b/src/ruqad/resources/crawler-settings/cfood.yaml index 55cc78dd73913de374cbb652ba05e46d489b4b4b..cf3bc7d91f51f723db363348d42bc3a5188b0f65 100644 --- a/src/ruqad/resources/crawler-settings/cfood.yaml +++ b/src/ruqad/resources/crawler-settings/cfood.yaml @@ -10,6 +10,12 @@ Converters: ROCrateEntity: converter: ROCrateEntityConverter package: caoscrawler.converters +Transformers: + cast_metadata_type: + function: cast_metadata_type + package: ruqad.crawler_extensions.transformers + + DataDir: type: Directory @@ -129,6 +135,24 @@ DataDir: match_properties: propertyID: (?P<propid>.*)$ value: (?P<propvalue>.*)$ + + transform: + cast_voltage_type: + in: $propvalue + out: $propvalue + functions: + - cast_metadata_type: + out_type: float + var_name: $propid + var_value: voltage + cast_rating_type: + in: $propvalue + out: $propvalue + functions: + - cast_metadata_type: + out_type: int + var_name: $propid + var_value: rating records: Dataset: $propid: $propvalue @@ -163,7 +187,13 @@ DataDir: AuthorID: match_name: "@id" type: TextElement - match_value: ^(?P<url>(?P<repo>https://.*?)(/users/)(?P<unr>[0-9]+))$ + match_value: ^(?P<url>(?P<repo>https?://.*?)(/users/)(?P<unr>[0-9]+))$ + transform: + cast_nr_type: + in: $unr + out: $unr + functions: + - cast_to_int: records: Author: nr: $unr diff --git a/src/ruqad/resources/crawler-settings/datamodel.yaml b/src/ruqad/resources/crawler-settings/datamodel.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f101a76cb4290755840fbd98e4da05375f863dc0 --- /dev/null +++ b/src/ruqad/resources/crawler-settings/datamodel.yaml @@ -0,0 +1,44 @@ +Dataset: + obligatory_properties: + Author: + Repository: + dateModified: + datatype: DATETIME + dateCreated: + datatype: DATETIME + recommended_properties: + MetaData: + datatype: LIST<MetaData> + notes: + datatype: TEXT + rating: + datatype: INTEGER + voltage: + datatype: DOUBLE + unit: V + + +MetaData: + obligatory_properties: + v: + datatype: TEXT + +Author: + obligatory_properties: + url: + datatype: TEXT + recommended_properties: + nr: + datatype: INTEGER + +Repository: + obligatory_properties: + url: + +# ELNFile: +# recommended_properties: +# QualityReportFile: +# +# QualityReportFile: +# recommended_properties: +# ELNFile: diff --git a/tox.ini b/tox.ini index 09fa08a0c811f4b251f09f50d481fc9486c94658..d88b4b08b3a4698d20e8ab5a35c7fddf6d4b1910 100644 --- a/tox.ini +++ b/tox.ini @@ -3,7 +3,7 @@ envlist = py38, py39, py310, py311, py312, py313 skip_missing_interpreters = true [testenv] -deps = . +deps = .[all] [flake8] max-line-length = 100