diff --git a/.docker/Dockerfile b/.docker/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..b300a1a97aa22b3eafc91ef89c01bbd7111edd62 --- /dev/null +++ b/.docker/Dockerfile @@ -0,0 +1,37 @@ +FROM debian:latest +RUN apt-get update && \ + apt-get install \ + curl \ + git \ + openjdk-11-jdk-headless \ + python3-autopep8 \ + python3-pip \ + python3-pytest \ + tox \ + -y +COPY .docker/wait-for-it.sh /wait-for-it.sh +ADD https://gitlab.indiscale.com/api/v4/projects/97/repository/commits/${PYLIB} \ + pylib_version.json +RUN git clone https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git && \ + cd caosdb-pylib && git checkout ${PYLIB} && pip3 install . +ADD https://gitlab.indiscale.com/api/v4/projects/104/repository/commits/${ADVANCED} \ + advanced_version.json +RUN git clone https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git && \ + cd caosdb-advanced-user-tools && git checkout ${ADVANCED} && pip3 install . +COPY . /git + +# Delete .git because it is huge. +RUN rm -r /git/.git + +# Install pycaosdb.ini for the tests +RUN mv /git/.docker/tester_pycaosdb.ini /git/integrationtests/pycaosdb.ini + +RUN cd /git/ && pip3 install . + +WORKDIR /git/integrationtests +# wait for server, +CMD /wait-for-it.sh caosdb-server:10443 -t 500 -- \ + # ... install pycaosdb.ini the server-side scripts + cp /git/.docker/sss_pycaosdb.ini /scripting/home/.pycaosdb.ini && \ + # ... and run tests + pytest-3 . diff --git a/.docker/cert.sh b/.docker/cert.sh new file mode 100755 index 0000000000000000000000000000000000000000..e22cfba2995b5fd9d812232f562b7254233fe5b0 --- /dev/null +++ b/.docker/cert.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2019 Daniel Hornung, Göttingen +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header + + +# Creates a directory `cert` and certificates in this directory. +# +# The hostname for which the certificate is created can be changed by setting +# the environment variable CAOSHOSTNAME. +# +# ## Overview of variables ## +# +# - CAOSHOSTNAME :: Hostname for the key (localhost) +# - KEYPW :: Password for the key (default ist CaosDBSecret) +# - KEYSTOREPW :: Password for the key store (same as KEYPW) +function cert() { + mkdir -p cert + cd cert + KEYPW="${KEYPW:-CaosDBSecret}" + CAOSHOSTNAME="${CAOSHOSTNAME:-localhost}" + KEYSTOREPW="${KEYPW:-}" + # NOTE: KEYPW and KEYSTOREPW are the same, due to Java limitations. + KEYPW="${KEYPW}" openssl genrsa -aes256 -out caosdb.key.pem \ + -passout env:KEYPW 2048 + # Certificate is for localhost + KEYPW="${KEYPW}" openssl req -new -x509 -key caosdb.key.pem \ + -out caosdb.cert.pem -passin env:KEYPW \ + -subj "/C=/ST=/L=/O=/OU=/CN=${CAOSHOSTNAME}" + KEYPW="${KEYPW}" KEYSTOREPW="$KEYSTOREPW" openssl pkcs12 -export \ + -inkey caosdb.key.pem -in caosdb.cert.pem -out all-certs.pkcs12 \ + -passin env:KEYPW -passout env:KEYPW + + keytool -importkeystore -srckeystore all-certs.pkcs12 -srcstoretype PKCS12 \ + -deststoretype pkcs12 -destkeystore caosdb.jks \ + -srcstorepass "${KEYPW}" \ + -destkeypass "${KEYPW}" -deststorepass "$KEYSTOREPW" + echo "Certificates successfuly created." +} + +cert diff --git a/.docker/docker-compose.yml b/.docker/docker-compose.yml new file mode 100644 index 0000000000000000000000000000000000000000..bbee24fbd8c898c479a0fafa13000ddf506d00eb --- /dev/null +++ b/.docker/docker-compose.yml @@ -0,0 +1,43 @@ +version: '3.7' +services: + sqldb: + image: mariadb:10.4 + environment: + MYSQL_ROOT_PASSWORD: caosdb1234 + networks: + - caosnet + caosdb-server: + image: "$CI_REGISTRY/caosdb/src/caosdb-deploy:$CAOSDB_TAG" + user: 999:999 + depends_on: + - sqldb + networks: + - caosnet + volumes: + - type: bind + source: ./cert + target: /opt/caosdb/cert + - type: volume + source: extroot + target: /opt/caosdb/mnt/extroot + - type: volume + source: scripting + target: /opt/caosdb/git/caosdb-server/scripting + - type: volume + source: authtoken + target: /opt/caosdb/git/caosdb-server/authtoken + ports: + # - "from_outside:from_inside" + - "10443:10443" + - "10080:10080" + environment: + DEBUG: 1 + CAOSDB_CONFIG_AUTHTOKEN_CONFIG: "conf/core/authtoken.example.yaml" + CAOSDB_CONFIG_TRANSACTION_BENCHMARK_ENABLED: "TRUE" +volumes: + scripting: + extroot: + authtoken: +networks: + caosnet: + driver: bridge diff --git a/.docker/run.sh b/.docker/run.sh new file mode 100755 index 0000000000000000000000000000000000000000..b0e1a716f28516b83043fb3fdb6594515a0bafd4 --- /dev/null +++ b/.docker/run.sh @@ -0,0 +1,5 @@ +#!/bin/sh + +docker-compose -f tester.yml run tester +rv=$? +echo $rv > result diff --git a/.docker/sss_pycaosdb.ini b/.docker/sss_pycaosdb.ini new file mode 100644 index 0000000000000000000000000000000000000000..de2867f8dc66b3e81f10f35e40c36f9cb8591604 --- /dev/null +++ b/.docker/sss_pycaosdb.ini @@ -0,0 +1,9 @@ +; this is the pycaosdb.ini for the server-side-scripting home. +[Connection] +url = https://caosdb-server:10443 +cacert = /opt/caosdb/cert/caosdb.cert.pem +debug = 0 +timeout = 5000 + +[Misc] +sendmail = /usr/local/bin/sendmail_to_file diff --git a/.docker/tester.yml b/.docker/tester.yml new file mode 100644 index 0000000000000000000000000000000000000000..83db879c6072bfdea7b3212c833116b96bb54d0c --- /dev/null +++ b/.docker/tester.yml @@ -0,0 +1,26 @@ +version: '3.7' +services: + tester: + image: "$CI_REGISTRY_IMAGE" + networks: + - docker_caosnet + volumes: + - type: bind + source: ./cert + target: /cert + - type: volume + source: extroot + target: /extroot + - type: volume + source: scripting + target: /scripting + - type: volume + source: authtoken + target: /authtoken +networks: + docker_caosnet: + external: true +volumes: + scripting: + extroot: + authtoken: diff --git a/.docker/tester_pycaosdb.ini b/.docker/tester_pycaosdb.ini new file mode 100644 index 0000000000000000000000000000000000000000..2159dec250b3dcb2f16043d12bdbe73675e4d75c --- /dev/null +++ b/.docker/tester_pycaosdb.ini @@ -0,0 +1,31 @@ +; pycaosdb.ini for pytest test suites. + +[IntegrationTests] +; location of the scripting bin dir which is used for the test scripts from the +; server's perspective. +test_server_side_scripting.bin_dir.server = scripting/bin-debug/ +; location of the scripting bin dir which is used for the test scripts from the +; pyinttest's perspective. +test_server_side_scripting.bin_dir.local = /scripting/bin-debug/ + +; location of the files from the pyinttest perspective +test_files.test_insert_files_in_dir.local = /extroot/test_insert_files_in_dir/ +; location of the files from the caosdb_servers perspective +test_files.test_insert_files_in_dir.server = /opt/caosdb/mnt/extroot/test_insert_files_in_dir/ + +; location of the one-time tokens from the pyinttest's perspective +test_authentication.admin_token_crud = /authtoken/admin_token_crud.txt +test_authentication.admin_token_expired = /authtoken/admin_token_expired.txt +test_authentication.admin_token_3_attempts = /authtoken/admin_token_3_attempts.txt + + +[Connection] +url = https://caosdb-server:10443/ +username = admin +cacert = /cert/caosdb.cert.pem +debug = 0 + +password_method = plain +password = caosdb + +timeout = 500 diff --git a/.docker/wait-for-it.sh b/.docker/wait-for-it.sh new file mode 100755 index 0000000000000000000000000000000000000000..d69e99f1f13257b559dce2433de0515379663efa --- /dev/null +++ b/.docker/wait-for-it.sh @@ -0,0 +1,182 @@ +#!/usr/bin/env bash +# License: +# From https://github.com/vishnubob/wait-for-it +# The MIT License (MIT) +# Use this script to test if a given TCP host/port are available + +WAITFORIT_cmdname=${0##*/} + +echoerr() { if [[ $WAITFORIT_QUIET -ne 1 ]]; then echo "$@" 1>&2; fi } + +usage() +{ + cat << USAGE >&2 +Usage: + $WAITFORIT_cmdname host:port [-s] [-t timeout] [-- command args] + -h HOST | --host=HOST Host or IP under test + -p PORT | --port=PORT TCP port under test + Alternatively, you specify the host and port as host:port + -s | --strict Only execute subcommand if the test succeeds + -q | --quiet Don't output any status messages + -t TIMEOUT | --timeout=TIMEOUT + Timeout in seconds, zero for no timeout + -- COMMAND ARGS Execute command with args after the test finishes +USAGE + exit 1 +} + +wait_for() +{ + if [[ $WAITFORIT_TIMEOUT -gt 0 ]]; then + echoerr "$WAITFORIT_cmdname: waiting $WAITFORIT_TIMEOUT seconds for $WAITFORIT_HOST:$WAITFORIT_PORT" + else + echoerr "$WAITFORIT_cmdname: waiting for $WAITFORIT_HOST:$WAITFORIT_PORT without a timeout" + fi + WAITFORIT_start_ts=$(date +%s) + while : + do + if [[ $WAITFORIT_ISBUSY -eq 1 ]]; then + nc -z $WAITFORIT_HOST $WAITFORIT_PORT + WAITFORIT_result=$? + else + (echo > /dev/tcp/$WAITFORIT_HOST/$WAITFORIT_PORT) >/dev/null 2>&1 + WAITFORIT_result=$? + fi + if [[ $WAITFORIT_result -eq 0 ]]; then + WAITFORIT_end_ts=$(date +%s) + echoerr "$WAITFORIT_cmdname: $WAITFORIT_HOST:$WAITFORIT_PORT is available after $((WAITFORIT_end_ts - WAITFORIT_start_ts)) seconds" + break + fi + sleep 1 + done + return $WAITFORIT_result +} + +wait_for_wrapper() +{ + # In order to support SIGINT during timeout: http://unix.stackexchange.com/a/57692 + if [[ $WAITFORIT_QUIET -eq 1 ]]; then + timeout $WAITFORIT_BUSYTIMEFLAG $WAITFORIT_TIMEOUT $0 --quiet --child --host=$WAITFORIT_HOST --port=$WAITFORIT_PORT --timeout=$WAITFORIT_TIMEOUT & + else + timeout $WAITFORIT_BUSYTIMEFLAG $WAITFORIT_TIMEOUT $0 --child --host=$WAITFORIT_HOST --port=$WAITFORIT_PORT --timeout=$WAITFORIT_TIMEOUT & + fi + WAITFORIT_PID=$! + trap "kill -INT -$WAITFORIT_PID" INT + wait $WAITFORIT_PID + WAITFORIT_RESULT=$? + if [[ $WAITFORIT_RESULT -ne 0 ]]; then + echoerr "$WAITFORIT_cmdname: timeout occurred after waiting $WAITFORIT_TIMEOUT seconds for $WAITFORIT_HOST:$WAITFORIT_PORT" + fi + return $WAITFORIT_RESULT +} + +# process arguments +while [[ $# -gt 0 ]] +do + case "$1" in + *:* ) + WAITFORIT_hostport=(${1//:/ }) + WAITFORIT_HOST=${WAITFORIT_hostport[0]} + WAITFORIT_PORT=${WAITFORIT_hostport[1]} + shift 1 + ;; + --child) + WAITFORIT_CHILD=1 + shift 1 + ;; + -q | --quiet) + WAITFORIT_QUIET=1 + shift 1 + ;; + -s | --strict) + WAITFORIT_STRICT=1 + shift 1 + ;; + -h) + WAITFORIT_HOST="$2" + if [[ $WAITFORIT_HOST == "" ]]; then break; fi + shift 2 + ;; + --host=*) + WAITFORIT_HOST="${1#*=}" + shift 1 + ;; + -p) + WAITFORIT_PORT="$2" + if [[ $WAITFORIT_PORT == "" ]]; then break; fi + shift 2 + ;; + --port=*) + WAITFORIT_PORT="${1#*=}" + shift 1 + ;; + -t) + WAITFORIT_TIMEOUT="$2" + if [[ $WAITFORIT_TIMEOUT == "" ]]; then break; fi + shift 2 + ;; + --timeout=*) + WAITFORIT_TIMEOUT="${1#*=}" + shift 1 + ;; + --) + shift + WAITFORIT_CLI=("$@") + break + ;; + --help) + usage + ;; + *) + echoerr "Unknown argument: $1" + usage + ;; + esac +done + +if [[ "$WAITFORIT_HOST" == "" || "$WAITFORIT_PORT" == "" ]]; then + echoerr "Error: you need to provide a host and port to test." + usage +fi + +WAITFORIT_TIMEOUT=${WAITFORIT_TIMEOUT:-15} +WAITFORIT_STRICT=${WAITFORIT_STRICT:-0} +WAITFORIT_CHILD=${WAITFORIT_CHILD:-0} +WAITFORIT_QUIET=${WAITFORIT_QUIET:-0} + +# check to see if timeout is from busybox? +WAITFORIT_TIMEOUT_PATH=$(type -p timeout) +WAITFORIT_TIMEOUT_PATH=$(realpath $WAITFORIT_TIMEOUT_PATH 2>/dev/null || readlink -f $WAITFORIT_TIMEOUT_PATH) +if [[ $WAITFORIT_TIMEOUT_PATH =~ "busybox" ]]; then + WAITFORIT_ISBUSY=1 + WAITFORIT_BUSYTIMEFLAG="-t" + +else + WAITFORIT_ISBUSY=0 + WAITFORIT_BUSYTIMEFLAG="" +fi + +if [[ $WAITFORIT_CHILD -gt 0 ]]; then + wait_for + WAITFORIT_RESULT=$? + exit $WAITFORIT_RESULT +else + if [[ $WAITFORIT_TIMEOUT -gt 0 ]]; then + wait_for_wrapper + WAITFORIT_RESULT=$? + else + wait_for + WAITFORIT_RESULT=$? + fi +fi + +if [[ $WAITFORIT_CLI != "" ]]; then + if [[ $WAITFORIT_RESULT -ne 0 && $WAITFORIT_STRICT -eq 1 ]]; then + echoerr "$WAITFORIT_cmdname: strict mode, refusing to execute subprocess" + exit $WAITFORIT_RESULT + fi + exec "${WAITFORIT_CLI[@]}" +else + exit $WAITFORIT_RESULT +fi + diff --git a/.gitignore b/.gitignore index 6df7e28419776d5976ed34c11a69b39a3cbd3dec..11c17317428964b82b47d55399a4dde1a9e698a9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ -src/newcrawler.egg-info/ +src/caoscrawler.egg-info/ .coverage __pycache__ .tox diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000000000000000000000000000000000000..a30140e684b465d40b964f1bfb9b97959b29834d --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,248 @@ +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2018 Research Group Biomedical Physics, +# Max-Planck-Institute for Dynamics and Self-Organization Göttingen +# Copyright (C) 2019 Henrik tom Wörden +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +variables: + CI_REGISTRY_IMAGE: $CI_REGISTRY/caosdb/src/caosdb-crawler/testenv:$CI_COMMIT_REF_NAME + CI_REGISTRY_IMAGE_BASE: $CI_REGISTRY/caosdb/src/caosdb-pyinttest/base:latest + +stages: + - info + - setup + - cert + - style + - test + - deploy + + +# During the test stage the CI pipeline (which runs in a "root" docker) starts +# two docker containers with docker-compose (one for the caosdb-server, another +# for the mysql-backend). Then a third docker is being started which contains +# the test suite and executes it. +# +# +-------------(root docker)-------------------------+ +# | | +# | +-(caosdb_mysqlbackend)-------------+ | +# | | | | +# | +-----------------------------------+ | +# | +-(caosdb-server)-------------------+ | +# | | | | +# | | /opt/caosdb | | +# | .------->| + /git/caosdb-server/scripting/ | | +# | | .----->| + /git/caosdb-server/authtoken/ | | +# | | | .--->| + /mnt/extroot | | +# | | | | .->| + /cert | | +# | | | | | | | | +# | | | | | +-----------------------------------+ | +# | | | | | | +# | | | | | filesystem: | +# | | | | *--- /cert -----------. | +# | | | | | | +# | | | | volumes: | | +# | | | *----- extroot ------. | | +# | | *------- scripting --. | | | +# | *--------- authtoken -. | | | | +# | | | | | | +# | +-(crawler tests)---+ | | | | | +# | | | | | | | | +# | | /authtoken |<---* | | | | +# | | /scripting |<----* | | | +# | | /extroot |<------* | | +# | | /cert |<--------* | +# | | | | +# | +----------------------+ | +# +---------------------------------------------------+ +# +# In the root docker, the directory /cert is mounted to .docker/cert relative +# to this repository. The directory is created during the cert stage of this +# pipeline and a certificate is created in there. The certificat is then +# available in mounted directories in the server and crawler containers. +# +# Additional volumes in the root docker are shared by the caosdb-server and the crawler +# containers. These volumes are intended to be used for testing server-side scripting and +# file-system features. +# + +.env: &env + - echo "Pipeline triggered by $TRIGGERED_BY_REPO@$TRIGGERED_BY_REF ($TRIGGERED_BY_HASH)" + - echo "CI_REGISTRY_IMAGE_BASE = $CI_REGISTRY_IMAGE_BASE" + - echo "CI_REGISTRY_IMAGE = $CI_REGISTRY_IMAGE" + - echo "CAOSDB_TAG = $CAOSDB_TAG" + - echo "REFTAG = $REFTAG" + - echo "F_BRANCH = $F_BRANCH" + - echo "CI_COMMIT_REF_NAME = $CI_COMMIT_REF_NAME" + - ls -lah /image-cache/ + + - F_BRANCH=${F_BRANCH:-$CI_COMMIT_REF_NAME} + - echo $F_BRANCH + - if [[ "$REFTAG" == "" ]] ; then + if [[ "$F_BRANCH" == "dev" ]] ; then + REFTAG=dev; + fi; + fi + - REFTAG=${REFTAG:-dev_F_${F_BRANCH}} + + - echo $F_BRANCH + + - if [[ "$CAOSDB_TAG" == "" ]]; then + CAOSDB_TAG=${REFTAG}; + fi + - echo $CAOSDB_TAG + +info: + tags: [cached-dind] + image: docker:20.10 + stage: info + needs: [] + script: + - *env + +unittest: + tags: [cached-dind] + image: docker:20.10 + stage: test + image: $CI_REGISTRY_IMAGE + script: + - tox + +inttest: + tags: [docker] + services: + - docker:20.10-dind + variables: + # This is a workaround for the gitlab-runner health check mechanism when + # using docker-dind service. The runner will otherwise guess the port + # wrong and the health check will timeout. + SERVICE_PORT_2376_TCP_PORT: 2375 + stage: test + image: $CI_REGISTRY_IMAGE_BASE + needs: [cert] + script: + - *env + - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY + - echo $CAOSDB_TAG + + - cd .docker + # Store mariadb version + - MARIADBVERSION=$(grep mariadb docker-compose.yml | awk '{print $2}') + - echo "mariadb image:"$MARIADBVERSION + - time docker load < /image-cache/caosdb-crawler-testenv-${CI_COMMIT_REF_NAME}.tar || true + - time docker load < /image-cache/caosdb-${REFTAG}.tar || time docker load < /image-cache/caosdb-dev.tar || true + - time docker load < /image-cache/$MARIADBVERSION.tar || true + - docker pull $CI_REGISTRY/caosdb/src/caosdb-deploy:$CAOSDB_TAG || CAOSDB_TAG=dev + - docker pull $CI_REGISTRY_IMAGE + + # Here, the server and the mysql backend docker are being started + - CAOSDB_TAG=$CAOSDB_TAG docker-compose up -d + + # Store versions of CaosDB parts + - docker exec -u 0 -t docker_caosdb-server_1 cat /opt/caosdb/git/caosdb_pylib_commit > hash_pylib + - docker exec -u 0 -t docker_caosdb-server_1 cat /opt/caosdb/git/caosdb_webui_commit > hash_webui + - docker exec -u 0 -t docker_caosdb-server_1 cat /opt/caosdb/git/caosdb_server_commit > hash_server + - docker exec -u 0 -t docker_caosdb-server_1 cat /opt/caosdb/git/caosdb_mysqlbackend_commit > hash_mysql + - docker exec -u 0 -t docker_caosdb-server_1 cat /opt/caosdb/git/caosdb_proto_commit > hash_proto + - cat hash_server + - cat hash_proto + - cat hash_mysql + - cat hash_webui + - cat hash_pylib + # Run the actual tests. This starts a new docker container within which + # the tests run. The return value is stored in .docker/result + - /bin/sh ./run.sh + + # Save logs + - docker logs docker_caosdb-server_1 &> ../caosdb_log.txt + - docker logs docker_sqldb_1 &> ../mariadb_log.txt + - cd .. + + # Stop the server + - docker-compose -f .docker/docker-compose.yml down + + # the crawler docker writes the return value of the tests into the + # file result + - rc=`cat .docker/result` + - exit $rc + dependencies: [cert] + timeout: 3h + artifacts: + paths: + - caosdb_log.txt + - mariadb_log.txt + - .docker/hash_* + expire_in: 1 week + +build-testenv: + tags: [cached-dind] + image: docker:20.10 + stage: setup + timeout: 2h + only: + - schedules + - web + - pushes + needs: [] + script: + - df -h + - command -v wget + - if [ -z "$PYLIB" ]; then + if echo "$CI_COMMIT_REF_NAME" | grep -c "^f-" ; then + echo "Check if pylib has branch $CI_COMMIT_REF_NAME" ; + if wget https://gitlab.indiscale.com/api/v4/projects/97/repository/branches/${CI_COMMIT_REF_NAME} ; then + PYLIB=$CI_COMMIT_REF_NAME ; + fi; + fi; + fi; + - PYLIB=${PYLIB:-dev} + - echo $PYLIB + + - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY + # use here general latest or specific branch latest... + - docker build + --build-arg PYLIB=${PYLIB} + --build-arg ADVANCED=${ADVANCED:dev} + --file .docker/Dockerfile + -t $CI_REGISTRY_IMAGE . + - docker push $CI_REGISTRY_IMAGE + - docker save $CI_REGISTRY_IMAGE > /image-cache/caosdb-crawler-testenv-${CI_COMMIT_REF_NAME}.tar + +cert: + tags: [docker] + stage: cert + image: $CI_REGISTRY_IMAGE + needs: + - job: build-testenv + optional: true + artifacts: + paths: + - .docker/cert/ + expire_in: 1 week + script: + - cd .docker + - CAOSHOSTNAME=caosdb-server ./cert.sh + +style: + tags: [docker] + stage: style + image: $CI_REGISTRY_IMAGE + needs: + - job: build-testenv + optional: true + script: + - autopep8 -r --diff --exit-code . + allow_failure: true diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000000000000000000000000000000000000..d0a2883005d6651f0ba3ef22b9fa5fe0d03349aa --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,24 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +### Added + +* Everything + +### Changed + +* Renamed module from `newcrawler` to `caoscrawler` + +### Deprecated + +### Removed + +### Fixed + +### Security diff --git a/README.md b/README.md index 88d8a6d9965e67ec268bff979ceb709dbf650129..59b88aaa36ed97d8c2cc9e4474820e3dad4a478b 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# newcrawler +# caoscrawler A new crawler for CaosDB. diff --git a/integrationtests/README.md b/integrationtests/README.md index 5c308f51a332d5a930f91eb30f0d93032ae47627..96789ed9f02036a0c7cc25ca1a60d9f0042a5557 100644 --- a/integrationtests/README.md +++ b/integrationtests/README.md @@ -1,3 +1,2 @@ -1. Clear database (see clear_database.py) -2. Insert model (see insert_model.py) -3. Run test.py +1. Mount test_data/extroot as extroot folder in the CaosDB server +2. use an empty server diff --git a/integrationtests/model.yml b/integrationtests/basic_example/model.yml similarity index 100% rename from integrationtests/model.yml rename to integrationtests/basic_example/model.yml diff --git a/integrationtests/test.py b/integrationtests/basic_example/test.py similarity index 91% rename from integrationtests/test.py rename to integrationtests/basic_example/test.py index efff64305bbc9dd24ebf7817fb9d10d0523c9f5b..6e35f7f2e4532acb5a2c3c80d06d9faeabd0fe0a 100755 --- a/integrationtests/test.py +++ b/integrationtests/basic_example/test.py @@ -28,21 +28,22 @@ module description """ +import os from caosdb import EmptyUniqueQueryError import argparse import sys from argparse import RawTextHelpFormatter -from newcrawler import Crawler +from caoscrawler import Crawler import caosdb as db -from newcrawler.identifiable_adapters import CaosDBIdentifiableAdapter +from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter import pytest from caosadvancedtools.models.parser import parse_model_from_yaml import yaml +# TODO is not yet merged in caosadvancedtools from caosadvancedtools.testutils import clear_database, set_test_key set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") -import os def rfp(*pathcomponents): """ @@ -52,14 +53,12 @@ def rfp(*pathcomponents): return os.path.join(os.path.dirname(__file__), *pathcomponents) - - @pytest.fixture def usemodel(): model = parse_model_from_yaml(rfp("model.yml")) model.sync_data_model(noquestion=True, verbose=False) - + @pytest.fixture def ident(): ident = CaosDBIdentifiableAdapter() @@ -68,12 +67,12 @@ def ident(): ident.register_identifiable( "Person", db.RecordType() .add_parent(name="Person") - #.add_property(name="first_name") + # .add_property(name="first_name") .add_property(name="last_name")) ident.register_identifiable( "Measurement", db.RecordType() .add_parent(name="Measurement") - #.add_property(name="identifier") + # .add_property(name="identifier") .add_property(name="date") .add_property(name="project")) ident.register_identifiable( @@ -106,7 +105,8 @@ def crawler_extended(ident): updateList = cr.updateList fileList = [r for r in updateList if r.role == "File"] for f in fileList: - f.file = rfp("..", "unittests", "test_directories", "examples_article", f.file) + f.file = rfp("..", "unittests", "test_directories", + "examples_article", f.file) return cr @@ -150,6 +150,7 @@ def test_multiple_insertions(clear_database, usemodel, ident, crawler): assert len(ins) == 0 assert len(ups) == 0 + def test_insertion(clear_database, usemodel, ident, crawler): ins, ups = crawler.synchronize() @@ -169,6 +170,7 @@ def test_insertion(clear_database, usemodel, ident, crawler): assert len(ins) == 0 assert len(ups) == 0 + def test_insertion_and_update(clear_database, usemodel, ident, crawler): ins, ups = crawler.synchronize() @@ -184,7 +186,8 @@ def test_insertion_and_update(clear_database, usemodel, ident, crawler): ins, ups = cr.synchronize() assert len(ins) == 0 assert len(ups) == 1 - + + def test_identifiable_update(clear_database, usemodel, ident, crawler): ins, ups = crawler.synchronize() @@ -197,23 +200,23 @@ def test_identifiable_update(clear_database, usemodel, ident, crawler): l = cr.updateList for record in l: if (record.parents[0].name == "Measurement" and - record.get_property("date").value == "2020-01-03"): + record.get_property("date").value == "2020-01-03"): # maybe a bit weird, but add an email address to a measurement - record.add_property(name="email", value="testperson@testaccount.test") + record.add_property( + name="email", value="testperson@testaccount.test") print("one change") break ins, ups = cr.synchronize() assert len(ins) == 0 assert len(ups) == 1 - # Test the change within one property: cr = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory(cr) l = cr.updateList for record in l: if (record.parents[0].name == "Measurement" and - record.get_property("date").value == "2020-01-03"): + record.get_property("date").value == "2020-01-03"): record.add_property(name="email", value="testperson@coolmail.test") print("one change") break @@ -227,7 +230,7 @@ def test_identifiable_update(clear_database, usemodel, ident, crawler): l = cr.updateList for record in l: if (record.parents[0].name == "Measurement" and - record.get_property("date").value == "2020-01-03"): + record.get_property("date").value == "2020-01-03"): record.add_property(name="email", value="testperson@coolmail.test") record.get_property("date").value = "2012-01-02" print("one change") @@ -239,7 +242,8 @@ def test_identifiable_update(clear_database, usemodel, ident, crawler): def test_file_insertion_dry(clear_database, usemodel, ident): crawler_extended = Crawler(debug=True, identifiableAdapter=ident) - crawl_standard_test_directory(crawler_extended, cfood="scifolder_extended.yml") + crawl_standard_test_directory( + crawler_extended, cfood="scifolder_extended.yml") updateList = crawler_extended.updateList fileList = [r for r in updateList if r.role == "File"] assert len(fileList) == 11 @@ -269,6 +273,7 @@ def test_file_insertion(clear_database, usemodel, ident, crawler_extended): assert len(r) == 1 assert r[0].get_property("ReadmeFile").value == f.id + def test_file_update(clear_database, usemodel, ident, crawler_extended): ins1, ups1 = crawler_extended.synchronize(commit_changes=True) fileList_ins = [r for r in ins1 if r.role == "File"] @@ -279,7 +284,8 @@ def test_file_update(clear_database, usemodel, ident, crawler_extended): updateList = cr.updateList fileList = [r for r in updateList if r.role == "File"] for f in fileList: - f.file = rfp("..", "unittests", "test_directories", "examples_article", f.file) + f.file = rfp("..", "unittests", "test_directories", + "examples_article", f.file) ins2, ups2 = cr.synchronize(commit_changes=True) assert len(ups1) == 0 assert len(ups2) == 0 @@ -288,21 +294,21 @@ def test_file_update(clear_database, usemodel, ident, crawler_extended): res = db.execute_query("Find File") assert len(res) == 11 assert len(res[0].parents) == 0 - + cr2 = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory(cr2, cfood="scifolder_extended2.yml") updateList = cr2.updateList fileList = [r for r in updateList if r.role == "File"] for f in fileList: - f.file = rfp("..", "unittests", "test_directories", "examples_article", f.file) + f.file = rfp("..", "unittests", "test_directories", + "examples_article", f.file) ins3, ups3 = cr2.synchronize(commit_changes=True) assert len(ups3) == 11 res = db.execute_query("Find File") assert len(res) == 11 assert res[0].parents[0].name == "ProjectMarkdownReadme" - # TODO: Implement file update checks (based on checksum) # Add test with actual file update: diff --git a/integrationtests/realworld_example/crawl.sh b/integrationtests/realworld_example/crawl.sh new file mode 100755 index 0000000000000000000000000000000000000000..55a2a331fe517a539e2dd937ac35605c72b496c9 --- /dev/null +++ b/integrationtests/realworld_example/crawl.sh @@ -0,0 +1,4 @@ +#!/bin/bash +python -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/data +python load_and_insert_json_models.py +python test_dataset_crawler.py diff --git a/integrationtests/realworld_example/load_and_insert_json_models.py b/integrationtests/realworld_example/load_and_insert_json_models.py new file mode 100644 index 0000000000000000000000000000000000000000..682fd9c77531e63ed18dd13417399ad0d18a8de2 --- /dev/null +++ b/integrationtests/realworld_example/load_and_insert_json_models.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2022 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2022 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# Copyright (C) 2022 Florian Spreckelsen <f.spreckelsen@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +import sys + +from caosadvancedtools.models.parser import parse_model_from_json_schema, parse_model_from_yaml + + +def main(): + # First load dataspace data model + dataspace_definitions = parse_model_from_json_schema( + "schema/dataspace.schema.json") + dataspace_definitions.sync_data_model(noquestion=True) + + # Then general dataset definitions + dataset_definitions = parse_model_from_json_schema( + "schema/dataset.schema.json") + dataset_definitions.sync_data_model(noquestion=True) + + # Finally, add inheritances as defined in yaml + dataset_inherits = parse_model_from_yaml( + "schema/dataset-inheritance.yml") + dataset_inherits.sync_data_model(noquestion=True) + + +if __name__ == "__main__": + + sys.exit(main()) diff --git a/integrationtests/realworld_example/test_dataset_crawler.py b/integrationtests/realworld_example/test_dataset_crawler.py new file mode 100644 index 0000000000000000000000000000000000000000..8713f490399471dc324c542f5d0e96bfe161b60a --- /dev/null +++ b/integrationtests/realworld_example/test_dataset_crawler.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2022 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2022 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# Copyright (C) 2022 Florian Spreckelsen <f.spreckelsen@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" +module description +""" +import json +import os + +import caosdb as db + +from caoscrawler.crawl import Crawler +from caoscrawler.converters import JSONFileConverter, DictConverter +from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter +from caoscrawler.structure_elements import File, JSONFile, Directory +import pytest +from caosadvancedtools.models.parser import parse_model_from_json_schema, parse_model_from_yaml + +#from caosadvancedtools.testutils import clear_database, set_test_key +import sys + +# TODO is not yet merged in caosadvancedtools +# set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") + + +def rfp(*pathcomponents): + """ + Return full path. + Shorthand convenience function. + """ + return os.path.join(os.path.dirname(__file__), *pathcomponents) + + +DATADIR = rfp("..", "test_data", "extroot", "realworld_example") + + +@pytest.fixture +def usemodel(): + # First load dataspace data model + dataspace_definitions = parse_model_from_json_schema( + os.path.join(DATADIR, "schema", "dataspace.schema.json")) + dataspace_definitions.sync_data_model(noquestion=True) + + # Then general dataset definitions + dataset_definitions = parse_model_from_json_schema( + os.path.join(DATADIR, "schema", "dataset.schema.json")) + dataset_definitions.sync_data_model(noquestion=True) + + # Finally, add inheritances as defined in yaml + dataset_inherits = parse_model_from_yaml( + os.path.join(DATADIR, "schema", "dataset-inheritance.yml")) + dataset_inherits.sync_data_model(noquestion=True) + + +def test_dataset( + # clear_database, + usemodel): + # json_file_path = rfp("test_directories", "single_file_test_data", "testjson.json") + + ident = CaosDBIdentifiableAdapter() + ident.register_identifiable( + "license", db.RecordType().add_parent("license").add_property("name")) + ident.register_identifiable("project_type", db.RecordType( + ).add_parent("project_type").add_property("name")) + ident.register_identifiable("Person", db.RecordType( + ).add_parent("Person").add_property("full_name")) + + crawler = Crawler(debug=True, identifiableAdapter=ident) + crawler_definition = crawler.load_definition( + os.path.join(DATADIR, "dataset_cfoods.yml")) + # print(json.dumps(crawler_definition, indent=3)) + # Load and register converter packages: + converter_registry = crawler.load_converters(crawler_definition) + # print("DictIntegerElement" in converter_registry) + + records = crawler.start_crawling( + Directory("data", os.path.join(DATADIR, 'data')), + crawler_definition, + converter_registry + ) + subd = crawler.debug_tree + subc = crawler.debug_metadata + # print(json.dumps(subc, indent=3)) + # print(subd) + # print(subc) + # print(records) + ins, ups = crawler.synchronize() + + dataspace = db.execute_query("FIND RECORD Dataspace WITH name=35 AND dataspace_id=20002 AND " + "archived=FALSE AND url='https://datacloud.de/index.php/f/7679'" + " AND Person", unique=True) + assert dataspace.get_property("start_date").value == "2022-03-01" + db.execute_query("FIND RECORD Person with full_name='Max Schmitt' AND" + " given_name='Max'", unique=True) + + dataset = db.execute_query(f"FIND RECORD Dataset with Dataspace={dataspace.id} AND title=" + "'Random numbers created on a random autumn day in a random office'" + "", unique=True) + assert db.execute_query(f"COUNT RECORD with id={dataset.id} AND WHICH REFERENCES Person WITH full_name=" + "'Alexa Nozone' AND WHICH REFERENCES Person WITH full_name='Max Schmitt'" + "") == 1 + assert db.execute_query(f"COUNT RECORD with id={dataset.id} AND WHICH REFERENCES Event WITH " + "start_datetime='2022-02-10T16:36:48+01:00'") == 1 diff --git a/integrationtests/test_data/extroot/realworld_example/data/35/.dataspace.json b/integrationtests/test_data/extroot/realworld_example/data/35/.dataspace.json new file mode 100644 index 0000000000000000000000000000000000000000..26e11e4e16081b8b5b64a83889bc1f4d160ef0e7 --- /dev/null +++ b/integrationtests/test_data/extroot/realworld_example/data/35/.dataspace.json @@ -0,0 +1,15 @@ +{ + "name": "DEMO", + "dataspace_id": 20002, + "archived": false, + "coordinator": { + "full_name": "Max Schmitt", + "given_name": "Max", + "family_name": "Schmitt", + "email": "max.schmitt@email.de" + }, + "start_date": "2022-03-01", + "end_date": "2032-02-28", + "comment": "Demonstration data space for DataCloud", + "url": "https://datacloud.de/index.php/f/7679" +} diff --git a/integrationtests/test_data/extroot/realworld_example/data/35/03_raw_data/001_dataset1/demo-dataset.csv b/integrationtests/test_data/extroot/realworld_example/data/35/03_raw_data/001_dataset1/demo-dataset.csv new file mode 100644 index 0000000000000000000000000000000000000000..7a4d684e50cf4fa0699c66d27661d0d54055ec8b --- /dev/null +++ b/integrationtests/test_data/extroot/realworld_example/data/35/03_raw_data/001_dataset1/demo-dataset.csv @@ -0,0 +1,101 @@ +index,A[kg],B[s],pH,Temp.[C] +0,2.1209183975976957,-0.5658499891692009,-0.8391639362482752,0.6210332089995103 +1,-1.2155508955759597,-1.0141121577750831,0.2503340429095144,0.7560156296323594 +2,-1.0191141299527218,-1.5870495901656396,0.6811842117478961,-0.25776671384531147 +3,-0.8235788683146266,1.1688759819188137,-0.15841036014621737,0.24351773490785233 +4,-2.028210212186099,-0.15000944869896093,0.7344551834722798,-1.0594635581726441 +5,0.8578345931586077,-1.0478958942647336,-0.5059960285526023,0.6141193812881873 +6,-0.7585068400011461,-0.45812334415522366,-0.6299981228519985,-0.072295788065162 +7,-0.34875455645064296,-0.49936600901639105,0.08492189470338947,0.24398792231786676 +8,-0.491473523786921,-1.1815449374689073,-0.23631388788457763,0.8801868915647684 +9,-1.291852196630842,0.4956544058017087,1.7176555991727498,1.8889309443940632 +10,-0.974327795079914,-0.6002779223325445,1.4950878953418667,-0.4750187681874636 +11,0.863708396863823,0.4867513929363103,-1.2500529683835453,2.1711592870838112 +12,-1.0518542498779602,-0.6800136223939168,-0.5593377295003794,-0.23451862458342732 +13,0.013421028872223972,-1.7652967848993042,0.302518679323854,1.124258888392337 +14,1.1387734213591119,-0.5602347718731282,-0.6908747870526222,0.905906598269778 +15,-1.8032949181114486,0.18858416406523845,1.0083249532267977,0.6969475009127225 +16,-0.42755813629599176,-1.0354063212247375,-0.24666198541039489,-1.2245102779938972 +17,-0.558268266895522,-1.4564784210249142,1.6162446783371565,-0.6109432350045504 +18,-0.9759505344957924,-2.780175134826593,3.039543722358096,-1.258487109407404 +19,-0.042261223623348665,0.7827311969447484,0.8902139085357877,0.33130889065513175 +20,-0.43764310886282315,-0.8338864816830261,0.8545198929035823,-0.8330242660029193 +21,0.2910454990578444,0.40786200750721635,-0.8115126892604917,0.7108997766944964 +22,0.41446462010439317,-1.0965365861313923,-0.1816041240266455,-0.18304466068648742 +23,-0.5187715545823834,-0.46490147833949275,-0.5059346590831783,0.6998562249774912 +24,2.4491154744839005,-0.3554192977203785,-0.6604902675826654,-0.9434392815439072 +25,-0.5083188860395834,0.2781724921583019,-0.4340136020292349,0.02629089617543565 +26,-0.9854213292611846,-1.398313530263303,0.05552818415139104,-0.20282242071816114 +27,1.0808664341388348,-0.681501179909626,0.6492258431774035,-0.41774069067997716 +28,-1.1767497846165254,1.0817469159915034,-1.524089495721789,0.703812702135731 +29,0.19626402088297137,-1.731421126100085,0.33753714074823216,1.167207071332792 +30,-1.1808345594828473,-0.2820078693924212,-0.8720833031493173,0.8643708946275879 +31,0.8284163458216123,0.20722015645321426,0.29071068457985955,2.6180265991342315 +32,-0.08203655784081282,0.060308831720906446,0.9519485488783623,0.7350446746473065 +33,-0.9071581669506105,0.6088044300190749,1.0099718941738625,0.002799079788086574 +34,-0.42977850177492904,1.2900375327057412,0.32028642454115197,0.8301665482611077 +35,1.0852695299159272,-0.7040390830488096,0.7964627034904589,0.5892571532287761 +36,-1.5667114288837196,0.19932071915614016,-1.0037399027933205,0.5715977614420107 +37,1.3367378436097728,-0.4584285824179284,-0.4435084312392094,-1.3448283883056802 +38,-0.03788754387000687,-0.37288494267798383,-0.5643391975832854,0.43485956543590193 +39,1.0634390535750102,1.0881233131592658,1.2921865320956318,-0.07293734130819148 +40,1.6786504380461766,-0.03043290400609124,2.66472625811549,-0.016638240963738466 +41,-1.657581538683817,0.8240214695327108,0.32914391919723984,0.08007211199118686 +42,0.04171224685709963,-0.9854865121535178,-0.3195510216437891,-0.42540430453161987 +43,0.6506526831507736,-1.159358101323352,-1.2789107289085737,0.10499609768025624 +44,0.7402635450212406,-0.44202303578095753,-0.5748164371395315,0.5600113473434154 +45,-0.9809738202025933,0.16868168368656386,-1.5883259666916558,-2.6955712214488345 +46,-1.8495816486925372,-1.6954982682847552,1.387648046113911,0.8455399256972358 +47,1.0442607146494682,0.44438084748213075,-0.6547675875380801,-0.5557546828614935 +48,0.32905474147201974,-0.7323591467714324,0.8555098512789541,2.4647371835928196 +49,-2.5131333956577557,1.4005121469909907,-2.162216422615549,-0.3797761578463441 +50,-1.406182674849582,-0.33037485118390236,-0.30720520090625775,0.3765108318500068 +51,1.4315461764484496,0.4490657382715407,0.14688708820540236,-1.3363710028523919 +52,-1.3546100448551868,0.35309094153560083,1.1314974294901488,-0.8299500202669722 +53,-0.7668372422803534,1.3427856896905794,0.11144680945543838,0.5488627384438831 +54,2.6377507721791997,1.86851303077989,0.1358347611054535,0.0021631807468969044 +55,-0.2814604476092987,-0.8110890245372547,0.2914246407211869,1.3009776744589592 +56,-0.08220515064674763,0.06131679740379093,1.2240755323078314,1.6651435947789437 +57,-1.5833977101974248,-1.0390852809695386,0.9472604405151627,-1.1238493594739791 +58,0.060801913229076375,-1.1395369395697963,-0.6773504352336591,-0.7823844332031786 +59,0.3719151864023754,-2.6589573541115885,0.9138765623088898,1.9179285751965107 +60,0.16875501543121765,-0.21075290840365637,-0.15712808326461272,-1.095263810678978 +61,-0.6676220651512094,-2.392512574657398,-0.1970135407082481,1.1303688380560521 +62,-0.3508037371211798,0.37103055819752395,0.1319143246551687,-0.8442765717512588 +63,0.5744187610995448,0.2824163982139891,-0.23250484081352427,-0.009480528299369923 +64,-1.033847039653939,-0.6062251775571341,0.8745680740327043,0.10611431160660695 +65,0.8616095853453857,-0.7902852788672261,0.9924735544245377,-0.39017477285341734 +66,-0.25797403501959537,0.9776756368066659,-0.1774701795502288,0.8457628045096433 +67,0.1879011473947124,0.4880410431165719,0.33941695573743247,-0.3098695458944371 +68,0.12908240475251795,-0.3929831705571321,-0.9815115481276334,-0.6596680503662373 +69,0.47271005121390686,-0.27585706457228726,0.659750762879994,-1.621655291178758 +70,1.2805576221047092,1.255690982276119,0.8893506172744224,0.36843763617254915 +71,-1.8222077377715007,-1.2618097663744718,-1.2393746501949852,0.22742537143827415 +72,-0.7670935921671362,0.6632357605887813,-1.8652052380554516,-0.3566398262186697 +73,0.368513682832951,0.22484190975093934,0.7207761550523548,-0.4607733151206031 +74,-1.6353304746550132,-1.0835890398703607,0.6240782484796151,1.497716990815385 +75,1.2631082191418077,1.9388688317848526,0.43069457351954177,-0.1240852286700612 +76,1.4229945541316606,1.685287372911636,0.282616738427184,1.6075806781661712 +77,0.15907038463344916,-1.1862747951875707,-2.162241163696355,0.9048269906929861 +78,0.8724544719304812,-0.06423147646568356,0.28403221059939265,0.7315950326908587 +79,-0.5099002924982818,0.8674753935115029,0.0015306969822590103,-0.793334121698815 +80,0.16756755106838742,-0.8374595440291756,1.871547652925694,-0.019948470822079158 +81,0.5333319586985659,-1.6076411272904392,0.4676478392958759,0.35245743045221734 +82,-0.5292514883314576,-1.2708056558247538,-1.7043012586370947,0.3391676901971921 +83,1.8042184317880245,1.2058943020996364,-2.3228385290614084,1.2008461670776127 +84,0.8671835774935015,0.9953640415286719,-1.4439272409362103,0.9410085688802767 +85,-0.118043369635042,0.41649838899300184,-1.2993225013700294,1.9232397286356342 +86,-0.32517525711392864,0.062481999278585824,-0.27679161049236684,0.06555334954413516 +87,-0.39336711632154264,0.0790516633124132,-0.600204351381406,1.321653482130525 +88,-0.9789171222367312,0.30688902979967303,0.10346158693798674,0.3160642853129814 +89,0.4332454768673768,-0.620828990252391,-1.0710192139922268,0.15027972939295933 +90,3.1092106995021096,0.354640404873306,1.8164064530643516,1.8911595405760606 +91,0.7027212216033006,-1.9367414347582559,-0.26797308254438235,1.1063820286927997 +92,0.6665636818250888,0.7953561614160027,1.8164132351496374,1.5760380002772454 +93,-1.4931006068027144,0.2680846074746922,-0.30697269318261355,-0.5300118028948997 +94,0.9258476710590248,0.15464742730214845,0.5847769923450901,-0.8405562302565793 +95,0.3015957125126854,2.9697978560379323,2.2793789547159338,0.13951152352691706 +96,0.4109127837045091,0.04501972229381512,0.5969781411176205,1.6443498245829686 +97,0.07956221270863263,0.009072464866011773,-0.6905847540574735,-0.9639714900867246 +98,2.9172401959670817,0.43571229891911717,-0.903738601954934,0.08343820441617454 +99,0.5501333973314503,-0.2511364474548299,1.4945524498890597,-1.1608586317841827 diff --git a/integrationtests/test_data/extroot/realworld_example/data/35/03_raw_data/001_dataset1/metadata.json b/integrationtests/test_data/extroot/realworld_example/data/35/03_raw_data/001_dataset1/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..64df90e55eff065b1cc249a634444a72f9fd00d2 --- /dev/null +++ b/integrationtests/test_data/extroot/realworld_example/data/35/03_raw_data/001_dataset1/metadata.json @@ -0,0 +1,50 @@ +{ + "title": "Random numbers created on a random autumn day in a random office", + "abstract": "For demonstration purposes we created random numbers on a computer in an office of the CLOUD. This demonstration dataset is used in the DataCloud, a curated cloud storage for scientific data.", + "Event": [ + { + "longitude": 18.445078548041533, + "start_datetime": "2022-02-10T16:36:48+01:00", + "latitude": 53.10833068997861, + "elevation": 2, + "location": "Bremen, Germany" + } + ], + "license": "CC-BY", + "authors": [ + { + "firstname": "Max", + "lastname": "Schmitt", + "full_name": "Max Schmitt", + "affiliation": "CLOUD", + "ORCID": "0000-0001-6233-1866", + "email": "max.schmitt@email.de" + }, + { + "firstname": "Alexa", + "lastname": "Nozone", + "full_name": "Alexa Nozone", + "affiliation": "CLOUD", + "email": "alexa.nozone@email.de" + } + ], + "comment": "For questions about the DataCloud or this demonstration dataset, contact research-data@email.de", + "project": { + "name": "Demonstration of Extremly important Metadata in Folders", + "full_name": "Project", + "project_acronym": "DEMO", + "project_type": "national", + "institute": "CLOUD", + "start_date": "2021-10-01", + "end_date": "2031-12-31", + "url": "https://www.cloud.de/de/forschung-infrastruktur/forschungsdaten-services.html", + "coordinator": { + "firstname": "Max", + "lastname": "Schmitt", + "email": "max.schmitt@email.de" + } + }, + "method": { + "name": "Random Number Generator" + } +} diff --git a/integrationtests/test_data/extroot/realworld_example/data/35/03_raw_data/README_RawData.md b/integrationtests/test_data/extroot/realworld_example/data/35/03_raw_data/README_RawData.md new file mode 100644 index 0000000000000000000000000000000000000000..2317ff8616c43e75f52637ff581017bf4a50d468 --- /dev/null +++ b/integrationtests/test_data/extroot/realworld_example/data/35/03_raw_data/README_RawData.md @@ -0,0 +1,25 @@ +# Raw Data + +The `03_raw_data` folder is here to store all raw data of each dataset +associated with the project – the data that has not been edited by you yet but +which you plan to use in your research. It can be e.g. your unprocessed field +sampling records, or useful data from an online repository. Organize your data +in this folder in the following way: + +- Each dataset should reside inside a subfolder. It is recommended to number and name these folders clearly, e.g. `03_raw_data/001_precipitationgermany2017`. + +- **IMPORTANT**: provide the folder with information about your raw data by + filling out a metadata form for each of your datasets! For this, + + - either copy the `metadata-template.json` file and put it into your dataset + folder. Open the copy with a text editor and fill out the fields. + - or use the metadata editor in the DataCoud web client (press the "+" button + and use "New matadata.json" file) + + If you can’t find information about your data to fill in here, you should + reconsider using it - it is important to be able to trace your data sources to + ensure a FAIR scientific process! + +- For processing any of the data, make a copy of the dataset and paste it into + the `04_data_processing` folder. This way, you make sure to keep your raw data + in its original state. \ No newline at end of file diff --git a/integrationtests/test_data/extroot/realworld_example/data/35/03_raw_data/metadata-template.json b/integrationtests/test_data/extroot/realworld_example/data/35/03_raw_data/metadata-template.json new file mode 100644 index 0000000000000000000000000000000000000000..7f457d239321b232fb2db7d46f4e1576c85911b0 --- /dev/null +++ b/integrationtests/test_data/extroot/realworld_example/data/35/03_raw_data/metadata-template.json @@ -0,0 +1,52 @@ +{ + "dataset": { + "title": "", + "abstract": "See https://github.com/CLOUD/metadata-schema for schema specification", + "license": "CC-BY", + "authors": [ + { + "firstname": "", + "lastname": "", + "affiliation": "", + "ORCID": "XXXX-XXXX-XXXX-XXXX", + "email": "name@domain.de" + }, + { + "firstname": "", + "lastname": "", + "affiliation": "", + "email": "name@domain.de", + "ORCID": "XXXX-XXXX-XXXX-XXXX" + } + ], + "project": { + "name": "", + "acronym": "", + "type": "DFG/", + "institute": "CLOUD", + "start_date": "YYYY-MM-DD", + "end_date": "YYYY-MM-DD", + "url": "", + "coordinator": { + "lastname": "", + "email": "", + "firstname": "" + } + }, + "events_in_data": false, + "events": [ + { + "longitude": 0, + "latitude": 0, + "elevation": 0, + "location": "", + "datetime": "YYYY-MM-DDTHH:mm:ss" + } + ], + "method": { + "name": "", + "url": "" + }, + "max_files": 100 + } +} diff --git a/integrationtests/test_data/extroot/realworld_example/data/35/04_data_processing/README_ProcessedData.md b/integrationtests/test_data/extroot/realworld_example/data/35/04_data_processing/README_ProcessedData.md new file mode 100644 index 0000000000000000000000000000000000000000..ce1b002b18772b85f4bba3a222574f438a6ed0e3 --- /dev/null +++ b/integrationtests/test_data/extroot/realworld_example/data/35/04_data_processing/README_ProcessedData.md @@ -0,0 +1,10 @@ +# Data Processing + +The actual work is done in this `04_data_processing` folder. Depending on your +field and type and size of project, you can organize this folder in the way that +fits your process best. Here, a bit of chaos can happen ;) Keep in mind to +document your processing steps in the `02_materials_and_methods` folder and to +put in your final results into the `05_results` folder. In the end of your +project, it should be possible to delete everything in this folder and +reconstruct the working process using the documentation and raw data from +previous folders. diff --git a/integrationtests/test_data/extroot/realworld_example/data/35/04_data_processing/metadata-template.json b/integrationtests/test_data/extroot/realworld_example/data/35/04_data_processing/metadata-template.json new file mode 100644 index 0000000000000000000000000000000000000000..05f9394dfbfa9a0b2b4844c7080a340585a9050f --- /dev/null +++ b/integrationtests/test_data/extroot/realworld_example/data/35/04_data_processing/metadata-template.json @@ -0,0 +1,52 @@ +{ + "dataset": { + "title": "", + "abstract": "See https://github.com/cloud/metadata-schema for schema specification", + "license": "CC-BY", + "authors": [ + { + "firstname": "", + "lastname": "", + "affiliation": "", + "ORCID": "XXXX-XXXX-XXXX-XXXX", + "email": "name@domain.de" + }, + { + "firstname": "", + "lastname": "", + "affiliation": "", + "email": "name@domain.de", + "ORCID": "XXXX-XXXX-XXXX-XXXX" + } + ], + "project": { + "name": "", + "acronym": "", + "type": "DFG/", + "institute": "CLOUD", + "start_date": "YYYY-MM-DD", + "end_date": "YYYY-MM-DD", + "url": "", + "coordinator": { + "lastname": "", + "email": "", + "firstname": "" + } + }, + "events_in_data": false, + "events": [ + { + "longitude": 0, + "latitude": 0, + "elevation": 0, + "location": "", + "datetime": "YYYY-MM-DDTHH:mm:ss" + } + ], + "method": { + "name": "", + "url": "" + }, + "max_files": 100 + } +} diff --git a/integrationtests/test_data/extroot/realworld_example/data/35/05_results/README_Results.md b/integrationtests/test_data/extroot/realworld_example/data/35/05_results/README_Results.md new file mode 100644 index 0000000000000000000000000000000000000000..ae0ab6571c52c0ec9a1cdc8aba27b31fd3be6fcc --- /dev/null +++ b/integrationtests/test_data/extroot/realworld_example/data/35/05_results/README_Results.md @@ -0,0 +1,7 @@ +# Results + +All the results that are final versions of your data analysis or processing, +should be copied into this `05_results` folder. Organize your results folder in +the way most fitting to your project. + +Provide metadata to your results files. diff --git a/integrationtests/test_data/extroot/realworld_example/data/35/README.md b/integrationtests/test_data/extroot/realworld_example/data/35/README.md new file mode 100644 index 0000000000000000000000000000000000000000..809d699c462d064ff5193add8e23677bec84b0e0 --- /dev/null +++ b/integrationtests/test_data/extroot/realworld_example/data/35/README.md @@ -0,0 +1,5 @@ +# Dataspace: DEMO + +This is a Dataspace in the CLOUD DataCloud providing safe, curated cloud storage +for all of CLOUD's research data. + diff --git a/integrationtests/test_data/extroot/realworld_example/dataset_cfoods.yml b/integrationtests/test_data/extroot/realworld_example/dataset_cfoods.yml new file mode 100644 index 0000000000000000000000000000000000000000..1589cba2b44afc3e2645b0ee72f91bf83b327032 --- /dev/null +++ b/integrationtests/test_data/extroot/realworld_example/dataset_cfoods.yml @@ -0,0 +1,528 @@ +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2022 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2022 Florian Spreckelsen <f.spreckelsen@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify it under +# the terms of the GNU Affero General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your option) any +# later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +# details. +# +# You should have received a copy of the GNU Affero General Public License along +# with this program. If not, see <https://www.gnu.org/licenses/>. +# +Data: + type: Directory + match: data + subtree: + dataspace_dir: + type: Directory + match: (?P<dataspace_dir_number>[0-9]+) + records: + Dataspace: + name: $dataspace_dir_number + subtree: + dataspace_json: + type: JSONFile + match: .dataspace.json + validate: schema/dataspace.schema.json + subtree: + dataspace_id_element: + type: DictIntegerElement + match_name: "dataspace_id" + match_value: "(?P<id>[0-9]+)" + records: + Dataspace: + dataspace_id: $id + archived_element: + type: DictBooleanElement + match_name: "archived" + match_value: "(?P<archived>.*)" + records: + Dataspace: + archived: $archived + url_element: + type: DictTextElement + match_name: "url" + match_value: "(?P<url>.*)" + records: + Dataspace: + url: $url + coordinator_element: + type: DictDictElement + match_name: "coordinator" + records: + Person: + parents: + - Person + Dataspace: + Person: $Person + subtree: &person_subtree + full_name_element: + type: DictTextElement + match_name: "full_name" + match_value: "(?P<full_name>.*)" + records: + Person: + full_name: $full_name + full_name_nonlatin_element: + type: DictTextElement + match_name: "full_name_nonlatin" + match_value: "(?P<full_name_nonlatin>.*)" + records: + Person: + full_name_nonlatin: $full_name_nonlatin + family_name_element: + type: DictTextElement + match_name: "family_name" + match_value: "(?P<family_name>.*)" + records: + Person: + family_name: $family_name + given_name_element: + type: DictTextElement + match_name: "given_name" + match_value: "(?P<given_name>.*)" + records: + Person: + given_name: $given_name + email_element: + type: DictTextElement + match_name: "email" + match_value: "(?P<email>.*)" + records: + Person: + email: $email + affiliation_element: + type: DictTextElement + match_name: "affiliation" + match_value: "(?P<affiliation>.*)" + records: + Person: + affiliation: $affiliation + ORCID_element: + type: DictTextElement + match_name: "ORCID" + match_value: "(?P<ORCID>.*)" + records: + Person: + ORCID: $ORCID + start_date_element: + type: DictTextElement + match_name: "start_date" + match_value: "(?P<start_date>.*)" + records: + Dataspace: + start_date: $start_date + end_date_element: + type: DictTextElement + match_name: "end_date" + match_value: "(?P<end_date>.*)" + records: + Dataspace: + end_date: $end_date + comment: + type: DictTextElement + match_name: "comment" + match_value: "(?P<comment>.*)" + records: + Dataspace: + comment: $comment + raw_data_dir: + type: Directory + match: 03_raw_data + subtree: &template + # TODO collect info from metadata.json and look into sub-directories + # (only one level) for metadata.json + dataset_dir: + match: (?P<dataset_dir_name>.*) + type: Directory + records: + Dataset: + Dataspace: $Dataspace + subtree: + metadata_json: &metadata_json_template + type: JSONFile + match: metadata.json + validate: schema/dataset.schema.json + subtree: + title_element: + type: DictTextElement + match_name: "title" + match_value: "(?P<title>.*)" + records: + Dataset: + title: $title + authors_element: + type: DictListElement + match_name: "authors" + subtree: + author_element: + type: Dict + records: + Person: + parents: + - Person + Dataset: + authors: +$Person + subtree: *person_subtree + abstract_element: + type: DictTextElement + match_name: "abstract" + match_value: "(?P<abstract>.*)" + records: + Dataset: + abstract: $abstract + comment_element: + type: DictTextElement + match_name: "comment" + match_value: "(?P<comment>.*)" + records: + Dataset: + comment: $comment + license_element: + type: DictTextElement + match_name: "license" + match_value: "(?P<license_name>.*)" + records: + license: + # TODO: As soon as such things can be validated, a + # creation of a new license has to be forbidden here + # (although this is effectively done already by + # validating against the above schema.) + name: $license_name + Dataset: + license: $license + dataset_doi_element: + type: DictTextElement + match_name: "dataset_doi" + match_value: "(?P<dataset_doi>.*)" + records: + Dataset: + dataset_doi: $dataset_doi + related_to_dois_element: + type: DictListElement + match_name: "related_to_dois" + subtree: + related_to_doi_element: + type: TextElement + match: "(?P<related_to_doi>).*" + records: + Dataset: + related_to_dois: +$related_to_doi + Keywords_element: + type: DictListElement + match_name: "Keyword" + Events_element: + type: DictListElement + match_name: "Event" + subtree: + Event_element: + type: Dict + records: + Event: + parents: + - Event + Dataset: + Event: +$Event + subtree: + label_element: + type: DictTextElement + match_name: "label" + match_value: "(?P<label>.*)" + records: + Event: + label: $label + comment_element: + type: DictTextElement + match_name: "comment" + match_value: "(?P<comment>.*)" + records: + Event: + comment: $comment + start_datetime_element: + type: DictTextElement + match_name: start_datetime + match_value: "(?P<start_datetime>.*)" + records: + Event: + start_datetime: $start_datetime + end_datetime_element: + type: DictTextElement + match_name: end_datetime + match_value: "(?P<end_datetime>.*)" + records: + Event: + end_datetime: $end_datetime + longitude_element: + type: DictFloatElement + match_name: "longitude" + match_value: "(?P<longitude>.*)" + records: + Event: + longitude: $longitude + latitude_element: + type: DictFloatElement + match_name: "latitude" + match_value: "(?P<latitude>.*)" + records: + Event: + latitude: $latitude + elevation_element: + type: DictFloatElement + match_name: "elevation" + match_value: "(?P<elevation>.*)" + records: + Event: + elevation: $elevation + location_element: + type: DictTextElement + match_name: location + match_value: "(?P<location>.*)" + records: + Event: + location: $location + igsn_element: + type: DictTextElement + match_name: igsn + match_value: "(?P<igsn>.*)" + records: + Event: + igsn: $igsn + events_in_data_element: + type: DictBooleanElement + match_name: "events_in_data" + match_value: "(?P<events_in_data>.*)" + records: + Dataset: + events_in_data: $events_in_data + geojson_element: + type: DictTextElement + match_name: "geojson" + match_value: "(?P<geojson>.*)" + records: + Dataset: + geojson: $geojson + project_element: + type: DictDictElement + match_name: "project" + records: + Project: + parents: + - Project + Dataset: + Project: $Project + subtree: + full_name_element: + type: DictTextElement + match_name: "full_name" + match_value: "(?P<full_name>.*)" + records: + Project: + full_name: $full_name + project_id_element: + type: DictTextElement + match_name: "project_id" + match_value: "(?P<project_id>.*)" + records: + Project: + project_id: $project_id + project_type_element: + type: DictTextElement + match_name: "project_type" + match_value: "(?P<project_type_name>.*)" + records: + project_type: + name: $project_type_name + Project: + project_type: $project_type + institute_element: + type: DictTextElement + match_name: "institute" + match_value: "(?P<institute>.*)" + records: + Project: + institute: $institute + start_date_element: + type: DictTextElement + match_name: "start_date" + match_value: "(?P<start_date>.*)" + records: + Project: + start_date: $start_date + end_date_element: + type: DictTextElement + match_name: "end_date" + match_value: "(?P<end_date>.*)" + records: + Project: + end_date: $end_date + url_element: + type: DictTextElement + match_name: "url" + match_value: "(?P<url>.*)" + records: + Project: + url: $url + coordinators_element: + type: DictListElement + match_name: "coordinators" + subtree: + coordinator_element: + type: Dict + records: + Person: + parents: + - Person + Project: + coordinators: +$Person + subtree: *person_subtree + campaign_element: + type: DictDictElement + match_name: "campaign" + records: + Campaign: + parents: + - Campaign + Dataset: + Campaign: $Campaign + subtree: + label_element: + type: DictTextElement + match_name: "label" + match_value: "(?P<label>.*)" + records: + Campaign: + label: $label + optional_label_element: + type: DictTextElement + match_name: "optional_label" + match_value: "(?P<optional_label>.*)" + records: + Campaign: + optional_label: $optional_label + start_date_element: + type: DictTextElement + match_name: "start_date" + match_value: "(?P<start_date>.*)" + records: + Campaign: + start_date: $start_date + end_date_element: + type: DictTextElement + match_name: "end_date" + match_value: "(?P<end_date>.*)" + records: + Campaign: + end_date: $end_date + responsible_scientists_element: + type: DictListElement + match_name: "responsible_scientists" + subtree: + responsible_scientist_element: + type: Dict + records: + Person: + parents: + - Person + Campaign: + responsible_scientists: +$Person + subtree: *person_subtree + Methods_element: + type: DictListElement + match_name: "Method" + subtree: + Method_element: + type: Dict + records: + Method: + parents: + - Method + Dataset: + Method: +$Method + subtree: + method_name_element: + type: DictTextElement + match_name: "method_name" + match_value: "(?P<method_name>.*)" + records: + Method: + name: $method_name + abbreviation_element: + type: DictTextElement + match_name: "abbreviation" + match_value: "(?P<abbreviation>.*)" + records: + Method: + abbreviation: $abbreviation + url_element: + type: DictTextElement + match_name: "url" + match_value: "(?P<url>.*)" + records: + Method: + url: $url + Taxa_element: + type: DictListElement + match_name: "Taxon" + subtree: + Taxon_element: + type: Dict + records: + Taxon: + parents: + - Taxon + Dataset: + Taxon: +$Taxon + subtree: + taxon_name_element: + type: DictTextElement + match_name: "taxon_name" + match_value: "(?P<taxon_name>.*)" + records: + Taxon: + name: $taxon_name + archived_element: + type: DictBooleanElement + match_name: "archived" + match_value: "(P<archived>.*)" + records: + Dataset: + archived: $archived + publication_date_element: + type: DictTextElement + match_name: "publication_date" + match_value: "(P<publication_date>.*)" + records: + Dataset: + publication_date: $publication_date + max_files_element: + type: DictIntegerElement + match_name: "max_files" + match_value: "(P<max_files>.*)" + records: + Dataset: + max_files: $max_files + auxiliary_file: &aux_file_template + type: File + match: "(?P<aux_file_name>(?!metadata.json).*)" + # TODO File, path and reference dataset in file record + child_dataset_dir: + type: Directory + match: (?P<child_dataset_dir_name>.*) + subtree: + metadata_json: *metadata_json_template + auxiliary_file: *aux_file_template + data_processing_dir: + type: Directory + match: 04_data_processing + subtree: *template + results_dir: + type: Directory + match: 05_results + subtree: *template diff --git a/integrationtests/test_data/extroot/realworld_example/schema/README.md b/integrationtests/test_data/extroot/realworld_example/schema/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e0bb95f8b844374bba72c7c6989ac57cfa5fc305 --- /dev/null +++ b/integrationtests/test_data/extroot/realworld_example/schema/README.md @@ -0,0 +1,37 @@ +# Dataset Schemas + +These schema's are derived from the [metadata +schemas](https://github.com/leibniz-zmt/zmt-metadata-schema) used at the Leibniz +Center for Tropical Marine Research (Leibniz ZMT). + +# Copyright + +BSD 3-Clause License + +Copyright (c) 2022 ZMT +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/integrationtests/test_data/extroot/realworld_example/schema/dataset-inheritance.yml b/integrationtests/test_data/extroot/realworld_example/schema/dataset-inheritance.yml new file mode 100644 index 0000000000000000000000000000000000000000..3d12053a0007cdea1005e7673db69f46b35a063d --- /dev/null +++ b/integrationtests/test_data/extroot/realworld_example/schema/dataset-inheritance.yml @@ -0,0 +1,18 @@ +extern: +- Keyword +- Taxon +- full_name +- full_name_nonlatin +- name + +full_name: + inherit_from_obligatory: + - name + +full_name_nonlatin: + inherit_from_obligatory: + - name + +Taxon: + inherit_from_obligatory: + - Keyword diff --git a/integrationtests/test_data/extroot/realworld_example/schema/dataset.schema.json b/integrationtests/test_data/extroot/realworld_example/schema/dataset.schema.json new file mode 100644 index 0000000000000000000000000000000000000000..83d6a60d857349772c960af637671cb21c8abd5d --- /dev/null +++ b/integrationtests/test_data/extroot/realworld_example/schema/dataset.schema.json @@ -0,0 +1,365 @@ +{ + "title": "Dataset", + "description": "", + "type": "object", + "properties": { + "title": { + "type": "string", + "description": "full dataset title" + }, + "authors": { + "type": "array", + "items": { + "type": "object", + "title": "Person", + "properties": { + "full_name": { + "type": "string", + "description": "Full name (latin transcription, all UFT-8 characters allowed)" + }, + "full_name_nonlatin": { + "type": "string", + "description": "Full name (non-latin alphabet)" + }, + "family_name": { + "type": "string", + "description": "Family name (latin transcription)" + }, + "given_name": { + "type": "string", + "description": "Given/other names (latin transcription)" + }, + "affiliation": { + "type": "string" + }, + "ORCID": { + "type": "string", + "description": "ORCID identifier as 16-digit number, e.g. 0000-0001-6233-1866", + "pattern": "^\\d{4}-\\d{4}-\\d{4}-\\d{4}$" + }, + "email": { + "type": "string", + "format": "email" + } + }, + "required": [ + "full_name", + "email" + ] + }, + "minItems": 1, + "uniqueItems": true + }, + "abstract": { + "type": "string", + "minLength": 80, + "maxLength": 1000, + "description": "Abstract with at least 80 characters" + }, + "comment": { + "type": "string" + }, + "license": { + "type": "string", + "enum": [ + "CC-BY", + "CC-BY-SA", + "CC0", + "restricted access" + ] + }, + "dataset_doi": { + "type": "string", + "pattern": "(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![%\"#? ])\\S)+)", + "description": "Dataset DOI, e.g. 10.1594/PANGAEA.938740" + }, + "related_to_dois": { + "type": "array", + "items": { + "type": "string", + "pattern": "(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![%\"#? ])\\S)+)" + }, + "description": "DOIs of related publications and/or datasets, e.g. 10.1000/182" + }, + "Keyword": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string" + } + } + } + }, + "Event": { + "type": "array", + "description": "https://wiki.pangaea.de/wiki/Event", + "items": { + "type": "object", + "properties": { + "label": { + "type": "string" + }, + "comment": { + "type": "string" + }, + "start_datetime": { + "type": "string", + "format": "date-time" + }, + "end_datetime": { + "type": "string", + "format": "date-time" + }, + "longitude": { + "type": "number", + "minimum": -180, + "maximum": 180, + "description": "longitude (W/E) in decimal degree (-180 to 180)" + }, + "latitude": { + "type": "number", + "minimum": -90, + "maximum": 90, + "description": "latitude (N/S) in decimal degree (-90 to 90)" + }, + "elevation": { + "type": "number", + "minimum": -10000, + "maximum": 20000, + "description": "elevation in m" + }, + "location": { + "type": "string", + "description": "geographical location as text (e.g., North Sea; Espoo, Finland)" + }, + "igsn": { + "type": "string", + "description": "International Geo Sample Number (http://www.geosamples.org/aboutigsn)" + } + }, + "required": [ + "longitude", + "latitude", + "start_datetime" + ] + } + }, + "events_in_data": { + "type": "boolean", + "description": "Does the data contain additional information about timepoints and locations?" + }, + "geojson": { + "type": "string", + "pattern": "", + "description": "GeoJSON for complex geographic structures" + }, + "project": { + "title": "Project", + "description": "https://wiki.pangaea.de/wiki/Project", + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "short name of project" + }, + "full_name": { + "type": "string", + "description": "Full name (latin transcription, all UTF-8 characters allowed)" + }, + "project_id": { + "type": "string", + "description": "Project ID" + }, + "project_type": { + "type": "string", + "enum": [ + "DFG", + "EU", + "BMBF", + "national", + "international" + ] + }, + "institute": { + "type": "string", + "description": "place of coordination or project office", + "default": "Centre for Research" + }, + "start_date": { + "type": "string", + "format": "date" + }, + "end_date": { + "type": "string", + "format": "date" + }, + "url": { + "type": "string", + "format": "uri" + }, + "coordinators": { + "type": "array", + "items": { + "type": "object", + "title": "Person", + "properties": { + "full_name": { + "type": "string", + "description": "Full name (latin transcription, all UTF-8 characters allowed)" + }, + "full_name_nonlatin": { + "type": "string", + "description": "Full name (non-latin alphabet)" + }, + "family_name": { + "type": "string", + "description": "Family name (latin transcription)" + }, + "given_name": { + "type": "string", + "description": "Given/other names (latin transcription)" + }, + "affiliation": { + "type": "string" + }, + "ORCID": { + "type": "string", + "description": "ORCID identifier as 16-digit number, e.g. 0000-0001-6233-1866", + "pattern": "^\\d{4}-\\d{4}-\\d{4}-\\d{4}$" + }, + "email": { + "type": "string", + "format": "email" + } + }, + "required": [ + "full_name", + "email" + ] + }, + "minItems": 1, + "uniqueItems": true + } + }, + "required": ["name", "full_name"] + }, + "campaign": { + "title": "Campaign", + "description": "https://wiki.pangaea.de/wiki/Campaign, synonyms: cruise, expedition, leg, ", + "type": "object", + "properties": { + "label": { + "type": "string", + "description": "is unique and does not contain blanks; uses abbreviations instead of full names" + }, + "optional_label": { + "type": "string" + }, + "start_date": { + "type": "string", + "format": "date" + }, + "end_date": { + "type": "string", + "format": "date" + }, + "responsible_scientists": { + "type": "array", + "items": { + "type": "object", + "title": "Person", + "properties": { + "full_name": { + "type": "string", + "description": "Full name (latin transcription, all UFT-8 characters allowed)" + }, + "full_name_nonlatin": { + "type": "string", + "description": "Full name (non-latin alphabet)" + }, + "family_name": { + "type": "string", + "description": "Family name (latin transcription)" + }, + "given_name": { + "type": "string", + "description": "Given/other names (latin transcription)" + }, + "affiliation": { + "type": "string" + }, + "ORCID": { + "type": "string", + "description": "ORCID identifier as 16-digit number, e.g. 0000-0001-6233-1866", + "pattern": "^\\d{4}-\\d{4}-\\d{4}-\\d{4}$" + }, + "email": { + "type": "string", + "format": "email" + } + }, + "required": [ + "full_name", + "email" + ] + }, + "minItems": 1, + "uniqueItems": true + } + } + }, + "Method": { + "type": "array", + "items": { + "type": "object", + "description": "https://wiki.pangaea.de/wiki/Method", + "properties": { + "method_name": { + "type": "string", + "description": "full official name of tool/instrument/device/gear" + }, + "abbreviation": { + "type": "string", + "description": "may be used for import in an event list to avoid misspellings" + }, + "url": { + "type": "string", + "description": "should contain a web address, where an official description of the device can be found" + } + } + } + }, + "Taxon": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string" + } + } + } + }, + "archived": { + "type": "boolean", + "description": "Has the dataset been archived?", + "default": false + }, + "publication_date": { + "type": "string", + "format": "date" + }, + "max_files": { + "type": "integer", + "description": "Maximum number of files to included by the CaosDB crawler", + "default": 100 + } + }, + "required": [ + "title", + "authors", + "abstract" + ] +} diff --git a/integrationtests/test_data/extroot/realworld_example/schema/dataspace.schema.json b/integrationtests/test_data/extroot/realworld_example/schema/dataspace.schema.json new file mode 100644 index 0000000000000000000000000000000000000000..01653bfa821e0a0acbb5a481bfd458e2ed784fb9 --- /dev/null +++ b/integrationtests/test_data/extroot/realworld_example/schema/dataspace.schema.json @@ -0,0 +1,45 @@ +{ + "title": "Dataspace", + "description": "A Dataspace is a folder in the DataCloud with a pre-defined structure", + "type": "object", + "properties": { + "dataspace_id": { + "type": "integer", + "description": "Integer ID of Dataspace (matches LDAP GID)", + "minimum": 20000 + }, + "archived": { "type": "boolean" }, + "url": { + "type": "string", + "description": "link to folder on file system (CaosDB or cloud folder)" + }, + "coordinator": { + "type": "object", + "title": "Person", + "properties": { + "full_name": { + "type": "string", + "description": "Full name (latin transcription, all UFT-8 characters allowed)" + }, + "full_name_nonlatin": { + "type": "string", + "description": "Full name (non-latin alphabet)" + }, + "family_name": { + "type": "string", + "description": "Family name (latin transcription)" + }, + "given_name": { + "type": "string", + "description": "Given/other names (latin transcription)" + }, + "email": { "type": "string", "format": "email" } + }, + "required": ["full_name", "email"] + }, + "start_date": { "type": "string", "format": "date" }, + "end_date": { "type": "string", "format": "date" }, + "comment": { "type": "string" } + }, + "required": ["dataspace_id", "url", "coordinator"] +} diff --git a/integrationtests/test_data/extroot/realworld_example/schema/zmt-organisation.yml b/integrationtests/test_data/extroot/realworld_example/schema/zmt-organisation.yml new file mode 100644 index 0000000000000000000000000000000000000000..7e251eeced7bf626e77364fc5555b1cb10dd3afb --- /dev/null +++ b/integrationtests/test_data/extroot/realworld_example/schema/zmt-organisation.yml @@ -0,0 +1,26 @@ +extern: +- name +- url +- Dataset + +german_name: + datatype: TEXT + inherit_from_obligatory: + - name + +Department: + recommended_properties: + url: + german_name: + + +WorkingGroup: + recommended_properties: + Department: + german_name: + url: + +Dataset: + recommended_properties: + WorkingGroup: + diff --git a/setup.cfg b/setup.cfg index b89b07543d91dd35e2238aaddd363e85dd45f2d2..2f8d46b30ee04d68adc6aef69e1a04115bbc44d8 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [metadata] -name = newcrawler +name = caoscrawler version = 0.1 author = Alexander Schlemmer author_email = alexander.schlemmer@ds.mpg.de @@ -16,7 +16,6 @@ classifiers = [options] package_dir = = src - packages = find: python_requires = >=3.6 install_requires = @@ -24,6 +23,7 @@ install_requires = caosdb caosadvancedtools yaml-header-tools + pyyaml [options.packages.find] where = src @@ -35,4 +35,4 @@ per-file-ignores = __init__.py:F401 [options.entry_points] console_scripts = - crawler2.0 = newcrawler.crawl:main + caosdb-crawler = caoscrawler.crawl:main diff --git a/src/newcrawler/__init__.py b/src/caoscrawler/__init__.py similarity index 100% rename from src/newcrawler/__init__.py rename to src/caoscrawler/__init__.py diff --git a/src/newcrawler/cfood-schema.yml b/src/caoscrawler/cfood-schema.yml similarity index 100% rename from src/newcrawler/cfood-schema.yml rename to src/caoscrawler/cfood-schema.yml diff --git a/src/newcrawler/converters.py b/src/caoscrawler/converters.py similarity index 96% rename from src/newcrawler/converters.py rename to src/caoscrawler/converters.py index ebc3ab19ceb0f8c18cba5cb1bc3f86d5e31bfb84..b8b9bd2ce7bff206d1233953f05c795a45a5b4ca 100644 --- a/src/newcrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -37,6 +37,7 @@ from .structure_elements import (StructureElement, Directory, File, Dict, JSONFi TextElement, DictTextElement, DictElement, DictListElement) from typing import Optional, Union from abc import abstractmethod +from string import Template import yaml_header_tools import yaml @@ -63,6 +64,7 @@ def handle_value(value: Union[dict, str], values: GeneralStore): - the final value of the property - the collection mode (can be single, list or multiproperty) """ + # @review Florian Spreckelsen 2022-05-13 if type(value) == dict: if "value" not in value: @@ -90,12 +92,20 @@ def handle_value(value: Union[dict, str], values: GeneralStore): propvalue = value return (propvalue, collection_mode) - if propvalue.startswith("$"): - propvalue = values[propvalue[1:]] - # Allow the insertion of $ signs at the beginning - if type(propvalue) == str and propvalue.startswith("$$"): - propvalue = propvalue[1:] - + # Check if the replacement is a single variable containing a record: + match = re.match(r"^\$(\{)?(?P<varname>[0-9a-zA-Z_]+)(\})?$", propvalue) + if match is not None: + varname = match.group("varname") + if varname in values: + if values[varname] is None: + propvalue = None + return (propvalue, collection_mode) + if isinstance(values[varname], db.Entity): + propvalue = values[varname] + return (propvalue, collection_mode) + + propvalue_template = Template(propvalue) + propvalue = propvalue_template.safe_substitute(**values.get_storage()) return (propvalue, collection_mode) diff --git a/src/newcrawler/crawl.py b/src/caoscrawler/crawl.py similarity index 96% rename from src/newcrawler/crawl.py rename to src/caoscrawler/crawl.py index 605f1463d9853a100443ea8ed698e4169266fa13..b0f576a2c73342cc1301ff0f27b74bb519768541 100644 --- a/src/newcrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -122,6 +122,23 @@ def check_identical(record1: db.Entity, record2: db.Entity, ignore_id=False): return True +def _resolve_datatype(prop: db.Property, remote_entity: db.Entity): + + if remote_entity.role == "Property": + datatype = remote_entity.datatype + elif remote_entity.role == "RecordType": + datatype = remote_entity.name + else: + raise RuntimeError("Cannot set datatype.") + + # Treat lists separately + if isinstance(prop.value, list) and not datatype.startswith("LIST"): + datatype = db.LIST(datatype) + + prop.datatype = datatype + return prop + + class Crawler(object): """ Crawler class that encapsulates crawling functions. @@ -188,7 +205,7 @@ class Crawler(object): # tested in the next lines of code: # Load the cfood schema: - with open(files('newcrawler').joinpath('cfood-schema.yml'), "r") as f: + with open(files('caoscrawler').joinpath('cfood-schema.yml'), "r") as f: schema = yaml.safe_load(f) # Add custom converters to converter enum in schema: @@ -243,43 +260,43 @@ class Crawler(object): converter_registry: dict[str, dict[str, str]] = { "Directory": { "converter": "DirectoryConverter", - "package": "newcrawler.converters"}, + "package": "caoscrawler.converters"}, "SimpleFile": { "converter": "SimpleFileConverter", - "package": "newcrawler.converters"}, + "package": "caoscrawler.converters"}, "MarkdownFile": { "converter": "MarkdownFileConverter", - "package": "newcrawler.converters"}, + "package": "caoscrawler.converters"}, "File": { "converter": "FileConverter", - "package": "newcrawler.converters"}, + "package": "caoscrawler.converters"}, "JSONFile": { "converter": "JSONFileConverter", - "package": "newcrawler.converters"}, + "package": "caoscrawler.converters"}, "Dict": { "converter": "DictConverter", - "package": "newcrawler.converters"}, + "package": "caoscrawler.converters"}, "DictBooleanElement": { "converter": "DictBooleanElementConverter", - "package": "newcrawler.converters"}, + "package": "caoscrawler.converters"}, "DictFloatElement": { "converter": "DictFloatElementConverter", - "package": "newcrawler.converters"}, + "package": "caoscrawler.converters"}, "DictTextElement": { "converter": "DictTextElementConverter", - "package": "newcrawler.converters"}, + "package": "caoscrawler.converters"}, "DictIntegerElement": { "converter": "DictIntegerElementConverter", - "package": "newcrawler.converters"}, + "package": "caoscrawler.converters"}, "DictListElement": { "converter": "DictListElementConverter", - "package": "newcrawler.converters"}, + "package": "caoscrawler.converters"}, "DictDictElement": { "converter": "DictDictElementConverter", - "package": "newcrawler.converters"}, + "package": "caoscrawler.converters"}, "TextElement": { "converter": "TextElementConverter", - "package": "newcrawler.converters"} + "package": "caoscrawler.converters"} } # More converters from definition file: @@ -465,8 +482,8 @@ class Crawler(object): """ for p in record.properties: if (isinstance(p.value, list)): + lst = [] for el in p.value: - lst = [] if (isinstance(el, db.Entity) and el.id is None): cached = self.get_identified_record_from_local_cache( el) @@ -481,7 +498,7 @@ class Crawler(object): lst.append(cached) else: lst.append(el) - p.value = lst + p.value = lst if (isinstance(p.value, db.Entity) and p.value.id is None): cached = self.get_identified_record_from_local_cache(p.value) if cached is None: @@ -703,6 +720,10 @@ class Crawler(object): @staticmethod def execute_inserts_in_list(to_be_inserted): + for record in to_be_inserted: + for prop in record.properties: + entity = db.Entity(name=prop.name).retrieve() + prop = _resolve_datatype(prop, entity) print("INSERT") print(to_be_inserted) if len(to_be_inserted) > 0: @@ -719,12 +740,7 @@ class Crawler(object): if prop.id is None: entity = db.Entity(name=prop.name).retrieve() prop.id = entity.id - if entity.role == "Property": - prop.datatype = entity.datatype - elif entity.role == "RecordType": - prop.datatype = entity.name - else: - raise RuntimeError("Cannot set datatype.") + prop = _resolve_datatype(prop, entity) print("UPDATE") print(to_be_updated) if len(to_be_updated) > 0: @@ -753,6 +769,7 @@ class Crawler(object): updateList) # remove unnecessary updates from list + # TODO: refactoring of typo for el in to_be_updated: self.replace_entities_by_ids(el) @@ -907,19 +924,11 @@ def crawler_main(args_path, crawler.save_debug_data(args_provenance) if args_load_identifiables is not None: - with open(args_load_identifiables, "r") as f: - identifiable_data = yaml.safe_load(f) ident = CaosDBIdentifiableAdapter() + ident.load_from_yaml_definition(args_load_identifiables) crawler.identifiableAdapter = ident - for k, v in identifiable_data.items(): - rt = db.RecordType() - rt.add_parent(k) - for pn in v: - rt.add_property(name=pn) - ident.register_identifiable(k, rt) - if args_dry_sync: ins, upd = crawler.synchronize(commit_changes=False) inserts = [str(i) for i in ins] @@ -996,6 +1005,7 @@ def parse_args(): return parser.parse_args() + def main(): args = parse_args() return crawler_main( @@ -1009,5 +1019,6 @@ def main(): args.prefix ) + if __name__ == "__main__": sys.exit(main()) diff --git a/src/newcrawler/extension-converters-config-schema.yml b/src/caoscrawler/extension-converters-config-schema.yml similarity index 100% rename from src/newcrawler/extension-converters-config-schema.yml rename to src/caoscrawler/extension-converters-config-schema.yml diff --git a/src/newcrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py similarity index 97% rename from src/newcrawler/identifiable_adapters.py rename to src/caoscrawler/identifiable_adapters.py index c1125ee1bdaba71ed4fa339fa74b379604293c98..47fd5324a4803c67d7c9f99448378e7b5f9241bd 100644 --- a/src/newcrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -23,6 +23,8 @@ # ** end header # +import yaml + from datetime import datetime import caosdb as db from abc import abstractmethod, ABCMeta @@ -412,6 +414,17 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): def __init__(self): self._registered_identifiables = dict() + def load_from_yaml_definition(self, path: str): + """Load identifiables defined in a yaml file""" + with open(path, 'r') as yaml_f: + identifiable_data = yaml.safe_load(yaml_f) + + for key, value in identifiable_data.items(): + rt = db.RecordType().add_parent(key) + for prop_name in value: + rt.add_property(name=prop_name) + self.register_identifiable(key, rt) + def register_identifiable(self, name: str, definition: db.RecordType): self._registered_identifiables[name] = definition diff --git a/src/newcrawler/identified_cache.py b/src/caoscrawler/identified_cache.py similarity index 99% rename from src/newcrawler/identified_cache.py rename to src/caoscrawler/identified_cache.py index cba00dd2bfff8a0f886878f532133bb18b1a20de..0b9d7a47bdecc4094edb1296f4c04dfa083a2436 100644 --- a/src/newcrawler/identified_cache.py +++ b/src/caoscrawler/identified_cache.py @@ -66,7 +66,6 @@ def _create_hashable_string(identifiable: db.Record): else: tmplist.append(val) value = str(tmplist) - rec_string += "{}:".format(pname) + value return rec_string diff --git a/src/newcrawler/stores.py b/src/caoscrawler/stores.py similarity index 100% rename from src/newcrawler/stores.py rename to src/caoscrawler/stores.py diff --git a/src/newcrawler/structure_elements.py b/src/caoscrawler/structure_elements.py similarity index 100% rename from src/newcrawler/structure_elements.py rename to src/caoscrawler/structure_elements.py diff --git a/src/newcrawler/utils.py b/src/caoscrawler/utils.py similarity index 100% rename from src/newcrawler/utils.py rename to src/caoscrawler/utils.py diff --git a/src/doc/Makefile b/src/doc/Makefile index dc1690a8f7f74815b25a51e519e4712c7c92b7ec..bea7f860173d930527c84fae43cb7d5bdf6cae97 100644 --- a/src/doc/Makefile +++ b/src/doc/Makefile @@ -29,7 +29,7 @@ SPHINXOPTS ?= -a SPHINXBUILD ?= sphinx-build SPHINXAPIDOC ?= sphinx-apidoc -PY_BASEDIR = ../newcrawler +PY_BASEDIR = ../caoscrawler SOURCEDIR = . BUILDDIR = ../../build/doc diff --git a/src/doc/conf.py b/src/doc/conf.py index 75731285a77f8a30fcb4bfc6be0483c4bba0052a..fb37cdd96c440300741aeb49e90caffe4370f5d7 100644 --- a/src/doc/conf.py +++ b/src/doc/conf.py @@ -3,7 +3,7 @@ # Configuration file for the Sphinx documentation builder. # # Based on the configuration for caosdb-pylib. -# +# # # Copyright (C) 2021 Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> # # This file only contains a selection of the most common options. For a full @@ -28,7 +28,7 @@ import sphinx_rtd_theme # noqa: E402 # -- Project information ----------------------------------------------------- -project = 'caosdb-newcrawler' +project = 'caosdb-caoscrawler' copyright = '2021, MPIDS' author = 'Alexander Schlemmer' @@ -115,7 +115,7 @@ html_static_path = ['_static'] # -- Options for HTMLHelp output --------------------------------------------- # Output file base name for HTML help builder. -htmlhelp_basename = 'caosdb-newcrawlerdoc' +htmlhelp_basename = 'caosdb-caoscrawlerdoc' # -- Options for LaTeX output ------------------------------------------------ @@ -142,7 +142,7 @@ latex_elements = { # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'caosdb-newcrawler.tex', 'caosdb-newcrawler Documentation', + (master_doc, 'caosdb-caoscrawler.tex', 'caosdb-caoscrawler Documentation', 'MPIDS', 'manual'), ] @@ -152,7 +152,7 @@ latex_documents = [ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - (master_doc, 'caosdb-newcrawler', 'caosdb-newcrawler documentation', + (master_doc, 'caosdb-caoscrawler', 'caosdb-caoscrawler documentation', [author], 1) ] @@ -163,8 +163,8 @@ man_pages = [ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'caosdb-newcrawler', 'caosdb-newcrawler documentation', - author, 'caosdb-newcrawler', 'One line description of project.', + (master_doc, 'caosdb-caoscrawler', 'caosdb-caoscrawler documentation', + author, 'caosdb-caoscrawler', 'One line description of project.', 'Miscellaneous'), ] diff --git a/tox.ini b/tox.ini index 8b5ad34fb1583790de5365f2bfa4ff7b3704574c..2cf966fb5b80e62cb7f216b0785ba567e13ee3ff 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist=py36, py37, py38, py39 +envlist=py36, py37, py38, py39, py310 skip_missing_interpreters = true [testenv] diff --git a/unittests/records.xml b/unittests/records.xml index 0ae34124a8875a723d7f0879687d8f0bdec51de0..f7455ec6b8995db8cd205f69729c32358beee8c0 100644 --- a/unittests/records.xml +++ b/unittests/records.xml @@ -78,6 +78,7 @@ <Property id="248" name="identifier" description="identifier of the experiment" datatype="TEXT" importance="FIX" flag="inheritance:FIX">TimeOfFlight</Property> <Property id="250" name="project" datatype="Project" importance="FIX" flag="inheritance:FIX">287</Property> <Property id="249" name="responsible" datatype="LIST<Person>" importance="FIX" flag="inheritance:FIX"> + <Value>289</Value> <Value>288</Value> </Property> </Record> diff --git a/unittests/test_cache.py b/unittests/test_cache.py index 7061b63c1f07a9ea2989509710b5f4043e73898d..135316b92fda0ac1e43f4e5f2c4f28fbf1272494 100644 --- a/unittests/test_cache.py +++ b/unittests/test_cache.py @@ -5,7 +5,7 @@ import caosdb as db from pytest import raises -from newcrawler.identified_cache import _create_hashable_string as create_hash_string +from caoscrawler.identified_cache import _create_hashable_string as create_hash_string def test_normal_hash_creation(): diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 100b10062916fb992d2bb19241d1cf8ea543e44c..5f56486ba0f63fdd64d4e4dd80e6d6eaeed705d1 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -27,16 +27,16 @@ test the converters module """ -from newcrawler.converters import Converter -from newcrawler.stores import GeneralStore -from newcrawler.converters import (ConverterValidationError, - MarkdownFileConverter, JSONFileConverter, - DictConverter) -from newcrawler.structure_elements import Directory -from newcrawler.structure_elements import (File, DictTextElement, - DictListElement, DictElement, - DictBooleanElement, DictDictElement, - DictIntegerElement, DictFloatElement) +from caoscrawler.converters import Converter +from caoscrawler.stores import GeneralStore +from caoscrawler.converters import (ConverterValidationError, + MarkdownFileConverter, JSONFileConverter, + DictConverter) +from caoscrawler.structure_elements import Directory +from caoscrawler.structure_elements import (File, DictTextElement, + DictListElement, DictElement, + DictBooleanElement, DictDictElement, + DictIntegerElement, DictFloatElement) from test_tool import rfp @@ -50,25 +50,25 @@ def converter_registry(): converter_registry: dict[str, dict[str, str]] = { "Directory": { "converter": "DirectoryConverter", - "package": "newcrawler.converters"}, + "package": "caoscrawler.converters"}, "MarkdownFile": { "converter": "MarkdownFileConverter", - "package": "newcrawler.converters"}, + "package": "caoscrawler.converters"}, "Dict": { "converter": "DictConverter", - "package": "newcrawler.converters"}, + "package": "caoscrawler.converters"}, "DictTextElement": { "converter": "DictTextElementConverter", - "package": "newcrawler.converters"}, + "package": "caoscrawler.converters"}, "DictListElement": { "converter": "DictListElementConverter", - "package": "newcrawler.converters"}, + "package": "caoscrawler.converters"}, "TextElement": { "converter": "TextElementConverter", - "package": "newcrawler.converters"}, + "package": "caoscrawler.converters"}, "JSONFile": { "converter": "JSONFileConverter", - "package": "newcrawler.converters"}, + "package": "caoscrawler.converters"}, } for key, value in converter_registry.items(): @@ -118,9 +118,13 @@ def testDirectoryConverter(converter_registry): def test_markdown_converter(converter_registry): - test_readme = File("README.md", rfp( - "test_directories", "examples_article", "DataAnalysis", - "2020_climate-model-predict", "2020-02-08_prediction-errors", "README.md")) + test_readme = File( + "README.md", + rfp( + "test_directories", "examples_article", "DataAnalysis", + "2020_climate-model-predict", "2020-02-08_prediction-errors", "README.md" + ) + ) converter = MarkdownFileConverter({ "match": "(.*)" @@ -155,8 +159,11 @@ def test_markdown_converter(converter_registry): assert children[0].name == "responsible" assert children[0].value.__class__ == str - test_readme2 = File("README.md", rfp("test_directories", "examples_article", - "ExperimentalData", "2020_SpeedOfLight", "2020-01-01_TimeOfFlight", "README.md")) + test_readme2 = File( + "README.md", + rfp("test_directories", "examples_article", + "ExperimentalData", "2020_SpeedOfLight", "2020-01-01_TimeOfFlight", "README.md") + ) m = converter.match(test_readme2) assert m is not None @@ -177,7 +184,8 @@ def test_json_converter(converter_registry): test_json = File("testjson.json", rfp( "test_directories", "examples_json", "testjson.json")) - schema_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "test_directories", "examples_json", "testjson.schema.json") + schema_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), + "test_directories", "examples_json", "testjson.schema.json") jsonconverter = JSONFileConverter( definition={"match": "(.*)", "validate": schema_path}, name="TestJSONFileConverter", @@ -203,9 +211,10 @@ def test_json_converter(converter_registry): assert children[2].name == "archived" assert children[2].value.__class__ == bool - assert children[3].__class__ == DictDictElement - assert children[3].name == "coordinator" - assert children[3].value.__class__ == dict + assert children[3].__class__ == DictListElement + assert children[3].name == "Person" + assert children[3].value.__class__ == list + assert len(children[3].value) == 2 assert children[4].__class__ == DictTextElement assert children[4].name == "start_date" @@ -224,10 +233,12 @@ def test_json_converter(converter_registry): assert children[7].name == "url" assert children[7].value.__class__ == str - broken_json = File("brokenjson.json", rfp( - "test_directories", "examples_json", "brokenjson.json")) + broken_json = File( + "brokenjson.json", + rfp("test_directories", "examples_json", "brokenjson.json") + ) m = jsonconverter.match(broken_json) - + # Doesn't validate because of missing required 'name' property with pytest.raises(ConverterValidationError) as err: children = jsonconverter.create_children(None, broken_json) diff --git a/unittests/test_directories/example_substitutions/ExperimentalData/220512_data.dat b/unittests/test_directories/example_substitutions/ExperimentalData/220512_data.dat new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/unittests/test_directories/example_substitutions/substitutions.yml b/unittests/test_directories/example_substitutions/substitutions.yml new file mode 100644 index 0000000000000000000000000000000000000000..1b4e8784a69d1ad1b80fa757ad77cd137c8cc7b5 --- /dev/null +++ b/unittests/test_directories/example_substitutions/substitutions.yml @@ -0,0 +1,22 @@ + +ExperimentalData: # name of the converter + type: Directory + match: ExperimentalData + records: + Project: + name: project + subtree: + File: # name of the converter + type: SimpleFile + match: (?P<year>[0-9]{2,2})(?P<month>[0-9]{2,2})(?P<day>[0-9]{2,2})_data.dat + records: + Experiment: + date: 20$year-$month-$day + + ExperimentSeries: + Experiment: $Experiment + + Project: + Experiments: +$Experiment + dates: +20$year-$month-$day + diff --git a/unittests/test_directories/examples_json/jsontest_cfood.yml b/unittests/test_directories/examples_json/jsontest_cfood.yml index bcf79a2d5183ebb496f8e180e9c264bb3ac05e48..f1eb6a9fa186c07f551bd12a84050f544abfdabc 100644 --- a/unittests/test_directories/examples_json/jsontest_cfood.yml +++ b/unittests/test_directories/examples_json/jsontest_cfood.yml @@ -3,13 +3,56 @@ JSONTest: # name of the converter type: JSONFile match: '(.*)' validate: ./testjson.schema.json - subtree: - element: # name of the first subtree element which is a converter + records: + Project: # this is an identifiable in this case + parents: + - Project # not needed as the name is equivalent + subtree: + name_element: + type: DictTextElement + match_name: "name" + match_value: "(?P<name>.*)" + records: + Project: + name: $name + url_element: # name of the first subtree element which is a converter type: DictTextElement match_value: "(?P<url>.*)" match_name: "url" records: - Project: # this is an identifiable in this case - parents: - - Project # not needed as the name is equivalent - url: $url + Project: + url: $url + persons_element: + type: DictListElement + match_name: "Person" + subtree: + person_element: + type: Dict + records: + Person: + parents: + - Person + Project: + Person: +$Person + subtree: + firstname_element: + type: DictTextElement + match_name: "firstname" + match_value: "(?P<firstname>.*)" + records: + Person: + firstname: $firstname + lastname_element: + type: DictTextElement + match_name: "lastname" + match_value: "(?P<lastname>.*)" + records: + Person: + lastname: $lastname + email_element: + type: DictTextElement + match_name: "email" + match_value: "(?P<email>.*)" + records: + Person: + email: $email diff --git a/unittests/test_directories/examples_json/testjson.json b/unittests/test_directories/examples_json/testjson.json index cd26c9c3295d6a2a8a6110f0876fffb62f60419e..b893b608a6a2119c5c3252cd9cff4c4100f404da 100644 --- a/unittests/test_directories/examples_json/testjson.json +++ b/unittests/test_directories/examples_json/testjson.json @@ -2,11 +2,18 @@ "name": "DEMO", "projectId": 10002, "archived": false, - "coordinator": { - "firstname": "Miri", - "lastname": "Mueller", - "email": "miri.mueller@science.de" - }, + "Person": [ + { + "firstname": "Miri", + "lastname": "Mueller", + "email": "miri.mueller@science.de" + }, + { + "firstname": "Mara", + "lastname": "Mueller", + "email": "mara.mueller@science.de" + } + ], "start_date": "2022-03-01", "candidates": ["Mouse", "Penguine"], "rvalue": 0.4444, diff --git a/unittests/test_directories/examples_json/testjson.schema.json b/unittests/test_directories/examples_json/testjson.schema.json index a684e9b663d8cba1ba1931aae5615040b2797240..fc784a61079e4737f1a0176fe4240133f5d1b5d0 100644 --- a/unittests/test_directories/examples_json/testjson.schema.json +++ b/unittests/test_directories/examples_json/testjson.schema.json @@ -11,25 +11,28 @@ "archived": { "type": "boolean" }, - "coordinator": { - "type": "object", - "properties": { - "firstname": { - "type": "string" - }, - "lastname": { - "type": "string" - }, - "email": { - "type": "string" + "Person": { + "type": "array", + "items": { + "type": "object", + "properties": { + "firstname": { + "type": "string" + }, + "lastname": { + "type": "string" + }, + "email": { + "type": "string" + } + }, + "required": [ + "firstname", + "lastname", + "email" + ], + "additionalProperties": true } - }, - "required": [ - "firstname", - "lastname", - "email" - ], - "additionalProperties": true }, "start_date": { "type": "string", @@ -51,7 +54,7 @@ "required": [ "name", "projectId", - "coordinator" + "Person" ], "additionalProperties": false } diff --git a/unittests/test_directories/single_file_test_data/identifiables.yml b/unittests/test_directories/single_file_test_data/identifiables.yml new file mode 100644 index 0000000000000000000000000000000000000000..e32746d5a6984096cc46fa618250832b325965b0 --- /dev/null +++ b/unittests/test_directories/single_file_test_data/identifiables.yml @@ -0,0 +1,7 @@ +Person: + - full_name +Keyword: + - name +Project: + - project_id + - title diff --git a/unittests/test_entity_comparison.py b/unittests/test_entity_comparison.py index 1e22280d893ae7bba301baa61213b5e49eaaba6c..549bc4f42a59765d25446d44fbb845e49ca4d9b9 100644 --- a/unittests/test_entity_comparison.py +++ b/unittests/test_entity_comparison.py @@ -7,19 +7,19 @@ import caosdb as db import pytest from pytest import raises -from newcrawler.crawl import check_identical +from caoscrawler.crawl import check_identical def test_compare_entities(): record1 = db.Record() record2 = db.Record() - + assert check_identical(record1, record2) record1.add_property(name="type", value="int") assert not check_identical(record1, record2) assert not check_identical(record2, record1) - + record2.add_property(name="type", value="int") assert check_identical(record1, record2) record2.get_property("type").value = "int2" @@ -36,8 +36,10 @@ def test_compare_entities(): # This is confusing, but needed: record1.add_property(name="field_with_type", value=42, datatype=db.INTEGER) record2.add_property(name="field_with_type", value=42) - assert not check_identical(record1, record2) # not identical, because record1 sets the datatype - assert check_identical(record2, record1) # identical, because record2 sets the datatype + # not identical, because record1 sets the datatype + assert not check_identical(record1, record2) + # identical, because record2 sets the datatype + assert check_identical(record2, record1) record2.get_property("field_with_type").datatype = db.INTEGER assert check_identical(record1, record2) assert check_identical(record2, record1) @@ -79,8 +81,10 @@ def test_compare_entities(): for attribute, values in zip(("_checksum", "_size"), (vals[0], vals[1])): setattr(record1, attribute, values[0]) - assert not check_identical(record1, record2) # not identical, because record1 sets the datatype - assert check_identical(record2, record1) # identical, because record2 sets the datatype + # not identical, because record1 sets the datatype + assert not check_identical(record1, record2) + # identical, because record2 sets the datatype + assert check_identical(record2, record1) setattr(record2, attribute, values[1]) assert not check_identical(record1, record2) @@ -89,5 +93,3 @@ def test_compare_entities(): setattr(record2, attribute, values[0]) assert check_identical(record1, record2) assert check_identical(record2, record1) - - diff --git a/unittests/test_file_identifiables.py b/unittests/test_file_identifiables.py index 234fae20c53e137bf049e496dbe178a30e5de833..b0b9801993dc68fe473e788b8ca79a2244912676 100644 --- a/unittests/test_file_identifiables.py +++ b/unittests/test_file_identifiables.py @@ -7,7 +7,7 @@ import caosdb as db import pytest from pytest import raises -from newcrawler.identifiable_adapters import LocalStorageIdentifiableAdapter +from caoscrawler.identifiable_adapters import LocalStorageIdentifiableAdapter def test_file_identifiable(): @@ -41,7 +41,8 @@ def test_file_identifiable(): assert file_obj.checksum != identifiable.checksum # This is the wrong method, so it should definitely return None: - identified_file = ident.retrieve_identified_record_for_identifiable(identifiable) + identified_file = ident.retrieve_identified_record_for_identifiable( + identifiable) assert identified_file is None # This is the correct method to use: identified_file = ident.get_file(identifiable) @@ -71,5 +72,3 @@ def test_file_identifiable(): with raises(RuntimeError, match=".*unambigiously.*"): records.append(test_record_alsocorrect_path) identified_file = ident.get_file(file_obj) - - diff --git a/unittests/test_identifiable_adapters.py b/unittests/test_identifiable_adapters.py index 4a3ae786438e99ded8925d4405d9b051cd86bf66..ef7998a460c07342d30a3f769fd609c1045a9cca 100644 --- a/unittests/test_identifiable_adapters.py +++ b/unittests/test_identifiable_adapters.py @@ -27,8 +27,10 @@ test identifiable_adapters module """ +import os from datetime import datetime -from newcrawler.identifiable_adapters import IdentifiableAdapter +from caoscrawler.identifiable_adapters import ( + CaosDBIdentifiableAdapter, IdentifiableAdapter) import caosdb as db @@ -57,3 +59,27 @@ def test_create_query_for_identifiable(): query = IdentifiableAdapter.create_query_for_identifiable( db.Record(name="TestRecord").add_parent("TestType")) assert query.lower() == "find record testtype with name='testrecord'" + + +def test_load_from_yaml_file(): + ident = CaosDBIdentifiableAdapter() + ident.load_from_yaml_definition( + os.path.join(os.path.dirname(__file__), "test_directories", + "single_file_test_data", "identifiables.yml") + ) + + person_i = ident.get_registered_identifiable( + db.Record().add_parent("Person")) + assert person_i is not None + assert person_i.get_property("full_name") is not None + + keyword_i = ident.get_registered_identifiable( + db.Record().add_parent("Keyword")) + assert keyword_i is not None + assert keyword_i.get_property("name") is not None + + project_i = ident.get_registered_identifiable( + db.Record().add_parent("Project")) + assert project_i is not None + assert project_i.get_property("project_id") is not None + assert project_i.get_property("title") is not None diff --git a/unittests/test_identified_cache.py b/unittests/test_identified_cache.py index 9a1034634692e3d55935d31e2b3923d874f3f673..33add97d4309d87705144ec5331366d0bcd05541 100644 --- a/unittests/test_identified_cache.py +++ b/unittests/test_identified_cache.py @@ -27,12 +27,13 @@ test identified_cache module """ -from newcrawler.identified_cache import _create_hashable_string, IdentifiedCache +from caoscrawler.identified_cache import _create_hashable_string, IdentifiedCache import caosdb as db def test_create_hash(): - assert _create_hashable_string(db.Record("A").add_parent("B")) == "P<B>N<A>" + assert _create_hashable_string( + db.Record("A").add_parent("B")) == "P<B>N<A>" assert _create_hashable_string(db.Record("A") .add_parent("B").add_property('a', 5)) == "P<B>N<A>a:5" assert (_create_hashable_string( diff --git a/unittests/test_json.py b/unittests/test_json.py index d4da1fe7f20d3b2ea8c623315542fce90fb18497..97d9831de20a2b9f712294d1a0f6322789580f30 100644 --- a/unittests/test_json.py +++ b/unittests/test_json.py @@ -31,9 +31,11 @@ import os from pytest import raises -from newcrawler.converters import JSONFileConverter, DictConverter -from newcrawler.crawl import Crawler -from newcrawler.structure_elements import File, JSONFile +import caosdb as db + +from caoscrawler.converters import JSONFileConverter, DictConverter +from caoscrawler.crawl import Crawler +from caoscrawler.structure_elements import File, JSONFile from test_tool import rfp, dircheckstr @@ -47,19 +49,27 @@ def test_json(): # Load and register converter packages: converter_registry = crawler.load_converters(crawler_definition) - crawler.start_crawling( + records = crawler.start_crawling( JSONFile(os.path.basename(json_file_path), json_file_path), crawler_definition, converter_registry ) - subd = crawler.debug_tree - subc = crawler.debug_metadata - #print(json.dumps(subd, indent=3)) - print(subd) - print(subc) + + rec = [r for r in records if r.name == "DEMO"] + assert len(rec) == 1 + rec = rec[0] + assert len(rec.parents) == 1 + assert rec.parents[0].name == "Project" + assert rec.get_property("url") is not None + assert rec.get_property("url").value == "https://site.de/index.php/" + assert rec.get_property("Person") is not None + assert isinstance(rec.get_property("Person").value, list) + assert len(rec.get_property("Person").value) == 2 + def test_broken_validation(): - crawler_definition_path = rfp("broken_cfoods", "broken_validation_path.yml") + crawler_definition_path = rfp( + "broken_cfoods", "broken_validation_path.yml") crawler = Crawler() with raises(FileNotFoundError) as err: crawler_definition = crawler.load_definition(crawler_definition_path) diff --git a/unittests/test_schema.py b/unittests/test_schema.py index cac37c758aa838d78eb24435db55b099258900ac..0736698eb32146fb3cfbee6acbcf11f5436df27e 100644 --- a/unittests/test_schema.py +++ b/unittests/test_schema.py @@ -6,13 +6,14 @@ from importlib_resources import files import caosdb as db from os.path import join, dirname -from newcrawler import Crawler +from caoscrawler import Crawler import pytest from pytest import raises from jsonschema.exceptions import ValidationError + def rfp(*pathcomponents): """ Return full path. diff --git a/unittests/test_tool.py b/unittests/test_tool.py index dd9fb83d772496cc6b3729f2893997360d318f18..1e7f10069c49ce6cab71da5f469e28b69158b4b5 100755 --- a/unittests/test_tool.py +++ b/unittests/test_tool.py @@ -3,9 +3,9 @@ # Adapted from check-sfs # A. Schlemmer, 06/2021 -from newcrawler import Crawler -from newcrawler.structure_elements import File, DictTextElement, DictListElement -from newcrawler.identifiable_adapters import IdentifiableAdapter, LocalStorageIdentifiableAdapter +from caoscrawler import Crawler +from caoscrawler.structure_elements import File, DictTextElement, DictListElement +from caoscrawler.identifiable_adapters import IdentifiableAdapter, LocalStorageIdentifiableAdapter from functools import partial from copy import deepcopy from unittest.mock import MagicMock, Mock @@ -30,7 +30,7 @@ def dircheckstr(*pathcomponents): """ Return the debug tree identifier for a given path. """ - return "newcrawler.structure_elements.Directory: " + basename(join(*pathcomponents)) + ", " + rfp("test_directories", "examples_article", *pathcomponents) + return "caoscrawler.structure_elements.Directory: " + basename(join(*pathcomponents)) + ", " + rfp("test_directories", "examples_article", *pathcomponents) @pytest.fixture @@ -74,8 +74,10 @@ def test_record_structure_generation(crawler): subd = crawler.debug_tree[dircheckstr("DataAnalysis")] subc = crawler.debug_metadata["copied"][dircheckstr("DataAnalysis")] assert len(subd) == 2 - assert len(subd[0]) == 2 # variables store on Data Analysis node of debug tree - assert len(subd[1]) == 0 # record store on Data Analysis node of debug tree + # variables store on Data Analysis node of debug tree + assert len(subd[0]) == 2 + # record store on Data Analysis node of debug tree + assert len(subd[1]) == 0 assert len(subc) == 2 assert len(subc[0]) == 2 assert len(subc[1]) == 0 @@ -84,7 +86,8 @@ def test_record_structure_generation(crawler): assert subd[0]["DataAnalysis"] == "examples_article/DataAnalysis" assert subc[0]["DataAnalysis"] == False - subd = crawler.debug_tree[dircheckstr("DataAnalysis", "2020_climate-model-predict")] + subd = crawler.debug_tree[dircheckstr( + "DataAnalysis", "2020_climate-model-predict")] subc = crawler.debug_metadata["copied"][dircheckstr( "DataAnalysis", "2020_climate-model-predict")] @@ -92,7 +95,8 @@ def test_record_structure_generation(crawler): assert len(subd[1]["Project"].get_parents()) == 1 assert subd[1]["Project"].get_parents()[0].name == "Project" assert subd[1]["Project"].get_property("date").value == "2020" - assert subd[1]["Project"].get_property("identifier").value == "climate-model-predict" + assert subd[1]["Project"].get_property( + "identifier").value == "climate-model-predict" assert len(subd[0]) == 6 assert subd[0]["date"] == "2020" @@ -129,15 +133,19 @@ def test_record_structure_generation(crawler): assert len(subd[1]["Project"].get_parents()) == 1 assert subd[1]["Project"].get_parents()[0].name == "Project" assert subd[1]["Project"].get_property("date").value == "2020" - assert subd[1]["Project"].get_property("identifier").value == "climate-model-predict" + assert subd[1]["Project"].get_property( + "identifier").value == "climate-model-predict" assert len(subd[1]["Measurement"].get_parents()) == 1 assert subd[1]["Measurement"].get_parents()[0].name == "Measurement" assert subd[1]["Measurement"].get_property("date").value == "2020-02-08" - assert subd[1]["Measurement"].get_property("identifier").value == "prediction-errors" + assert subd[1]["Measurement"].get_property( + "identifier").value == "prediction-errors" assert subd[1]["Measurement"].get_property("project").value != "$Project" - assert subd[1]["Measurement"].get_property("project").value.__class__ == db.Record - assert subd[1]["Measurement"].get_property("project").value == subd[0]["Project"] + assert subd[1]["Measurement"].get_property( + "project").value.__class__ == db.Record + assert subd[1]["Measurement"].get_property( + "project").value == subd[0]["Project"] # Check the copy flags for the second level in the hierarchy: assert subc[1]["Project"] is True @@ -176,9 +184,15 @@ def test_crawler_update_list(crawler, ident): # If the following assertions fail, that is a hint, that the test file records.xml has changed # and this needs to be updated: assert len(ident.get_records()) == 18 - assert len([r for r in ident.get_records() if r.parents[0].name == "Person"]) == 5 - assert len([r for r in ident.get_records() if r.parents[0].name == "Measurement"]) == 11 - assert len([r for r in ident.get_records() if r.parents[0].name == "Project"]) == 2 + assert len( + [r for r in ident.get_records() if r.parents[0].name == "Person"] + ) == 5 + assert len( + [r for r in ident.get_records() if r.parents[0].name == "Measurement"] + ) == 11 + assert len( + [r for r in ident.get_records() if r.parents[0].name == "Project"] + ) == 2 # The crawler contains lots of duplicates, because identifiables have not been resolved yet: assert len(ident.get_records()) != len(crawler.updateList) @@ -194,8 +208,10 @@ def test_crawler_update_list(crawler, ident): id_r0 = ident.get_identifiable(r_cur) assert r_cur.parents[0].name == id_r0.parents[0].name - assert r_cur.get_property("first_name").value == id_r0.get_property("first_name").value - assert r_cur.get_property("last_name").value == id_r0.get_property("last_name").value + assert r_cur.get_property( + "first_name").value == id_r0.get_property("first_name").value + assert r_cur.get_property( + "last_name").value == id_r0.get_property("last_name").value assert len(r_cur.parents) == 1 assert len(id_r0.parents) == 1 assert len(r_cur.properties) == 2 @@ -213,9 +229,11 @@ def test_crawler_update_list(crawler, ident): id_r1 = ident.get_identifiable(r_cur) assert r_cur.parents[0].name == id_r1.parents[0].name - assert r_cur.get_property("identifier").value == id_r1.get_property("identifier").value + assert r_cur.get_property( + "identifier").value == id_r1.get_property("identifier").value assert r_cur.get_property("date").value == id_r1.get_property("date").value - assert r_cur.get_property("project").value == id_r1.get_property("project").value + assert r_cur.get_property( + "project").value == id_r1.get_property("project").value assert len(r_cur.parents) == 1 assert len(id_r1.parents) == 1 assert len(r_cur.properties) == 4 @@ -228,7 +246,8 @@ def test_crawler_update_list(crawler, ident): assert idr_r1_test != idr_r0_test assert len(idr_r1.properties) == 4 - assert r_cur.get_property("responsible").value == idr_r1.get_property("responsible").value + assert r_cur.get_property( + "responsible").value == idr_r1.get_property("responsible").value assert r_cur.description == idr_r1.description # test whether compare_entites function works in this context: @@ -355,14 +374,17 @@ def test_split_into_inserts_and_updates_trivial(crawler): def test_split_into_inserts_and_updates_single(mock_retrieve): crawler = mock_retrieve - entlist = [db.Record(name="A").add_parent("C"), db.Record(name="B").add_parent("C")] + entlist = [db.Record(name="A").add_parent( + "C"), db.Record(name="B").add_parent("C")] assert crawler.get_identified_record_from_local_cache(entlist[0]) is None assert crawler.get_identified_record_from_local_cache(entlist[1]) is None assert crawler.can_be_checked_externally(entlist[0]) assert crawler.can_be_checked_externally(entlist[1]) - assert crawler.identifiableAdapter.retrieve_identified_record_for_record(entlist[0]).id == 1111 - assert crawler.identifiableAdapter.retrieve_identified_record_for_record(entlist[1]) is None + assert crawler.identifiableAdapter.retrieve_identified_record_for_record( + entlist[0]).id == 1111 + assert crawler.identifiableAdapter.retrieve_identified_record_for_record( + entlist[1]) is None insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) assert len(insert) == 1 @@ -416,7 +438,8 @@ def test_split_into_inserts_and_updates_with_complex(mock_retrieve): # ^ # | # F <- B <- G - a = db.Record(name="A").add_parent("C").add_property('d', 13).add_property('e', "lskdjlsfdj") + a = db.Record(name="A").add_parent("C").add_property( + 'd', 13).add_property('e', "lskdjlsfdj") b = db.Record(name="B").add_parent("C") g = db.Record(name="G").add_parent("C") f = db.Record(name="F").add_parent("C") @@ -457,7 +480,8 @@ def test_all_references_are_existing_already(crawler): base_mocked_lookup, known={"A": db.Record(name="A").add_parent("C"), "B": db.Record(name="B").add_parent("C")})) - assert crawler.all_references_are_existing_already(db.Record().add_property('a', 123)) + assert crawler.all_references_are_existing_already( + db.Record().add_property('a', 123)) assert crawler.all_references_are_existing_already(db.Record() .add_property('a', db.Record(id=123))) assert crawler.all_references_are_existing_already(db.Record() @@ -475,7 +499,8 @@ def test_all_references_are_existing_already(crawler): def test_can_be_checked_externally(crawler): - assert crawler.can_be_checked_externally(db.Record().add_property('a', 123)) + assert crawler.can_be_checked_externally( + db.Record().add_property('a', 123)) assert crawler.can_be_checked_externally(db.Record() .add_property('a', db.Record(id=123))) assert crawler.can_be_checked_externally(db.Record() diff --git a/unittests/test_tool_extended.py b/unittests/test_tool_extended.py index 2361e99373042a5f5ce73b8eb98083f7431d4836..d0b431a539a15e3e83906540c69becff437742ec 100644 --- a/unittests/test_tool_extended.py +++ b/unittests/test_tool_extended.py @@ -3,9 +3,9 @@ # Adapted from check-sfs # A. Schlemmer, 06/2021 -from newcrawler import Crawler -from newcrawler.structure_elements import File, DictTextElement, DictListElement -from newcrawler.identifiable_adapters import IdentifiableAdapter, LocalStorageIdentifiableAdapter +from caoscrawler import Crawler +from caoscrawler.structure_elements import File, DictTextElement, DictListElement +from caoscrawler.identifiable_adapters import IdentifiableAdapter, LocalStorageIdentifiableAdapter from functools import partial from copy import deepcopy from unittest.mock import MagicMock, Mock @@ -30,7 +30,7 @@ def dircheckstr(*pathcomponents, structure_element_type="Directory"): """ Return the debug tree identifier for a given path. """ - return ("newcrawler.structure_elements." + structure_element_type + ": " + + return ("caoscrawler.structure_elements." + structure_element_type + ": " + basename(join(*pathcomponents)) + ", " + rfp("test_directories", "examples_article", *pathcomponents)) @@ -47,7 +47,7 @@ def crawler(): # def ident(crawler): # ident = LocalStorageIdentifiableAdapter() # crawler.identifiableAdapter = ident - + # ident.restore_state(rfp("records.xml")) # ident.register_identifiable( diff --git a/unittests/test_variable_substitutions.py b/unittests/test_variable_substitutions.py new file mode 100644 index 0000000000000000000000000000000000000000..071bf4646d20e35ed05dafaf5fabf786dc182dcc --- /dev/null +++ b/unittests/test_variable_substitutions.py @@ -0,0 +1,61 @@ +#!/bin/python +# Tests for variable substitutions +# A. Schlemmer, 05/2022 + +from caoscrawler import Crawler +from caoscrawler.structure_elements import File, DictTextElement, DictListElement +from caoscrawler.identifiable_adapters import IdentifiableAdapter, LocalStorageIdentifiableAdapter +from functools import partial +from copy import deepcopy +from unittest.mock import MagicMock, Mock +from os.path import join, dirname, basename +import yaml +import caosdb as db +from caosdb.apiutils import compare_entities + +import pytest +from pytest import raises + + +def rfp(*pathcomponents): + """ + Return full path. + Shorthand convenience function. + """ + return join(dirname(__file__), *pathcomponents) + + +def dircheckstr(element_type, *pathcomponents): + """ + Return the debug tree identifier for a given path. + """ + return "caoscrawler.structure_elements." + element_type + ": " + basename(join(*pathcomponents)) + ", " + rfp("test_directories", "example_substitutions", *pathcomponents) + + +@pytest.fixture +def crawler(): + crawler = Crawler(debug=True) + crawler.crawl_directory(rfp("test_directories", "example_substitutions", "ExperimentalData"), + rfp("test_directories", "example_substitutions", "substitutions.yml")) + return crawler + + +def test_substitutions(crawler): + # @review Florian Spreckelsen 2022-05-13 + for i in range(2): + subd = crawler.debug_tree[dircheckstr( + "File", "ExperimentalData", "220512_data.dat")] + assert subd[i]["Experiment"].get_property("date").value == "2022-05-12" + assert isinstance(subd[i]["ExperimentSeries"].get_property( + "Experiment").value, db.Record) + + subd = crawler.debug_tree[dircheckstr("Directory", "ExperimentalData")] + assert subd[i]["Project"].name == "project" + assert isinstance(subd[i]["Project"].get_property( + "Experiments").value, list) + assert isinstance(subd[i]["Project"].get_property( + "Experiments").value[0], db.Record) + + assert isinstance(subd[i]["Project"].get_property("dates").value, list) + assert subd[i]["Project"].get_property( + "dates").value[0] == "2022-05-12"