diff --git a/.docker/Dockerfile b/.docker/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..f7353e059d8cd027f08403d6f6527ffbcaabc965 --- /dev/null +++ b/.docker/Dockerfile @@ -0,0 +1,39 @@ +FROM debian:latest +RUN apt-get update && \ + apt-get install \ + curl \ + git \ + openjdk-11-jdk-headless \ + python3-autopep8 \ + python3-pip \ + python3-pytest \ + tox \ + -y +COPY .docker/wait-for-it.sh /wait-for-it.sh +ARG PYLIB +ADD https://gitlab.indiscale.com/api/v4/projects/97/repository/commits/${PYLIB} \ + pylib_version.json +RUN git clone https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git && \ + cd caosdb-pylib && git checkout ${PYLIB} && pip3 install . +ARG ADVANCED +ADD https://gitlab.indiscale.com/api/v4/projects/104/repository/commits/${ADVANCED} \ + advanced_version.json +RUN git clone https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git && \ + cd caosdb-advanced-user-tools && git checkout ${ADVANCED} && pip3 install .[h5-crawler] +COPY . /git + +# Delete .git because it is huge. +RUN rm -r /git/.git + +# Install pycaosdb.ini for the tests +RUN mv /git/.docker/tester_pycaosdb.ini /git/integrationtests/pycaosdb.ini + +RUN cd /git/ && pip3 install . + +WORKDIR /git/integrationtests +# wait for server, +CMD /wait-for-it.sh caosdb-server:10443 -t 500 -- \ + # ... install pycaosdb.ini the server-side scripts + cp /git/.docker/sss_pycaosdb.ini /scripting/home/.pycaosdb.ini && \ + # ... and run tests + pytest-3 . diff --git a/.docker/cert.sh b/.docker/cert.sh new file mode 100755 index 0000000000000000000000000000000000000000..e22cfba2995b5fd9d812232f562b7254233fe5b0 --- /dev/null +++ b/.docker/cert.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2019 Daniel Hornung, Göttingen +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header + + +# Creates a directory `cert` and certificates in this directory. +# +# The hostname for which the certificate is created can be changed by setting +# the environment variable CAOSHOSTNAME. +# +# ## Overview of variables ## +# +# - CAOSHOSTNAME :: Hostname for the key (localhost) +# - KEYPW :: Password for the key (default ist CaosDBSecret) +# - KEYSTOREPW :: Password for the key store (same as KEYPW) +function cert() { + mkdir -p cert + cd cert + KEYPW="${KEYPW:-CaosDBSecret}" + CAOSHOSTNAME="${CAOSHOSTNAME:-localhost}" + KEYSTOREPW="${KEYPW:-}" + # NOTE: KEYPW and KEYSTOREPW are the same, due to Java limitations. + KEYPW="${KEYPW}" openssl genrsa -aes256 -out caosdb.key.pem \ + -passout env:KEYPW 2048 + # Certificate is for localhost + KEYPW="${KEYPW}" openssl req -new -x509 -key caosdb.key.pem \ + -out caosdb.cert.pem -passin env:KEYPW \ + -subj "/C=/ST=/L=/O=/OU=/CN=${CAOSHOSTNAME}" + KEYPW="${KEYPW}" KEYSTOREPW="$KEYSTOREPW" openssl pkcs12 -export \ + -inkey caosdb.key.pem -in caosdb.cert.pem -out all-certs.pkcs12 \ + -passin env:KEYPW -passout env:KEYPW + + keytool -importkeystore -srckeystore all-certs.pkcs12 -srcstoretype PKCS12 \ + -deststoretype pkcs12 -destkeystore caosdb.jks \ + -srcstorepass "${KEYPW}" \ + -destkeypass "${KEYPW}" -deststorepass "$KEYSTOREPW" + echo "Certificates successfuly created." +} + +cert diff --git a/.docker/docker-compose.yml b/.docker/docker-compose.yml new file mode 100644 index 0000000000000000000000000000000000000000..e5bb4c9b8ca6ad1750922cb07c92cd6c5eb77c6b --- /dev/null +++ b/.docker/docker-compose.yml @@ -0,0 +1,42 @@ +version: '3.7' +services: + sqldb: + image: mariadb:10.4 + environment: + MYSQL_ROOT_PASSWORD: caosdb1234 + networks: + - caosnet + caosdb-server: + image: "$CI_REGISTRY/caosdb/src/caosdb-deploy:$CAOSDB_TAG" + user: 999:999 + depends_on: + - sqldb + networks: + - caosnet + volumes: + - type: bind + source: ./cert + target: /opt/caosdb/cert + - type: bind + source: "../integrationtests/test_data/extroot" + target: /opt/caosdb/mnt/extroot + - type: volume + source: scripting + target: /opt/caosdb/git/caosdb-server/scripting + - type: volume + source: authtoken + target: /opt/caosdb/git/caosdb-server/authtoken + ports: + # - "from_outside:from_inside" + - "10443:10443" + - "10080:10080" + environment: + DEBUG: 1 + CAOSDB_CONFIG_AUTHTOKEN_CONFIG: "conf/core/authtoken.example.yaml" + CAOSDB_CONFIG_TRANSACTION_BENCHMARK_ENABLED: "TRUE" +volumes: + scripting: + authtoken: +networks: + caosnet: + driver: bridge diff --git a/.docker/run.sh b/.docker/run.sh new file mode 100755 index 0000000000000000000000000000000000000000..b0e1a716f28516b83043fb3fdb6594515a0bafd4 --- /dev/null +++ b/.docker/run.sh @@ -0,0 +1,5 @@ +#!/bin/sh + +docker-compose -f tester.yml run tester +rv=$? +echo $rv > result diff --git a/.docker/sss_pycaosdb.ini b/.docker/sss_pycaosdb.ini new file mode 100644 index 0000000000000000000000000000000000000000..de2867f8dc66b3e81f10f35e40c36f9cb8591604 --- /dev/null +++ b/.docker/sss_pycaosdb.ini @@ -0,0 +1,9 @@ +; this is the pycaosdb.ini for the server-side-scripting home. +[Connection] +url = https://caosdb-server:10443 +cacert = /opt/caosdb/cert/caosdb.cert.pem +debug = 0 +timeout = 5000 + +[Misc] +sendmail = /usr/local/bin/sendmail_to_file diff --git a/.docker/tester.yml b/.docker/tester.yml new file mode 100644 index 0000000000000000000000000000000000000000..83db879c6072bfdea7b3212c833116b96bb54d0c --- /dev/null +++ b/.docker/tester.yml @@ -0,0 +1,26 @@ +version: '3.7' +services: + tester: + image: "$CI_REGISTRY_IMAGE" + networks: + - docker_caosnet + volumes: + - type: bind + source: ./cert + target: /cert + - type: volume + source: extroot + target: /extroot + - type: volume + source: scripting + target: /scripting + - type: volume + source: authtoken + target: /authtoken +networks: + docker_caosnet: + external: true +volumes: + scripting: + extroot: + authtoken: diff --git a/.docker/tester_pycaosdb.ini b/.docker/tester_pycaosdb.ini new file mode 100644 index 0000000000000000000000000000000000000000..2159dec250b3dcb2f16043d12bdbe73675e4d75c --- /dev/null +++ b/.docker/tester_pycaosdb.ini @@ -0,0 +1,31 @@ +; pycaosdb.ini for pytest test suites. + +[IntegrationTests] +; location of the scripting bin dir which is used for the test scripts from the +; server's perspective. +test_server_side_scripting.bin_dir.server = scripting/bin-debug/ +; location of the scripting bin dir which is used for the test scripts from the +; pyinttest's perspective. +test_server_side_scripting.bin_dir.local = /scripting/bin-debug/ + +; location of the files from the pyinttest perspective +test_files.test_insert_files_in_dir.local = /extroot/test_insert_files_in_dir/ +; location of the files from the caosdb_servers perspective +test_files.test_insert_files_in_dir.server = /opt/caosdb/mnt/extroot/test_insert_files_in_dir/ + +; location of the one-time tokens from the pyinttest's perspective +test_authentication.admin_token_crud = /authtoken/admin_token_crud.txt +test_authentication.admin_token_expired = /authtoken/admin_token_expired.txt +test_authentication.admin_token_3_attempts = /authtoken/admin_token_3_attempts.txt + + +[Connection] +url = https://caosdb-server:10443/ +username = admin +cacert = /cert/caosdb.cert.pem +debug = 0 + +password_method = plain +password = caosdb + +timeout = 500 diff --git a/.docker/wait-for-it.sh b/.docker/wait-for-it.sh new file mode 100755 index 0000000000000000000000000000000000000000..d69e99f1f13257b559dce2433de0515379663efa --- /dev/null +++ b/.docker/wait-for-it.sh @@ -0,0 +1,182 @@ +#!/usr/bin/env bash +# License: +# From https://github.com/vishnubob/wait-for-it +# The MIT License (MIT) +# Use this script to test if a given TCP host/port are available + +WAITFORIT_cmdname=${0##*/} + +echoerr() { if [[ $WAITFORIT_QUIET -ne 1 ]]; then echo "$@" 1>&2; fi } + +usage() +{ + cat << USAGE >&2 +Usage: + $WAITFORIT_cmdname host:port [-s] [-t timeout] [-- command args] + -h HOST | --host=HOST Host or IP under test + -p PORT | --port=PORT TCP port under test + Alternatively, you specify the host and port as host:port + -s | --strict Only execute subcommand if the test succeeds + -q | --quiet Don't output any status messages + -t TIMEOUT | --timeout=TIMEOUT + Timeout in seconds, zero for no timeout + -- COMMAND ARGS Execute command with args after the test finishes +USAGE + exit 1 +} + +wait_for() +{ + if [[ $WAITFORIT_TIMEOUT -gt 0 ]]; then + echoerr "$WAITFORIT_cmdname: waiting $WAITFORIT_TIMEOUT seconds for $WAITFORIT_HOST:$WAITFORIT_PORT" + else + echoerr "$WAITFORIT_cmdname: waiting for $WAITFORIT_HOST:$WAITFORIT_PORT without a timeout" + fi + WAITFORIT_start_ts=$(date +%s) + while : + do + if [[ $WAITFORIT_ISBUSY -eq 1 ]]; then + nc -z $WAITFORIT_HOST $WAITFORIT_PORT + WAITFORIT_result=$? + else + (echo > /dev/tcp/$WAITFORIT_HOST/$WAITFORIT_PORT) >/dev/null 2>&1 + WAITFORIT_result=$? + fi + if [[ $WAITFORIT_result -eq 0 ]]; then + WAITFORIT_end_ts=$(date +%s) + echoerr "$WAITFORIT_cmdname: $WAITFORIT_HOST:$WAITFORIT_PORT is available after $((WAITFORIT_end_ts - WAITFORIT_start_ts)) seconds" + break + fi + sleep 1 + done + return $WAITFORIT_result +} + +wait_for_wrapper() +{ + # In order to support SIGINT during timeout: http://unix.stackexchange.com/a/57692 + if [[ $WAITFORIT_QUIET -eq 1 ]]; then + timeout $WAITFORIT_BUSYTIMEFLAG $WAITFORIT_TIMEOUT $0 --quiet --child --host=$WAITFORIT_HOST --port=$WAITFORIT_PORT --timeout=$WAITFORIT_TIMEOUT & + else + timeout $WAITFORIT_BUSYTIMEFLAG $WAITFORIT_TIMEOUT $0 --child --host=$WAITFORIT_HOST --port=$WAITFORIT_PORT --timeout=$WAITFORIT_TIMEOUT & + fi + WAITFORIT_PID=$! + trap "kill -INT -$WAITFORIT_PID" INT + wait $WAITFORIT_PID + WAITFORIT_RESULT=$? + if [[ $WAITFORIT_RESULT -ne 0 ]]; then + echoerr "$WAITFORIT_cmdname: timeout occurred after waiting $WAITFORIT_TIMEOUT seconds for $WAITFORIT_HOST:$WAITFORIT_PORT" + fi + return $WAITFORIT_RESULT +} + +# process arguments +while [[ $# -gt 0 ]] +do + case "$1" in + *:* ) + WAITFORIT_hostport=(${1//:/ }) + WAITFORIT_HOST=${WAITFORIT_hostport[0]} + WAITFORIT_PORT=${WAITFORIT_hostport[1]} + shift 1 + ;; + --child) + WAITFORIT_CHILD=1 + shift 1 + ;; + -q | --quiet) + WAITFORIT_QUIET=1 + shift 1 + ;; + -s | --strict) + WAITFORIT_STRICT=1 + shift 1 + ;; + -h) + WAITFORIT_HOST="$2" + if [[ $WAITFORIT_HOST == "" ]]; then break; fi + shift 2 + ;; + --host=*) + WAITFORIT_HOST="${1#*=}" + shift 1 + ;; + -p) + WAITFORIT_PORT="$2" + if [[ $WAITFORIT_PORT == "" ]]; then break; fi + shift 2 + ;; + --port=*) + WAITFORIT_PORT="${1#*=}" + shift 1 + ;; + -t) + WAITFORIT_TIMEOUT="$2" + if [[ $WAITFORIT_TIMEOUT == "" ]]; then break; fi + shift 2 + ;; + --timeout=*) + WAITFORIT_TIMEOUT="${1#*=}" + shift 1 + ;; + --) + shift + WAITFORIT_CLI=("$@") + break + ;; + --help) + usage + ;; + *) + echoerr "Unknown argument: $1" + usage + ;; + esac +done + +if [[ "$WAITFORIT_HOST" == "" || "$WAITFORIT_PORT" == "" ]]; then + echoerr "Error: you need to provide a host and port to test." + usage +fi + +WAITFORIT_TIMEOUT=${WAITFORIT_TIMEOUT:-15} +WAITFORIT_STRICT=${WAITFORIT_STRICT:-0} +WAITFORIT_CHILD=${WAITFORIT_CHILD:-0} +WAITFORIT_QUIET=${WAITFORIT_QUIET:-0} + +# check to see if timeout is from busybox? +WAITFORIT_TIMEOUT_PATH=$(type -p timeout) +WAITFORIT_TIMEOUT_PATH=$(realpath $WAITFORIT_TIMEOUT_PATH 2>/dev/null || readlink -f $WAITFORIT_TIMEOUT_PATH) +if [[ $WAITFORIT_TIMEOUT_PATH =~ "busybox" ]]; then + WAITFORIT_ISBUSY=1 + WAITFORIT_BUSYTIMEFLAG="-t" + +else + WAITFORIT_ISBUSY=0 + WAITFORIT_BUSYTIMEFLAG="" +fi + +if [[ $WAITFORIT_CHILD -gt 0 ]]; then + wait_for + WAITFORIT_RESULT=$? + exit $WAITFORIT_RESULT +else + if [[ $WAITFORIT_TIMEOUT -gt 0 ]]; then + wait_for_wrapper + WAITFORIT_RESULT=$? + else + wait_for + WAITFORIT_RESULT=$? + fi +fi + +if [[ $WAITFORIT_CLI != "" ]]; then + if [[ $WAITFORIT_RESULT -ne 0 && $WAITFORIT_STRICT -eq 1 ]]; then + echoerr "$WAITFORIT_cmdname: strict mode, refusing to execute subprocess" + exit $WAITFORIT_RESULT + fi + exec "${WAITFORIT_CLI[@]}" +else + exit $WAITFORIT_RESULT +fi + diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..5599d7d263c8927025e128c37eabb185025bf96b --- /dev/null +++ b/.gitignore @@ -0,0 +1,19 @@ +src/caoscrawler.egg-info/ +.coverage +__pycache__ +.tox +TAGS +src/.coverage +build/ +*~ +.pdbrc +provenance.yml +*.pkcs12 +*.pem +*.jks +*.tar.gz +*.sql +/integrationtests/test-profile/custom/other/cert/ +src/doc/_apidoc/ +start_caosdb_docker.sh +src/doc/_apidoc diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000000000000000000000000000000000000..30a8cd8fe4c08fd3fe0f3f98aaa56b83cb623086 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,259 @@ +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2018 Research Group Biomedical Physics, +# Max-Planck-Institute for Dynamics and Self-Organization Göttingen +# Copyright (C) 2019 Henrik tom Wörden +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +variables: + CI_REGISTRY_IMAGE: $CI_REGISTRY/caosdb/src/caosdb-crawler/testenv:$CI_COMMIT_REF_NAME + CI_REGISTRY_IMAGE_BASE: $CI_REGISTRY/caosdb/src/caosdb-pyinttest/base:latest + +stages: + - info + - setup + - cert + - style + - test + - deploy + + +# During the test stage the CI pipeline (which runs in a "root" docker) starts +# two docker containers with docker-compose (one for the caosdb-server, another +# for the mysql-backend). Then a third docker is being started which contains +# the test suite and executes it. +# +# +-------------(root docker)-------------------------+ +# | | +# | +-(caosdb_mysqlbackend)-------------+ | +# | | | | +# | +-----------------------------------+ | +# | +-(caosdb-server)-------------------+ | +# | | | | +# | | /opt/caosdb | | +# | .------->| + /git/caosdb-server/scripting/ | | +# | | .----->| + /git/caosdb-server/authtoken/ | | +# | | | .--->| + /mnt/extroot | | +# | | | | .->| + /cert | | +# | | | | | | | | +# | | | | | +-----------------------------------+ | +# | | | | | | +# | | | | | filesystem: | +# | | | | *--- /cert -----------. | +# | | | | | | +# | | | | volumes: | | +# | | | *----- extroot ------. | | +# | | *------- scripting --. | | | +# | *--------- authtoken -. | | | | +# | | | | | | +# | +-(crawler tests)---+ | | | | | +# | | | | | | | | +# | | /authtoken |<---* | | | | +# | | /scripting |<----* | | | +# | | /extroot |<------* | | +# | | /cert |<--------* | +# | | | | +# | +----------------------+ | +# +---------------------------------------------------+ +# +# In the root docker, the directory /cert is mounted to .docker/cert relative +# to this repository. The directory is created during the cert stage of this +# pipeline and a certificate is created in there. The certificat is then +# available in mounted directories in the server and crawler containers. +# +# Additional volumes in the root docker are shared by the caosdb-server and the crawler +# containers. These volumes are intended to be used for testing server-side scripting and +# file-system features. +# + +.env: &env + - echo "Pipeline triggered by $TRIGGERED_BY_REPO@$TRIGGERED_BY_REF ($TRIGGERED_BY_HASH)" + - echo "CI_REGISTRY_IMAGE_BASE = $CI_REGISTRY_IMAGE_BASE" + - echo "CI_REGISTRY_IMAGE = $CI_REGISTRY_IMAGE" + - echo "CAOSDB_TAG = $CAOSDB_TAG" + - echo "REFTAG = $REFTAG" + - echo "F_BRANCH = $F_BRANCH" + - echo "CI_COMMIT_REF_NAME = $CI_COMMIT_REF_NAME" + - ls -lah /image-cache/ + + - F_BRANCH=${F_BRANCH:-$CI_COMMIT_REF_NAME} + - echo $F_BRANCH + - if [[ "$REFTAG" == "" ]] ; then + if [[ "$F_BRANCH" == "dev" ]] ; then + REFTAG=dev; + fi; + fi + - REFTAG=${REFTAG:-dev_F_${F_BRANCH}} + + - echo $F_BRANCH + + - if [[ "$CAOSDB_TAG" == "" ]]; then + CAOSDB_TAG=${REFTAG}; + fi + - echo $CAOSDB_TAG + +info: + tags: [cached-dind] + image: docker:20.10 + stage: info + needs: [] + script: + - *env + +unittest: + tags: [cached-dind] + image: docker:20.10 + stage: test + image: $CI_REGISTRY_IMAGE + script: + - tox + +inttest: + tags: [docker] + services: + - docker:20.10-dind + variables: + # This is a workaround for the gitlab-runner health check mechanism when + # using docker-dind service. The runner will otherwise guess the port + # wrong and the health check will timeout. + SERVICE_PORT_2376_TCP_PORT: 2375 + stage: test + image: $CI_REGISTRY_IMAGE_BASE + needs: [cert] + script: + - *env + - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY + - echo $CAOSDB_TAG + + - cd .docker + # Store mariadb version + - MARIADBVERSION=$(grep mariadb docker-compose.yml | awk '{print $2}') + - echo "mariadb image:"$MARIADBVERSION + - time docker load < /image-cache/caosdb-crawler-testenv-${CI_COMMIT_REF_NAME}.tar || true + - time docker load < /image-cache/caosdb-${REFTAG}.tar || time docker load < /image-cache/caosdb-dev.tar || true + - time docker load < /image-cache/$MARIADBVERSION.tar || true + - docker pull $CI_REGISTRY/caosdb/src/caosdb-deploy:$CAOSDB_TAG || CAOSDB_TAG=dev + - docker pull $CI_REGISTRY_IMAGE + + # Here, the server and the mysql backend docker are being started + - CAOSDB_TAG=$CAOSDB_TAG docker-compose up -d + + # Store versions of CaosDB parts + - docker exec -u 0 -t docker-caosdb-server-1 cat /opt/caosdb/git/caosdb_pylib_commit > hash_pylib + - docker exec -u 0 -t docker-caosdb-server-1 cat /opt/caosdb/git/caosdb_webui_commit > hash_webui + - docker exec -u 0 -t docker-caosdb-server-1 cat /opt/caosdb/git/caosdb_server_commit > hash_server + - docker exec -u 0 -t docker-caosdb-server-1 cat /opt/caosdb/git/caosdb_mysqlbackend_commit > hash_mysql + - docker exec -u 0 -t docker-caosdb-server-1 cat /opt/caosdb/git/caosdb_proto_commit > hash_proto + - cat hash_server + - cat hash_proto + - cat hash_mysql + - cat hash_webui + - cat hash_pylib + # Run the actual tests. This starts a new docker container within which + # the tests run. The return value is stored in .docker/result + - /bin/sh ./run.sh + + # Save logs + - docker logs docker-caosdb-server-1 &> ../caosdb_log.txt + - docker logs docker-sqldb-1 &> ../mariadb_log.txt + - cd .. + + # Stop the server + - docker-compose -f .docker/docker-compose.yml down + + # the crawler docker writes the return value of the tests into the + # file result + - rc=`cat .docker/result` + - exit $rc + dependencies: [cert] + timeout: 3h + artifacts: + paths: + - caosdb_log.txt + - mariadb_log.txt + - .docker/hash_* + expire_in: 1 week + +build-testenv: + tags: [cached-dind] + image: docker:20.10 + stage: setup + timeout: 2h + only: + - schedules + - web + - pushes + needs: [] + script: + - df -h + - command -v wget + - if [ -z "$PYLIB" ]; then + if echo "$CI_COMMIT_REF_NAME" | grep -c "^f-" ; then + echo "Check if pylib has branch $CI_COMMIT_REF_NAME" ; + if wget https://gitlab.indiscale.com/api/v4/projects/97/repository/branches/${CI_COMMIT_REF_NAME} ; then + PYLIB=$CI_COMMIT_REF_NAME ; + fi; + fi; + fi; + - PYLIB=${PYLIB:-dev} + - echo $PYLIB + + - if [ -z "$ADVANCED" ]; then + if echo "$CI_COMMIT_REF_NAME" | grep -c "^f-" ; then + echo "Check if advanced user tools have branch $CI_COMMIT_REF_NAME" ; + if wget https://gitlab.indiscale.com/api/v4/projects/104/repository/branches/${CI_COMMIT_REF_NAME} ; then + ADVANCED=$CI_COMMIT_REF_NAME ; + fi; + fi; + fi; + - ADVANCED=${ADVANCED:-dev} + - echo $ADVANCED + + - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY + # use here general latest or specific branch latest... + - docker build + --build-arg PYLIB=${PYLIB} + --build-arg ADVANCED=${ADVANCED:dev} + --file .docker/Dockerfile + -t $CI_REGISTRY_IMAGE . + - docker push $CI_REGISTRY_IMAGE + - docker save $CI_REGISTRY_IMAGE > /image-cache/caosdb-crawler-testenv-${CI_COMMIT_REF_NAME}.tar + +cert: + tags: [docker] + stage: cert + image: $CI_REGISTRY_IMAGE + needs: + - job: build-testenv + optional: true + artifacts: + paths: + - .docker/cert/ + expire_in: 1 week + script: + - cd .docker + - CAOSHOSTNAME=caosdb-server ./cert.sh + +style: + tags: [docker] + stage: style + image: $CI_REGISTRY_IMAGE + needs: + - job: build-testenv + optional: true + script: + - autopep8 -r --diff --exit-code . + allow_failure: true diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000000000000000000000000000000000000..a6e38b041e80e3d8b983f9f1562160a642d9480b --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "test-setup/caosdb-deploy"] + path = test-setup/caosdb-deploy + url = git@gitlab.indiscale.com:caosdb/src/caosdb-deploy.git diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000000000000000000000000000000000000..6fcd19eed6a3b2f5c083b1752d16fa6d1a414742 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,36 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [0.1.0] - 2022-10-11 +(Florian Spreckelsen) + +### Added + +* Everything +* Added new converters for tables: CSVTableConverter and XLSXTableConverter +* Possibility to authorize updates as in the old crawler +* Allow authorization of inserts +* Allow splitting cfoods into multiple yaml documents +* Implemented macros +* Converters can now filter the list of children +* You can now crawl data with name conflicts: `synchronize(unique_names=False)` + +### Changed + +* MAINT: Renamed module from `newcrawler` to `caoscrawler` +* MAINT: Removed global converters from `crawl.py` + +### Fixed + +* FIX: #12 +* FIX: #14 +* FIX: Variables are now also replaced when the value is given as a list. +* FIX: #35 Parent cannot be set from value +* [#6](https://gitlab.com/caosdb/caosdb-crawler/-/issues/6): Fixed many type + hints to be compatible to python 3.8 +* [#9](https://gitlab.com/caosdb/caosdb-crawler/-/issues/9): Sclaras of types + different than string can now be given in cfood definitions diff --git a/Makefile b/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..95fc2bf61473b94decfb43d0c5ba0d3fda535a07 --- /dev/null +++ b/Makefile @@ -0,0 +1,48 @@ +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2020 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2020 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header + +# This Makefile is a wrapper for several other scripts. + +.PHONY: help + +help: + @echo 'Type `make doc` for documentation, or `make install` for (local) installation.' + +doc: + $(MAKE) -C src/doc html + +install: + @echo "Not implemented yet, use pip for installation." + +check: style lint +.PHONY: check + +style: + pycodestyle --count src unittests +.PHONY: style + +lint: + pylint --unsafe-load-any-extension=y -d all -e E,F src/caoscrawler +.PHONY: lint + +unittest: + tox -r +.PHONY: unittest diff --git a/README.md b/README.md index 7e99a70ce6b4fce574fc7a1b02e2dfed0e1d6e7b..b97fc8775ba334c03c5c0a42238e9f396301e6b3 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# newcrawler +# caoscrawler A new crawler for CaosDB. @@ -25,6 +25,8 @@ After installation of the package run (within the project folder): pytest ``` +## Integration Tests +see `integrationtests/README.md` # Contributers @@ -36,8 +38,9 @@ The original authors of this package are: # License -Copyright (C) 2021 Research Group Biomedical Physics, Max Planck Institute for +Copyright (C) 2021-2022 Research Group Biomedical Physics, Max Planck Institute for Dynamics and Self-Organization Göttingen. +Copyright (C) 2021-2022 IndiScale GmbH All files in this repository are licensed under a [GNU Affero General Public License](LICENCE) (version 3 or later). diff --git a/RELEASE_GUIDELINES.md b/RELEASE_GUIDELINES.md new file mode 100644 index 0000000000000000000000000000000000000000..d6bc2c9ae41b8032a5567f786eb060d7b67d2cc5 --- /dev/null +++ b/RELEASE_GUIDELINES.md @@ -0,0 +1,47 @@ +# Release Guidelines for the CaosDB Python Client Library + +This document specifies release guidelines in addition to the general release +guidelines of the CaosDB Project +([RELEASE_GUIDELINES.md](https://gitlab.com/caosdb/caosdb/blob/dev/RELEASE_GUIDELINES.md)) + +## General Prerequisites + +* All tests are passing. +* FEATURES.md is up-to-date and a public API is being declared in that document. +* CHANGELOG.md is up-to-date. +* dependencies in `setup.cfg` are up-to-date. + +## Steps + +1. Create a release branch from the dev branch. This prevents further changes + to the code base and a never ending release process. Naming: `release-<VERSION>` + +2. Update CHANGELOG.md + +3. Check all general prerequisites. + +4. Update the version: + - `version` variables in `src/doc/conf.py` + - Version in [setup.cfg](./setup.cfg): Check the `MAJOR`, `MINOR`, `MICRO`, `PRE` variables and set + `ISRELEASED` to `True`. Use the possibility to issue pre-release versions for testing. + +5. Merge the release branch into the main branch. + +6. Tag the latest commit of the main branch with `v<VERSION>`. + +7. Delete the release branch. + +8. Remove possibly existing `./dist` directory with old release. + +9. Publish the release by executing `./release.sh` with uploads the caosdb + module to the Python Package Index [pypi.org](https://pypi.org). + +10. Merge the main branch back into the dev branch. + +11. After the merge of main to dev, start a new development version by + increasing at least the micro version in [setup.cfg](./setup.cfg) and + preparing CHANGELOG.md. + +12. Create releases on gitlab.com and gitlab.indiscale.com that contain (at + least) the most recent section of the CHANGELOG as the description and link + to the PyPi package. diff --git a/concept.md b/concept.md new file mode 100644 index 0000000000000000000000000000000000000000..8b00cd805011466d50ec033a483ad42d08b77374 --- /dev/null +++ b/concept.md @@ -0,0 +1,113 @@ +# Crawler 2.0 +The current CaosDB crawler has several limitations. The concept of +identifiables is for example not able to incorporate conditions like +referencing entities (only entities that are being referenced; other direction). +Another aspect is that crawler setup should be more easy. This should probably +result in less code (since custom and possibly untested code is error prone). +Optimally, setup/configuration can be done using a visual tool or is (in part) automated. + +One approach to these goals would be to: +1. generalize some aspects of the crawler (e.g. the identifiable) +2. use a more configuration based approach that requires as little programming + as possible + +The datastructures that we encountered in the past were inherently hierarchical: +- folder sturctures +- standardized containers, like HDF5 files +- ASCII "container" formats, like JSON files + +The Crawler 2.0 should be able treat an arbitrary hierarchical structures and +convert them to interconnected Records that are consistent with a predefined +semantic data model. + +The configuration must define: +- How the structure is created + Example: Does the content of a file need to be considered and added to the tree? +- How the structure and its contained data is mapped to the semantic data model: + Example The Record "Experiment" will store the data from the folder name and the + email address from a JSON file as CaosDB properties. + + +## Structure Mapping +In the following, it is described how the above can be done on an abstract level. + +The hierarchical structure is assumed to be constituted of a tree of +StructureElements. The tree is created on the fly by so-called Converters which +are defined the configuration. The tree of StructureElements is a model +of the existing data. +Example: A tree of Python file objects (StructureElements) could represent a file tree + that exists on some file server. + +Converters treat StructureElements and thereby create the StructureElements that +are the children of the treated StructureElement. +Example: A StructureElement represents a folder and a Converter defines that for each file in the folder + another StructureElement is created. +Converters therefore create the above named tree. The definition of a Converter also contains what +Converters shall be used to treat the generated child-StructureElements. The definition is therefore a tree itself. + +> Alex: The previous paragraph is difficult to understand. The reference "above named" is a little unclear. + +> Side discussion +> Question: Should there be global Converters +> that are always checked when treating a StructureElement? Should Converters be +> associated with generated child-StructureElements? Currently, all children are +> created and checked against all Converters. It could be that one would like to +> check file-StructureElements against one set of Converters and +> directory-StructureElements against another) +> +> Alex' opinion: I would rather go for a macro/variable/template-based solution, so that the employment of a globally predefined +> converter is explicitely mentioned instead of "silently and automatically" applied. + +Each StructureElement in the tree has a set of data values, i.e a dictionary +of key-value pairs. +Some of those values may be set due to the kind of StructureElement. For example, +a file could always have the file name as such a key value pair: 'filename': <sth>. +Converters may define additional functions that create further values. For +example, a regular expression could be used to get a date from a file name. + +## Identifiables +The concept of an identifiable should be broadend to how an entity can be +identified. Suggestion: Definition through a unique query +Example: "FIND RECORD Fish WITH FishNumber=A AND WHICH IS REFERENCED BY B" +Note that the second part can not be specified as condition with the old +identifiable concept. +The query must return 1 or 0 entities. If no entity is returned the respective +object may be created and if one is returned it is the one we were looking for. +If more than one is returned, then there is a mistake in the definition or in +the data set. It is the responsibility of the designer of the Query for the identifiable +to make sure, that it returns either zero or one Entity. + +## Entity Construction + +In the simplest case an entity is constructed at a given node from its key- +value pairs. However, the data for a given entity might be distributed over different levels of +the tree. + +Two different approaches are possible: +1. During the construction of an entity at a given node also key-value pairs + from other nodes are used. For example, key-value pairs from parent nodes might + be made accessible. Or key-value pairs might be accessed by providing the path + to them in the tree. +2. Information is added to an entity at other nodes. The simplest case uses the + identifiable definition to add information. I.e. it is checked whether the + respective entity does already exist in the server, if not it is inserted and + then the information is added. +Additionally, it could be made possible to add information to entities that are +constructed in other nodes without the use of the identifiable. For example, +it could be allowed to add information to entities that were created at parent +nodes. + +> Alex: I haven't really understood the variant at 2.. + +## Value computation +It is quite straight forward how to set a Property of a Record with a value +that is contained in the hierarchical structure. However, the example with the +regular expression illustrates that the desired value might not be present. +For example, the desired value might be `firstname+" "+lastname`. Since the +computation might not be trivial, it is likely that writing code for these +computations might be necessary. Still, these would be tiny parts that probably +can easily be unit tested. There is also no immediated security risk since the +configuration plus code replace the old scripts (i.e. only code). One could +define small functions that are vigorously unit tested and the function names +are used in the configuration. + diff --git a/integrationtests/README.md b/integrationtests/README.md new file mode 100644 index 0000000000000000000000000000000000000000..88d55902e3fdc5836baefd97c3192cc9ff01e7bd --- /dev/null +++ b/integrationtests/README.md @@ -0,0 +1,3 @@ +1. Mount test_data/extroot as extroot folder in the CaosDB server +2. use an empty server +3. run pytest from `src`: `python -m pytest ../integrationtests` diff --git a/integrationtests/basic_example/model.yml b/integrationtests/basic_example/model.yml new file mode 100644 index 0000000000000000000000000000000000000000..7e1a391186be6a01fb10d0b32e8516238012f374 --- /dev/null +++ b/integrationtests/basic_example/model.yml @@ -0,0 +1,88 @@ +Experiment: + obligatory_properties: + date: + datatype: DATETIME + description: 'date of the experiment' + identifier: + datatype: TEXT + description: 'identifier of the experiment' + # TODO empty recommended_properties is a problem + #recommended_properties: + responsible: + datatype: LIST<Person> +Project: +SoftwareVersion: + recommended_properties: + version: + datatype: TEXT + description: 'Version of the software.' + binaries: + sourceCode: + Software: +DepthTest: + obligatory_properties: + temperature: + datatype: DOUBLE + description: 'temp' + depth: + datatype: DOUBLE + description: 'temp' +Person: + obligatory_properties: + first_name: + datatype: TEXT + description: 'First name of a Person.' + last_name: + datatype: TEXT + description: 'LastName of a Person.' + recommended_properties: + email: + datatype: TEXT + description: 'Email of a Person.' +revisionOf: + datatype: REFERENCE +results: + datatype: LIST<REFERENCE> +sources: + datatype: LIST<REFERENCE> +scripts: + datatype: LIST<REFERENCE> +single_attribute: + datatype: LIST<INTEGER> +Simulation: + obligatory_properties: + date: + identifier: + responsible: +Analysis: + obligatory_properties: + date: + identifier: + responsible: + suggested_properties: + mean_value: + datatype: DOUBLE +Publication: +Thesis: + inherit_from_suggested: + - Publication +Article: + inherit_from_suggested: + - Publication +Poster: + inherit_from_suggested: + - Publication +Presentation: + inherit_from_suggested: + - Publication +Report: + inherit_from_suggested: + - Publication +hdf5File: + datatype: REFERENCE +Measurement: + recommended_properties: + date: +ReadmeFile: + datatype: REFERENCE +ProjectMarkdownReadme: diff --git a/integrationtests/basic_example/test_basic.py b/integrationtests/basic_example/test_basic.py new file mode 100755 index 0000000000000000000000000000000000000000..b24a1c658cfc9e23ca0ba2de266161864cb6b66c --- /dev/null +++ b/integrationtests/basic_example/test_basic.py @@ -0,0 +1,336 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2021 Indiscale GmbH <info@indiscale.com> +# 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# 2021 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +""" +module description +""" + +from caosadvancedtools.crawler import Crawler as OldCrawler +import os +from caosdb import EmptyUniqueQueryError +import argparse +import sys +from argparse import RawTextHelpFormatter +from caoscrawler import Crawler, SecurityMode +import caosdb as db +from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter +import pytest +from caosadvancedtools.models.parser import parse_model_from_yaml +import yaml + +# TODO is not yet merged in caosadvancedtools +#from caosadvancedtools.testutils import clear_database, set_test_key +# set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") + + +def rfp(*pathcomponents): + """ + Return full path. + Shorthand convenience function. + """ + return os.path.join(os.path.dirname(__file__), *pathcomponents) + + +@pytest.fixture +def clear_database(): + db.execute_query("FIND Entity").delete() + + +@pytest.fixture +def usemodel(): + model = parse_model_from_yaml(rfp("model.yml")) + model.sync_data_model(noquestion=True, verbose=False) + + +@pytest.fixture +def ident(): + ident = CaosDBIdentifiableAdapter() + + # TODO place this definition of identifiables elsewhere + ident.register_identifiable( + "Person", db.RecordType() + .add_parent(name="Person") + # .add_property(name="first_name") + .add_property(name="last_name")) + ident.register_identifiable( + "Measurement", db.RecordType() + .add_parent(name="Measurement") + # .add_property(name="identifier") + .add_property(name="date") + .add_property(name="project")) + ident.register_identifiable( + "Project", db.RecordType() + .add_parent(name="Project") + .add_property(name="date") + .add_property(name="identifier")) + return ident + + +def crawl_standard_test_directory(cr: Crawler, + subdir: str = "examples_article", + cfood: str = "scifolder_cfood.yml"): + cr.crawl_directory(rfp("..", "..", "unittests", "test_directories", subdir), + rfp("..", "..", "unittests", cfood)) + + +@pytest.fixture +def crawler(ident): + cr = Crawler(debug=True, identifiableAdapter=ident) + crawl_standard_test_directory(cr) + return cr + + +@pytest.fixture +def crawler_extended(ident): + cr = Crawler(debug=True, identifiableAdapter=ident) + crawl_standard_test_directory(cr, cfood="scifolder_extended.yml") + # correct paths for current working directory + file_list = [r for r in cr.target_data if r.role == "File"] + for f in file_list: + f.file = rfp("..", "..", "unittests", "test_directories", f.file) + return cr + + +def test_single_insertion(clear_database, usemodel, crawler, ident): + ins, ups = crawler.synchronize() + + # This test also generates the file records.xml used in some of the unittesets: + res = db.execute_query("FIND Record") + for i in reversed(range(len(res))): + if res[i].parents[0].name == "PyTestInfo": + del res[i] + filename = rfp("..", "..", "unittests", "records.xml") + with open(filename, "w") as f: + xml = res.to_xml() + # Remove noscript and transaction benchmark: + for tag in ("noscript", "TransactionBenchmark"): + if xml.find(tag) is not None: + xml.remove(xml.find(tag)) + f.write(db.common.utils.xml2str(xml)) + + assert len(ins) == 18 + assert len(ups) == 0 + + # Do a second run on the same data, there should be no changes: + crawler = Crawler(debug=True, identifiableAdapter=ident) + crawler.crawl_directory(rfp("../../unittests/test_directories", "examples_article"), + rfp("../../unittests/scifolder_cfood.yml")) + ins, ups = crawler.synchronize() + assert len(ins) == 0 + assert len(ups) == 0 + + +def test_multiple_insertions(clear_database, usemodel, ident, crawler): + ins, ups = crawler.synchronize() + + # Do a second run on the same data, there should be no changes: + cr = Crawler(debug=True, identifiableAdapter=ident) + crawl_standard_test_directory(cr) + ins, ups = cr.synchronize() + assert len(ins) == 0 + assert len(ups) == 0 + + +def test_insertion(clear_database, usemodel, ident, crawler): + ins, ups = crawler.synchronize() + + # Do a second run on the same data, there should a new insert: + cr = Crawler(debug=True, identifiableAdapter=ident) + crawl_standard_test_directory(cr, "example_insert") + assert len(cr.target_data) == 3 + ins, ups = cr.synchronize() + assert len(ins) == 1 + assert len(ups) == 0 + + # Do it again to check whether nothing is changed: + cr = Crawler(debug=True, identifiableAdapter=ident) + crawl_standard_test_directory(cr, "example_insert") + assert len(cr.target_data) == 3 + ins, ups = cr.synchronize() + assert len(ins) == 0 + assert len(ups) == 0 + + +def test_insert_auth(clear_database, usemodel, ident, crawler): + ins, ups = crawler.synchronize() + + # Do a second run on the same data, there should a new insert: + cr = Crawler(debug=True, identifiableAdapter=ident, securityMode=SecurityMode.RETRIEVE) + crawl_standard_test_directory(cr, "example_insert") + assert len(cr.target_data) == 3 + ins, ups = cr.synchronize() + assert len(ins) == 1 + assert not ins[0].is_valid() + nins, nups = OldCrawler.update_authorized_changes(cr.run_id) + assert nins == 1 + + # Do it again to check whether nothing is changed: + cr = Crawler(debug=True, identifiableAdapter=ident) + crawl_standard_test_directory(cr, "example_insert") + assert len(cr.target_data) == 3 + ins, ups = cr.synchronize() + assert len(ins) == 0 + assert len(ups) == 0 + + +def test_insertion_and_update(clear_database, usemodel, ident, crawler): + ins, ups = crawler.synchronize() + + cr = Crawler(debug=True, identifiableAdapter=ident) + crawl_standard_test_directory(cr, "example_insert") + ins, ups = cr.synchronize() + + cr = Crawler(debug=True, identifiableAdapter=ident) + crawl_standard_test_directory(cr, "example_overwrite_1") + # print(cr.target_data) + # cr.save_debug_data(rfp("provenance.yml")) + assert len(cr.target_data) == 3 + ins, ups = cr.synchronize() + assert len(ins) == 0 + assert len(ups) == 1 + + +def test_identifiable_update(clear_database, usemodel, ident, crawler): + ins, ups = crawler.synchronize() + + # Do a second run on the same data with a change in one + # of the identifiables: + cr = Crawler(debug=True, identifiableAdapter=ident) + crawl_standard_test_directory(cr) + + # Test the addition of a single property: + l = cr.target_data + for record in l: + if (record.parents[0].name == "Measurement" and + record.get_property("date").value == "2020-01-03"): + # maybe a bit weird, but add an email address to a measurement + record.add_property( + name="email", value="testperson@testaccount.test") + print("one change") + break + ins, ups = cr.synchronize() + assert len(ins) == 0 + assert len(ups) == 1 + + # Test the change within one property: + cr = Crawler(debug=True, identifiableAdapter=ident) + crawl_standard_test_directory(cr) + l = cr.target_data + for record in l: + if (record.parents[0].name == "Measurement" and + record.get_property("date").value == "2020-01-03"): + record.add_property(name="email", value="testperson@coolmail.test") + print("one change") + break + ins, ups = cr.synchronize() + assert len(ins) == 0 + assert len(ups) == 1 + + # Changing the date should result in a new insertion: + cr = Crawler(debug=True, identifiableAdapter=ident) + crawl_standard_test_directory(cr) + l = cr.target_data + for record in l: + if (record.parents[0].name == "Measurement" and + record.get_property("date").value == "2020-01-03"): + record.add_property(name="email", value="testperson@coolmail.test") + record.get_property("date").value = "2012-01-02" + print("one change") + break + ins, ups = cr.synchronize() + assert len(ins) == 1 + assert len(ups) == 0 + + +def test_file_insertion_dry(clear_database, usemodel, ident): + crawler_extended = Crawler(debug=True, identifiableAdapter=ident) + crawl_standard_test_directory( + crawler_extended, cfood="scifolder_extended.yml") + file_list = [r for r in crawler_extended.target_data if r.role == "File"] + assert len(file_list) == 11 + + for f in file_list: + assert f.path.endswith("README.md") + assert f.path[1:] == f.file + + ins, ups = crawler_extended.synchronize(commit_changes=False) + assert len(ups) == 0 + file_list_ins = [r for r in ins if r.role == "File"] + assert len(file_list_ins) == 11 + + +def test_file_insertion(clear_database, usemodel, ident, crawler_extended): + ins, ups = crawler_extended.synchronize(commit_changes=True) + file_list_ins = [r for r in ins if r.role == "File"] + assert len(file_list_ins) == 11 + + assert db.execute_query("COUNT File") > 0 + + # find record which references File does not seem to be possible + # retrieve ids of files: + files = db.execute_query("FIND File") + for f in files: + r = db.execute_query("FIND Record which references {}".format(f.id)) + assert len(r) == 1 + assert r[0].get_property("ReadmeFile").value == f.id + + +def test_file_update(clear_database, usemodel, ident, crawler_extended): + ins1, ups1 = crawler_extended.synchronize(commit_changes=True) + file_list_ins = [r for r in ins1 if r.role == "File"] + + cr = Crawler(debug=True, identifiableAdapter=ident) + crawl_standard_test_directory(cr, cfood="scifolder_extended.yml") + + file_list = [r for r in cr.target_data if r.role == "File"] + for f in file_list: + f.file = rfp("..", "..", "unittests", "test_directories", f.file) + ins2, ups2 = cr.synchronize(commit_changes=True) + assert len(ups1) == 0 + assert len(ups2) == 0 + + # Try adding a parent: + res = db.execute_query("Find File") + assert len(res) == 11 + assert len(res[0].parents) == 0 + + cr2 = Crawler(debug=True, identifiableAdapter=ident) + crawl_standard_test_directory(cr2, cfood="scifolder_extended2.yml") + + file_list = [r for r in cr2.target_data if r.role == "File"] + for f in file_list: + f.file = rfp("..", "..", "unittests", "test_directories", f.file) + ins3, ups3 = cr2.synchronize(commit_changes=True) + assert len(ups3) == 11 + + res = db.execute_query("Find File") + assert len(res) == 11 + assert res[0].parents[0].name == "ProjectMarkdownReadme" + + # TODO: Implement file update checks (based on checksum) + # Add test with actual file update: + # assert len(ins2) == 0 + # assert len(ups2) == len(file_list_ins) diff --git a/integrationtests/pycaosdb.ini b/integrationtests/pycaosdb.ini new file mode 100644 index 0000000000000000000000000000000000000000..a4f429736c9b46c8987d05a02724725295f32081 --- /dev/null +++ b/integrationtests/pycaosdb.ini @@ -0,0 +1,29 @@ +[Connection] +url=https://localhost:10443/ +username=admin +debug=0 +#cacert=/home//CaosDB/caosdb-deploy/profiles/default/custom/other/cert/caosdb.cert.pem +password_method=plain +password=caosdb + +ssl_insecure=True +timeout=5000 +[Container] +debug=0 + +#[Crawler] +#oldprefix=/ExperimentalData/ +#newprefix=/home/professional/CaosDB/caosdb-advanced-user-tools/integrationtests/extroot/ExperimentalData +#[IntegrationTests] +#test_server_side_scripting.bin_dir=/home/professional/CaosDB/caosdb-pyinttest/resources + +[Misc] +sendmail=sendmail_to_file +#sendmail=/usr/local/bin/sendmail_to_file +entity_loan.curator_mail_from=admin@indiscale.com +entity_loan.curator_mail_to=admin@indiscale.com +[sss_helper] +external_uri = https://localhost:10443 +[advancedtools] +crawler.from_mail=admin@indiscale.com +crawler.to_mail=admin@indiscale.com diff --git a/integrationtests/test-profile/custom/caosdb-server/.add_dir_to_git b/integrationtests/test-profile/custom/caosdb-server/.add_dir_to_git new file mode 100644 index 0000000000000000000000000000000000000000..c51a03ac8e38c55c161ae55fe6ba805a4e1b05f5 --- /dev/null +++ b/integrationtests/test-profile/custom/caosdb-server/.add_dir_to_git @@ -0,0 +1 @@ +This directory should be created when cloning or pulling this git repository. diff --git a/integrationtests/test-profile/custom/caosdb-server/scripting/home/.pycaosdb.ini b/integrationtests/test-profile/custom/caosdb-server/scripting/home/.pycaosdb.ini new file mode 100644 index 0000000000000000000000000000000000000000..f45f1dbb14a343f2bee23e48b850df0ab48ca13b --- /dev/null +++ b/integrationtests/test-profile/custom/caosdb-server/scripting/home/.pycaosdb.ini @@ -0,0 +1,8 @@ +[Connection] +url = https://localhost:10443 +cacert = /opt/caosdb/cert/caosdb.cert.pem +debug = 0 +timeout = 5000 + +[Misc] +sendmail = /usr/local/bin/sendmail_to_file diff --git a/integrationtests/test-profile/paths/extroot/README.md b/integrationtests/test-profile/paths/extroot/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ee741757ce62d03d49358a7245faf44b20e6cd60 --- /dev/null +++ b/integrationtests/test-profile/paths/extroot/README.md @@ -0,0 +1,2 @@ +This directory is mounted into the LinkAhead docker container when the debug +profile is used, to allow the inclusion of external file systems. diff --git a/integrationtests/test-profile/profile.yml b/integrationtests/test-profile/profile.yml new file mode 100644 index 0000000000000000000000000000000000000000..f830a2fbe6c6a4ae35362676db310f3eadf6f4cc --- /dev/null +++ b/integrationtests/test-profile/profile.yml @@ -0,0 +1,179 @@ +default: + # Optionally, specify a base compose file. The default base compose file is + # located in ./compose/docker-compose-default.yml. A different compose file + # may be useful when other services shall be included, e.g. nginx or django. + # base_compose_file: "compose/docker-compose-default.yml" + + # Directories (list of strings) (or single directory (string, deprecated)) + # with customization files. + # Directories listed later in the list take precedence over earlier ones. + # If you change this, you need to include the full list of directories + # (including possibly directories contained in the default setting). + # custom: + # Standard directory for customizations + # - "./custom" # included by default + # LinkAhead Theme for the web interface + # - "./theme-linkahead" # included by default + # - "./included_customization" # since this is later in the list this takes precedence + + # Paths to be mounted into Docker, all entries are optional. + paths: + # extroot: From where files are copied/symlinked. This is a + # list of `NAME: PATH` pairs or a single path. + extroot: + # "": "paths/extroot" + "": "../test_data/extroot/" + # + # "base": "/path/to/base/dir" + # "other": "/path/to/other" + # + # dropoffbox: (Soon to be deprecated.) Files can be written here, but note that you may need to + # become root to remove this directory later. The corresponding server property is DROP_OFF_BOX. + # dropoffbox: "/path/to/dropoffbox" + + # Docker building configuration + # References can be either hashes of commits or branch names + refs: + # SERVER: dev + # PYLIB: dev + # MYSQLBACKEND: dev + # WEBUI: dev + # ADVANCEDUSERTOOLS: dev + + # General configuration options + conf: + # Shall the SQL & caosroot dumps at custom/other/restore/ be used? + # restore: false + restore: true + # uncomment to disable tls (ssl). This might be insecure! + # no_tls: false + # Shall the local users be imported as LinkAhead users? + # local_users: false + # Shall the anonymous user have the administration role? Implies auth_optional: TRUE + # anonymous_admin: false + # Shall NIS/LDAP be used for authentication? + # nis: false + # Shall a mail server be used? + # mail: false + # You can provide the path to an non-standard sendmail executable + # sendmail: /usr/sbin/sendmail + # sendmail: /usr/local/bin/sendmail_to_file + # Shall the server run in debug mode? + # This will bind-mount the following directories from custom into the Docker + # container: + # - debug-authtoken :: Authentication tokens will be stored here. + # - debug-scripting-bin :: Used as the server-side scripting bin dir. + # debug: false + debug: true + # URL of the docker registry. Set to "" to look locally. + # registry_server: "gitlab.indiscale.com:5050" + # The account for accessing the registry server. "" means no account. + # registry_account: "" + # The secret token for accessing the registry server. "" means no token. + # registry_token: "" + # Name of the docker image + # image: "caosdb/src/caosdb-deploy" + # Tag of the docker image + # tag: "latest" + # Name of the main Docker container. Set to "" to use an auto-generated + # name, which is necessary for running multiple instances + # container_name: "linkahead" + # Directory where backups shall be stored. + # backup_dir: "backup" + # The time zone for the server + # timezone: "Coordinated Universal Time" + + # You can set labels for the docker container here + # labels: + # label_1_key: label_1_value + # label_2_key: label_2_value + + # User/Group of the server, either numeric or names. + # user_group: 999:999 + + # Network settings. + network: + # The subnet for the Docker containers + # default: auto selected by docker + # You can set it with: + # subnet: 10.3.128.0/17 + # Port for accessing LinkAhead via HTTPS + # port_ssl: 10443 + # Port for accessing LinkAhead via plain HTTP (not recommended when + # accessible from untrusted networks, but ok for testing or when behind + # a proxy) + # port_plain: 8000 + # Port for GRPC end-point via HTTPS + # port_grpc_ssl: 8443 + # Port for GRPC end-point via plain HTTP + # port_grpc_plain: 8080 + # Port for debugging the LinkAhead JVM + # port_debug: 9000 + # Port for profiling the LinkAhead JVM via JMX + # port_profiler: 9090 + # listen to ip address ("" means any) + # bind_ip: "127.0.0.1" + + server: + # All the keys of conf are set as environment variables in the server + # container before the server start. This overrides the server.conf + # settings in any other files, even the settings from + # `custom/caosdb-server/conf/ext/server.conf.d/` + # Check out conf/core/server.conf in the caosdb-server repository for + # options. + # + # When the conf variables are unset, the server uses its default values + # or the values from the `server.conf.d` directory. + conf: + # uncomment to enable the anonymous user + # auth_optional: TRUE + # uncomment to use the your custom authtoken config. See + # `conf/core/authtoken.example.yaml` for examples. + # Note: The path is relative to the caosdb server's root directory. + # authtoken_config: conf/core/authtoken.yaml + + # HTTPS port of the grpc end-point + # grpc_server_port_https: 8443 + # HTTP port of the grpc end-point + # grpc_server_port_http: 8080 + + # Development configuration options + # devel: + # Copy the caosdb-server jar from this location into the Docker container. + # Note that this is implemented by copying the file to + # custom/caosdb-server/target/, any file there will be overwritten. + # jar: /var/build/caosdb-server/0123abcd/target/caosdb-server-<version>-jar-with-dependencies.jar + + # The following is for the very specific case of server-side scripts + # requiring additional Python packages that are not installed during + # the regular build process of LinkAhead. If additional packages are + # needed, list them below. Mind that only packages that can be + # installed by pip are supported. + + # scripting: + # packages: + + # Packages can be installed from PyPI or external git + # repositories. In this case, `mode: "pip"` has to be + # provided. `package` can be the package name in PyPI (possibly + # with a version specification, i.e., `my_package>=1.0`, or it + # can be the URL of a git repository of a Python + # package. Essentially, the command `pip3 install + # package_string` will be executed within LinkAhead. + + # <package1_key>: + # mode: "pip" + # package: "<package_string>" + + # Alternatively, local packages can be copied into LinkAhead and + # then be installed using pip. Here, `mode: "copy"` has to be + # provided. `path` specifies the path to the Python package on + # the host system. `package` is the name of the destination + # directory within the LinkAhead container into which the local + # package will be copied. After copying, a `pip3 install .` is + # run from within that directory. + + # <package2_key>: + # mode: "copy" + # path: "/path/to/local/python/package" + # package: "<package_string>" diff --git a/integrationtests/test_data/extroot/realworld_example/data/35/.dataspace.json b/integrationtests/test_data/extroot/realworld_example/data/35/.dataspace.json new file mode 100644 index 0000000000000000000000000000000000000000..26e11e4e16081b8b5b64a83889bc1f4d160ef0e7 --- /dev/null +++ b/integrationtests/test_data/extroot/realworld_example/data/35/.dataspace.json @@ -0,0 +1,15 @@ +{ + "name": "DEMO", + "dataspace_id": 20002, + "archived": false, + "coordinator": { + "full_name": "Max Schmitt", + "given_name": "Max", + "family_name": "Schmitt", + "email": "max.schmitt@email.de" + }, + "start_date": "2022-03-01", + "end_date": "2032-02-28", + "comment": "Demonstration data space for DataCloud", + "url": "https://datacloud.de/index.php/f/7679" +} diff --git a/integrationtests/test_data/extroot/realworld_example/data/35/03_raw_data/001_dataset1/demo-dataset.csv b/integrationtests/test_data/extroot/realworld_example/data/35/03_raw_data/001_dataset1/demo-dataset.csv new file mode 100644 index 0000000000000000000000000000000000000000..7a4d684e50cf4fa0699c66d27661d0d54055ec8b --- /dev/null +++ b/integrationtests/test_data/extroot/realworld_example/data/35/03_raw_data/001_dataset1/demo-dataset.csv @@ -0,0 +1,101 @@ +index,A[kg],B[s],pH,Temp.[C] +0,2.1209183975976957,-0.5658499891692009,-0.8391639362482752,0.6210332089995103 +1,-1.2155508955759597,-1.0141121577750831,0.2503340429095144,0.7560156296323594 +2,-1.0191141299527218,-1.5870495901656396,0.6811842117478961,-0.25776671384531147 +3,-0.8235788683146266,1.1688759819188137,-0.15841036014621737,0.24351773490785233 +4,-2.028210212186099,-0.15000944869896093,0.7344551834722798,-1.0594635581726441 +5,0.8578345931586077,-1.0478958942647336,-0.5059960285526023,0.6141193812881873 +6,-0.7585068400011461,-0.45812334415522366,-0.6299981228519985,-0.072295788065162 +7,-0.34875455645064296,-0.49936600901639105,0.08492189470338947,0.24398792231786676 +8,-0.491473523786921,-1.1815449374689073,-0.23631388788457763,0.8801868915647684 +9,-1.291852196630842,0.4956544058017087,1.7176555991727498,1.8889309443940632 +10,-0.974327795079914,-0.6002779223325445,1.4950878953418667,-0.4750187681874636 +11,0.863708396863823,0.4867513929363103,-1.2500529683835453,2.1711592870838112 +12,-1.0518542498779602,-0.6800136223939168,-0.5593377295003794,-0.23451862458342732 +13,0.013421028872223972,-1.7652967848993042,0.302518679323854,1.124258888392337 +14,1.1387734213591119,-0.5602347718731282,-0.6908747870526222,0.905906598269778 +15,-1.8032949181114486,0.18858416406523845,1.0083249532267977,0.6969475009127225 +16,-0.42755813629599176,-1.0354063212247375,-0.24666198541039489,-1.2245102779938972 +17,-0.558268266895522,-1.4564784210249142,1.6162446783371565,-0.6109432350045504 +18,-0.9759505344957924,-2.780175134826593,3.039543722358096,-1.258487109407404 +19,-0.042261223623348665,0.7827311969447484,0.8902139085357877,0.33130889065513175 +20,-0.43764310886282315,-0.8338864816830261,0.8545198929035823,-0.8330242660029193 +21,0.2910454990578444,0.40786200750721635,-0.8115126892604917,0.7108997766944964 +22,0.41446462010439317,-1.0965365861313923,-0.1816041240266455,-0.18304466068648742 +23,-0.5187715545823834,-0.46490147833949275,-0.5059346590831783,0.6998562249774912 +24,2.4491154744839005,-0.3554192977203785,-0.6604902675826654,-0.9434392815439072 +25,-0.5083188860395834,0.2781724921583019,-0.4340136020292349,0.02629089617543565 +26,-0.9854213292611846,-1.398313530263303,0.05552818415139104,-0.20282242071816114 +27,1.0808664341388348,-0.681501179909626,0.6492258431774035,-0.41774069067997716 +28,-1.1767497846165254,1.0817469159915034,-1.524089495721789,0.703812702135731 +29,0.19626402088297137,-1.731421126100085,0.33753714074823216,1.167207071332792 +30,-1.1808345594828473,-0.2820078693924212,-0.8720833031493173,0.8643708946275879 +31,0.8284163458216123,0.20722015645321426,0.29071068457985955,2.6180265991342315 +32,-0.08203655784081282,0.060308831720906446,0.9519485488783623,0.7350446746473065 +33,-0.9071581669506105,0.6088044300190749,1.0099718941738625,0.002799079788086574 +34,-0.42977850177492904,1.2900375327057412,0.32028642454115197,0.8301665482611077 +35,1.0852695299159272,-0.7040390830488096,0.7964627034904589,0.5892571532287761 +36,-1.5667114288837196,0.19932071915614016,-1.0037399027933205,0.5715977614420107 +37,1.3367378436097728,-0.4584285824179284,-0.4435084312392094,-1.3448283883056802 +38,-0.03788754387000687,-0.37288494267798383,-0.5643391975832854,0.43485956543590193 +39,1.0634390535750102,1.0881233131592658,1.2921865320956318,-0.07293734130819148 +40,1.6786504380461766,-0.03043290400609124,2.66472625811549,-0.016638240963738466 +41,-1.657581538683817,0.8240214695327108,0.32914391919723984,0.08007211199118686 +42,0.04171224685709963,-0.9854865121535178,-0.3195510216437891,-0.42540430453161987 +43,0.6506526831507736,-1.159358101323352,-1.2789107289085737,0.10499609768025624 +44,0.7402635450212406,-0.44202303578095753,-0.5748164371395315,0.5600113473434154 +45,-0.9809738202025933,0.16868168368656386,-1.5883259666916558,-2.6955712214488345 +46,-1.8495816486925372,-1.6954982682847552,1.387648046113911,0.8455399256972358 +47,1.0442607146494682,0.44438084748213075,-0.6547675875380801,-0.5557546828614935 +48,0.32905474147201974,-0.7323591467714324,0.8555098512789541,2.4647371835928196 +49,-2.5131333956577557,1.4005121469909907,-2.162216422615549,-0.3797761578463441 +50,-1.406182674849582,-0.33037485118390236,-0.30720520090625775,0.3765108318500068 +51,1.4315461764484496,0.4490657382715407,0.14688708820540236,-1.3363710028523919 +52,-1.3546100448551868,0.35309094153560083,1.1314974294901488,-0.8299500202669722 +53,-0.7668372422803534,1.3427856896905794,0.11144680945543838,0.5488627384438831 +54,2.6377507721791997,1.86851303077989,0.1358347611054535,0.0021631807468969044 +55,-0.2814604476092987,-0.8110890245372547,0.2914246407211869,1.3009776744589592 +56,-0.08220515064674763,0.06131679740379093,1.2240755323078314,1.6651435947789437 +57,-1.5833977101974248,-1.0390852809695386,0.9472604405151627,-1.1238493594739791 +58,0.060801913229076375,-1.1395369395697963,-0.6773504352336591,-0.7823844332031786 +59,0.3719151864023754,-2.6589573541115885,0.9138765623088898,1.9179285751965107 +60,0.16875501543121765,-0.21075290840365637,-0.15712808326461272,-1.095263810678978 +61,-0.6676220651512094,-2.392512574657398,-0.1970135407082481,1.1303688380560521 +62,-0.3508037371211798,0.37103055819752395,0.1319143246551687,-0.8442765717512588 +63,0.5744187610995448,0.2824163982139891,-0.23250484081352427,-0.009480528299369923 +64,-1.033847039653939,-0.6062251775571341,0.8745680740327043,0.10611431160660695 +65,0.8616095853453857,-0.7902852788672261,0.9924735544245377,-0.39017477285341734 +66,-0.25797403501959537,0.9776756368066659,-0.1774701795502288,0.8457628045096433 +67,0.1879011473947124,0.4880410431165719,0.33941695573743247,-0.3098695458944371 +68,0.12908240475251795,-0.3929831705571321,-0.9815115481276334,-0.6596680503662373 +69,0.47271005121390686,-0.27585706457228726,0.659750762879994,-1.621655291178758 +70,1.2805576221047092,1.255690982276119,0.8893506172744224,0.36843763617254915 +71,-1.8222077377715007,-1.2618097663744718,-1.2393746501949852,0.22742537143827415 +72,-0.7670935921671362,0.6632357605887813,-1.8652052380554516,-0.3566398262186697 +73,0.368513682832951,0.22484190975093934,0.7207761550523548,-0.4607733151206031 +74,-1.6353304746550132,-1.0835890398703607,0.6240782484796151,1.497716990815385 +75,1.2631082191418077,1.9388688317848526,0.43069457351954177,-0.1240852286700612 +76,1.4229945541316606,1.685287372911636,0.282616738427184,1.6075806781661712 +77,0.15907038463344916,-1.1862747951875707,-2.162241163696355,0.9048269906929861 +78,0.8724544719304812,-0.06423147646568356,0.28403221059939265,0.7315950326908587 +79,-0.5099002924982818,0.8674753935115029,0.0015306969822590103,-0.793334121698815 +80,0.16756755106838742,-0.8374595440291756,1.871547652925694,-0.019948470822079158 +81,0.5333319586985659,-1.6076411272904392,0.4676478392958759,0.35245743045221734 +82,-0.5292514883314576,-1.2708056558247538,-1.7043012586370947,0.3391676901971921 +83,1.8042184317880245,1.2058943020996364,-2.3228385290614084,1.2008461670776127 +84,0.8671835774935015,0.9953640415286719,-1.4439272409362103,0.9410085688802767 +85,-0.118043369635042,0.41649838899300184,-1.2993225013700294,1.9232397286356342 +86,-0.32517525711392864,0.062481999278585824,-0.27679161049236684,0.06555334954413516 +87,-0.39336711632154264,0.0790516633124132,-0.600204351381406,1.321653482130525 +88,-0.9789171222367312,0.30688902979967303,0.10346158693798674,0.3160642853129814 +89,0.4332454768673768,-0.620828990252391,-1.0710192139922268,0.15027972939295933 +90,3.1092106995021096,0.354640404873306,1.8164064530643516,1.8911595405760606 +91,0.7027212216033006,-1.9367414347582559,-0.26797308254438235,1.1063820286927997 +92,0.6665636818250888,0.7953561614160027,1.8164132351496374,1.5760380002772454 +93,-1.4931006068027144,0.2680846074746922,-0.30697269318261355,-0.5300118028948997 +94,0.9258476710590248,0.15464742730214845,0.5847769923450901,-0.8405562302565793 +95,0.3015957125126854,2.9697978560379323,2.2793789547159338,0.13951152352691706 +96,0.4109127837045091,0.04501972229381512,0.5969781411176205,1.6443498245829686 +97,0.07956221270863263,0.009072464866011773,-0.6905847540574735,-0.9639714900867246 +98,2.9172401959670817,0.43571229891911717,-0.903738601954934,0.08343820441617454 +99,0.5501333973314503,-0.2511364474548299,1.4945524498890597,-1.1608586317841827 diff --git a/integrationtests/test_data/extroot/realworld_example/data/35/03_raw_data/001_dataset1/metadata.json b/integrationtests/test_data/extroot/realworld_example/data/35/03_raw_data/001_dataset1/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..9b81cc094bf7d1c35154d8f092a96d5f5fae35c9 --- /dev/null +++ b/integrationtests/test_data/extroot/realworld_example/data/35/03_raw_data/001_dataset1/metadata.json @@ -0,0 +1,50 @@ +{ + "title": "Random numbers created on a random autumn day in a random person's office", + "abstract": "For demonstration purposes we created random numbers on a computer in an office of the CLOUD. This demonstration dataset is used in the DataCloud, a curated cloud storage for scientific data.", + "Event": [ + { + "longitude": 18.445078548041533, + "start_datetime": "2022-02-10T16:36:48+01:00", + "latitude": 53.10833068997861, + "elevation": 2, + "location": "Bremen, Germany" + } + ], + "license": "CC-BY", + "authors": [ + { + "firstname": "Max", + "lastname": "Schmitt", + "full_name": "Max Schmitt", + "affiliation": "CLOUD", + "ORCID": "0000-0001-6233-1866", + "email": "max.schmitt@email.de" + }, + { + "firstname": "Alexa", + "lastname": "Nozone", + "full_name": "Alexa Nozone", + "affiliation": "CLOUD", + "email": "alexa.nozone@email.de" + } + ], + "comment": "For questions about the DataCloud or this demonstration dataset, contact research-data@email.de", + "project": { + "name": "Demonstration of Extremly important Metadata in Folders", + "full_name": "Project", + "project_acronym": "DEMO", + "project_type": "national", + "institute": "CLOUD", + "start_date": "2021-10-01", + "end_date": "2031-12-31", + "url": "https://www.cloud.de/de/forschung-infrastruktur/forschungsdaten-services.html", + "coordinator": { + "firstname": "Max", + "lastname": "Schmitt", + "email": "max.schmitt@email.de" + } + }, + "method": { + "name": "Random Number Generator" + } +} diff --git a/integrationtests/test_data/extroot/realworld_example/data/35/03_raw_data/README_RawData.md b/integrationtests/test_data/extroot/realworld_example/data/35/03_raw_data/README_RawData.md new file mode 100644 index 0000000000000000000000000000000000000000..2317ff8616c43e75f52637ff581017bf4a50d468 --- /dev/null +++ b/integrationtests/test_data/extroot/realworld_example/data/35/03_raw_data/README_RawData.md @@ -0,0 +1,25 @@ +# Raw Data + +The `03_raw_data` folder is here to store all raw data of each dataset +associated with the project – the data that has not been edited by you yet but +which you plan to use in your research. It can be e.g. your unprocessed field +sampling records, or useful data from an online repository. Organize your data +in this folder in the following way: + +- Each dataset should reside inside a subfolder. It is recommended to number and name these folders clearly, e.g. `03_raw_data/001_precipitationgermany2017`. + +- **IMPORTANT**: provide the folder with information about your raw data by + filling out a metadata form for each of your datasets! For this, + + - either copy the `metadata-template.json` file and put it into your dataset + folder. Open the copy with a text editor and fill out the fields. + - or use the metadata editor in the DataCoud web client (press the "+" button + and use "New matadata.json" file) + + If you can’t find information about your data to fill in here, you should + reconsider using it - it is important to be able to trace your data sources to + ensure a FAIR scientific process! + +- For processing any of the data, make a copy of the dataset and paste it into + the `04_data_processing` folder. This way, you make sure to keep your raw data + in its original state. \ No newline at end of file diff --git a/integrationtests/test_data/extroot/realworld_example/data/35/03_raw_data/metadata-template.json b/integrationtests/test_data/extroot/realworld_example/data/35/03_raw_data/metadata-template.json new file mode 100644 index 0000000000000000000000000000000000000000..7f457d239321b232fb2db7d46f4e1576c85911b0 --- /dev/null +++ b/integrationtests/test_data/extroot/realworld_example/data/35/03_raw_data/metadata-template.json @@ -0,0 +1,52 @@ +{ + "dataset": { + "title": "", + "abstract": "See https://github.com/CLOUD/metadata-schema for schema specification", + "license": "CC-BY", + "authors": [ + { + "firstname": "", + "lastname": "", + "affiliation": "", + "ORCID": "XXXX-XXXX-XXXX-XXXX", + "email": "name@domain.de" + }, + { + "firstname": "", + "lastname": "", + "affiliation": "", + "email": "name@domain.de", + "ORCID": "XXXX-XXXX-XXXX-XXXX" + } + ], + "project": { + "name": "", + "acronym": "", + "type": "DFG/", + "institute": "CLOUD", + "start_date": "YYYY-MM-DD", + "end_date": "YYYY-MM-DD", + "url": "", + "coordinator": { + "lastname": "", + "email": "", + "firstname": "" + } + }, + "events_in_data": false, + "events": [ + { + "longitude": 0, + "latitude": 0, + "elevation": 0, + "location": "", + "datetime": "YYYY-MM-DDTHH:mm:ss" + } + ], + "method": { + "name": "", + "url": "" + }, + "max_files": 100 + } +} diff --git a/integrationtests/test_data/extroot/realworld_example/data/35/04_data_processing/README_ProcessedData.md b/integrationtests/test_data/extroot/realworld_example/data/35/04_data_processing/README_ProcessedData.md new file mode 100644 index 0000000000000000000000000000000000000000..ce1b002b18772b85f4bba3a222574f438a6ed0e3 --- /dev/null +++ b/integrationtests/test_data/extroot/realworld_example/data/35/04_data_processing/README_ProcessedData.md @@ -0,0 +1,10 @@ +# Data Processing + +The actual work is done in this `04_data_processing` folder. Depending on your +field and type and size of project, you can organize this folder in the way that +fits your process best. Here, a bit of chaos can happen ;) Keep in mind to +document your processing steps in the `02_materials_and_methods` folder and to +put in your final results into the `05_results` folder. In the end of your +project, it should be possible to delete everything in this folder and +reconstruct the working process using the documentation and raw data from +previous folders. diff --git a/integrationtests/test_data/extroot/realworld_example/data/35/04_data_processing/metadata-template.json b/integrationtests/test_data/extroot/realworld_example/data/35/04_data_processing/metadata-template.json new file mode 100644 index 0000000000000000000000000000000000000000..05f9394dfbfa9a0b2b4844c7080a340585a9050f --- /dev/null +++ b/integrationtests/test_data/extroot/realworld_example/data/35/04_data_processing/metadata-template.json @@ -0,0 +1,52 @@ +{ + "dataset": { + "title": "", + "abstract": "See https://github.com/cloud/metadata-schema for schema specification", + "license": "CC-BY", + "authors": [ + { + "firstname": "", + "lastname": "", + "affiliation": "", + "ORCID": "XXXX-XXXX-XXXX-XXXX", + "email": "name@domain.de" + }, + { + "firstname": "", + "lastname": "", + "affiliation": "", + "email": "name@domain.de", + "ORCID": "XXXX-XXXX-XXXX-XXXX" + } + ], + "project": { + "name": "", + "acronym": "", + "type": "DFG/", + "institute": "CLOUD", + "start_date": "YYYY-MM-DD", + "end_date": "YYYY-MM-DD", + "url": "", + "coordinator": { + "lastname": "", + "email": "", + "firstname": "" + } + }, + "events_in_data": false, + "events": [ + { + "longitude": 0, + "latitude": 0, + "elevation": 0, + "location": "", + "datetime": "YYYY-MM-DDTHH:mm:ss" + } + ], + "method": { + "name": "", + "url": "" + }, + "max_files": 100 + } +} diff --git a/integrationtests/test_data/extroot/realworld_example/data/35/05_results/README_Results.md b/integrationtests/test_data/extroot/realworld_example/data/35/05_results/README_Results.md new file mode 100644 index 0000000000000000000000000000000000000000..ae0ab6571c52c0ec9a1cdc8aba27b31fd3be6fcc --- /dev/null +++ b/integrationtests/test_data/extroot/realworld_example/data/35/05_results/README_Results.md @@ -0,0 +1,7 @@ +# Results + +All the results that are final versions of your data analysis or processing, +should be copied into this `05_results` folder. Organize your results folder in +the way most fitting to your project. + +Provide metadata to your results files. diff --git a/integrationtests/test_data/extroot/realworld_example/data/35/README.md b/integrationtests/test_data/extroot/realworld_example/data/35/README.md new file mode 100644 index 0000000000000000000000000000000000000000..809d699c462d064ff5193add8e23677bec84b0e0 --- /dev/null +++ b/integrationtests/test_data/extroot/realworld_example/data/35/README.md @@ -0,0 +1,5 @@ +# Dataspace: DEMO + +This is a Dataspace in the CLOUD DataCloud providing safe, curated cloud storage +for all of CLOUD's research data. + diff --git a/integrationtests/test_data/extroot/realworld_example/dataset_cfoods.yml b/integrationtests/test_data/extroot/realworld_example/dataset_cfoods.yml new file mode 100644 index 0000000000000000000000000000000000000000..eaf2690ae130cb61c8a74452e3e4e1d4fd06846a --- /dev/null +++ b/integrationtests/test_data/extroot/realworld_example/dataset_cfoods.yml @@ -0,0 +1,535 @@ +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2022 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2022 Florian Spreckelsen <f.spreckelsen@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify it under +# the terms of the GNU Affero General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your option) any +# later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +# details. +# +# You should have received a copy of the GNU Affero General Public License along +# with this program. If not, see <https://www.gnu.org/licenses/>. +# +Data: + type: Directory + match: data + subtree: + dataspace_dir: + type: Directory + match: (?P<dataspace_dir_number>[0-9]+) + records: + Dataspace: + name: $dataspace_dir_number + subtree: + dataspace_json: + type: JSONFile + match: .dataspace.json + validate: schema/dataspace.schema.json + subtree: + dataspace_id_element: + type: DictIntegerElement + match_name: "dataspace_id" + match_value: "(?P<id>[0-9]+)" + records: + Dataspace: + dataspace_id: $id + archived_element: + type: DictBooleanElement + match_name: "archived" + match_value: "(?P<archived>.*)" + records: + Dataspace: + archived: $archived + url_element: + type: DictTextElement + match_name: "url" + match_value: "(?P<url>.*)" + records: + Dataspace: + url: $url + coordinator_element: + type: DictDictElement + match_name: "coordinator" + records: + Person: + parents: + - Person + Dataspace: + Person: $Person + subtree: &person_subtree + full_name_element: + type: DictTextElement + match_name: "full_name" + match_value: "(?P<full_name>.*)" + records: + Person: + full_name: $full_name + full_name_nonlatin_element: + type: DictTextElement + match_name: "full_name_nonlatin" + match_value: "(?P<full_name_nonlatin>.*)" + records: + Person: + full_name_nonlatin: $full_name_nonlatin + family_name_element: + type: DictTextElement + match_name: "family_name" + match_value: "(?P<family_name>.*)" + records: + Person: + family_name: $family_name + given_name_element: + type: DictTextElement + match_name: "given_name" + match_value: "(?P<given_name>.*)" + records: + Person: + given_name: $given_name + email_element: + type: DictTextElement + match_name: "email" + match_value: "(?P<email>.*)" + records: + Person: + email: $email + affiliation_element: + type: DictTextElement + match_name: "affiliation" + match_value: "(?P<affiliation>.*)" + records: + Person: + affiliation: $affiliation + ORCID_element: + type: DictTextElement + match_name: "ORCID" + match_value: "(?P<ORCID>.*)" + records: + Person: + ORCID: $ORCID + start_date_element: + type: DictTextElement + match_name: "start_date" + match_value: "(?P<start_date>.*)" + records: + Dataspace: + start_date: $start_date + end_date_element: + type: DictTextElement + match_name: "end_date" + match_value: "(?P<end_date>.*)" + records: + Dataspace: + end_date: $end_date + comment: + type: DictTextElement + match_name: "comment" + match_value: "(?P<comment>.*)" + records: + Dataspace: + comment: $comment + raw_data_dir: + type: Directory + match: 03_raw_data + subtree: &template + # TODO collect info from metadata.json and look into sub-directories + # (only one level) for metadata.json + dataset_dir: + match: (?P<dataset_dir_name>.*) + type: Directory + records: + Dataset: + Dataspace: $Dataspace + subtree: + metadata_json: &metadata_json_template + type: JSONFile + match: metadata.json + validate: schema/dataset.schema.json + subtree: + title_element: + type: DictTextElement + match_name: "title" + match_value: "(?P<title>.*)" + records: + Dataset: + title: $title + authors_element: + type: DictListElement + match_name: "authors" + subtree: + author_element: + type: Dict + records: + Person: + parents: + - Person + Dataset: + authors: +$Person + subtree: *person_subtree + abstract_element: + type: DictTextElement + match_name: "abstract" + match_value: "(?P<abstract>.*)" + records: + Dataset: + abstract: $abstract + comment_element: + type: DictTextElement + match_name: "comment" + match_value: "(?P<comment>.*)" + records: + Dataset: + comment: $comment + license_element: + type: DictTextElement + match_name: "license" + match_value: "(?P<license_name>.*)" + records: + license: + # TODO: As soon as such things can be validated, a + # creation of a new license has to be forbidden here + # (although this is effectively done already by + # validating against the above schema.) + name: $license_name + Dataset: + license: $license + dataset_doi_element: + type: DictTextElement + match_name: "dataset_doi" + match_value: "(?P<dataset_doi>.*)" + records: + Dataset: + dataset_doi: $dataset_doi + related_to_dois_element: + type: DictListElement + match_name: "related_to_dois" + subtree: + related_to_doi_element: + type: TextElement + match: "(?P<related_to_doi>).*" + records: + Dataset: + related_to_dois: +$related_to_doi + Keywords_element: + type: DictListElement + match_name: "Keyword" + Events_element: + type: DictListElement + match_name: "Event" + subtree: + Event_element: + type: Dict + records: + Event: + parents: + - Event + Dataset: + Event: +$Event + subtree: + label_element: + type: DictTextElement + match_name: "label" + match_value: "(?P<label>.*)" + records: + Event: + label: $label + comment_element: + type: DictTextElement + match_name: "comment" + match_value: "(?P<comment>.*)" + records: + Event: + comment: $comment + start_datetime_element: + type: DictTextElement + match_name: start_datetime + match_value: "(?P<start_datetime>.*)" + records: + Event: + start_datetime: $start_datetime + end_datetime_element: + type: DictTextElement + match_name: end_datetime + match_value: "(?P<end_datetime>.*)" + records: + Event: + end_datetime: $end_datetime + longitude_element: + type: DictFloatElement + match_name: "longitude" + match_value: "(?P<longitude>.*)" + records: + Event: + longitude: $longitude + latitude_element: + type: DictFloatElement + match_name: "latitude" + match_value: "(?P<latitude>.*)" + records: + Event: + latitude: $latitude + elevation_element: + type: DictFloatElement + match_name: "elevation" + match_value: "(?P<elevation>.*)" + records: + Event: + elevation: $elevation + location_element: + type: DictTextElement + match_name: location + match_value: "(?P<location>.*)" + records: + Event: + location: $location + igsn_element: + type: DictTextElement + match_name: igsn + match_value: "(?P<igsn>.*)" + records: + Event: + igsn: $igsn + events_in_data_element: + type: DictBooleanElement + match_name: "events_in_data" + match_value: "(?P<events_in_data>.*)" + records: + Dataset: + events_in_data: $events_in_data + geojson_element: + type: DictTextElement + match_name: "geojson" + match_value: "(?P<geojson>.*)" + records: + Dataset: + geojson: $geojson + project_element: + type: DictDictElement + match_name: "project" + records: + Project: + parents: + - Project + Dataset: + Project: $Project + subtree: + name_element: + type: DictTextElement + match_name: "name" + match_value: "(?P<name>.*)" + records: + Project: + name: $name + full_name_element: + type: DictTextElement + match_name: "full_name" + match_value: "(?P<full_name>.*)" + records: + Project: + full_name: $full_name + project_id_element: + type: DictTextElement + match_name: "project_id" + match_value: "(?P<project_id>.*)" + records: + Project: + project_id: $project_id + project_type_element: + type: DictTextElement + match_name: "project_type" + match_value: "(?P<project_type_name>.*)" + records: + project_type: + name: $project_type_name + Project: + project_type: $project_type + institute_element: + type: DictTextElement + match_name: "institute" + match_value: "(?P<institute>.*)" + records: + Project: + institute: $institute + start_date_element: + type: DictTextElement + match_name: "start_date" + match_value: "(?P<start_date>.*)" + records: + Project: + start_date: $start_date + end_date_element: + type: DictTextElement + match_name: "end_date" + match_value: "(?P<end_date>.*)" + records: + Project: + end_date: $end_date + url_element: + type: DictTextElement + match_name: "url" + match_value: "(?P<url>.*)" + records: + Project: + url: $url + coordinators_element: + type: DictListElement + match_name: "coordinators" + subtree: + coordinator_element: + type: Dict + records: + Person: + parents: + - Person + Project: + coordinators: +$Person + subtree: *person_subtree + campaign_element: + type: DictDictElement + match_name: "campaign" + records: + Campaign: + parents: + - Campaign + Dataset: + Campaign: $Campaign + subtree: + label_element: + type: DictTextElement + match_name: "label" + match_value: "(?P<label>.*)" + records: + Campaign: + label: $label + optional_label_element: + type: DictTextElement + match_name: "optional_label" + match_value: "(?P<optional_label>.*)" + records: + Campaign: + optional_label: $optional_label + start_date_element: + type: DictTextElement + match_name: "start_date" + match_value: "(?P<start_date>.*)" + records: + Campaign: + start_date: $start_date + end_date_element: + type: DictTextElement + match_name: "end_date" + match_value: "(?P<end_date>.*)" + records: + Campaign: + end_date: $end_date + responsible_scientists_element: + type: DictListElement + match_name: "responsible_scientists" + subtree: + responsible_scientist_element: + type: Dict + records: + Person: + parents: + - Person + Campaign: + responsible_scientists: +$Person + subtree: *person_subtree + Methods_element: + type: DictListElement + match_name: "Method" + subtree: + Method_element: + type: Dict + records: + Method: + parents: + - Method + Dataset: + Method: +$Method + subtree: + method_name_element: + type: DictTextElement + match_name: "method_name" + match_value: "(?P<method_name>.*)" + records: + Method: + name: $method_name + abbreviation_element: + type: DictTextElement + match_name: "abbreviation" + match_value: "(?P<abbreviation>.*)" + records: + Method: + abbreviation: $abbreviation + url_element: + type: DictTextElement + match_name: "url" + match_value: "(?P<url>.*)" + records: + Method: + url: $url + Taxa_element: + type: DictListElement + match_name: "Taxon" + subtree: + Taxon_element: + type: Dict + records: + Taxon: + parents: + - Taxon + Dataset: + Taxon: +$Taxon + subtree: + taxon_name_element: + type: DictTextElement + match_name: "taxon_name" + match_value: "(?P<taxon_name>.*)" + records: + Taxon: + name: $taxon_name + archived_element: + type: DictBooleanElement + match_name: "archived" + match_value: "(P<archived>.*)" + records: + Dataset: + archived: $archived + publication_date_element: + type: DictTextElement + match_name: "publication_date" + match_value: "(P<publication_date>.*)" + records: + Dataset: + publication_date: $publication_date + max_files_element: + type: DictIntegerElement + match_name: "max_files" + match_value: "(P<max_files>.*)" + records: + Dataset: + max_files: $max_files + auxiliary_file: &aux_file_template + type: File + match: "(?P<aux_file_name>(?!metadata.json).*)" + # TODO File, path and reference dataset in file record + child_dataset_dir: + type: Directory + match: (?P<child_dataset_dir_name>.*) + subtree: + metadata_json: *metadata_json_template + auxiliary_file: *aux_file_template + data_processing_dir: + type: Directory + match: 04_data_processing + subtree: *template + results_dir: + type: Directory + match: 05_results + subtree: *template diff --git a/integrationtests/test_data/extroot/realworld_example/identifiables.yml b/integrationtests/test_data/extroot/realworld_example/identifiables.yml new file mode 100644 index 0000000000000000000000000000000000000000..0ea0265ecfec05392c599457d81339bc91ba18d0 --- /dev/null +++ b/integrationtests/test_data/extroot/realworld_example/identifiables.yml @@ -0,0 +1,22 @@ +license: + - name +project_type: + - name +Keyword: + - name +Taxon: + - name +Person: + - email + # - full_name +Dataset: + - title + # - DOI +Event: + - longitude + - latitude + - start_datetime +Dataspace: + - dataspace_id +Project: + - name diff --git a/integrationtests/test_data/extroot/realworld_example/pycaosdb.ini b/integrationtests/test_data/extroot/realworld_example/pycaosdb.ini new file mode 120000 index 0000000000000000000000000000000000000000..bc443439d842f18ce05e002e5f6b95d37ca22747 --- /dev/null +++ b/integrationtests/test_data/extroot/realworld_example/pycaosdb.ini @@ -0,0 +1 @@ +../../../pycaosdb.ini \ No newline at end of file diff --git a/integrationtests/test_data/extroot/realworld_example/schema/README.md b/integrationtests/test_data/extroot/realworld_example/schema/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e0bb95f8b844374bba72c7c6989ac57cfa5fc305 --- /dev/null +++ b/integrationtests/test_data/extroot/realworld_example/schema/README.md @@ -0,0 +1,37 @@ +# Dataset Schemas + +These schema's are derived from the [metadata +schemas](https://github.com/leibniz-zmt/zmt-metadata-schema) used at the Leibniz +Center for Tropical Marine Research (Leibniz ZMT). + +# Copyright + +BSD 3-Clause License + +Copyright (c) 2022 ZMT +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/integrationtests/test_data/extroot/realworld_example/schema/dataset-inheritance.yml b/integrationtests/test_data/extroot/realworld_example/schema/dataset-inheritance.yml new file mode 100644 index 0000000000000000000000000000000000000000..3d12053a0007cdea1005e7673db69f46b35a063d --- /dev/null +++ b/integrationtests/test_data/extroot/realworld_example/schema/dataset-inheritance.yml @@ -0,0 +1,18 @@ +extern: +- Keyword +- Taxon +- full_name +- full_name_nonlatin +- name + +full_name: + inherit_from_obligatory: + - name + +full_name_nonlatin: + inherit_from_obligatory: + - name + +Taxon: + inherit_from_obligatory: + - Keyword diff --git a/integrationtests/test_data/extroot/realworld_example/schema/dataset.schema.json b/integrationtests/test_data/extroot/realworld_example/schema/dataset.schema.json new file mode 100644 index 0000000000000000000000000000000000000000..83d6a60d857349772c960af637671cb21c8abd5d --- /dev/null +++ b/integrationtests/test_data/extroot/realworld_example/schema/dataset.schema.json @@ -0,0 +1,365 @@ +{ + "title": "Dataset", + "description": "", + "type": "object", + "properties": { + "title": { + "type": "string", + "description": "full dataset title" + }, + "authors": { + "type": "array", + "items": { + "type": "object", + "title": "Person", + "properties": { + "full_name": { + "type": "string", + "description": "Full name (latin transcription, all UFT-8 characters allowed)" + }, + "full_name_nonlatin": { + "type": "string", + "description": "Full name (non-latin alphabet)" + }, + "family_name": { + "type": "string", + "description": "Family name (latin transcription)" + }, + "given_name": { + "type": "string", + "description": "Given/other names (latin transcription)" + }, + "affiliation": { + "type": "string" + }, + "ORCID": { + "type": "string", + "description": "ORCID identifier as 16-digit number, e.g. 0000-0001-6233-1866", + "pattern": "^\\d{4}-\\d{4}-\\d{4}-\\d{4}$" + }, + "email": { + "type": "string", + "format": "email" + } + }, + "required": [ + "full_name", + "email" + ] + }, + "minItems": 1, + "uniqueItems": true + }, + "abstract": { + "type": "string", + "minLength": 80, + "maxLength": 1000, + "description": "Abstract with at least 80 characters" + }, + "comment": { + "type": "string" + }, + "license": { + "type": "string", + "enum": [ + "CC-BY", + "CC-BY-SA", + "CC0", + "restricted access" + ] + }, + "dataset_doi": { + "type": "string", + "pattern": "(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![%\"#? ])\\S)+)", + "description": "Dataset DOI, e.g. 10.1594/PANGAEA.938740" + }, + "related_to_dois": { + "type": "array", + "items": { + "type": "string", + "pattern": "(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![%\"#? ])\\S)+)" + }, + "description": "DOIs of related publications and/or datasets, e.g. 10.1000/182" + }, + "Keyword": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string" + } + } + } + }, + "Event": { + "type": "array", + "description": "https://wiki.pangaea.de/wiki/Event", + "items": { + "type": "object", + "properties": { + "label": { + "type": "string" + }, + "comment": { + "type": "string" + }, + "start_datetime": { + "type": "string", + "format": "date-time" + }, + "end_datetime": { + "type": "string", + "format": "date-time" + }, + "longitude": { + "type": "number", + "minimum": -180, + "maximum": 180, + "description": "longitude (W/E) in decimal degree (-180 to 180)" + }, + "latitude": { + "type": "number", + "minimum": -90, + "maximum": 90, + "description": "latitude (N/S) in decimal degree (-90 to 90)" + }, + "elevation": { + "type": "number", + "minimum": -10000, + "maximum": 20000, + "description": "elevation in m" + }, + "location": { + "type": "string", + "description": "geographical location as text (e.g., North Sea; Espoo, Finland)" + }, + "igsn": { + "type": "string", + "description": "International Geo Sample Number (http://www.geosamples.org/aboutigsn)" + } + }, + "required": [ + "longitude", + "latitude", + "start_datetime" + ] + } + }, + "events_in_data": { + "type": "boolean", + "description": "Does the data contain additional information about timepoints and locations?" + }, + "geojson": { + "type": "string", + "pattern": "", + "description": "GeoJSON for complex geographic structures" + }, + "project": { + "title": "Project", + "description": "https://wiki.pangaea.de/wiki/Project", + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "short name of project" + }, + "full_name": { + "type": "string", + "description": "Full name (latin transcription, all UTF-8 characters allowed)" + }, + "project_id": { + "type": "string", + "description": "Project ID" + }, + "project_type": { + "type": "string", + "enum": [ + "DFG", + "EU", + "BMBF", + "national", + "international" + ] + }, + "institute": { + "type": "string", + "description": "place of coordination or project office", + "default": "Centre for Research" + }, + "start_date": { + "type": "string", + "format": "date" + }, + "end_date": { + "type": "string", + "format": "date" + }, + "url": { + "type": "string", + "format": "uri" + }, + "coordinators": { + "type": "array", + "items": { + "type": "object", + "title": "Person", + "properties": { + "full_name": { + "type": "string", + "description": "Full name (latin transcription, all UTF-8 characters allowed)" + }, + "full_name_nonlatin": { + "type": "string", + "description": "Full name (non-latin alphabet)" + }, + "family_name": { + "type": "string", + "description": "Family name (latin transcription)" + }, + "given_name": { + "type": "string", + "description": "Given/other names (latin transcription)" + }, + "affiliation": { + "type": "string" + }, + "ORCID": { + "type": "string", + "description": "ORCID identifier as 16-digit number, e.g. 0000-0001-6233-1866", + "pattern": "^\\d{4}-\\d{4}-\\d{4}-\\d{4}$" + }, + "email": { + "type": "string", + "format": "email" + } + }, + "required": [ + "full_name", + "email" + ] + }, + "minItems": 1, + "uniqueItems": true + } + }, + "required": ["name", "full_name"] + }, + "campaign": { + "title": "Campaign", + "description": "https://wiki.pangaea.de/wiki/Campaign, synonyms: cruise, expedition, leg, ", + "type": "object", + "properties": { + "label": { + "type": "string", + "description": "is unique and does not contain blanks; uses abbreviations instead of full names" + }, + "optional_label": { + "type": "string" + }, + "start_date": { + "type": "string", + "format": "date" + }, + "end_date": { + "type": "string", + "format": "date" + }, + "responsible_scientists": { + "type": "array", + "items": { + "type": "object", + "title": "Person", + "properties": { + "full_name": { + "type": "string", + "description": "Full name (latin transcription, all UFT-8 characters allowed)" + }, + "full_name_nonlatin": { + "type": "string", + "description": "Full name (non-latin alphabet)" + }, + "family_name": { + "type": "string", + "description": "Family name (latin transcription)" + }, + "given_name": { + "type": "string", + "description": "Given/other names (latin transcription)" + }, + "affiliation": { + "type": "string" + }, + "ORCID": { + "type": "string", + "description": "ORCID identifier as 16-digit number, e.g. 0000-0001-6233-1866", + "pattern": "^\\d{4}-\\d{4}-\\d{4}-\\d{4}$" + }, + "email": { + "type": "string", + "format": "email" + } + }, + "required": [ + "full_name", + "email" + ] + }, + "minItems": 1, + "uniqueItems": true + } + } + }, + "Method": { + "type": "array", + "items": { + "type": "object", + "description": "https://wiki.pangaea.de/wiki/Method", + "properties": { + "method_name": { + "type": "string", + "description": "full official name of tool/instrument/device/gear" + }, + "abbreviation": { + "type": "string", + "description": "may be used for import in an event list to avoid misspellings" + }, + "url": { + "type": "string", + "description": "should contain a web address, where an official description of the device can be found" + } + } + } + }, + "Taxon": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string" + } + } + } + }, + "archived": { + "type": "boolean", + "description": "Has the dataset been archived?", + "default": false + }, + "publication_date": { + "type": "string", + "format": "date" + }, + "max_files": { + "type": "integer", + "description": "Maximum number of files to included by the CaosDB crawler", + "default": 100 + } + }, + "required": [ + "title", + "authors", + "abstract" + ] +} diff --git a/integrationtests/test_data/extroot/realworld_example/schema/dataspace.schema.json b/integrationtests/test_data/extroot/realworld_example/schema/dataspace.schema.json new file mode 100644 index 0000000000000000000000000000000000000000..01653bfa821e0a0acbb5a481bfd458e2ed784fb9 --- /dev/null +++ b/integrationtests/test_data/extroot/realworld_example/schema/dataspace.schema.json @@ -0,0 +1,45 @@ +{ + "title": "Dataspace", + "description": "A Dataspace is a folder in the DataCloud with a pre-defined structure", + "type": "object", + "properties": { + "dataspace_id": { + "type": "integer", + "description": "Integer ID of Dataspace (matches LDAP GID)", + "minimum": 20000 + }, + "archived": { "type": "boolean" }, + "url": { + "type": "string", + "description": "link to folder on file system (CaosDB or cloud folder)" + }, + "coordinator": { + "type": "object", + "title": "Person", + "properties": { + "full_name": { + "type": "string", + "description": "Full name (latin transcription, all UFT-8 characters allowed)" + }, + "full_name_nonlatin": { + "type": "string", + "description": "Full name (non-latin alphabet)" + }, + "family_name": { + "type": "string", + "description": "Family name (latin transcription)" + }, + "given_name": { + "type": "string", + "description": "Given/other names (latin transcription)" + }, + "email": { "type": "string", "format": "email" } + }, + "required": ["full_name", "email"] + }, + "start_date": { "type": "string", "format": "date" }, + "end_date": { "type": "string", "format": "date" }, + "comment": { "type": "string" } + }, + "required": ["dataspace_id", "url", "coordinator"] +} diff --git a/integrationtests/test_data/extroot/realworld_example/schema/organisation.yml b/integrationtests/test_data/extroot/realworld_example/schema/organisation.yml new file mode 100644 index 0000000000000000000000000000000000000000..7e251eeced7bf626e77364fc5555b1cb10dd3afb --- /dev/null +++ b/integrationtests/test_data/extroot/realworld_example/schema/organisation.yml @@ -0,0 +1,26 @@ +extern: +- name +- url +- Dataset + +german_name: + datatype: TEXT + inherit_from_obligatory: + - name + +Department: + recommended_properties: + url: + german_name: + + +WorkingGroup: + recommended_properties: + Department: + german_name: + url: + +Dataset: + recommended_properties: + WorkingGroup: + diff --git a/integrationtests/test_data/extroot/use_case_simple_presentation/DataAnalysis/results.md b/integrationtests/test_data/extroot/use_case_simple_presentation/DataAnalysis/results.md new file mode 100644 index 0000000000000000000000000000000000000000..b867d778942ce5595286870bd6a92e53015be0e8 --- /dev/null +++ b/integrationtests/test_data/extroot/use_case_simple_presentation/DataAnalysis/results.md @@ -0,0 +1,8 @@ +--- +identifier: test analysis +date: 2022-03-16 +source_identifier: crawlertest +source_date: 2022-03-16 + +frequency: 17 +--- diff --git a/integrationtests/test_data/extroot/use_case_simple_presentation/ExperimentalData/data.md b/integrationtests/test_data/extroot/use_case_simple_presentation/ExperimentalData/data.md new file mode 100644 index 0000000000000000000000000000000000000000..60dcd78ed1f70428b18e8762a14dc3fe7f3fa5cd --- /dev/null +++ b/integrationtests/test_data/extroot/use_case_simple_presentation/ExperimentalData/data.md @@ -0,0 +1,5 @@ +--- +date: "2022-03-16" +identifier: crawlertest +alpha: 16 +--- diff --git a/integrationtests/test_data/extroot/use_case_simple_presentation/cfood.yml b/integrationtests/test_data/extroot/use_case_simple_presentation/cfood.yml new file mode 100644 index 0000000000000000000000000000000000000000..6495e1828dc56e99459c162f7751951f880ea55c --- /dev/null +++ b/integrationtests/test_data/extroot/use_case_simple_presentation/cfood.yml @@ -0,0 +1,117 @@ +# This is only a scifolder test cfood with a limited functionality. +# The full scifolder cfood will be developed here: +# https://gitlab.indiscale.com/caosdb/src/crawler-cfoods/scifolder-cfood + +Definitions: + type: Definitions + #include "description.yml" + +Converters: {} + +extroot: + type: Directory + match: use_case_simple_presentation + subtree: + ExperimentalData: # name of the converter + type: Directory + match: ExperimentalData + subtree: + DataFile: + type: MarkdownFile + match: ^data\.md$ + + records: + mdfile: + parents: + - mdfile + role: File + path: $DataFile + file: $DataFile + + Experiment: + mdfile: $mdfile + + + subtree: + date: + type: DictTextElement + match_name: date + match_value: (?P<date>.+) + records: + Experiment: + date: $date + identifier: + type: DictTextElement + match_name: identifier + match_value: (?P<identifier>.+) + records: + Experiment: + identifier: $identifier + parameter_alpha: + type: DictTextElement + match_name: alpha + match_value: (?P<alpha>[0-9]+) + records: + Experiment: + alpha: $alpha + + DataAnalysis: + type: Directory + match: DataAnalysis + subtree: + DataFile: + type: MarkdownFile + match: ^results\.md$ + + records: + mdfile: + parents: + - mdfile + role: File + path: $DataFile + file: $DataFile + + Experiment: {} + + DataAnalysis: + mdfile: $mdfile + sources: +$Experiment + + subtree: + date: + type: DictTextElement + match_name: date + match_value: (?P<date>.+) + records: + DataAnalysis: + date: $date + identifier: + type: DictTextElement + match_name: identifier + match_value: (?P<identifier>.+) + records: + DataAnalysis: + identifier: $identifier + + frequency: + type: DictTextElement + match_name: frequency + match_value: (?P<frequency>[0-9]+) + records: + DataAnalysis: + frequency: $frequency + + source_date: + type: DictTextElement + match_name: source_date + match_value: (?P<source_date>.+) + records: + Experiment: + date: $source_date + source_identifier: + type: DictTextElement + match_name: source_identifier + match_value: (?P<source_identifier>.+) + records: + Experiment: + identifier: $source_identifier diff --git a/integrationtests/test_data/extroot/use_case_simple_presentation/identifiables.yml b/integrationtests/test_data/extroot/use_case_simple_presentation/identifiables.yml new file mode 100644 index 0000000000000000000000000000000000000000..94b593bfb4c425ce71a4f94504d4f0033538cacb --- /dev/null +++ b/integrationtests/test_data/extroot/use_case_simple_presentation/identifiables.yml @@ -0,0 +1,6 @@ +Experiment: +- date +- identifier +DataAnalysis: +- date +- identifier diff --git a/integrationtests/test_data/extroot/use_case_simple_presentation/model.yml b/integrationtests/test_data/extroot/use_case_simple_presentation/model.yml new file mode 100644 index 0000000000000000000000000000000000000000..bcf041c9586841ef9c61b9aef62574985c2be471 --- /dev/null +++ b/integrationtests/test_data/extroot/use_case_simple_presentation/model.yml @@ -0,0 +1,41 @@ + + + +ScientificActivity: + description: | + The base record type for all scientific activities, like experiments, + data analysis records, simulations or publications. + recommended_properties: + sources: + description: This scientific activity is based on the activity referenced here. + datatype: LIST<ScientificActivity> + date: + description: The date according to https://doi.org/10.3390/data5020043 + datatype: DATETIME + identifier: + description: An identifier according to https://doi.org/10.3390/data5020043 + datatype: TEXT + mdfile: + description: The file storing information about this record. + +Experiment: + description: | + The base record type for all records containing data from experiments. + inherit_from_obligatory: + - ScientificActivity + obligatory_properties: + alpha: + description: A ficticious piece of data. + datatype: DOUBLE + unit: km + +DataAnalysis: + description: | + The base record type for all records containing results from data analysis. + inherit_from_obligatory: + - ScientificActivity + recommended_properties: + frequency: + description: A ficticious piece of data. + datatype: DOUBLE + unit: Hz diff --git a/integrationtests/test_realworld_example.py b/integrationtests/test_realworld_example.py new file mode 100644 index 0000000000000000000000000000000000000000..da3fb69ce635ae69cd33cbf01de9df8ebf019661 --- /dev/null +++ b/integrationtests/test_realworld_example.py @@ -0,0 +1,210 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2022 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2022 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# Copyright (C) 2022 Florian Spreckelsen <f.spreckelsen@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" +module description +""" +import json +import os + +import caosdb as db + +from caoscrawler.crawl import Crawler, crawler_main +from caoscrawler.converters import JSONFileConverter, DictConverter +from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter +from caoscrawler.structure_elements import File, JSONFile, Directory +import pytest +from caosadvancedtools.models.parser import parse_model_from_json_schema, parse_model_from_yaml + +#from caosadvancedtools.testutils import clear_database, set_test_key +import sys + +# TODO is not yet merged in caosadvancedtools +# set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") + + +def rfp(*pathcomponents): + """ + Return full path. + Shorthand convenience function. + """ + return os.path.join(os.path.dirname(__file__), *pathcomponents) + + +DATADIR = rfp("test_data", "extroot", "realworld_example") + + +@pytest.fixture +def usemodel(): + # First load dataspace data model + dataspace_definitions = parse_model_from_json_schema( + os.path.join(DATADIR, "schema", "dataspace.schema.json")) + dataspace_definitions.sync_data_model(noquestion=True) + + # Then general dataset definitions + dataset_definitions = parse_model_from_json_schema( + os.path.join(DATADIR, "schema", "dataset.schema.json")) + dataset_definitions.sync_data_model(noquestion=True) + + # Finally, add inheritances as defined in yaml + dataset_inherits = parse_model_from_yaml( + os.path.join(DATADIR, "schema", "dataset-inheritance.yml")) + dataset_inherits.sync_data_model(noquestion=True) + + +@pytest.fixture +def clear_database(): + # TODO(fspreck): Remove once the corresponding advancedtools function can + # be used. + ents = db.execute_query("FIND ENTITY WITH ID>99") + if ents: + ents.delete() + + +def create_identifiable_adapter(): + ident = CaosDBIdentifiableAdapter() + ident.register_identifiable("license", ( + db.RecordType() + .add_parent("license") + .add_property("name"))) + ident.register_identifiable("project_type", ( + db.RecordType() + .add_parent("project_type") + .add_property("name"))) + ident.register_identifiable("Person", ( + db.RecordType() + .add_parent("Person") + .add_property("full_name"))) + + return ident + + +def test_dataset(clear_database, usemodel): + ident = create_identifiable_adapter() + crawler = Crawler(identifiableAdapter=ident) + crawler_definition = crawler.load_definition( + os.path.join(DATADIR, "dataset_cfoods.yml")) + # print(json.dumps(crawler_definition, indent=3)) + # Load and register converter packages: + converter_registry = crawler.load_converters(crawler_definition) + # print("DictIntegerElement" in converter_registry) + + records = crawler.start_crawling( + Directory("data", os.path.join(DATADIR, 'data')), + crawler_definition, + converter_registry + ) + crawler.synchronize() + + dataspace = db.execute_query("FIND RECORD Dataspace WITH name=35 AND dataspace_id=20002 AND " + "archived=FALSE AND url='https://datacloud.de/index.php/f/7679'" + " AND Person", unique=True) + assert dataspace.get_property("start_date").value == "2022-03-01" + db.execute_query("FIND RECORD Person with full_name='Max Schmitt' AND" + " given_name='Max'", unique=True) + + dataset = db.execute_query(f"FIND RECORD Dataset with Dataspace={dataspace.id} AND title=" + "'Random numbers created on a random autumn day in a random person\\'s office'" + "", unique=True) + assert db.execute_query(f"COUNT RECORD with id={dataset.id} AND WHICH REFERENCES Person WITH full_name=" + "'Alexa Nozone' AND WHICH REFERENCES Person WITH full_name='Max Schmitt'" + "") == 1 + assert db.execute_query(f"COUNT RECORD with id={dataset.id} AND WHICH REFERENCES Event WITH " + "start_datetime='2022-02-10T16:36:48+01:00'") == 1 + + +def test_event_update(clear_database, usemodel): + + identifiable_path = os.path.join(DATADIR, "identifiables.yml") + crawler_definition_path = os.path.join(DATADIR, "dataset_cfoods.yml") + + # TODO(fspreck): Use crawler_main + crawler_main( + os.path.join(DATADIR, 'data'), + crawler_definition_path, + identifiable_path, + True, + os.path.join(DATADIR, "provenance.yml"), + False, + "" + ) + + old_dataset_rec = db.execute_query( + "FIND RECORD Dataset WHICH HAS AN EVENT WITH location='Bremen, Germany'") + assert len(old_dataset_rec) == 1 + old_dataset_rec = old_dataset_rec[0] + assert old_dataset_rec.get_property("Event").datatype == db.LIST("Event") + assert len(old_dataset_rec.get_property("Event").value) == 1 + old_event_rec = db.Record( + id=old_dataset_rec.get_property("Event").value[0]).retrieve() + + # TODO(fspreck): crawl again manually, edit the event records in the update + # list, synchronize, and test whether the events have been updated. + ident = CaosDBIdentifiableAdapter() + ident.load_from_yaml_definition(identifiable_path) + + second_crawler = Crawler(identifiableAdapter=ident) + crawler_definition = second_crawler.load_definition( + crawler_definition_path) + converter_registry = second_crawler.load_converters(crawler_definition) + records = second_crawler.start_crawling( + Directory("data", os.path.join(DATADIR, "data")), + crawler_definition, + converter_registry + ) + + for rec in records: + if rec.parents[0].name == "Event": + rec.get_property("longitude").value = 0.0 + rec.get_property("latitude").value = 0.0 + rec.get_property("location").value = "Origin" + elif rec.parents[0].name == "Dataset": + rec.get_property("Event").value[0].get_property( + "longitude").value = 0.0 + rec.get_property("Event").value[0].get_property( + "latitude").value = 0.0 + rec.get_property("Event").value[0].get_property( + "location").value = "Origin" + second_crawler.synchronize() + + # Dataset is still the same Record, but with an updated event + new_dataset_rec = db.Record(id=old_dataset_rec.id).retrieve() + for prop in old_dataset_rec.get_properties(): + if not prop.name == "Event": + assert new_dataset_rec.get_property( + prop.name).datatype == prop.datatype + assert new_dataset_rec.get_property( + prop.name).value == prop.value + assert new_dataset_rec.get_property("Event").datatype == db.LIST("Event") + assert new_dataset_rec.get_property("Event").value is not None + assert len(new_dataset_rec.get_property("Event").value) == 1 + assert new_dataset_rec.get_property("Event").value[0] != old_event_rec.id + + # The event has new properties + new_event_rec = db.Record( + id=new_dataset_rec.get_property("Event").value[0]).retrieve() + assert new_event_rec.get_property("longitude").value == 0.0 + assert new_event_rec.get_property("latitude").value == 0.0 + assert new_event_rec.get_property("location").value == "Origin" + assert new_event_rec.get_property( + "start_datetime").value == old_event_rec.get_property("start_datetime").value diff --git a/integrationtests/test_use_case_simple_presentation.py b/integrationtests/test_use_case_simple_presentation.py new file mode 100644 index 0000000000000000000000000000000000000000..1d611ba4b002aa4c5b31f6f6a2862985c9d4298f --- /dev/null +++ b/integrationtests/test_use_case_simple_presentation.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2022 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +""" +module description +""" +import os +import pytest +from subprocess import run + +import caosdb as db +from caosadvancedtools.loadFiles import loadpath +from caosadvancedtools.models import parser as parser +from caoscrawler.crawl import crawler_main + +# TODO: wait for release of this feature in pylib +# from caosdb.utils.register_tests import clear_database, set_test_key +# set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") + +DATADIR = os.path.join(os.path.dirname(__file__), "test_data", + "extroot", "use_case_simple_presentation") + +# TODO: remove this + + +@pytest.fixture +def clear_database(): + # TODO(fspreck): Remove once the corresponding advancedtools function can be + # used. + ents = db.execute_query("FIND ENTITY WITH ID>99") + if ents: + ents.delete() + + +def test_complete_crawler( + clear_database +): + # Setup the data model: + model = parser.parse_model_from_yaml(os.path.join(DATADIR, "model.yml")) + model.sync_data_model(noquestion=True, verbose=False) + + # Insert the data: + for path in [ + "/opt/caosdb/mnt/extroot/use_case_simple_presentation/ExperimentalData", + "/opt/caosdb/mnt/extroot/use_case_simple_presentation/DataAnalysis"]: + loadpath( + path=path, + include=None, + exclude=None, + prefix="/", + dryrun=False, + forceAllowSymlinks=False) + + crawler_main(DATADIR, + os.path.join(DATADIR, "cfood.yml"), + os.path.join(DATADIR, "identifiables.yml"), + True, + os.path.join(DATADIR, "provenance.yml"), + False, + "/use_case_simple_presentation") + + res = db.execute_query("FIND Record Experiment") + assert len(res) == 1 + assert res[0].get_property("identifier").value == "crawlertest" + assert res[0].get_property("date").value == "2022-03-16" + + lf = db.File(id=res[0].get_property("mdfile").value).retrieve() + assert lf.path == "/ExperimentalData/data.md" + + assert res[0].get_property("alpha").value == 16.0 + assert res[0].get_property("alpha").unit == "km" + + res_da = db.execute_query("FIND Record DataAnalysis") + assert len(res_da) == 1 + assert res_da[0].get_property("sources").value[0] == res[0].id + + lf = db.File(id=res_da[0].get_property("mdfile").value).retrieve() + assert lf.path == "/DataAnalysis/results.md" diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index 501dea852f18a271e49245342111f1c1c8e3fdd0..0000000000000000000000000000000000000000 --- a/pyproject.toml +++ /dev/null @@ -1,7 +0,0 @@ -[build-system] -requires = [ - "setuptools>=42", - "wheel" -] -build-backend = "setuptools.build_meta" - \ No newline at end of file diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000000000000000000000000000000000000..f818888e98690a861228b1f3c0214b1cc94fb6e1 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +testpaths=unittests diff --git a/release.sh b/release.sh new file mode 100644 index 0000000000000000000000000000000000000000..1af097f014de6cd9eb3d3e8ba5da34aea0fe1671 --- /dev/null +++ b/release.sh @@ -0,0 +1,4 @@ +#!/bin/bash +rm -rf dist/ build/ .eggs/ +python setup.py sdist bdist_wheel +python -m twine upload -s dist/* diff --git a/setup.cfg b/setup.cfg index 88826f71b1563492d6f9780295d04b1c402b5550..05e278a5644cbfa84ae37fc8bbd0aa7e4c5232e5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] -name = newcrawler -version = 0.1 +name = caoscrawler +version = 0.1.0 author = Alexander Schlemmer author_email = alexander.schlemmer@ds.mpg.de description = A new crawler for caosdb @@ -17,7 +17,24 @@ classifiers = package_dir = = src packages = find: -python_requires = >=3.6 +python_requires = >=3.8 +install_requires = + importlib-resources + caosdb + caosadvancedtools >= 0.6.0 + yaml-header-tools >= 0.2.1 + pyyaml + odfpy #make optional + pandas [options.packages.find] -where = src \ No newline at end of file +where = src +[options.package_data] +* = *.yml + +[flake8] +per-file-ignores = __init__.py:F401 + +[options.entry_points] +console_scripts = + caosdb-crawler = caoscrawler.crawl:main diff --git a/src/caoscrawler/__init__.py b/src/caoscrawler/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b65b9fd9d24b9519a52ca13d07e46c9d8f791a73 --- /dev/null +++ b/src/caoscrawler/__init__.py @@ -0,0 +1 @@ +from .crawl import Crawler, SecurityMode diff --git a/src/caoscrawler/authorize.py b/src/caoscrawler/authorize.py new file mode 100644 index 0000000000000000000000000000000000000000..6f1011b227881d4b73186996076abe20d94d52e5 --- /dev/null +++ b/src/caoscrawler/authorize.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2022 Henrik tom Wörden +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +from caosadvancedtools.crawler import Crawler as OldCrawler + +import argparse + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("run_id", + help="Run ID or the crawler run that created the changes that shall be " + "authorized.") + + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + OldCrawler.update_authorized_changes(args.run_id) diff --git a/src/caoscrawler/cfood-schema.yml b/src/caoscrawler/cfood-schema.yml new file mode 100644 index 0000000000000000000000000000000000000000..d7b5abfd1ac6c381b50bd4ce61015f1b8602b408 --- /dev/null +++ b/src/caoscrawler/cfood-schema.yml @@ -0,0 +1,67 @@ +cfood: + type: object + additionalProperties: + $ref: + "#/$defs/converter" + $defs: + converter: + properties: + type: + enum: + - Directory + - File + - DictTextElement + - TextElement + - SimpleFile + - YamlFileCaosDBRecord + - MarkdownFile + - DictListElement + - DictDictElement + - DictFloatElement + - DictIntegerElement + - DictBooleanElement + - Definitions + - Dict + - JSONFile + - CSVTableConverter + - XLSXTableConverter + description: Type of this converter node. + match: + description: typically a regexp which is matched to a structure element name + type: string + match_name: + description: a regexp that is matched to the key of a key-value pair + type: string + match_value: + description: a regexp that is matched to the value of a key-value pair + type: string + records: + description: This field is used to define new records or to modify records which have been defined on a higher level. + type: object + properties: + parents: + description: Parents for this record are given here as a list of names. + type: array + items: + type: string + additionalProperties: + oneOf: + - type: object + properties: + value: + description: Dictionary notation for variable values. Values can be given by a variable which is indicated by an initial "$". Use "$$" for setting values actually starting with a dollar sign. + type: string + collection_mode: + description: The collection mode defines whether the resulting property will be a single property or whether the values of multiple structure elements will be collected either into a list or a multiproperty. + enum: + - single + - list + - multiproperty + additionalProperties: false + - type: string + description: The short notation for values. Values can be given by a variable which is indicated by an initial "$". Use "$$" for setting values actually starting with a dollar sign. Multiproperties can be set using an initial "*" and list properties using an initial "+". + subtree: + type: object + additionalProperties: + $ref: + "#/$defs/converter" diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py new file mode 100644 index 0000000000000000000000000000000000000000..dc6883ca94a65acd58ecd321ee6ea77f86593cd3 --- /dev/null +++ b/src/caoscrawler/converters.py @@ -0,0 +1,828 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2021 Henrik tom Wörden +# 2021 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +from jsonschema import validate, ValidationError +import os +import re +import caosdb as db +import json +import warnings +from .utils import has_parent +from .stores import GeneralStore, RecordStore +from .structure_elements import (StructureElement, Directory, File, Dict, JSONFile, + DictIntegerElement, DictBooleanElement, + DictFloatElement, DictDictElement, + TextElement, DictTextElement, DictElement, DictListElement) +from typing import Dict as Dict_t, List, Optional, Tuple, Union +from abc import ABCMeta, abstractmethod +from string import Template +import yaml_header_tools + +import pandas as pd + +import yaml + +# These are special properties which are (currently) treated differently +# by the converters: +SPECIAL_PROPERTIES = ("description", "name", "id", "path", + "file", "checksum", "size") + + +def _only_max(children_with_keys): + + return [max(children_with_keys, key=lambda x: x[1])[0]] + + +def _only_min(children_with_keys): + + return [min(children_with_keys, key=lambda x: x[1])[0]] + + +# names of functions that can be used to filter children +FILTER_FUNCTIONS = { + "only_max": _only_max, + "only_min": _only_min, +} + + +def str_to_bool(x): + if str(x).lower() == "true": + return True + elif str(x).lower() == "false": + return False + else: + raise RuntimeError("Should be 'true' or 'false'.") + + +class ConverterValidationError(Exception): + """To be raised if contents of an element to be converted are invalid.""" + + def __init__(self, msg): + self.message = msg + + +def replace_variables(propvalue, values: GeneralStore): + """ + This function replaces variables in property values (and possibly other locations, + where the crawler can replace cfood-internal variables). + + This function checks whether the value that is to be replaced is of type db.Entity. + In this case the entity is returned (note that this is of course only possible, if the + occurrence of the variable is directly at the beginning of the value and e.g. no string + concatenation is attempted. + + In any other case the variable substitution is carried out and a new string with the + replaced variables is returned. + """ + # Check if the replacement is a single variable containing a record: + match = re.match(r"^\$(\{)?(?P<varname>[0-9a-zA-Z_]+)(\})?$", propvalue) + if match is not None: + varname = match.group("varname") + if varname in values: + if values[varname] is None: + return None + if isinstance(values[varname], db.Entity): + return values[varname] + + propvalue_template = Template(propvalue) + return propvalue_template.safe_substitute(**values.get_storage()) + + +def handle_value(value: Union[dict, str, list], values: GeneralStore): + """ + determines whether the given value needs to set a property, be added to an existing value (create a list) or + add as an additional property (multiproperty). + + Variable names (starting with a "$") are replaced by the corresponding value stored in the + `values` GeneralStore. + + Parameters: + - value: if str, the value to be interpreted. E.g. "4", "hallo" or "$a" etc. + if dict, must have keys "value" and "collection_mode". The returned tuple is directly + created from the corresponding values. + if list, each element is checked for replacement and the resulting list will be used + as (list) value for the property + Returns a tuple: + - the final value of the property; variable names contained in `values` are replaced. + - the collection mode (can be single, list or multiproperty) + """ + # @review Florian Spreckelsen 2022-05-13 + + if type(value) == dict: + if "value" not in value: + # TODO: how do we handle this case? Just ignore? + # or disallow? + raise NotImplementedError() + propvalue = value["value"] + # can be "single", "list" or "multiproperty" + collection_mode = value["collection_mode"] + elif type(value) == str: + propvalue = value + collection_mode = "single" + if propvalue.startswith("+"): + collection_mode = "list" + propvalue = propvalue[1:] + elif propvalue.startswith("*"): + collection_mode = "multiproperty" + propvalue = propvalue[1:] + elif type(value) == list: + # TODO: (for review) + # This is a bit dirty right now and needed for + # being able to directly set list values. Semantics is, however, a bit + # different from the two cases above. + collection_mode = "single" + propvalue = value + + # variables replacement: + propvalue = list() + for element in value: + # Do the element-wise replacement only, when its type is string: + if type(element) == str: + propvalue.append(replace_variables(element, values)) + else: + propvalue.append(element) + + return (propvalue, collection_mode) + else: + # value is another simple type + collection_mode = "single" + propvalue = value + # Return it immediately, otherwise variable substitution would be done and fail: + return (propvalue, collection_mode) + + propvalue = replace_variables(propvalue, values) + return (propvalue, collection_mode) + + +def create_records(values: GeneralStore, + records: RecordStore, + def_records: dict): + # list of keys to identify, which variables have been set by which paths: + # the items are tuples: + # 0: record name + # 1: property name + keys_modified = [] + + for name, record in def_records.items(): + role = "Record" + # This allows us to create e.g. Files + if "role" in record: + role = record["role"] + + # whether the record already exists in the store or not are actually really + # different distinct cases for treating the setting and updating of variables: + if name not in records: + if role == "Record": + c_record = db.Record() + elif role == "File": + c_record = db.File() + else: + raise RuntimeError("Role {} not supported.".format(role)) + # add the new record to the record store: + records[name] = c_record + # additionally add the new record to the general store: + values[name] = c_record + + c_record = records[name] + + for key, value in record.items(): + if key == "parents" or key == "role": + continue + + # Allow replacing variables in keys / names of properties: + key_template = Template(key) + key = key_template.safe_substitute(**values.get_storage()) + + keys_modified.append((name, key)) + propvalue, collection_mode = handle_value(value, values) + + if key.lower() in SPECIAL_PROPERTIES: + # e.g. description, name, etc. + # list mode does not work for them + if key.lower() == "path" and not propvalue.startswith(os.path.sep): + propvalue = os.path.sep + propvalue + + # Convert relative to absolute paths: + propvalue = os.path.normpath(propvalue) + setattr(c_record, key.lower(), propvalue) + else: + + if c_record.get_property(key) is None: + + if collection_mode == "list": + c_record.add_property(name=key, value=[propvalue]) + elif (collection_mode == "multiproperty" or + collection_mode == "single"): + c_record.add_property(name=key, value=propvalue) + else: + if collection_mode == "list": + c_record.get_property(key).value.append(propvalue) + elif collection_mode == "multiproperty": + c_record.add_property(name=key, value=propvalue) + elif collection_mode == "single": + c_record.get_property(key).value = propvalue + + # no matter whether the record existed in the record store or not, + # parents will be added when they aren't present in the record yet: + if "parents" in record: + for parent in record["parents"]: + # Do the variables replacement: + var_replaced_parent = replace_variables(parent, values) + if not has_parent(c_record, var_replaced_parent): + c_record.add_parent(var_replaced_parent) + else: + # add the "fallback" parent only for Records, not for Files: + if role == "Record": + # if not has_parent(c_record, name): + if len(c_record.parents) == 0: + c_record.add_parent(name) + return keys_modified + + +class Converter(object, metaclass=ABCMeta): + """ + Converters treat StructureElements contained in the hierarchical sturcture. + """ + + def __init__(self, definition: dict, + name: str, + converter_registry: dict): + self.definition = definition + self.name = name + + # Used to store usage information for debugging: + self.metadata: Dict_t[str, set[str]] = { + "usage": set() + } + + self.converters = [] + + if "subtree" in definition: + for converter_name in definition['subtree']: + converter_definition = definition["subtree"][converter_name] + self.converters.append(Converter.converter_factory( + converter_definition, converter_name, converter_registry)) + + @staticmethod + def converter_factory(definition: dict, + name: str, + converter_registry: dict): + """creates a Converter instance of the appropriate class. + + The `type` key in the `definition` defines the Converter class which is being used. + """ + + if "type" not in definition: + raise RuntimeError( + "Type is mandatory for converter entries in CFood definition.") + + if definition["type"] not in converter_registry: + raise RuntimeError("Unknown Type: {}".format(definition["type"])) + + if "class" not in converter_registry[definition["type"]]: + raise RuntimeError("Converter class not loaded correctly.") + + # instatiates an object of the required class, e.g. DirectoryConverter(definition, name) + converter = converter_registry[definition["type"]]["class"](definition, name, + converter_registry) + + return converter + + def create_values(self, + values: GeneralStore, + element: StructureElement): + """ + Extract information from the structure element and store them as values in the + general store. + + values: The GeneralStore to store values in. + element: The StructureElement to extract values from. + """ + m = self.match(element) + if m is None: + # this should never happen as the condition was checked before already + raise RuntimeError("Condition does not match.") + values.update(m) + + @abstractmethod + def create_children(self, values: GeneralStore, + element: StructureElement): + pass + + def create_records(self, values: GeneralStore, + records: RecordStore, + element: StructureElement): + + if "records" not in self.definition: + return [] + + return create_records(values, + records, + self.definition["records"]) + + def filter_children(self, children_with_strings: + List[Tuple[StructureElement, str]], expr: str, + group: str, rule: str): + """Filter children according to regexp `expr` and `rule`.""" + + if rule not in FILTER_FUNCTIONS: + raise RuntimeError( + f"{rule} is not a known filter rule. Only {list(FILTER_FUNCTIONS.keys())} are implemented." + ) + + to_be_filtered = [] + unmatched_children = [] + + for (child, name) in children_with_strings: + + m = re.match(expr, name) + if m is None: + unmatched_children.append(child) + else: + to_be_filtered.append((child, m.groupdict()[group])) + + filtered_children = FILTER_FUNCTIONS[rule](to_be_filtered) + + return filtered_children+unmatched_children + + @abstractmethod + def typecheck(self, element: StructureElement): + pass + + @abstractmethod + def match(self, element: StructureElement) -> Optional[dict]: + pass + + +class DirectoryConverter(Converter): + + def __init__(self, definition: dict, name: str, + converter_registry: dict): + """ + Initialize a new directory converter. + """ + super().__init__(definition, name, converter_registry) + + def create_children(self, generalStore: GeneralStore, + element: StructureElement): + if not isinstance(element, Directory): + raise RuntimeError( + "Directory converters can only create children from directories.") + + children = self.create_children_from_directory(element) + + if "filter" in self.definition: + + tuple_list = [(c, c.name) for c in children] + + return self.filter_children(tuple_list, **self.definition["filter"]) + + return children + + def typecheck(self, element: StructureElement): + return isinstance(element, Directory) + + def match(self, element: StructureElement): + if not isinstance(element, Directory): + raise RuntimeError("Element must be a directory.") + m = re.match(self.definition["match"], element.name) + if m is None: + return None + return m.groupdict() + + @staticmethod + def create_children_from_directory(element: Directory): + """ + Creates a list of files (of type File) and directories (of type Directory) for a + given directory. No recursion. + + element: A directory (of type Directory) which will be traversed. + """ + children: List[StructureElement] = [] + + for name in sorted(os.listdir(element.path)): + path = os.path.join(element.path, name) + + if os.path.isdir(path): + children.append(Directory(name, path)) + elif os.path.isfile(path): + children.append(File(name, path)) + + return children + + +class SimpleFileConverter(Converter): + """ + Just a file, ignore the contents. + """ + + def typecheck(self, element: StructureElement): + return isinstance(element, File) + + def create_children(self, generalStore: GeneralStore, + element: StructureElement): + return list() + + def match(self, element: StructureElement): + if not isinstance(element, File): + raise RuntimeError("Element must be a file.") + m = re.match(self.definition["match"], element.name) + if m is None: + return None + return m.groupdict() + + +class MarkdownFileConverter(Converter): + def __init__(self, definition: dict, name: str, + converter_registry: dict): + """ + Initialize a new directory converter. + """ + super().__init__(definition, name, converter_registry) + + def create_children(self, generalStore: GeneralStore, + element: StructureElement): + if not isinstance(element, File): + raise RuntimeError("A markdown file is needed to create children.") + + header = yaml_header_tools.get_header_from_file( + element.path, clean=False) + children: List[StructureElement] = [] + + for name, entry in header.items(): + if type(entry) == list: + children.append(DictListElement(name, entry)) + elif type(entry) == str: + children.append(DictTextElement(name, entry)) + else: + raise RuntimeError( + "Header entry {} has incompatible type.".format(name)) + return children + + def typecheck(self, element: StructureElement): + return isinstance(element, File) + + def match(self, element: StructureElement): + if not isinstance(element, File): + raise RuntimeError("Element must be a file.") + m = re.match(self.definition["match"], element.name) + if m is None: + return None + try: + yaml_header_tools.get_header_from_file(element.path) + except yaml_header_tools.NoValidHeader: + # TODO(salexan): Raise a validation error instead of just not + # matching silently. + return None + return m.groupdict() + + +class DictConverter(Converter): + # TODO use Dict as typecheck? + def create_children(self, generalStore: GeneralStore, element: StructureElement): + if not self.typecheck(element): + raise RuntimeError("A dict is needed to create children") + + return self._create_children_from_dict(element.value) + + def _create_children_from_dict(self, data): + children = [] + + for name, value in data.items(): + if type(value) == list: + children.append(DictListElement(name, value)) + elif type(value) == str: + children.append(DictTextElement(name, value)) + elif type(value) == dict: + children.append(DictDictElement(name, value)) + elif type(value) == int: + children.append(DictIntegerElement(name, value)) + elif type(value) == bool: + children.append(DictBooleanElement(name, value)) + elif type(value) == float: + children.append(DictFloatElement(name, value)) + elif type(value) == type(None): + continue + else: + children.append(DictElement(name, value)) + warnings.warn(f"The value in the dict for key:{name} has an unknown type. " + "The fallback type DictElement is used.") + + return children + + # TODO use Dict as typecheck? + def typecheck(self, element: StructureElement): + return isinstance(element, Dict) + + def match(self, element: StructureElement): + """ + Allways matches if the element has the right type. + """ + if not isinstance(element, Dict): + raise RuntimeError("Element must be a DictElement.") + return {} + + +# TODO: difference to SimpleFileConverter? Do we need both? +class FileConverter(Converter): + def typecheck(self, element: StructureElement): + return isinstance(element, File) + + def match(self, element: StructureElement): + if not self.typecheck(element): + raise RuntimeError("Element must be a file") + m = re.match(self.definition["match"], element.name) + if m is None: + return None + return m.groupdict() + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + return [] + + +class JSONFileConverter(DictConverter): + def typecheck(self, element: StructureElement): + return isinstance(element, File) + + def match(self, element: StructureElement): + if not self.typecheck(element): + raise RuntimeError("Element must be a file") + m = re.match(self.definition["match"], element.name) + if m is None: + return None + return m.groupdict() + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + if not self.typecheck(element): + raise RuntimeError("A JSON file is needed to create children") + # TODO: either add explicit time check for File structure element here, + # or add a comment to suppress mypy type warning. + with open(element.path, 'r') as json_file: + json_data = json.load(json_file) + if not isinstance(json_data, dict): + raise NotImplementedError("JSON file must contain a dict") + if "validate" in self.definition and self.definition["validate"]: + if isinstance(self.definition["validate"], dict): + schema = self.definition["validate"] + elif isinstance(self.definition["validate"], str): + + with open(self.definition["validate"], 'r') as json_file: + schema = json.load(json_file) + else: + raise ValueError("The value of 'validate' has to be a string describing the path " + "to the json schema file (relative to the cfood yml) " + "or a dict containing the schema.") + # Validate the json content + try: + validate(instance=json_data, schema=schema) + except ValidationError as err: + raise ConverterValidationError( + f"Couldn't validate {json_data}:\n{err.message}") + + return self._create_children_from_dict(json_data) + + +class _AbstractDictElementConverter(Converter): + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + return [] + + def typecheck(self, element: StructureElement): + return True + + def match(self, element: StructureElement): + """ + Try to match the given structure element. + + If it does not match, return None. + + Else return a dictionary containing the variables from the matched regexp + as key value pairs. + """ + if not self.typecheck(element): + raise RuntimeError( + f"Element has an invalid type: {type(element)}.") + m1 = re.match(self.definition["match_name"], element.name) + if m1 is None: + return None + m2 = re.match(self.definition["match_value"], str(element.value)) + if m2 is None: + return None + values = dict() + values.update(m1.groupdict()) + values.update(m2.groupdict()) + return values + + +class DictBooleanElementConverter(_AbstractDictElementConverter): + def typecheck(self, element: StructureElement): + return isinstance(element, DictBooleanElement) + + +class DictFloatElementConverter(_AbstractDictElementConverter): + def typecheck(self, element: StructureElement): + return isinstance(element, DictFloatElement) + + +class DictTextElementConverter(_AbstractDictElementConverter): + def typecheck(self, element: StructureElement): + return isinstance(element, DictTextElement) + + +class DictIntegerElementConverter(_AbstractDictElementConverter): + def typecheck(self, element: StructureElement): + return isinstance(element, DictIntegerElement) + + +class DictListElementConverter(Converter): + def create_children(self, generalStore: GeneralStore, + element: StructureElement): + if not isinstance(element, DictListElement): + raise RuntimeError( + "This converter can only process DictListElements.") + children = [] + for index, list_element in enumerate(element.value): + # TODO(fspreck): Refactor this and merge with DictXXXElements maybe? + if isinstance(list_element, str): + children.append(TextElement(str(index), list_element)) + elif isinstance(list_element, dict): + children.append(Dict(str(index), list_element)) + else: + raise NotImplementedError( + f"Unkown type {type(list_element)} in list element {list_element}.") + return children + + def typecheck(self, element: StructureElement): + return isinstance(element, DictListElement) + + def match(self, element: StructureElement): + if not isinstance(element, DictListElement): + raise RuntimeError("Element must be a DictListElement.") + m = re.match(self.definition["match_name"], element.name) + if m is None: + return None + if "match" in self.definition: + raise NotImplementedError( + "Match is not implemented for DictListElement.") + return m.groupdict() + + +class DictDictElementConverter(DictConverter): + def create_children(self, generalStore: GeneralStore, element: StructureElement): + if not self.typecheck(element): + raise RuntimeError("A dict is needed to create children") + + return self._create_children_from_dict(element.value) + + def typecheck(self, element: StructureElement): + return isinstance(element, DictDictElement) + + def match(self, element: StructureElement): + if not self.typecheck(element): + raise RuntimeError("Element must be a DictDictElement.") + m = re.match(self.definition["match_name"], element.name) + if m is None: + return None + if "match" in self.definition: + raise NotImplementedError( + "Match is not implemented for DictDictElement.") + return m.groupdict() + + +class TextElementConverter(Converter): + def create_children(self, generalStore: GeneralStore, + element: StructureElement): + return [] + + def typecheck(self, element: StructureElement): + return isinstance(element, TextElement) + + def match(self, element: StructureElement): + if not isinstance(element, TextElement): + raise RuntimeError("Element must be a TextElement.") + m = re.match(self.definition["match"], element.value) + if m is None: + return None + return m.groupdict() + + +class TableConverter(Converter): + """ + This converter reads tables in different formats line by line and + allows matching the corresponding rows. + + The subtree generated by the table converter consists of DictDictElements, each being + a row. The corresponding header elements will become the dictionary keys. + + The rows can be matched using a DictDictElementConverter. + """ + @abstractmethod + def get_options(self): + """ + This method needs to be overwritten by the specific table converter to provide + information about the possible options. + """ + pass + + def _get_options(self, possible_options): + option_dict = dict() + for opt_name, opt_conversion in possible_options: + if opt_name in self.definition: + el = self.definition[opt_name] + # The option can often either be a single value or a list of values. + # In the latter case each element of the list will be converted to the defined type. + if isinstance(el, list): + option_dict[opt_name] = [ + opt_conversion(el_el) for el_el in el] + else: + option_dict[opt_name] = opt_conversion(el) + return option_dict + + def typecheck(self, element: StructureElement): + return isinstance(element, File) + + def match(self, element: StructureElement): + if not isinstance(element, File): + raise RuntimeError("Element must be a File.") + m = re.match(self.definition["match"], element.name) + if m is None: + return None + return m.groupdict() + + +class XLSXTableConverter(TableConverter): + def get_options(self): + return self._get_options([ + ("sheet_name", str), + ("header", int), + ("names", str), + ("index_col", int), + ("usecols", int), + ("true_values", str), + ("false_values", str), + ("na_values", str), + ("skiprows", int), + ("nrows", int), + ("keep_default_na", str_to_bool), ] + ) + + def create_children(self, generalStore: GeneralStore, + element: StructureElement): + if not isinstance(element, File): + raise RuntimeError("Element must be a File.") + table = pd.read_excel(element.path, **self.get_options()) + child_elements = list() + for index, row in table.iterrows(): + child_elements.append( + DictDictElement(str(index), row.to_dict())) + return child_elements + + +class CSVTableConverter(TableConverter): + def get_options(self): + return self._get_options([ + ("sep", str), + ("delimiter", str), + ("header", int), + ("names", str), + ("index_col", int), + ("usecols", int), + ("true_values", str), + ("false_values", str), + ("na_values", str), + ("skiprows", int), + ("nrows", int), + ("keep_default_na", str_to_bool), ]) + + def create_children(self, generalStore: GeneralStore, + element: StructureElement): + if not isinstance(element, File): + raise RuntimeError("Element must be a File.") + table = pd.read_csv(element.path, **self.get_options()) + child_elements = list() + for index, row in table.iterrows(): + child_elements.append( + DictDictElement(str(index), row.to_dict())) + return child_elements diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py new file mode 100644 index 0000000000000000000000000000000000000000..b4c9c9ffeedfb0eaf82860b07afe695215ece04d --- /dev/null +++ b/src/caoscrawler/crawl.py @@ -0,0 +1,1253 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2021 Henrik tom Wörden +# 2021 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +""" +Crawl a file structure using a yaml cfood definition and synchronize +the acuired data with CaosDB. +""" + +import importlib +from caosadvancedtools.cache import UpdateCache, Cache +import uuid +import sys +import os +import yaml +from enum import Enum +import logging +from importlib_resources import files +import argparse +from argparse import RawTextHelpFormatter +import caosdb as db +from caosadvancedtools.crawler import Crawler as OldCrawler +from caosdb.common.datatype import is_reference +from .stores import GeneralStore, RecordStore +from .identified_cache import IdentifiedCache +from .structure_elements import StructureElement, Directory +from .converters import Converter, DirectoryConverter +from .identifiable_adapters import (IdentifiableAdapter, + LocalStorageIdentifiableAdapter, + CaosDBIdentifiableAdapter) +from collections import defaultdict +from typing import Any, Dict, List, Optional, Type, Union +from caosdb.apiutils import compare_entities, merge_entities +from copy import deepcopy +from jsonschema import validate + +from .macros import defmacro_constructor, macro_constructor + +logger = logging.getLogger(__name__) + +SPECIAL_PROPERTIES_STRICT = ("description", "name", "id", "path") +SPECIAL_PROPERTIES_NOT_STRICT = ("file", "checksum", "size") + +# Register the macro functions from the submodule: +yaml.SafeLoader.add_constructor("!defmacro", defmacro_constructor) +yaml.SafeLoader.add_constructor("!macro", macro_constructor) + + +def check_identical(record1: db.Entity, record2: db.Entity, ignore_id=False): + """ + This function uses compare_entities to check whether to entities are identical + in a quite complex fashion: + - If one of the entities has additional parents or additional properties -> not identical + - If the value of one of the properties differs -> not identical + - If datatype, importance or unit are reported different for a property by compare_entities + return "not_identical" only if these attributes are set explicitely by record1. + Ignore the difference otherwise. + - If description, name, id or path appear in list of differences -> not identical. + - If file, checksum, size appear -> Only different, if explicitely set by record1. + + record1 serves as the reference, so datatype, importance and unit checks are carried + out using the attributes from record1. In that respect, the function is not symmetrical + in its arguments. + """ + comp = compare_entities(record1, record2) + + if ignore_id: + if "id" in comp[0]: + del comp[0]["id"] + if "id" in comp[1]: + del comp[1]["id"] + + for j in range(2): + for label in ("parents", ): + if len(comp[j][label]) > 0: + return False + for special_property in SPECIAL_PROPERTIES_STRICT: + if special_property in comp[0] or special_property in comp[1]: + return False + + for special_property in SPECIAL_PROPERTIES_NOT_STRICT: + if special_property in comp[0]: + attr_val = comp[0][special_property] + other_attr_val = (comp[1][special_property] + if special_property in comp[1] else None) + if attr_val is not None and attr_val != other_attr_val: + return False + + for key in comp[0]["properties"]: + if len(comp[0]["properties"][key]) == 0: + # This is a new property + return False + for attribute in ("datatype", "importance", "unit"): + # only make an update for those attributes if there is a value difference and + # the value in the target_data is not None + if attribute in comp[0]["properties"][key]: + attr_val = comp[0]["properties"][key][attribute] + other_attr_val = (comp[1]["properties"][key][attribute] + if attribute in comp[1]["properties"][key] else None) + if attr_val is not None and attr_val != other_attr_val: + return False + + if "value" in comp[0]["properties"][key]: + return False + + # Check for removed properties: + for key in comp[1]["properties"]: + if len(comp[1]["properties"][key]) == 0: + # This is a removed property + return False + + return True + + +def _resolve_datatype(prop: db.Property, remote_entity: db.Entity): + """ sets the datatype on the given property (side effect) """ + + if remote_entity.role == "Property": + datatype = remote_entity.datatype + elif remote_entity.role == "RecordType": + datatype = remote_entity.name + else: + raise RuntimeError("Cannot set datatype.") + + # Treat lists separately + if isinstance(prop.value, list) and not datatype.startswith("LIST"): + datatype = db.LIST(datatype) + + prop.datatype = datatype + return prop + + +class SecurityMode(Enum): + RETRIEVE = 0 + INSERT = 1 + UPDATE = 2 + + +class Crawler(object): + """ + Crawler class that encapsulates crawling functions. + Furthermore it keeps track of the storage for records (record store) and the + storage for values (general store). + """ + + def __init__(self, + generalStore: Optional[GeneralStore] = None, + debug: bool = False, + identifiableAdapter: IdentifiableAdapter = None, + securityMode: int = SecurityMode.UPDATE + ): + """ + Create a new crawler and initialize an empty RecordStore and GeneralStore. + + Parameters + ---------- + recordStore : GeneralStore + An initial GeneralStore which might store e.g. environment variables. + debug : bool + Create a debugging information tree when set to True. + The debugging information tree is a variable stored in + self.debug_tree. It is a dictionary mapping directory entries + to a tuple of general stores and record stores which are valid for + the directory scope. + Furthermore, it is stored in a second tree named self.debug_copied whether the + objects in debug_tree had been copied from a higher level in the hierarchy + of the structureelements. + identifiableAdapter : IdentifiableAdapter + TODO describe + securityMode : int + Whether only retrieves are allowed or also inserts or even updates. + Please use SecurityMode Enum + """ + + # TODO: check if this feature is really needed + + self.identified_cache = IdentifiedCache() + self.recordStore = RecordStore() + self.securityMode = securityMode + + self.generalStore = generalStore + if generalStore is None: + self.generalStore = GeneralStore() + + self.identifiableAdapter = identifiableAdapter + if identifiableAdapter is None: + self.identifiableAdapter = LocalStorageIdentifiableAdapter() + # If a directory is crawled this may hold the path to that directory + self.crawled_directory = None + self.debug = debug + if self.debug: + # order in the tuple: + # 0: generalStore + # 1: recordStore + self.debug_tree: Dict[str, tuple] = dict() + self.debug_metadata: Dict[str, dict] = dict() + self.debug_metadata["copied"] = dict() + self.debug_metadata["provenance"] = defaultdict(lambda: dict()) + self.debug_metadata["usage"] = defaultdict(lambda: set()) + + def load_definition(self, crawler_definition_path: str): + """ + Load a cfood from a crawler definition defined by + crawler definition path and validate it using cfood-schema.yml. + """ + + # Load the cfood from a yaml file: + with open(crawler_definition_path, "r") as f: + crawler_definitions = list(yaml.safe_load_all(f)) + + crawler_definition = self._load_definition_from_yaml_dict( + crawler_definitions) + + return self._resolve_validator_paths(crawler_definition, crawler_definition_path) + + def _load_definition_from_yaml_dict(self, crawler_definitions: List[Dict]): + """Load crawler definitions from a list of (yaml) dicts `crawler_definitions` which + contains either one or two documents. + + Doesn't resolve the validator paths in the cfood definition, so for + internal and testing use only. + + """ + if len(crawler_definitions) == 1: + # Simple case, just one document: + crawler_definition = crawler_definitions[0] + elif len(crawler_definitions) == 2: + crawler_definition = crawler_definitions[1] + else: + raise RuntimeError( + "Crawler definition must not contain more than two documents.") + + # TODO: at this point this function can already load the cfood schema extensions + # from the crawler definition and add them to the yaml schema that will be + # tested in the next lines of code: + + # Load the cfood schema: + with open(files('caoscrawler').joinpath('cfood-schema.yml'), "r") as f: + schema = yaml.safe_load(f) + + # Add custom converters to converter enum in schema: + if "Converters" in crawler_definition: + for key in crawler_definition["Converters"]: + schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append( + key) + if len(crawler_definitions) == 2: + if "Converters" in crawler_definitions[0]["metadata"]: + for key in crawler_definitions[0]["metadata"]["Converters"]: + schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append( + key) + + # Validate the cfood schema: + validate(instance=crawler_definition, schema=schema["cfood"]) + + return crawler_definition + + def _resolve_validator_paths(self, definition: dict, definition_path: str): + """Resolve path to validation files with respect to the file in which + the crawler was defined. + + """ + + for key, value in definition.items(): + + if key == "validate" and isinstance(value, str): + # Validator is given by a path + if not value.startswith('/'): + # Not an absolute path + definition[key] = os.path.join( + os.path.dirname(definition_path), value) + if not os.path.isfile(definition[key]): + raise FileNotFoundError( + f"Couldn't find validation file {definition[key]}") + elif isinstance(value, dict): + # Recursively resolve all validators + definition[key] = self._resolve_validator_paths( + value, definition_path) + + return definition + + def load_converters(self, definition: dict): + """ + Currently the converter registry is a dictionary containing for each converter: + - key is the short code, abbreviation for the converter class name + - module is the name of the module to be imported which must be installed + - class is the converter class to load and associate with this converter entry + + all other info for the converter needs to be included in the converter plugin + directory: + schema.yml file + README.md documentation + """ + + # Defaults for the converter registry: + converter_registry: Dict[str, Dict[str, str]] = { + "Directory": { + "converter": "DirectoryConverter", + "package": "caoscrawler.converters"}, + "SimpleFile": { + "converter": "SimpleFileConverter", + "package": "caoscrawler.converters"}, + "MarkdownFile": { + "converter": "MarkdownFileConverter", + "package": "caoscrawler.converters"}, + "File": { + "converter": "FileConverter", + "package": "caoscrawler.converters"}, + "JSONFile": { + "converter": "JSONFileConverter", + "package": "caoscrawler.converters"}, + "CSVTableConverter": { + "converter": "CSVTableConverter", + "package": "caoscrawler.converters"}, + "XLSXTableConverter": { + "converter": "XLSXTableConverter", + "package": "caoscrawler.converters"}, + "Dict": { + "converter": "DictConverter", + "package": "caoscrawler.converters"}, + "DictBooleanElement": { + "converter": "DictBooleanElementConverter", + "package": "caoscrawler.converters"}, + "DictFloatElement": { + "converter": "DictFloatElementConverter", + "package": "caoscrawler.converters"}, + "DictTextElement": { + "converter": "DictTextElementConverter", + "package": "caoscrawler.converters"}, + "DictIntegerElement": { + "converter": "DictIntegerElementConverter", + "package": "caoscrawler.converters"}, + "DictListElement": { + "converter": "DictListElementConverter", + "package": "caoscrawler.converters"}, + "DictDictElement": { + "converter": "DictDictElementConverter", + "package": "caoscrawler.converters"}, + "TextElement": { + "converter": "TextElementConverter", + "package": "caoscrawler.converters"} + } + + # More converters from definition file: + if "Converters" in definition: + for key, entry in definition["Converters"].items(): + converter_registry[key] = { + "converter": entry["converter"], + "package": entry["package"] + } + + # Load modules and associate classes: + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry + + def crawl_directory(self, dirname: str, crawler_definition_path: str): + """ Crawl a single directory. + + Convenience function that starts the crawler (calls start_crawling) + with a single directory as the StructureElement. + """ + + crawler_definition = self.load_definition(crawler_definition_path) + # Load and register converter packages: + converter_registry = self.load_converters(crawler_definition) + + if not dirname: + raise ValueError( + "You have to provide a non-empty path for crawling.") + dir_structure_name = os.path.basename(dirname) + self.crawled_directory = dirname + if not dir_structure_name and dirname.endswith('/'): + if dirname == '/': + # Crawling the entire file system + dir_structure_name = "root" + else: + # dirname had a trailing '/' + dir_structure_name = os.path.basename(dirname[:-1]) + + self.start_crawling(Directory(dir_structure_name, + dirname), + crawler_definition, + converter_registry) + + @staticmethod + def initialize_converters(crawler_definition: dict, converter_registry: dict): + """ + takes the cfood as dict (`crawler_definition`) and creates the converter objects that + are defined on the highest level. Child Converters will in turn be created during the + initialization of the Converters. + """ + converters = [] + + for key, value in crawler_definition.items(): + # Definitions and Converters are reserved keywords + # on the top level of the yaml file. + # TODO: there should also be a top level keyword for the actual + # CFood to avoid confusion between top level keywords + # and the CFood. + if key == "Definitions": + continue + elif key == "Converters": + continue + converters.append(Converter.converter_factory( + value, key, converter_registry)) + + return converters + + def start_crawling(self, items: Union[List[StructureElement], StructureElement], + crawler_definition: dict, + converter_registry: dict): + """ + Start point of the crawler recursion. + + Parameters + ---------- + items: list + A list of structure elements (or a single StructureElement) that is used for + generating the initial items for the crawler. This could e.g. be a Directory. + crawler_definition : dict + A dictionary representing the crawler definition, possibly from a yaml + file. + + Returns + ------- + target_data : list + the final list with the target state of Records. + """ + + # This function builds the tree of converters out of the crawler definition. + + if self.generalStore is None: + raise RuntimeError("Should not happen.") + + if not isinstance(items, list): + items = [items] + + self.run_id = uuid.uuid1() + local_converters = Crawler.initialize_converters( + crawler_definition, converter_registry) + # This recursive crawling procedure generates the update list: + self.target_data: List[db.Record] = [] + self._crawl(items, local_converters, self.generalStore, + self.recordStore, [], []) + + if self.debug: + self.debug_converters = local_converters + + return self.target_data + + def synchronize(self, commit_changes: bool = True, unique_names=True): + """ + Carry out the actual synchronization. + """ + + # After the crawling, the actual synchronization with the database, based on the + # update list is carried out: + + return self._synchronize(self.target_data, commit_changes, unique_names=unique_names) + + def can_be_checked_externally(self, record: db.Record): + """ + Returns False if there is at least one property in record which: + a) is a reference property AND + b) where the value is set to a db.Entity (instead of an ID) AND + c) where the ID of the value is not set (to an integer) + + Returns True otherwise. + """ + for p in record.properties: + if isinstance(p.value, list): + for el in p.value: + if isinstance(el, db.Entity) and el.id is None: + return False + # TODO: please check! + # I removed the condition "is_reference", because the datatype field + # that is checked within this function is not always present for references + # parsed from the file structure. We have to rely on the condition, that + # if a property value is of type entity, it can be assumed to be a reference. + # elif (is_reference(p) and isinstance(p.value, db.Entity) + # and p.value.id is None): + elif isinstance(p.value, db.Entity) and p.value.id is None: + return False + return True + + def create_flat_list(self, ent_list: List[db.Entity], flat: List[db.Entity]): + """ + Recursively adds all properties contained in entities from ent_list to + the output list flat. Each element will only be added once to the list. + + TODO: This function will be moved to pylib as it is also needed by the + high level API. + """ + for ent in ent_list: + for p in ent.properties: + # For lists append each element that is of type Entity to flat: + if isinstance(p.value, list): + for el in p.value: + if isinstance(el, db.Entity): + if el not in flat: + flat.append(el) + # TODO: move inside if block? + self.create_flat_list([el], flat) + elif isinstance(p.value, db.Entity): + if p.value not in flat: + flat.append(p.value) + # TODO: move inside if block? + self.create_flat_list([p.value], flat) + + def all_references_are_existing_already(self, record: db.Record): + """ + returns true if all references either have IDs or were checked remotely and not found (i.e. + they exist in the local cache) + """ + for p in record.properties: + # if (is_reference(p) + # Entity instead of ID and not cached locally + if (isinstance(p.value, list)): + for el in p.value: + if (isinstance(el, db.Entity) and el.id is None + and self.get_identified_record_from_local_cache(el) is None): + return False + if (isinstance(p.value, db.Entity) and p.value.id is None + and self.get_identified_record_from_local_cache(p.value) is None): + # might be checked when reference is resolved + return False + return True + + def replace_references_with_cached(self, record: db.Record): + """ + Replace all references with the versions stored in the cache. + + If the cache version is not identical, raise an error. + """ + for p in record.properties: + if (isinstance(p.value, list)): + lst = [] + for el in p.value: + if (isinstance(el, db.Entity) and el.id is None): + cached = self.get_identified_record_from_local_cache( + el) + if cached is None: + raise RuntimeError("Not in cache.") + if not check_identical(cached, el, True): + if isinstance(p.value, db.File): + if p.value.path != cached.path: + raise RuntimeError("Not identical.") + else: + raise RuntimeError("Not identical.") + lst.append(cached) + else: + lst.append(el) + p.value = lst + if (isinstance(p.value, db.Entity) and p.value.id is None): + cached = self.get_identified_record_from_local_cache(p.value) + if cached is None: + raise RuntimeError("Not in cache.") + if not check_identical(cached, p.value, True): + if isinstance(p.value, db.File): + if p.value.path != cached.path: + raise RuntimeError("Not identical.") + else: + raise RuntimeError("Not identical.") + p.value = cached + + def get_identified_record_from_local_cache(self, record: db.Record): + """ + returns the identifiable if an identifiable with the same values already exists locally + (Each identifiable that is not found on the remote server, is 'cached' locally to prevent + that the same identifiable exists twice) + """ + if self.identifiableAdapter is None: + raise RuntimeError("Should not happen.") + identifiable = self.identifiableAdapter.get_identifiable(record) + if identifiable is None: + # TODO: check whether the same idea as below works here + identifiable = record + # return None + + if identifiable in self.identified_cache: + return self.identified_cache[identifiable] + else: + return None + + def add_identified_record_to_local_cache(self, record: db.Record): + """ + adds the given identifiable to the local cache + + No identifiable with the same values must exist locally. + (Each identifiable that is not found on the remote server, is 'cached' locally to prevent + that the same identifiable exists twice) + + Return False if there is no identifiable for this record and True otherwise. + """ + if self.identifiableAdapter is None: + raise RuntimeError("Should not happen.") + identifiable = self.identifiableAdapter.get_identifiable(record) + if identifiable is None: + # TODO: this error report is bad + # we need appropriate handling for records without an identifiable + # or at least a simple fallback definition if tehre is no identifiable. + + # print(record) + # raise RuntimeError("No identifiable for record.") + + # TODO: check whether that holds: + # if there is no identifiable, for the cache that is the same + # as if the complete entity is the identifiable: + identifiable = record + self.identified_cache.add(identifiable=identifiable, record=record) + + def copy_attributes(self, fro: db.Entity, to: db.Entity): + """ + Copy all attributes from one entity to another entity. + """ + + merge_entities(to, fro) + + def split_into_inserts_and_updates(self, ent_list: List[db.Entity]): + if self.identifiableAdapter is None: + raise RuntimeError("Should not happen.") + to_be_inserted: List[db.Entity] = [] + to_be_updated: List[db.Entity] = [] + flat = list(ent_list) + # assure all entities are direct members TODO Can this be removed at some point?Check only? + self.create_flat_list(ent_list, flat) + + # TODO: can the following be removed at some point + for ent in flat: + if ent.role == "Record" and len(ent.parents) == 0: + raise RuntimeError("Records must have a parent.") + + resolved_references = True + # flat contains Entities which could not yet be checked against the remote server + while resolved_references and len(flat) > 0: + resolved_references = False + + for i in reversed(range(len(flat))): + record = flat[i] + + # TODO remove if the exception is never raised + if (record.id is not None or record in to_be_inserted): + raise RuntimeError("This should not be reached since treated elements" + "are removed from the list") + # Check the local cache first for duplicate + elif self.get_identified_record_from_local_cache(record) is not None: + + # This record is a duplicate that can be removed. Make sure we do not lose + # information + # Update an (local) identified record that will be inserted + newrecord = self.get_identified_record_from_local_cache( + record) + self.copy_attributes(fro=record, to=newrecord) + # Bend references to the other object + # TODO refactor this + for el in flat + to_be_inserted + to_be_updated: + for p in el.properties: + if isinstance(p.value, list): + for index, val in enumerate(p.value): + if val is record: + p.value[index] = newrecord + else: + if p.value is record: + p.value = newrecord + + del flat[i] + + # all references need to be IDs that exist on the remote server + elif self.can_be_checked_externally(record): + + # Check remotely + # TODO: remove deepcopy? + identified_record = self.identifiableAdapter.retrieve_identified_record_for_record( + deepcopy(record)) + if identified_record is None: + # identifiable does not exist remotely + to_be_inserted.append(record) + self.add_identified_record_to_local_cache(record) + del flat[i] + else: + # side effect + record.id = identified_record.id + # On update every property needs to have an ID. + # This will be achieved by the function execute_updates_in_list below. + # For files this is not enough, we also need to copy over + # checksum and size: + if isinstance(record, db.File): + record._size = identified_record._size + record._checksum = identified_record._checksum + + to_be_updated.append(record) + # TODO think this through + self.add_identified_record_to_local_cache(record) + del flat[i] + resolved_references = True + + # e.g. references an identifiable that does not exist remotely + elif self.all_references_are_existing_already(record): + + # TODO: (for review) + # This was the old version, but also for this case the + # check for identifiables has to be done. + # to_be_inserted.append(record) + # self.add_identified_record_to_local_cache(record) + # del flat[i] + + # TODO: (for review) + # If the following replacement is not done, the cache will + # be invalid as soon as references are resolved. + # replace references by versions from cache: + self.replace_references_with_cached(record) + + identified_record = self.identifiableAdapter.retrieve_identified_record_for_record( + deepcopy(record)) + if identified_record is None: + # identifiable does not exist remotely + to_be_inserted.append(record) + self.add_identified_record_to_local_cache(record) + del flat[i] + else: + # side effect + record.id = identified_record.id + # On update every property needs to have an ID. + # This will be achieved by the function execute_updates_in_list below. + + to_be_updated.append(record) + # TODO think this through + self.add_identified_record_to_local_cache(record) + del flat[i] + + resolved_references = True + + if len(flat) > 0: + raise RuntimeError( + "Could not resolve all Entity references. Circular Dependency?") + + return to_be_inserted, to_be_updated + + def replace_entities_with_ids(self, rec: db.Record): + for el in rec.properties: + if isinstance(el.value, db.Entity): + if el.value.id is not None: + el.value = el.value.id + elif isinstance(el.value, list): + for index, val in enumerate(el.value): + if isinstance(val, db.Entity): + if val.id is not None: + el.value[index] = val.id + + @staticmethod + def remove_unnecessary_updates(target_data: List[db.Record], + identified_records: List[db.Record]): + """ + checks whether all relevant attributes (especially Property values) are equal + + Returns (in future) + ------- + update list without unecessary updates + + """ + if len(target_data) != len(identified_records): + raise RuntimeError("The lists of updates and of identified records need to be of the " + "same length!") + # TODO this can now easily be changed to a function without side effect + for i in reversed(range(len(target_data))): + identical = check_identical(target_data[i], identified_records[i]) + + if identical: + del target_data[i] + continue + else: + pass + + @staticmethod + def execute_parent_updates_in_list(to_be_updated, securityMode, run_id, unique_names): + """ + Execute the updates of changed parents. + + This method is used before the standard inserts and needed + because some changes in parents (e.g. of Files) might fail + if they are not updated first. + """ + Crawler.set_ids_and_datatype_of_parents_and_properties(to_be_updated) + parent_updates = db.Container() + + for record in to_be_updated: + old_entity = Crawler._get_entity_by_id(record.id) + + # Check whether the parents have been changed and add them if missing + # in the old entity: + changes_made = False + for parent in record.parents: + found = False + for old_parent in old_entity.parents: + if old_parent.id == parent.id: + found = True + break + if not found: + old_entity.add_parent(id=parent.id) + changes_made = True + if changes_made: + parent_updates.append(old_entity) + logger.debug("RecordTypes need to be added to the following entities:") + logger.debug(parent_updates) + if len(parent_updates) > 0: + if securityMode.value > SecurityMode.INSERT.value: + parent_updates.update(unique=False) + elif run_id is not None: + update_cache = UpdateCache() + update_cache.insert(parent_updates, run_id) + logger.info("Some entities need to be updated because they are missing a parent " + "RecordType. The update was NOT executed due to the chosen security " + "mode. This might lead to a failure of inserts that follow.") + logger.info(parent_updates) + + @staticmethod + def _get_entity_by_name(name): + return db.Entity(name=name).retrieve() + + @staticmethod + def _get_entity_by_id(id): + return db.Entity(id=id).retrieve() + + @staticmethod + def execute_inserts_in_list(to_be_inserted, securityMode, run_id: int = None, + unique_names=True): + for record in to_be_inserted: + for prop in record.properties: + entity = Crawler._get_entity_by_name(prop.name) + _resolve_datatype(prop, entity) + logger.debug("INSERT") + logger.debug(to_be_inserted) + if len(to_be_inserted) > 0: + if securityMode.value > SecurityMode.RETRIEVE.value: + db.Container().extend(to_be_inserted).insert(unique=unique_names) + elif run_id is not None: + update_cache = UpdateCache() + update_cache.insert(to_be_inserted, run_id, insert=True) + + @staticmethod + def set_ids_and_datatype_of_parents_and_properties(rec_list): + for record in rec_list: + for parent in record.parents: + if parent.id is None: + parent.id = Crawler._get_entity_by_name(parent.name).id + for prop in record.properties: + if prop.id is None: + entity = Crawler._get_entity_by_name(prop.name) + prop.id = entity.id + _resolve_datatype(prop, entity) + + @staticmethod + def execute_updates_in_list(to_be_updated, securityMode, run_id: int = None, + unique_names=True): + Crawler.set_ids_and_datatype_of_parents_and_properties(to_be_updated) + logger.debug("UPDATE") + logger.debug(to_be_updated) + if len(to_be_updated) > 0: + if securityMode.value > SecurityMode.INSERT.value: + db.Container().extend(to_be_updated).update(unique=unique_names) + elif run_id is not None: + update_cache = UpdateCache() + update_cache.insert(to_be_updated, run_id) + + def _synchronize(self, target_data: List[db.Record], commit_changes: bool = True, + unique_names=True): + """ + This function applies several stages: + 1) Retrieve identifiables for all records in target_data. + 2) Compare target_data with existing records. + 3) Insert and update records based on the set of identified differences. + + This function makes use of an IdentifiableAdapter which is used to retrieve + register and retrieve identifiables. + + if commit_changes is True, the changes are synchronized to the CaosDB server. + For debugging in can be useful to set this to False. + + Return the final to_be_inserted and to_be_updated as tuple. + """ + + if self.identifiableAdapter is None: + raise RuntimeError("Should not happen.") + + to_be_inserted, to_be_updated = self.split_into_inserts_and_updates( + target_data) + + # TODO: refactoring of typo + for el in to_be_updated: + # all entity objects are replaced by their IDs except for the not yet inserted ones + self.replace_entities_with_ids(el) + + identified_records = [ + self.identifiableAdapter.retrieve_identified_record_for_record( + record) + for record in to_be_updated] + # remove unnecessary updates from list by comparing the target records to the existing ones + self.remove_unnecessary_updates(to_be_updated, identified_records) + + if commit_changes: + self.execute_parent_updates_in_list(to_be_updated, securityMode=self.securityMode, + run_id=self.run_id, unique_names=unique_names) + self.execute_inserts_in_list( + to_be_inserted, self.securityMode, self.run_id, unique_names=unique_names) + self.execute_updates_in_list( + to_be_updated, self.securityMode, self.run_id, unique_names=unique_names) + + update_cache = UpdateCache() + pending_inserts = update_cache.get_inserts(self.run_id) + if pending_inserts: + Crawler.inform_about_pending_changes( + pending_inserts, self.run_id, self.crawled_directory) + + pending_updates = update_cache.get_updates(self.run_id) + if pending_updates: + Crawler.inform_about_pending_changes( + pending_updates, self.run_id, self.crawled_directory) + + return (to_be_inserted, to_be_updated) + + @staticmethod + def inform_about_pending_changes(pending_changes, run_id, path, inserts=False): + # Sending an Email with a link to a form to authorize updates is + # only done in SSS mode + + if "SHARED_DIR" in os.environ: + filename = OldCrawler.save_form( + [el[3] for el in pending_changes], path, run_id) + OldCrawler.send_mail([el[3] for el in pending_changes], filename) + + for i, el in enumerate(pending_changes): + + logger.debug( + """ +UNAUTHORIZED UPDATE ({} of {}): +____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) + logger.info("There were unauthorized changes (see above). An " + "email was sent to the curator.\n" + "You can authorize the " + + ("inserts" if inserts else "updates") + + " by invoking the crawler" + " with the run id: {rid}\n".format(rid=run_id)) + + @staticmethod + def debug_build_usage_tree(converter: Converter): + res: Dict[str, Dict[str, Any]] = { + converter.name: { + "usage": ", ".join(converter.metadata["usage"]), + "subtree": {} + } + } + + for subconv in converter.converters: + d = Crawler.debug_build_usage_tree(subconv) + k = list(d.keys()) + if len(k) != 1: + raise RuntimeError( + "Unkonwn error during building of usage tree.") + res[converter.name]["subtree"][k[0]] = d[k[0]] + return res + + def save_debug_data(self, filename: str): + paths: Dict[str, Union[dict, list]] = dict() + + def flatten_debug_info(key): + mod_info = self.debug_metadata[key] + paths[key] = dict() + for record_name in mod_info: + if key == "provenance": + paths[key][record_name] = dict() + for prop_name in mod_info[record_name]: + paths[key][record_name][prop_name] = { + "structure_elements_path": "/".join( + mod_info[record_name][prop_name][0]), + "converters_path": "/".join( + mod_info[record_name][prop_name][1])} + elif key == "usage": + paths[key][record_name] = ", ".join(mod_info[record_name]) + for key in ("provenance", "usage"): + flatten_debug_info(key) + + paths["converters_usage"] = [self.debug_build_usage_tree( + cv) for cv in self.debug_converters] + + with open(filename, "w") as f: + f.write(yaml.dump(paths, sort_keys=False)) + + def _crawl(self, items: List[StructureElement], + local_converters: List[Converter], + generalStore: GeneralStore, + recordStore: RecordStore, + structure_elements_path: List[str], converters_path: List[str]): + """ + Crawl a list of StructureElements and apply any matching converters. + + items: structure_elements (e.g. files and folders on one level on the hierarchy) + local_converters: locally defined converters for + treating structure elements. A locally defined converter could be + one that is only valid for a specific subtree of the originally + cralwed StructureElement structure. + generalStore and recordStore: This recursion of the crawl function should only operate on copies of the + global stores of the Crawler object. + """ + for element in items: + for converter in local_converters: + + # type is something like "matches files", replace isinstance with "type_matches" + # match function tests regexp for example + if (converter.typecheck(element) and + converter.match(element) is not None): + generalStore_copy = generalStore.create_scoped_copy() + recordStore_copy = recordStore.create_scoped_copy() + + # Create an entry for this matched structure element: + generalStore_copy[converter.name] = ( + os.path.join(*(structure_elements_path + [element.get_name()]))) + + # extracts values from structure element and stores them in the + # variable store + converter.create_values(generalStore_copy, element) + + keys_modified = converter.create_records( + generalStore_copy, recordStore_copy, element) + + children = converter.create_children( + generalStore_copy, element) + if self.debug: + # add provenance information for each varaible + self.debug_tree[str(element)] = ( + generalStore_copy.get_storage(), recordStore_copy.get_storage()) + self.debug_metadata["copied"][str(element)] = ( + generalStore_copy.get_dict_copied(), + recordStore_copy.get_dict_copied()) + self.debug_metadata["usage"][str(element)].add( + "/".join(converters_path + [converter.name])) + mod_info = self.debug_metadata["provenance"] + for record_name, prop_name in keys_modified: + # TODO: check + internal_id = recordStore_copy.get_internal_id( + record_name) + record_identifier = record_name + \ + "_" + str(internal_id) + converter.metadata["usage"].add(record_identifier) + mod_info[record_identifier][prop_name] = ( + structure_elements_path + [element.get_name()], + converters_path + [converter.name]) + + self._crawl(children, converter.converters, + generalStore_copy, recordStore_copy, + structure_elements_path + [element.get_name()], + converters_path + [converter.name]) + # if the crawler is running out of scope, copy all records in + # the recordStore, that were created in this scope + # to the general update container. + scoped_records = recordStore.get_records_current_scope() + for record in scoped_records: + self.target_data.append(record) + + # TODO: the scoped variables should be cleaned up as soon if the variables + # are no longer in the current scope. This can be implemented as follows, + # but this breaks the test "test_record_structure_generation", because + # some debug info is also deleted. This implementation can be used as soon + # as the remaining problems with the debug_tree are fixed. + # Delete the variables that are no longer needed: + # scoped_names = recordStore.get_names_current_scope() + # for name in scoped_names: + # del recordStore[name] + # del generalStore[name] + + return self.target_data + + +def crawler_main(crawled_directory_path: str, + cfood_file_name: str, + identifiables_definition_file: str = None, + debug: bool = False, + provenance_file: str = None, + dry_run: bool = False, + prefix: str = "", + securityMode: int = SecurityMode.UPDATE, + unique_names=True, + ): + """ + + Parameters + ---------- + crawled_directory_path : str + path to be crawled + cfood_file_name : str + filename of the cfood to be used + identifiables_definition_file : str + filename of an identifiable definition yaml file + debug : bool + whether or not to run in debug mode + provenance_file : str + provenance information will be stored in a file with given filename + dry_run : bool + do not commit any chnages to the server + prefix : str + remove the given prefix from file paths + securityMode : int + securityMode of Crawler + unique_names : bool + whether or not to update or insert entities inspite of name conflicts + + Returns + ------- + return_value : int + 0 if successful + """ + crawler = Crawler(debug=debug, securityMode=securityMode) + crawler.crawl_directory(crawled_directory_path, cfood_file_name) + if provenance_file is not None: + crawler.save_debug_data(provenance_file) + + if identifiables_definition_file is not None: + + ident = CaosDBIdentifiableAdapter() + ident.load_from_yaml_definition(identifiables_definition_file) + crawler.identifiableAdapter = ident + + if dry_run: + ins, upd = crawler.synchronize(commit_changes=False) + inserts = [str(i) for i in ins] + updates = [str(i) for i in upd] + with open("dry.yml", "w") as f: + f.write(yaml.dump({ + "insert": inserts, + "update": updates})) + else: + rtsfinder = dict() + for elem in crawler.target_data: + if isinstance(elem, db.File): + # correct the file path: + # elem.file = os.path.join(args.path, elem.file) + if prefix is None: + raise RuntimeError( + "No prefix set. Prefix must be set if files are used.") + if elem.path.startswith(prefix): + elem.path = elem.path[len(prefix):] + elem.file = None + # TODO: as long as the new file backend is not finished + # we are using the loadFiles function to insert symlinks. + # Therefore, I am setting the files to None here. + # Otherwise, the symlinks in the database would be replaced + # by uploads of the files which we currently do not want to happen. + + # Check whether all needed RecordTypes exist: + if len(elem.parents) > 0: + for parent in elem.parents: + if parent.name in rtsfinder: + continue + + rt = db.RecordType(name=parent.name) + try: + rt.retrieve() + rtsfinder[parent.name] = True + except db.TransactionError: + rtsfinder[parent.name] = False + notfound = [k for k, v in rtsfinder.items() if not v] + if len(notfound) > 0: + raise RuntimeError("Missing RecordTypes: {}". + format(", ".join(notfound))) + + crawler.synchronize(commit_changes=True, unique_names=unique_names) + return 0 + + +def parse_args(): + parser = argparse.ArgumentParser(description=__doc__, + formatter_class=RawTextHelpFormatter) + parser.add_argument("cfood_file_name", + help="Path name of the cfood yaml file to be used.") + parser.add_argument("--provenance", required=False, + help="Path name of the provenance yaml file. " + "This file will only be generated if this option is set.") + parser.add_argument("--debug", required=False, action="store_true", + help="Path name of the cfood yaml file to be used.") + parser.add_argument("crawled_directory_path", + help="The subtree of files below the given path will " + "be considered. Use '/' for everything.") + parser.add_argument("-s", "--security-mode", choices=["retrieve", "insert", "update"], + default="retrieve", + help="Determines whether entities may only be read from the server, or " + "whether inserts or even updates may be done.") + parser.add_argument("-n", "--dry-run", action="store_true", + help="Create two files dry.yml to show" + "what would actually be committed without doing the synchronization.") + + # TODO: load identifiables is a dirty implementation currently + parser.add_argument("-i", "--load-identifiables", + help="Load identifiables from the given yaml file.") + parser.add_argument("-u", "--unique-names", + help="Insert or updates entities even if name conflicts exist.") + parser.add_argument("-p", "--prefix", + help="Remove the given prefix from the paths " + "of all file objects.") + + return parser.parse_args() + + +def main(): + args = parse_args() + + conlogger = logging.getLogger("connection") + conlogger.setLevel(level=logging.ERROR) + + # logging config for local execution + logger.addHandler(logging.StreamHandler(sys.stdout)) + if args.debug: + logger.setLevel(logging.DEBUG) + else: + logger.setLevel(logging.INFO) + + sys.exit(crawler_main( + crawled_directory_path=args.crawled_directory_path, + cfood_file_name=args.cfood_file_name, + identifiables_definition_file=args.load_identifiables, + debug=args.debug, + provenance_file=args.provenance, + dry_run=args.dry_run, + prefix=args.prefix, + securityMode={"retrieve": SecurityMode.RETRIEVE, + "insert": SecurityMode.INSERT, + "update": SecurityMode.UPDATE}[args.security_mode], + unique_names=args.unique_names, + )) + + +if __name__ == "__main__": + main() diff --git a/src/caoscrawler/extension-converters-config-schema.yml b/src/caoscrawler/extension-converters-config-schema.yml new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py new file mode 100644 index 0000000000000000000000000000000000000000..d4c2b1d04316946dc28fec15489e0dc390cb9dd3 --- /dev/null +++ b/src/caoscrawler/identifiable_adapters.py @@ -0,0 +1,480 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2021 Henrik tom Wörden +# 2021 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +import yaml + +from datetime import datetime +import caosdb as db +import logging +from abc import abstractmethod, ABCMeta +from .utils import has_parent +logger = logging.getLogger(__name__) + + +def convert_value(value): + """ Returns a string representation of the value that is suitable + to be used in the query + looking for the identified record. + + Parameters + ---------- + value : The property of which the value shall be returned. + + Returns + ------- + out : the string reprensentation of the value + + """ + + if isinstance(value, db.Entity): + return str(value.id) + elif isinstance(value, datetime): + return value.isoformat() + elif type(value) == str: + # replace single quotes, otherwise they may break the queries + return value.replace("\'", "\\'") + else: + return f"{value}" + + +class IdentifiableAdapter(metaclass=ABCMeta): + """ + Base class for identifiable adapters. + + Some terms: + - Registered identifiable is the definition of an identifiable which is: + - A record type as the parent + - A list of properties + - A list of referenced by statements + + - Identifiable is the concrete identifiable, e.g. the Record based on + the registered identifiable with all the values filled in. + + - Identified record is the result of retrieving a record based on the + identifiable from the database. + + General question to clarify: + Do we want to support multiple identifiables per RecordType? + Current implementation supports only one identifiable per RecordType. + + The list of referenced by statements is currently not implemented. + + The IdentifiableAdapter can be used to retrieve the three above mentioned objects (registred + identifiabel, identifiable and identified record) for a Record. + """ + + @staticmethod + def create_query_for_identifiable(ident: db.Record): + """ + This function is taken from the old crawler: + caosdb-advanced-user-tools/src/caosadvancedtools/crawler.py + + uses the properties of ident to create a query that can determine + whether the required record already exists. + """ + + if len(ident.parents) != 1: + raise RuntimeError( + "Multiple parents for identifiables not supported.") + + query_string = "FIND Record " + ident.get_parents()[0].name + query_string += " WITH " + + if ident.name is None and len(ident.get_properties()) == 0: + raise ValueError( + "The identifiable must have features to identify it.") + + if ident.name is not None: + query_string += "name='{}'".format(ident.name) + if len(ident.get_properties()) > 0: + query_string += " AND " + + query_string += IdentifiableAdapter.create_property_query(ident) + return query_string + + @staticmethod + def create_property_query(entity: db.Entity): + query_string = "" + for p in entity.get_properties(): + if p.value is None: + query_string += "'" + p.name + "' IS NULL AND " + elif isinstance(p.value, list): + for v in p.value: + query_string += ("'" + p.name + "'='" + + convert_value(v) + "' AND ") + + # TODO: (for review) + # This code would allow for more complex identifiables with + # subproperties being checked directly. + # we currently do not need them and they could introduce + # problems in the local caching mechanism. + # However, it could be discussed to implement a similar mechanism. + # elif isinstance(p.value, db.Entity): + # query_string += ("'" + p.name + "' WITH (" + + # IdentifiableAdapter.create_property_query(p.value) + + # ") AND ") + else: + query_string += ("'" + p.name + "'='" + + convert_value(p.value) + "' AND ") + # remove the last AND + return query_string[:-4] + + @abstractmethod + def get_registered_identifiable(self, record: db.Record): + """ + Check whether an identifiable is registered for this record and return its definition. + If there is no identifiable registered, return None. + """ + pass + + @abstractmethod + def resolve_reference(self, record: db.Record): + pass + + @abstractmethod + def get_file(self, identifiable: db.File): + """ + Retrieve the file object for a (File) identifiable. + """ + pass + + def get_identifiable_for_file(self, record: db.File): + """ + Retrieve an identifiable for a file. + + Currently an identifiable for a file ist just a File object + with a specific path. In the future, this could be extended + to allow for names, parents and custom properties. + """ + identifiable = db.File() + identifiable.path = record.path + return identifiable + + def get_identifiable(self, record: db.Record): + """ + retrieve the registred identifiable and fill the property values to create an + identifiable + """ + + if record.role == "File": + return self.get_identifiable_for_file(record) + + registered_identifiable = self.get_registered_identifiable(record) + + if registered_identifiable is None: + return None + + identifiable = db.Record(name=record.name) + if len(registered_identifiable.parents) != 1: + raise RuntimeError("Multiple parents for identifiables" + "not supported.") + identifiable.add_parent(registered_identifiable.parents[0]) + property_name_list_A = [] + property_name_list_B = [] + + # fill the values: + for prop in registered_identifiable.properties: + if prop.name == "name": + # The name can be an identifiable, but it isn't a property + continue + # problem: what happens with multi properties? + # case A: in the registered identifiable + # case B: in the identifiable + + record_prop = record.get_property(prop.name) + if record_prop is None: + # TODO: how to handle missing values in identifiables + # raise an exception? + raise NotImplementedError( + f"RECORD\n{record}\nPROPERTY\n{prop.name}" + ) + newval = record_prop.value + if isinstance(record_prop.value, db.Entity): + newval = self.resolve_reference(record_prop.value) + elif isinstance(record_prop.value, list): + newval = list() + for element in record_prop.value: + if isinstance(element, db.Entity): + newval.append(self.resolve_reference(element)) + else: + newval.append(element) + record_prop_new = db.Property(name=record_prop.name, + id=record_prop.id, + description=record_prop.description, + datatype=record_prop.datatype, + value=newval, + unit=record_prop.unit) + identifiable.add_property(record_prop_new) + property_name_list_A.append(prop.name) + + # check for multi properties in the record: + for prop in property_name_list_A: + property_name_list_B.append(prop) + if (len(set(property_name_list_B)) != len(property_name_list_B) or len( + set(property_name_list_A)) != len(property_name_list_A)): + raise RuntimeError( + "Multi properties used in identifiables can cause unpredictable results.") + + return identifiable + + @abstractmethod + def retrieve_identified_record_for_identifiable(self, identifiable: db.Record): + """ + Retrieve identifiable record for a given identifiable. + + This function will return None if there is either no identifiable registered + or no corresponding identified record in the database for a given record. + + Warning: this function is not expected to work correctly for file identifiables. + """ + pass + + # TODO: remove side effect + # TODO: use ID if record has one? + def retrieve_identified_record_for_record(self, record: db.Record): + """ + This function combines all functionality of the IdentifierAdapter by + returning the identifiable after having checked for an appropriate + registered identifiable. + + In case there was no appropriate registered identifiable or no identifiable could + be found return value is None. + """ + identifiable = self.get_identifiable(record) + + if identifiable is None: + return None + + if identifiable.role == "File": + return self.get_file(identifiable) + + return self.retrieve_identified_record_for_identifiable(identifiable) + + +class LocalStorageIdentifiableAdapter(IdentifiableAdapter): + """ + Identifiable adapter which can be used for unit tests. + """ + + def __init__(self): + self._registered_identifiables = dict() + self._records = [] + + def register_identifiable(self, name: str, definition: db.RecordType): + self._registered_identifiables[name] = definition + + def get_records(self): + return self._records + + def get_file(self, identifiable: db.File): + """ + Just look in records for a file with the same path. + """ + candidates = [] + for record in self._records: + if record.role == "File" and record.path == identifiable.path: + candidates.append(record) + if len(candidates) > 1: + raise RuntimeError("Identifiable was not defined unambigiously.") + if len(candidates) == 0: + return None + return candidates[0] + + def store_state(self, filename): + with open(filename, "w") as f: + f.write(db.common.utils.xml2str( + db.Container().extend(self._records).to_xml())) + + def restore_state(self, filename): + with open(filename, "r") as f: + self._records = db.Container().from_xml(f.read()) + + # TODO: move to super class? + def is_identifiable_for_record(self, registered_identifiable: db.RecordType, record: db.Record): + """ + Check whether this registered_identifiable is an identifiable for the record. + + That means: + - The properties of the registered_identifiable are a subset of the properties of record. + - One of the parents of record is the parent of registered_identifiable. + + Return True in that case and False otherwise. + """ + if len(registered_identifiable.parents) != 1: + raise RuntimeError( + "Multiple parents for identifiables not supported.") + + if not has_parent(record, registered_identifiable.parents[0].name): + return False + + for prop in registered_identifiable.properties: + if record.get_property(prop.name) is None: + return False + return True + + def get_registered_identifiable(self, record: db.Record): + identifiable_candidates = [] + for _, definition in self._registered_identifiables.items(): + if self.is_identifiable_for_record(definition, record): + identifiable_candidates.append(definition) + if len(identifiable_candidates) > 1: + raise RuntimeError( + "Multiple candidates for an identifiable found.") + if len(identifiable_candidates) == 0: + return None + return identifiable_candidates[0] + + def check_record(self, record: db.Record, identifiable: db.Record): + """ + Check for a record from the local storage (named "record") if it is + the identified record for an identifiable which was created by + a run of the crawler. + + Naming of the parameters could be confusing: + record is the record from the local database to check against. + identifiable is the record that was created during the crawler run. + """ + if len(identifiable.parents) != 1: + raise RuntimeError( + "Multiple parents for identifiables not supported.") + if not has_parent(record, identifiable.parents[0].name): + return False + for prop in identifiable.properties: + prop_record = record.get_property(prop.name) + if prop_record is None: + return False + + # if prop is an entity, it needs to be resolved first. + # there are two different cases: + # a) prop_record.value has a registered identifiable: + # in this case, fetch the identifiable and set the value accordingly + if isinstance(prop.value, db.Entity): # lists are not checked here + registered = self.get_registered_identifiable(prop.value) + + if registered is None: + raise NotImplementedError("Non-identifiable references cannot" + " be used as properties in identifiables.") + + raise RuntimeError("The identifiable which is used as property" + " here has to be inserted first.") + + if prop.value != prop_record.value: + return False + return True + + def retrieve_identified_record_for_identifiable(self, identifiable: db.Record): + candidates = [] + for record in self._records: + if self.check_record(record, identifiable): + candidates.append(record) + if len(candidates) > 1: + raise RuntimeError( + f"Identifiable was not defined unambigiously. Possible candidates are {candidates}") + if len(candidates) == 0: + return None + return candidates[0] + + def resolve_reference(self, value: db.Record): + if self.get_registered_identifiable(value) is None: + raise NotImplementedError("Non-identifiable references cannot" + " be used as properties in identifiables.") + # TODO: just resolve the entity + + value_identifiable = self.retrieve_identified_record_for_record(value) + if value_identifiable is None: + raise RuntimeError("The identifiable which is used as property" + " here has to be inserted first.") + + if value_identifiable.id is None: + raise RuntimeError("The entity has not been assigned an ID.") + + return value_identifiable.id + + +class CaosDBIdentifiableAdapter(IdentifiableAdapter): + """ + Identifiable adapter which can be used for production. + """ + + # TODO: don't store registered identifiables locally + + def __init__(self): + self._registered_identifiables = dict() + + def load_from_yaml_definition(self, path: str): + """Load identifiables defined in a yaml file""" + with open(path, 'r') as yaml_f: + identifiable_data = yaml.safe_load(yaml_f) + + for key, value in identifiable_data.items(): + rt = db.RecordType().add_parent(key) + for prop_name in value: + rt.add_property(name=prop_name) + self.register_identifiable(key, rt) + + def register_identifiable(self, name: str, definition: db.RecordType): + self._registered_identifiables[name] = definition + + def get_file(self, identifiable: db.File): + if identifiable.path is None: + raise RuntimeError("Path must not be None for File retrieval.") + candidates = db.execute_query("FIND File which is stored at {}".format( + identifiable.path)) + if len(candidates) > 1: + raise RuntimeError("Identifiable was not defined unambigiously.") + if len(candidates) == 0: + return None + return candidates[0] + + def get_registered_identifiable(self, record: db.Record): + """ + returns the registred identifiable for the given Record + + It is assumed, that there is exactly one identifiable for each RecordType. Only the first + parent of the given Record is considered; others are ignored + """ + rt_name = record.parents[0].name + for name, definition in self._registered_identifiables.items(): + if definition.parents[0].name.lower() == rt_name.lower(): + return definition + + def resolve_reference(self, record: db.Record): + """ + Current implementation just sets the id for this record + as a value. It needs to be verified that references all contain an ID. + """ + if record.id is None: + return record + return record.id + + def retrieve_identified_record_for_identifiable(self, identifiable: db.Record): + query_string = self.create_query_for_identifiable(identifiable) + candidates = db.execute_query(query_string) + if len(candidates) > 1: + raise RuntimeError( + f"Identifiable was not defined unambigiously.\n{query_string}\nReturned the following {candidates}.") + if len(candidates) == 0: + return None + return candidates[0] diff --git a/src/caoscrawler/identified_cache.py b/src/caoscrawler/identified_cache.py new file mode 100644 index 0000000000000000000000000000000000000000..0b9d7a47bdecc4094edb1296f4c04dfa083a2436 --- /dev/null +++ b/src/caoscrawler/identified_cache.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2021 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +""" +stores identified records and is able to detect duplicates +""" + +import caosdb as db + +from hashlib import sha256 + + +def _create_hashable_string(identifiable: db.Record): + """ + creates a string from the attributes of an identifiable that can be hashed + """ + if identifiable.role == "File": + # Special treatment for files: + return "P<>N<>{}:{}".format("path", identifiable.path) + if len(identifiable.parents) != 1: + # TODO: extend this + # maybe something like this: + # parent_names = ",".join( + # sorted([p.name for p in identifiable.parents]) + raise RuntimeError("Cache entry can only be generated for entities with 1 parent.") + rec_string = "P<{}>N<{}>".format(identifiable.parents[0].name, identifiable.name) + for pname in sorted([p.name for p in identifiable.properties]): + value = str(identifiable.get_property(pname).value) + + # TODO: (for review) + # This expansion of the hash function was introduced recently + # to allow the special case of Files as values of properties. + # We need to review the completeness of all the cases here, as the cache + # is crucial for correct identification of insertion and updates. + if isinstance(identifiable.get_property(pname).value, db.File): + value = str(identifiable.get_property(pname).value.path) + elif isinstance(identifiable.get_property(pname).value, db.Entity): + value = str(identifiable.get_property(pname).value.id) + elif isinstance(identifiable.get_property(pname).value, list): + tmplist = [] + for val in identifiable.get_property(pname).value: + if isinstance(val, db.Entity): + tmplist.append(val.id) + else: + tmplist.append(val) + value = str(tmplist) + + rec_string += "{}:".format(pname) + value + return rec_string + + +def _create_hash(identifiable: db.Record) -> str: + return sha256(_create_hashable_string(identifiable).encode('utf-8')).hexdigest() + + +class IdentifiedCache(object): + def __init__(self): + self._cache = {} + + def __contains__(self, identifiable: db.Record): + return _create_hash(identifiable) in self._cache + + def __getitem__(self, identifiable: db.Record): + return self._cache[_create_hash(identifiable)] + + def add(self, record: db.Record, identifiable: db.Record): + self._cache[_create_hash(identifiable)] = record diff --git a/src/caoscrawler/macros/__init__.py b/src/caoscrawler/macros/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0acfb1763039a3bb800bbf0e26d6940b49d045cf --- /dev/null +++ b/src/caoscrawler/macros/__init__.py @@ -0,0 +1 @@ +from .macro_yaml_object import defmacro_constructor, macro_constructor diff --git a/src/caoscrawler/macros/macro_yaml_object.py b/src/caoscrawler/macros/macro_yaml_object.py new file mode 100644 index 0000000000000000000000000000000000000000..2849986e6deb5cb2cba9e45516e6ce8e1a93dfa0 --- /dev/null +++ b/src/caoscrawler/macros/macro_yaml_object.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2022 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +# Function to expand a macro in yaml +# A. Schlemmer, 05/2022 + +from dataclasses import dataclass +from typing import Any, Dict +from copy import deepcopy +from string import Template + + +@dataclass +class MacroDefinition: + """ + Stores a macro definition. + name: Name of the macro + params: variables and default values to be substituted in keys or values + definition: A dictionary that will be substituted including parameters + """ + name: str + params: Dict[str, Any] + definition: Any + + +# This dictionary stores the macro definitions +macro_store: Dict[str, MacroDefinition] = dict() + + +def substitute(propvalue, values: dict): + """ + Substitution of variables in strings using the variable substitution + library from python's standard library. + """ + propvalue_template = Template(propvalue) + return propvalue_template.safe_substitute(**values) + + +def substitute_dict(sourced: Dict[str, Any], values: Dict[str, Any]): + """ + Create a copy of sourced. + Afterwards recursively do variable substitution on all keys and values. + """ + d = deepcopy(sourced) + # Changes in keys: + replace: Dict[str, str] = dict() + for k in d: + replacement = substitute(k, values) + if replacement != k: + replace[k] = replacement + for k, v in replace.items(): + d[v] = d[k] + del d[k] + # Changes in values: + for k, v in d.items(): + if isinstance(v, str): + d[k] = substitute(v, values) + elif isinstance(v, list): + subst_list = list() + for i in d[k]: + if isinstance(i, str): + subst_list.append(substitute(i, values)) + elif isinstance(i, dict): + subst_list.append(substitute_dict(i, values)) + else: + subst_list.append(i) + d[k] = subst_list + elif isinstance(v, dict): + d[k] = substitute_dict(v, values) + else: + pass + return d + + +def defmacro_constructor(loader, node): + """ + Function for registering macros in yaml files. + + It can be registered in pyaml using: + yaml.SafeLoader.add_constructor("!defmacro", defmacro_constructor) + """ + + value = loader.construct_mapping(node, deep=True) + params = {} + if "params" in value: + params = value["params"] + macro = MacroDefinition( + value["name"], params, + value["definition"]) + macro_store[macro.name] = macro + return {} + + +def macro_constructor(loader, node): + """ + Function for substituting macros in yaml files. + + It can be registered in pyaml using: + yaml.SafeLoader.add_constructor("!macro", macro_constructor) + """ + res = dict() + value = loader.construct_mapping(node, deep=True) + for name, params_setter in value.items(): + if name in macro_store: + # If params_setter is a list, run this for every element: + if params_setter is not None and isinstance(params_setter, list): + for el in params_setter: + macro = macro_store[name] + params = deepcopy(macro.params) + if el is not None: + if isinstance(el, dict): + params.update(el) + else: + raise RuntimeError("params type not supported") + else: + raise RuntimeError("params type must not be None") + definition = substitute_dict(macro.definition, params) + res.update(definition) + else: + # This is just a single macro: + macro = macro_store[name] + params = deepcopy(macro.params) + if params_setter is not None: + if isinstance(params_setter, dict): + params.update(params_setter) + else: + raise RuntimeError("params type not supported") + definition = substitute_dict(macro.definition, params) + res.update(definition) + else: + # If there is no macro with that name, just keep that node: + res[name] = params_setter + + return res diff --git a/src/caoscrawler/stores.py b/src/caoscrawler/stores.py new file mode 100644 index 0000000000000000000000000000000000000000..7ae451994b43a12559dea4ab7f85574c85b2a074 --- /dev/null +++ b/src/caoscrawler/stores.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2021 Henrik tom Wörden +# 2021 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +from collections import defaultdict + + +class Store(object): + """ + Base class for record store and general store which act as storages for + records and values used during crawling. + """ + + def __init__(self): + self._storage = dict() + # This dict stores whether the corresponding dict item in _storage + # (same key) has been copied from another Store, or was created newly in this store. + self._copied = dict() + # This attribute stores an internal id for being able to distinguish multiple + # ocurrences of the same thing in the store: + self._ids = defaultdict(lambda: 0) + + def __getitem__(self, key: str): + return self._storage[key] + + def __contains__(self, key: str): + return key in self._storage + + def __delitem__(self, key: str): + del self._storage[key] + del self._copied[key] + + def update(self, other: dict): + self._storage.update(other) + for key in other: + self._copied[key] = False + self._ids[key] += 1 + + def __setitem__(self, key: str, value): + self._storage[key] = value + self._copied[key] = False + self._ids[key] += 1 + + def get_storage(self): + return self._storage + + def create_scoped_copy(self): + s_copy = self.__class__() + s_copy._storage = dict(self._storage) + s_copy._copied = {key: True for key in self._copied} + s_copy._ids = self._ids + return s_copy + + def get_dict_copied(self): + """ + Only for debugging. + """ + return self._copied + + def get_internal_id(self, key): + """ + Only for debugging. + """ + return self._ids[key] + + +class GeneralStore(Store): + pass + + +class RecordStore(Store): + + def get_names_current_scope(self): + """ + Return the names of all records that were created in the current scope. + """ + lst = [] + + for key in self._storage: + if not self._copied[key]: + lst.append(key) + return lst + + def get_records_current_scope(self): + """ + Return all records that were created in the current scope. + """ + lst = [] + + for key in self._storage: + if not self._copied[key]: + lst.append(self[key]) + return lst diff --git a/src/caoscrawler/structure_elements.py b/src/caoscrawler/structure_elements.py new file mode 100644 index 0000000000000000000000000000000000000000..01996b4ff3e14a9739857e6e03ceca161300b37e --- /dev/null +++ b/src/caoscrawler/structure_elements.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2021 Henrik tom Wörden +# 2021 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +from typing import Dict + + +class StructureElement(object): + """ base class for elements in the hierarchical data structure """ + + def __init__(self, name): + # Used to store usage information for debugging: + self.metadata: Dict[str, set[str]] = { + "usage": set() + } + + self.name = name + + def __str__(self): + return self.get_name() + + def get_name(self): + return self.name + + +class FileSystemStructureElement(StructureElement): + def __init__(self, name: str, path: str): + super().__init__(name) + self.path = path + + def __str__(self): + class_name_short = str(self.__class__).replace( + "<class \'", "")[:-2] + return "{}: {}, {}".format(class_name_short, self.name, self.path) + + +class Directory(FileSystemStructureElement): + pass + + +class File(FileSystemStructureElement): + pass + + +class JSONFile(File): + pass + + +class DictElement(StructureElement): + def __init__(self, name: str, value): + super().__init__(name) + self.value = value + + +class Dict(StructureElement): + def __init__(self, name: str, value: dict): + super().__init__(name) + self.value = value + + +class DictTextElement(DictElement): + def __init__(self, name: str, value: str): + super().__init__(name, value) + + +class DictIntegerElement(DictElement): + def __init__(self, name: str, value: int): + super().__init__(name, value) + + +class DictBooleanElement(DictElement): + def __init__(self, name: str, value: bool): + super().__init__(name, value) + + +class DictDictElement(Dict, DictElement): + def __init__(self, name: str, value: dict): + DictElement.__init__(self, name, value) + + +class DictListElement(DictElement): + def __init__(self, name: str, value: dict): + super().__init__(name, value) + + +class DictFloatElement(DictElement): + def __init__(self, name: str, value: float): + super().__init__(name, value) + + +class TextElement(StructureElement): + def __init__(self, name: str, value: str): + super().__init__(name) + self.value = value diff --git a/src/caoscrawler/utils.py b/src/caoscrawler/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..61b363099d0892b74e91f257bccb6cc832c3d59f --- /dev/null +++ b/src/caoscrawler/utils.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2021 Henrik tom Wörden +# 2021 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +# Some utility functions, e.g. for extending pylib. + +import caosdb as db + + +def has_parent(entity: db.Entity, name: str): + """ + A simple check, whether a parent with the given name exists. + + There is a similar, however more complex function in package caosdb. + """ + + for parent in entity.parents: + if parent.name == name: + return True + return False diff --git a/src/doc/Makefile b/src/doc/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..bea7f860173d930527c84fae43cb7d5bdf6cae97 --- /dev/null +++ b/src/doc/Makefile @@ -0,0 +1,49 @@ +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2020 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2020 Daniel Hornung <d.hornung@indiscale.com> +# Copyright (C) 2021 Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header + +# This Makefile is a wrapper for sphinx scripts. +# +# It is based upon the autocreated makefile for Sphinx documentation. + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= -a +SPHINXBUILD ?= sphinx-build +SPHINXAPIDOC ?= sphinx-apidoc +PY_BASEDIR = ../caoscrawler +SOURCEDIR = . +BUILDDIR = ../../build/doc + + +.PHONY: doc-help Makefile + +# Put it first so that "make" without argument is like "make help". +doc-help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile apidoc + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +apidoc: + @$(SPHINXAPIDOC) -o _apidoc --separate $(PY_BASEDIR) diff --git a/src/doc/README_SETUP.md b/src/doc/README_SETUP.md new file mode 100644 index 0000000000000000000000000000000000000000..b6995c9a2d950ecd1e832d5b49dac9ed88a7e455 --- /dev/null +++ b/src/doc/README_SETUP.md @@ -0,0 +1,82 @@ +# Getting started with the CaosDB Crawler # + +## Installation ## + +### Requirements ### + + +### How to install ### + +#### Linux #### + +Make sure that Python (at least version 3.8) and pip is installed, using your system tools and +documentation. + +Then open a terminal and continue in the [Generic installation](#generic-installation) section. + +#### Windows #### + +If a Python distribution is not yet installed, we recommend Anaconda Python, which you can download +for free from [https://www.anaconda.com](https://www.anaconda.com). The "Anaconda Individual Edition" provides most of all +packages you will ever need out of the box. If you prefer, you may also install the leaner +"Miniconda" installer, which allows you to install packages as you need them. + +After installation, open an Anaconda prompt from the Windows menu and continue in the [Generic +installation](#generic-installation) section. + +#### MacOS #### + +If there is no Python 3 installed yet, there are two main ways to +obtain it: Either get the binary package from +[python.org](https://www.python.org/downloads/) or, for advanced +users, install via [Homebrew](https://brew.sh/). After installation +from python.org, it is recommended to also update the TLS certificates +for Python (this requires administrator rights for your user): + +```sh +# Replace this with your Python version number: +cd /Applications/Python\ 3.9/ + +# This needs administrator rights: +sudo ./Install\ Certificates.command +``` + +After these steps, you may continue with the [Generic +installation](#generic-installation). + +#### Generic installation #### + +--- + +Obtain the sources from GitLab and install from there (`git` must be installed for +this option): + +```sh +git clone https://gitlab.com/caosdb/caosdb-crawler +cd caosdb-crawler +pip3 install --user . +``` + +**Note**: In the near future, this package will also be made available on PyPi. + +## Configuration ## + + + +## Try it out ## + + + +## Run Unit Tests + +## Documentation ## + +Build documentation in `src/doc` with `make html`. + +### Requirements ### + +- `sphinx` +- `sphinx-autoapi` +- `recommonmark` + +### Troubleshooting ### diff --git a/src/doc/cfood.rst b/src/doc/cfood.rst new file mode 100644 index 0000000000000000000000000000000000000000..677cadc55709c6c25d16ff547b311102ee78699a --- /dev/null +++ b/src/doc/cfood.rst @@ -0,0 +1,149 @@ +CFood-Definition +================ + +The crawler specification is called CFood-definition. It is stored inside a yaml file, or - more precisely - inside of one single or two yaml documents inside a yaml file. + +The specification consists of three separate parts: +#. Metadata and macro definitions +#. Custom converter registrations +#. The converter tree specification + +In the simplest case, there is just one yaml file with just a single document including at least +the converter tree specification (see :ref:`example 1<example_1>`). Additionally the custom converter part may be also included in +this single document (for historical reasons, see :ref:`example 2<example_2>`), but it is recommended to include them in the separate +document together with the metadata and :doc:`macro<macros>` definitions (see :ref:`below<example_4>`). + +If metadata and macro definitions are provided, there **must** be a second document preceeding the +converter tree specification, including these definitions. + +Examples +++++++++ + +A single document with a converter tree specification: + +.. _example_1: +.. code-block:: yaml + + extroot: + type: Directory + match: ^extroot$ + subtree: + DataAnalysis: + type: Directory + match: DataAnalysis + # (...) + + +A single document with a converter tree specification, but also including a custom converters section: + +.. _example_2: +.. code-block:: yaml + + Converters: + CustomConverter_1: + package: mypackage.converters + converter: CustomConverter1 + CustomConverter_2: + package: mypackage.converters + converter: CustomConverter2 + + extroot: + type: Directory + match: ^extroot$ + subtree: + DataAnalysis: + type: Directory + match: DataAnalysis + # (...) + + + +A yaml multi-document, defining metadata and some macros in the first document and declaring +two custom converters in the second document (**not recommended**, see the recommended version :ref:`below<example_4>`). Please note, that two separate yaml documents can be defined using the ``---`` syntax: + + +.. _example_3: +.. code-block:: yaml + + --- + metadata: + name: Datascience CFood + description: CFood for data from the local data science work group + macros: + - !defmacro + name: SimulationDatasetFile + params: + match: null + recordtype: null + nodename: null + definition: + # (...) + --- + Converters: + CustomConverter_1: + package: mypackage.converters + converter: CustomConverter1 + CustomConverter_2: + package: mypackage.converters + converter: CustomConverter2 + + extroot: + type: Directory + match: ^extroot$ + subtree: + DataAnalysis: + type: Directory + match: DataAnalysis + # (...) + + + +The **recommended way** of defining metadata, custom converters, macros and the main cfood specification is shown in the following code example: + + +.. _example_4: +.. code-block:: yaml + + --- + metadata: + name: Datascience CFood + description: CFood for data from the local data science work group + macros: + - !defmacro + name: SimulationDatasetFile + params: + match: null + recordtype: null + nodename: null + definition: + # (...) + Converters: + CustomConverter_1: + package: mypackage.converters + converter: CustomConverter1 + CustomConverter_2: + package: mypackage.converters + converter: CustomConverter2 + --- + extroot: + type: Directory + match: ^extroot$ + subtree: + DataAnalysis: + type: Directory + match: DataAnalysis + # (...) + + +List Mode +--------- + +Specifying values of properties can make use of two special characters, in order to automatically +create lists or multi properties instead of single values: + +.. code-block:: yaml + + Experiment1: + Measurement: +Measurement <- Element in List (list is cleared before run) + *Measurement <- Multi Property (properties are removed before run) + Measurement <- Overwrite diff --git a/src/doc/concepts.rst b/src/doc/concepts.rst new file mode 100644 index 0000000000000000000000000000000000000000..c0f21cbaa322caddabed8e045f7b6fc4253d2959 --- /dev/null +++ b/src/doc/concepts.rst @@ -0,0 +1,119 @@ +Concepts +)))))))) + +Structure Elements +++++++++++++++++++ + +This hierarchical structure is assumed to be consituted of a tree of +StructureElements. The tree is created on the fly by so called Converters which +are defined in a yaml file. The tree of StructureElements is a model +of the existing data (For example could a tree of Python file objects +(StructureElements) represent a file tree that exists on some file server). + +Relevant sources in: +src/structure_elements.py + +Converters +++++++++++ + +Converters treat StructureElements and thereby create the StructureElement that +are the children of the treated StructureElement. Converters therefore create +the above named tree. The definition of a Converter also contains what +Converters shall be used to treat the generated child-StructureElements. The +definition is therefore a tree itself. + +See `:doc:converters<converters>` for details. + + + +Relevant sources in: +src/converters.py + + + +Identifiables ++++++++++++++ + +Relevant sources in: +src/identifiable_adapters.py + +The Crawler ++++++++++++ + +The crawler can be considered the main program doing the synchronization in basically two steps: +#. Based on a yaml-specification scan the file system (or other sources) and create a set of CaosDB Entities that are supposed to be inserted or updated in a CaosDB instance. +#. Compare the current state of the CaosDB instance with the set of CaosDB Entities created in step 1, taking into account the :ref:`registered identifiables<Identifiables>`. Insert or update entites accordingly. + +Relevant sources in: +src/crawl.py + + + +Special Cases +============= + +Variable Precedence ++++++++++++++++++++ + +Let's assume the following situation + +.. code-block:: yaml + + description: + type: DictTextElement + match_value: (?P<description>.*) + match_name: description + + +Making use of the $description variable could refer to two different variables created here: +1. The structure element path. +2. The value of the matched expression. + +The matched expression does take precedence over the structure element path and shadows it. + +Make sure, that if you want to be able to use the structure element path, to give unique names +to the variables like: + +.. code-block:: yaml + + description_text_block: + type: DictTextElement + match_value: (?P<description>.*) + match_name: description + + +Scopes +======== + +Example: + +.. code-block:: yaml + + DicomFile: + type: SimpleDicomFile + match: (?P<filename>.*)\.dicom + records: + DicomRecord: + name: $filename + subtree: # header of dicom file + PatientID: + type: DicomHeaderElement + match_name: PatientName + match_value: (?P<patient>.*) + records: + Patient: + name: $patient + dicom_name: $filename # $filename is in same scope! + ExperimentFile: + type: MarkdownFile + match: ^readme.md$ + records: + Experiment: + dicom_name: $filename # does NOT work, because $filename is out of scope! + + +# can variables be used within regexp? + + +File Objects +============ diff --git a/src/doc/conf.py b/src/doc/conf.py new file mode 100644 index 0000000000000000000000000000000000000000..30ce670eb8685e9701eeeb59bf22451a21fb16b9 --- /dev/null +++ b/src/doc/conf.py @@ -0,0 +1,218 @@ +# -*- coding: utf-8 -*- +# +# Configuration file for the Sphinx documentation builder. +# +# Based on the configuration for caosdb-pylib. +# +# # Copyright (C) 2021 Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, add these +# directories to sys.path here. This is particularly necessary if this package is installed at a +# different version, for example via `pip install`. +# +# If the directory is relative to the documentation root, use os.path.abspath to make it absolute, +# like shown here. +# +import os +import sys +sys.path.insert(0, os.path.abspath('..')) + +import sphinx_rtd_theme # noqa: E402 + + +# -- Project information ----------------------------------------------------- + +project = 'caosdb-caoscrawler' +copyright = '2021, MPIDS' +author = 'Alexander Schlemmer' + +# The short X.Y version +version = '0.1' +# The full version, including alpha/beta/rc tags +# release = '0.5.2-rc2' +release = '0.1' + + +# -- General configuration --------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.autosectionlabel', + 'sphinx.ext.intersphinx', + 'sphinx.ext.napoleon', # For Google style docstrings + "recommonmark", # For markdown files. + "sphinx_rtd_theme", +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +source_suffix = ['.rst', '.md'] + +# The master toctree document. +master_doc = 'index' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = "en" + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = None + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# + +html_theme = "sphinx_rtd_theme" + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = [] # ['_static'] + +# Custom sidebar templates, must be a dictionary that maps document names +# to template names. +# +# The default sidebars (for documents that don't match any pattern) are +# defined by theme itself. Builtin themes are using these templates by +# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', +# 'searchbox.html']``. +# +# html_sidebars = {} + + +# -- Options for HTMLHelp output --------------------------------------------- + +# Output file base name for HTML help builder. +htmlhelp_basename = 'caosdb-caoscrawlerdoc' + + +# -- Options for LaTeX output ------------------------------------------------ + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'caosdb-caoscrawler.tex', 'caosdb-caoscrawler Documentation', + 'MPIDS', 'manual'), +] + + +# -- Options for manual page output ------------------------------------------ + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'caosdb-caoscrawler', 'caosdb-caoscrawler documentation', + [author], 1) +] + + +# -- Options for Texinfo output ---------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'caosdb-caoscrawler', 'caosdb-caoscrawler documentation', + author, 'caosdb-caoscrawler', 'One line description of project.', + 'Miscellaneous'), +] + + +# -- Options for Epub output ------------------------------------------------- + +# Bibliographic Dublin Core info. +epub_title = project + +# The unique identifier of the text. This can be a ISBN number +# or the project homepage. +# +# epub_identifier = '' + +# A unique identification for the text. +# +# epub_uid = '' + +# A list of files that should not be packed into the epub file. +epub_exclude_files = ['search.html'] + + +# -- Extension configuration ------------------------------------------------- + +# True to prefix each section label with the name of the document it is in, followed by a colon. For +# example, index:Introduction for a section called Introduction that appears in document +# index.rst. Useful for avoiding ambiguity when the same section heading appears in different +# documents. +# +# Note: This stops "normal" links from working, so it should be kept at False. +# autosectionlabel_prefix_document = True + +# -- Options for intersphinx ------------------------------------------------- + +# https://www.sphinx-doc.org/en/master/usage/extensions/intersphinx.html#confval-intersphinx_mapping +intersphinx_mapping = { + "python": ("https://docs.python.org/", None), + "caosdb-mysqlbackend": ("https://docs.indiscale.com/caosdb-mysqlbackend/", + None), + "caosdb-server": ("https://docs.indiscale.com/caosdb-server/", None), + "caosdb-pylib": ("https://docs.indiscale.com/caosdb-pylib/", None), + "caosdb-advanced-user-tools": ("https://docs.indiscale.com/caosdb-advanced-user-tools/", None), +} + + +# TODO Which options do we want? +autodoc_default_options = { + 'members': None, + 'undoc-members': None, +} diff --git a/src/doc/converters.rst b/src/doc/converters.rst new file mode 100644 index 0000000000000000000000000000000000000000..7ec93535ec41dc211e2fa7ee194b2ecbe1a659fb --- /dev/null +++ b/src/doc/converters.rst @@ -0,0 +1,309 @@ +Converters +)))))))))) + +Converters treat StructureElements and thereby create the StructureElement that +are the children of the treated StructureElement. Converters therefore create +the tree of structure elements. The definition of a Converter also contains what +Converters shall be used to treat the generated child-StructureElements. The +definition is therefore a tree itself. + +Each StructureElement in the tree has a set of data values, i.e a dictionary of +key value pairs. +Some of those values are set due to the kind of StructureElement. For example, +a file could have the file name as such a key value pair: 'filename': <sth>. +Converters may define additional functions that create further values. For +example, a regular expresion could be used to get a date from a file name. + + + + +A converter is defined via a yml file or part of it. The definition states +what kind of StructureElement it treats (typically one). +Also, it defines how children of the current StructureElement are +created and what Converters shall be used to treat those. + +The yaml definition looks like the following: + +TODO: outdated, see cfood-schema.yml + +.. code-block:: yaml + + <NodeName>: + type: <ConverterName> + match: ".*" + records: + Experiment1: + parents: + - Experiment + - Blablabla + date: $DATUM + (...) + Experiment2: + parents: + - Experiment + subtree: + (...) + +The **<NodeName>** is a description of what it represents (e.g. +'experiment-folder') and is used as identifier. + +**<type>** selects the converter that is going to be matched against the current structure +element. If the structure element matches (this is a combination of a typecheck and a detailed +match, see :py:class:`~caoscrawler.converters.Converter` for details) the converter is used +to generate records (see :py:meth:`~caoscrawler.converters.Converter.create_records`) and to possibly process a subtree, as defined by the function :func:`caoscrawler.converters.create_children`. + +**records** is a dict of definitions that define the semantic structure +(see details below). + +Subtree contains a list of Converter defnitions that look like the one +described here. + + +Standard Converters ++++++++++++++++++++ + +Directory Converter +=================== + +Simple File Converter +===================== + +Markdown File Converter +======================= + +Dict Converter +============== + +Typical Subtree converters +-------------------------- + +DictBooleanElementConverter +DictFloatElementConverter +DictTextElementConverter +DictIntegerElementConverter +DictListElementConverter +DictDictElementConverter + +YAMLFileConverter +================= + +A specialized Dict Converter for yaml files: Yaml files are opened and the contents are +converted into dictionaries that can be further converted using the typical subtree converters +of dict converter. + +**WARNING**: Currently unfinished implementation. + +JSONFileConverter +================= + + + +TextElementConverter +==================== + +TableConverter +============== + +A generic converter (abstract) for files containing tables. +Currently, there are two specialized implementations for xlsx-files and csv-files. + +All table converters generate a subtree that can be converted with DictDictElementConverters: +For each row in the table a DictDictElement (structure element) is generated. The key of the +element is the row number. The value of the element is a dict containing the mapping of +column names to values of the respective cell. + +Example: + +.. code-block:: yaml + + subtree: + TABLE: + type: CSVTableConverter + match: ^test_table.csv$ + records: + (...) # Records edited for the whole table file + subtree: + ROW: + type: DictDictElement + match_name: .* + match_value: .* + records: + (...) # Records edited for each row + subtree: + COLUMN: + type: DictFloatElement + match_name: measurement # Name of the column in the table file + match_value: (?P<column_value).*) + records: + (...) # Records edited for each cell + + +XLSXTableConverter +================== + +CSVTableConverter +================= + +Custom Converters ++++++++++++++++++ + +It was previously mentioned that it is possible to create custom converters. +These custom converters can be used to integrate arbitrary data extraction and ETL capabilities +into the caosdb-crawler and make these extensions available to any yaml specification. + +The basic syntax for adding a custom converter to a yaml cfood definition file is: + +.. code-block:: yaml + + Converters: + <NameOfTheConverterInYamlFile>: + package: <python>.<module>.<name> + converter: <PythonClassName> + +The Converters-section can be either put into the first or second document of the cfood yaml file. +It can be also part of a single-document yaml cfood file. Please refer to :doc:`the cfood documentation<cfood>` for more details. + +Details: + +- **<NameOfTheConverterInYamlFile>**: This is the name of the converter as it is going to be used in the present yaml file. +- **<python>.<module>.<name>**: The name of the module where the converter class resides. +- **<PythonClassName>**: Within this specified module there must be a class inheriting from base class :py:class:`caoscrawler.converters.Converter`. + +The following methods are abstract and need to be overwritten by your custom converter to make it work: + +- :py:meth:`~caoscrawler.converters.Converter.create_children` +- :py:meth:`~caoscrawler.converters.Converter.match` +- :py:meth:`~caoscrawler.converters.Converter.typecheck` + + +Example +======= + +In the following, we will explain the process of adding a custom converter to a yaml file using +a SourceResolver that is able to attach a source element to another entity. + +**Note**: This example might become a standard crawler soon, as part of the scifolder specification. See https://doi.org/10.3390/data5020043 for details. In this documentation example we will, therefore, add it to a package called "scifolder". + +First we will create our package and module structure, which might be: + +.. code-block:: + + scifolder_package/ + README.md + setup.cfg + setup.py + Makefile + tox.ini + src/ + scifolder/ + __init__.py + converters/ + __init__.py + sources.py # <- the actual file containing + # the converter class + doc/ + unittests/ + +Now we need to create a class called "SourceResolver" in the file "sources.py". In this - more advanced - example, we will not inherit our converter directly from :py:class:`~caoscrawler.converters.Converter`, but use :py:class:`~caoscrawler.converters.TextElementConverter`. The latter already implements :py:meth:`~caoscrawler.converters.Converter.match` and :py:meth:`~caoscrawler.converters.Converter.typecheck`, so only an implementation for :py:meth:`~caoscrawler.converters.Converter.create_children` has to be provided by us. +Furthermore we will customize the method :py:meth:`~caoscrawler.converters.Converter.create_records` that allows us to specify a more complex record generation procedure than provided in the standard implementation. One specific limitation of the standard implementation is, that only a fixed +number of records can be generated by the yaml definition. So for any applications - like here - that require an arbitrary number of records to be created, a customized implementation of :py:meth:`~caoscrawler.converters.Converter.create_records` is recommended. +In this context it is recommended to make use of the function :func:`caoscrawler.converters.create_records` that implements creation of record objects from python dictionaries of the same structure +that would be given using a yaml definition. + +.. code-block:: python + + import re + from caoscrawler.stores import GeneralStore, RecordStore + from caoscrawler.converters import TextElementConverter, create_records + from caoscrawler.structure_elements import StructureElement, TextElement + + + class SourceResolver(TextElementConverter): + """ + This resolver uses a source list element (e.g. from the markdown readme file) + to link sources correctly. + """ + + def __init__(self, definition: dict, name: str, + converter_registry: dict): + """ + Initialize a new directory converter. + """ + super().__init__(definition, name, converter_registry) + + def create_children(self, generalStore: GeneralStore, + element: StructureElement): + + # The source resolver does not create children: + + return [] + + def create_records(self, values: GeneralStore, + records: RecordStore, + element: StructureElement, + file_path_prefix): + if not isinstance(element, TextElement): + raise RuntimeError() + + # This function must return a list containing tuples, each one for a modified + # property: (name_of_entity, name_of_property) + keys_modified = [] + + # This is the name of the entity where the source is going to be attached: + attach_to_scientific_activity = self.definition["scientific_activity"] + rec = records[attach_to_scientific_activity] + + # The "source" is a path to a source project, so it should have the form: + # /<Category>/<project>/<scientific_activity>/ + # obtain these information from the structure element: + val = element.value + regexp = (r'/(?P<category>(SimulationData)|(ExperimentalData)|(DataAnalysis))' + '/(?P<project_date>.*?)_(?P<project_identifier>.*)' + '/(?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2})(_(?P<identifier>.*))?/') + + res = re.match(regexp, val) + if res is None: + raise RuntimeError("Source cannot be parsed correctly.") + + # Mapping of categories on the file system to corresponding record types in CaosDB: + cat_map = { + "SimulationData": "Simulation", + "ExperimentalData": "Experiment", + "DataAnalysis": "DataAnalysis"} + linkrt = cat_map[res.group("category")] + + keys_modified.extend(create_records(values, records, { + "Project": { + "date": res.group("project_date"), + "identifier": res.group("project_identifier"), + }, + linkrt: { + "date": res.group("date"), + "identifier": res.group("identifier"), + "project": "$Project" + }, + attach_to_scientific_activity: { + "sources": "+$" + linkrt + }}, file_path_prefix)) + + # Process the records section of the yaml definition: + keys_modified.extend( + super().create_records(values, records, element, file_path_prefix)) + + # The create_records function must return the modified keys to make it compatible + # to the crawler functions: + return keys_modified + + +If the recommended (python) package structure is used, the package containing the converter +definition can just be installed using `pip install .` or `pip install -e .` from the +`scifolder_package` directory. + +The following yaml block will register the converter in a yaml file: + +.. code-block:: yaml + + Converters: + SourceResolver: + package: scifolder.converters.sources + converter: SourceResolver diff --git a/src/doc/index.rst b/src/doc/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..724bcc543dd1cf0b9af451c487b1b3aab7fa95ca --- /dev/null +++ b/src/doc/index.rst @@ -0,0 +1,44 @@ +Crawler 2.0 Documentation +========================= + + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + :hidden: + + Getting started<README_SETUP> + Concepts<concepts> + Converters<converters> + CFoods (Crawler Definitions)<cfood> + Macros<macros> + Tutorials<tutorials/index> + API documentation<_apidoc/modules> + + + +This is the documentation for the crawler (previously known as crawler 2.0) for CaosDB, ``caosdb-crawler``. + +The crawler is the main date integration tool for CaosDB. +Its task is to automatically synchronize data found on file systems or in other +sources of data with the semantic data model of CaosDB. + +More specifically, data that is contained in a hierarchical structure is converted to a data +structure that is consistent with a predefined semantic data model. + +The hierarchical sturcture can be for example a file tree. However it can be +also something different like the contents of a json file or a file tree with +json files. + +This documentation helps you to :doc:`get started<README_SETUP>`, explains the most important +:doc:`concepts<concepts>` and offers a range of :doc:`tutorials<tutorials/index>`. + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` + + diff --git a/src/doc/macros.rst b/src/doc/macros.rst new file mode 100644 index 0000000000000000000000000000000000000000..569b8474c98ff8f5f5a4f2eeface10ffc1b7a849 --- /dev/null +++ b/src/doc/macros.rst @@ -0,0 +1,90 @@ +Macros +------ + +Macros highly facilitate the writing of complex :doc:`CFoods<cfood>`. Consider the following prevalent example: + +.. _example_files: +.. code-block:: yaml + + ExperimentalData: + type: Directory + match: ExperimentalData + subtree: + README: + type: SimpleFile + match: ^README.md$ + records: + ReadmeFile: + parents: + - MarkdownFile + role: File + path: $README + file: $README + +This example just inserts a file called ``README.md`` contained in Folder ``ExpreimentalData/`` into CaosDB, assigns the parent (RecordType) ``MarkdownFile`` and allows for later referencing this entity within the cfood. As file objects are created in the cfood specification using the ``records`` section with the special role ``File``, defining and using many files can become very cumbersome and make the cfood file difficult to read. + +The same version using cfood macros could be defined as follows: + +.. _example_files_2: +.. code-block:: yaml + + --- + metadata: + macros: + - !defmacro + name: MarkdownFile + params: + name: null + filename: null + definition: + ${name}_filename: + type: SimpleFile + match: $filename + records: + $name: + parents: + - MarkdownFile + role: File + path: ${name}_filename + file: ${name}_filename + --- + ExperimentalData: + type: Directory + match: ExperimentalData + subtree: !macro + MarkdownFile: + - name: README + filename: ^README.md$ + + +The "MarkdownFile" key and its value will be replaced by everything that is +given below "definition" in the Macro. + + + +Complex Example +=============== + +.. _example_1: +.. code-block:: yaml + + macros: + - !defmacro + name: SimulationDatasetFile + params: + match: null + recordtype: null + nodename: null + definition: + $nodename: + match: $match + type: SimpleFile + records: + File: + parents: + - $recordtype + role: File + path: $$$nodename + file: $$$nodename + Simulation: + $recordtype: +$File diff --git a/src/doc/tutorials/index.rst b/src/doc/tutorials/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..1652515968c3b0025a2916604632d57c042f119b --- /dev/null +++ b/src/doc/tutorials/index.rst @@ -0,0 +1,2 @@ +Tutorials ++++++++++ diff --git a/src/newcrawler/__init__.py b/src/newcrawler/__init__.py deleted file mode 100644 index c6a81199838c281938c3b2e0e820212570a43bd7..0000000000000000000000000000000000000000 --- a/src/newcrawler/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .crawl import * diff --git a/src/newcrawler/crawl.py b/src/newcrawler/crawl.py deleted file mode 100755 index 862600d220010cb6142f0511f1680189fb949a9c..0000000000000000000000000000000000000000 --- a/src/newcrawler/crawl.py +++ /dev/null @@ -1,316 +0,0 @@ -#!/usr/bin/env python -# The prototype for a new crawler for CaosDB -# A. Schlemmer, 06/2021 - -import argparse -import os -import sys -import yaml -import re -import json -import yaml_header_tools -from abc import abstractmethod - - - - - -def match_complete(node: dict): - """Determine whether the match is complete. - - This function checks whether all nodes and subnodes have a value. - - Parameters - ---------- - node : The node to check. - - Returns - ------- - True if the match is complete and False otherwise. - """ - if "value" not in node: - return False - if "children" in node: - return all([match_complete(element) for element in node["children"]]) - return True - -class InformationBackend(object): - @abstractmethod - def check_type(self, current_node, current_element): - return - - @abstractmethod - def list_elements_function(self): - return - - @abstractmethod - def sub_matcher(self, current_node, current_element, subelement): - pass - -class DirectoryInformationBackend(InformationBackend): - def __init__(self, current_dir): - self.current_dir = current_dir - - def add_defaults(self, node: dict): - """ - Return the key from node as subnode setting some important defaults for - the cfood specification. - - Currently this is: - - Creating an "re" (regular expression) from the key, if no re is set. - - Add type "dir" if no type is present. - - Add default case "sensitive" to the node. - - Parameters - ---------- - node : The dictionary containing the subnode as key. - key : The key of the dictionary. - - Returns - ------- - The subnode including the defaults. - """ - - if "re" not in node: - node["re"] = re.escape(node["nodeName"]) - - if "type" not in node: - node["type"] = "dir" - - if "case" not in node: - node["case"] = "sensitive" - - def match_file_object(self, current_node: dict, - current_element: str): - """ - Try to match a filename with the supplied current_node. - - This function only uses the current path name specified by filename. - It does not check whether the file system object behind that path is valid - and matching the type of the current_node. - - Parameters - ---------- - current_node : A dictionary containing the matcher. - filename : A filename to match. - - Returns - ------- - None if the matcher does not match and otherwise a dict with the values of the matcher. - """ - - flags = 0 - if current_node["case"] == "insensitive": - flags += re.IGNORECASE - - regexp = current_node["re"] - pattern = re.compile(regexp) - matcher = re.match(pattern, current_element) - - if matcher is None: - return None - - # Value of current_node: - # - Add the numeric groups - # - Add the dictionary groups as well - - valdict = {0: matcher.group()} - for i in range(len(matcher.groups())): - valdict[i+1] = matcher.group(i+1) - for k, v in matcher.groupdict().items(): - valdict[k] = v - - return valdict - - def check_type(self, current_node, current_element): - path = os.path.join(self.current_dir, current_element) - - if current_node["type"] == "dir" and not os.path.isdir(path): - return False - elif current_node["type"] == "file" and os.path.isdir(path): - return False - - return True - - def list_elements_function(self): - return os.listdir(self.current_dir) - - def sub_matcher(self, current_node, current_element, subelement): - path = os.path.join(self.current_dir, current_element) - if current_node["type"] == "dir": - match_current_dir_node(subelement, - DirectoryInformationBackend(path)) - elif current_node["type"] == "file": - if current_node["representer"] == "markdown": - print("MARKDOWN") - match_current_dir_node(subelement, - MarkdownInformationBackend(path)) - else: - raise RuntimeError("Not implemented") - -class MarkdownInformationBackend(InformationBackend): - def __init__(self, filename=None, header=None): - """ - - Parameters - ---------- - filename : str - The filename of the markdown file. If None, header will be used directly. - header : dict - The header dictionary object. - """ - if filename is None and header is None: - raise ValueError("filename and header cannot both be None.") - - if filename is not None: - self.header = yaml_header_tools.get_header_from_file(filename, clean=False) - else: - self.header = header - - def add_defaults(self, node: dict): - if "re" not in node: - node["re"] = ".*" - - if "type" not in node: - node["type"] = "LIST" - - if "case" not in node: - node["case"] = "sensitive" - - def match_file_object(self, current_node: dict, - current_element: str): - """ - Try to match a filename with the supplied current_node. - - This function only uses the current path name specified by filename. - It does not check whether the file system object behind that path is valid - and matching the type of the current_node. - - Parameters - ---------- - current_node : A dictionary containing the matcher. - filename : A filename to match. - - Returns - ------- - None if the matcher does not match and otherwise a dict with the values of the matcher. - """ - - if current_node["nodeName"] != current_element: - return None - - flags = 0 - if current_node["case"] == "insensitive": - flags += re.IGNORECASE - - regexp = current_node["re"] - pattern = re.compile(regexp) - matcher = re.match(pattern, self.header[current_element]) - - if matcher is None: - return None - - # Value of current_node: - # - Add the numeric groups - # - Add the dictionary groups as well - - valdict = {0: matcher.group()} - for i in range(len(matcher.groups())): - valdict[i+1] = matcher.group(i+1) - for k, v in matcher.groupdict().items(): - valdict[k] = v - - return valdict - - def list_elements_function(self): - print(list(self.header.keys())) - return self.header - - def check_type(self, current_node, current_element): - if current_node["type"] == "LIST" and not type(self.header[current_element]) == list: - return False - if current_node["type"] == "TEXT" and not type(self.header[current_element]) == str: - return False - return True - - def sub_matcher(self, current_node, current_element, subelement): - print(current_node) - if current_node["type"] == "LIST": - print("sub ok") - match_current_dir_node(subelement, - MarkdownInformationBackend(header=self.header[current_element])) - else: - pass - - -def match_current_dir_node(current_node, information_backend): - """Do the recursive matching in the file tree. - - """ - information_backend.add_defaults(current_node) - - for element in information_backend.list_elements_function(): - if not information_backend.check_type(current_node, element): - continue - - match = information_backend.match_file_object(current_node, element) - if match is not None: - if "value" not in current_node: - current_node["value"] = [] - current_node["value"].append(match) - - if "children" in current_node: - match["children"] = [] - for subelement_name in current_node["children"]: - subelement = current_node["children"][subelement_name].copy() - subelement["nodeName"] = subelement_name - match["children"].append(subelement) - - information_backend.sub_matcher(current_node, element, subelement) - - -def crawl_cfood(dirname: str, - cfood: str): - """ - Crawl a single cfood. - """ - - # Load the cfood from a yaml file: - with open(cfood, "r") as f: - cf = yaml.load(f, Loader=yaml.SafeLoader) - - # Current way of determining the root node: - root_node = cf["root"] - # Assume root to have a single element (for now): - if len(root_node) != 1: - raise ValueError("Only a single cfood root is allowed.") - - root_node_name = list(root_node.keys())[0] - root_node[root_node_name]["nodeName"] = root_node_name - match_current_dir_node(root_node[root_node_name], - DirectoryInformationBackend(dirname)) - - return root_node - - - - -def crawl(dirname: str, - cfoods: list[str]): - """ - Craw a given file hierarchy. - - dirname : the root path of the file tree to be crawled - cfoods : a list of filenames of cfood files - """ - - # simplified for testing: - for cfood in cfoods: - crawl_cfood(dirname, cfood) - -def main(): - crawl(sys.args[1], [sys.args[2]]) - - -if __name__ == "__main__": - main() diff --git a/synchronize.md b/synchronize.md new file mode 100644 index 0000000000000000000000000000000000000000..7d240095a770a33d55370d9e48a1ab89b60c02dd --- /dev/null +++ b/synchronize.md @@ -0,0 +1,34 @@ +# Synchronization + +## Goals +Ideally, with current XML API we only issue two transactions in order to be as atomic as possible: One insert and one update. +( This should also allow to remove all inserted objects if the update fails...) + +## Difficulties + +### Recursive References + +A Record might reference another Record from the list. This Record then has to be identified using the appropriate identifiable before the former Record can be inserted or updated. Thus starting with the leaves in this structure, for each record needs to be checked (using the identifiable) whether it exists. If it does not exist it can be added to an to_be_inserted list. + +If an identifiable contains a reference to or shall be referenced by an object in the inserted list, then that means that it also does not exist and can be added to the to_be_inserted list. + +### Duplicates +It must not happen, that an identifiable is checked twice and added twice (or more times) to the to_be_inserted list (e.g. two Experiments with a certain date). Inserting duplicates could be prevented by inserting one Record and then checking the identifiable of the next before another insertion. However, this violates the above goal to have one insertion. Thus, it is necessary to check whether an identifiable is already in the list without server interaction. + +This should be possible by using a dict with hashes as keys. The hashes can be computed from the identifiables as follows: RT+name+prop1=val+prop2=val+referencedby=A+B+C. Here, references to other objects in the to_be_inserted list (or dict) can be replaced by the hash. Creating the has carefully (sorted props etc) should make it unique such that an identification is possible without server check. + + + +## Implementation Sketch +Recursively, run through created objects and check whether the identifiable references or is referenced by Records that are not yet checked. Once the identifiable is checked against the server, the following applies: +a) identifiable exists in the Server: set the id of the Record object and add it to the to_be_updated list (if update is required) +b) identifiable does not exist: check the to_be_inserted dict(key is a hash computed as described above and value is the Record object) + 1. hash exists: reuse the value corresponding to the key + 2. hash does not exist: add hash and value (Record object) to the dict + +Maybe keep another dict that tracks what Record objects are in the to_be_updated dict (id(rec) as key?) + +After treating leave Records, Records that could not be checked before can be checked: Either referenced Records now have an ID or they are in the to_be_inserted dict such that it is clear that the identifiable at hand does not exist in the server. + +This way, the whole structure can be resolved except if there are circular dependencies: Those can be added fully to the to_be_inserted dict. (???) + diff --git a/tests/scifolder_cfood.yml b/tests/scifolder_cfood.yml deleted file mode 100644 index 048e194b53ff792f18ac5299f0a4476b9c97bf15..0000000000000000000000000000000000000000 --- a/tests/scifolder_cfood.yml +++ /dev/null @@ -1,35 +0,0 @@ - -root: - DataAnalysis: - - children: - project_dir: - re: (?P<date>.*?)_(?P<identifier>.*) - handlers: - - type: identifiable - name: idf_project - - children: - single: - re: (?P<date>.*?)_(?P<identifier>.*) - - children: - - README: - type: file - representer: markdown - case: insensitive - re: README\.md - - children: - description: - type: TEXT - responsible: - type: LIST - children: - person: - type: TEXT - re: (?P<first_name>.+) (?P<last_name>.+) - handlers: - type: identifiable - name: idf_person diff --git a/tests/test_functions.py b/tests/test_functions.py deleted file mode 100644 index a33f0d2182474e960fa42c2343db533c3d21d41f..0000000000000000000000000000000000000000 --- a/tests/test_functions.py +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/python -# Tests for main functions of crawler -# A. Schlemmer, 07/2021 - -from newcrawler import match_complete - -def test_match_complete(): - node = {"name": "bla"} - assert match_complete(node) == False - - node = {"name": "bla", - "children": [{ - "name": "test", - "value": 234}, { - "name": "test", - "value": 234}]} - assert match_complete(node) == False - - node = {"name": "bla", - "value": "ok", - "children": [{ - "name": "test", - "value": 234}, { - "name": "test", - "value": 234}]} - assert match_complete(node) == True - - node = {"name": "bla", - "value": "ok", - "children": [{ - "name": "test"}, { - "name": "test", - "value": 234}]} - assert match_complete(node) == False diff --git a/tests/test_tool.py b/tests/test_tool.py deleted file mode 100755 index a875305ac65dd043e30d83c215e3b6a5281bfb77..0000000000000000000000000000000000000000 --- a/tests/test_tool.py +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/python -# Tests for the tool using pytest -# Adapted from check-sfs -# A. Schlemmer, 06/2021 - -from newcrawler import crawl_cfood -from os.path import join, dirname -import yaml - -def test_crawler(): - m = crawl_cfood(join(dirname(__file__), "test_directories/examples_article"), - join(dirname(__file__), "scifolder_cfood.yml")) - print(yaml.dump(m)) - assert False diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000000000000000000000000000000000000..101904b7de43fba6f04cf65641f555d79b0b080a --- /dev/null +++ b/tox.ini @@ -0,0 +1,15 @@ +[tox] +envlist=py38, py39, py310 +skip_missing_interpreters = true + +[testenv] +deps = . + pytest + pytest-cov + # TODO: Make this f-branch sensitive + git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev + git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@dev +commands= caosdb-crawler --help + py.test --cov=caosdb -vv {posargs} +[flake8] +max-line-length=100 diff --git a/unittests/broken_cfoods/broken1.yml b/unittests/broken_cfoods/broken1.yml new file mode 100644 index 0000000000000000000000000000000000000000..9fd4c52934c56512ada8ea564ccd540e07e25661 --- /dev/null +++ b/unittests/broken_cfoods/broken1.yml @@ -0,0 +1,79 @@ +Definitions: + type: Definitions + #include "description.yml" + +# Converter-Provenance +# DataAnalysis/project_dir/measurement/match/identifier +# Structure-Element-Provenance +# DataAnalysis/2020_SpeedOflight/2020-11-10_kram + +DataAnalysis: # name of the converter + type: Directory_djskfj + match: DataAnalysis + subtree: &template + project_dir: # name of the first subtree element which is a converter + type: Directory + match: (?P<date>.*?)_(?P<identifier>.*) + records: + Project: # this is an identifiable in this case + parents: + - Project # not needed as the name is equivalent + date: $date + identifier: $identifier + + subtree: + measurement: # new name for folders on the 3rd level + type: Directory + match: (?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2})(_(?P<identifier>.*))? + records: + Measurement: + date: $date + identifier: $identifier + project: $Project + subtree: + README: + type: MarkdownFile # this is a subclass of converter File + # function signature: GeneralStore, StructureElement + # preprocessors: custom.caosdb.convert_values + match: ^README\.md$ + # how to make match case insensitive? + subtree: + description: + type: DictTextElement + match_value: (?P<description>.*) + match_name: description + records: + Measurement: + description: $description + responsible_single: + type: DictTextElement + match_name: responsible + match_value: &person_regexp ((?P<first_name>.+) )?(?P<last_name>.+) + records: &responsible_records + Person: + first_name: $first_name + last_name: $last_name + Measurement: # this uses the reference to the above defined record + responsible: +$Person # each record also implicitely creates a variable + # with the same name. The "+" indicates, that + # this will become a list entry in list property + # "responsible" belonging to Measurement. + + responsible_list: + type: DictListElement + match_name: responsible + subtree: + Person: + type: TextElement + match: *person_regexp + records: *responsible_records + +ExperimentalData: # name of the converter + type: Directory + match: ExperimentalData + subtree: *template + +SimulationData: # name of the converter + type: Directory + match: SimulationData + subtree: *template diff --git a/unittests/broken_cfoods/broken_validation_path.yml b/unittests/broken_cfoods/broken_validation_path.yml new file mode 100644 index 0000000000000000000000000000000000000000..a59978764ba1f400c491ecd94cfebedfe92fc4eb --- /dev/null +++ b/unittests/broken_cfoods/broken_validation_path.yml @@ -0,0 +1,4 @@ +BrokenValidationPathTest: + type: JSONFile + match: "(.*)" + validate: ./this-file-does-not-exist.schema.json diff --git a/unittests/cfoods_scalar.yml b/unittests/cfoods_scalar.yml new file mode 100644 index 0000000000000000000000000000000000000000..d0a728c35c27e331114cc5c18ebcfd1aa0905e31 --- /dev/null +++ b/unittests/cfoods_scalar.yml @@ -0,0 +1,14 @@ +# This is a test cfood for: +# https://gitlab.com/caosdb/caosdb-crawler/-/issues/9 + +Data: # name of the converter + type: Directory + match: (.*) + subtree: + DataAnalysis: # name of the converter + type: Directory + match: DataAnalysis + records: + RecordThatGetsParentsLater: + someId: 23 # <- this scalar causes problems + diff --git a/unittests/records.xml b/unittests/records.xml new file mode 100644 index 0000000000000000000000000000000000000000..f7455ec6b8995db8cd205f69729c32358beee8c0 --- /dev/null +++ b/unittests/records.xml @@ -0,0 +1,157 @@ +<Entities> + <Record id="281"> + <Version id="291faf0ae67b0437d5ab8dd0c6c60cf43c8cc027" head="true"/> + <Parent id="250" name="Project"/> + <Property id="247" name="date" description="date of the experiment" datatype="DATETIME" importance="FIX" flag="inheritance:FIX">2020</Property> + <Property id="248" name="identifier" description="identifier of the experiment" datatype="TEXT" importance="FIX" flag="inheritance:FIX">climate-model-predict</Property> + </Record> + <Record id="282"> + <Version id="59f41d5ebba6f6d7c881452386c3bd76e03a6871" head="true"/> + <Parent id="259" name="Person"/> + <Property id="261" name="first_name" description="First name of a Person." datatype="TEXT" importance="FIX" flag="inheritance:FIX"/> + <Property id="262" name="last_name" description="LastName of a Person." datatype="TEXT" importance="FIX" flag="inheritance:FIX">AuthorE</Property> + </Record> + <Record id="283"> + <Version id="58c553e40002e184c32ea062993701237fc21934" head="true"/> + <Parent id="259" name="Person"/> + <Property id="261" name="first_name" description="First name of a Person." datatype="TEXT" importance="FIX" flag="inheritance:FIX"/> + <Property id="262" name="last_name" description="LastName of a Person." datatype="TEXT" importance="FIX" flag="inheritance:FIX">AuthorD</Property> + </Record> + <Record id="284" description="Average temperatures of the years 2000-2009 as obtained from wheatherdata.example"> + <Version id="f9dbd861ccffff0c9a08df41a82ca60a374a92bb" head="true"/> + <Parent id="278" name="Measurement"/> + <Property id="247" name="date" description="date of the experiment" datatype="DATETIME" importance="FIX" flag="inheritance:FIX">2000-01-01</Property> + <Property id="248" name="identifier" description="identifier of the experiment" datatype="TEXT" importance="FIX" flag="inheritance:FIX"/> + <Property id="250" name="project" datatype="Project" importance="FIX" flag="inheritance:FIX">281</Property> + <Property id="249" name="responsible" datatype="LIST<Person>" importance="FIX" flag="inheritance:FIX"> + <Value>283</Value> + </Property> + </Record> + <Record id="285" description="Average temperatures of the years 1990-1999 as obtained from wheatherdata.example"> + <Version id="561a29c3b200f47a0c8cd1d43b3430f9ae4bbbb4" head="true"/> + <Parent id="278" name="Measurement"/> + <Property id="247" name="date" description="date of the experiment" datatype="DATETIME" importance="FIX" flag="inheritance:FIX">1990-01-01</Property> + <Property id="248" name="identifier" description="identifier of the experiment" datatype="TEXT" importance="FIX" flag="inheritance:FIX"/> + <Property id="250" name="project" datatype="Project" importance="FIX" flag="inheritance:FIX">281</Property> + <Property id="249" name="responsible" datatype="LIST<Person>" importance="FIX" flag="inheritance:FIX"> + <Value>283</Value> + </Property> + </Record> + <Record id="286" description="Average temperatures of the years 1980-1989 as obtained from wheatherdata.example"> + <Version id="8ec5f56b96a0e60130f909ab6b4a035f1579e856" head="true"/> + <Parent id="278" name="Measurement"/> + <Property id="247" name="date" description="date of the experiment" datatype="DATETIME" importance="FIX" flag="inheritance:FIX">1980-01-01</Property> + <Property id="248" name="identifier" description="identifier of the experiment" datatype="TEXT" importance="FIX" flag="inheritance:FIX"/> + <Property id="250" name="project" datatype="Project" importance="FIX" flag="inheritance:FIX">281</Property> + <Property id="249" name="responsible" datatype="LIST<Person>" importance="FIX" flag="inheritance:FIX"> + <Value>283</Value> + </Property> + </Record> + <Record id="287"> + <Version id="b967d4ba9a333fd37b723d2b4c6f7e18ee0d41e3" head="true"/> + <Parent id="250" name="Project"/> + <Property id="247" name="date" description="date of the experiment" datatype="DATETIME" importance="FIX" flag="inheritance:FIX">2020</Property> + <Property id="248" name="identifier" description="identifier of the experiment" datatype="TEXT" importance="FIX" flag="inheritance:FIX">SpeedOfLight</Property> + </Record> + <Record id="288"> + <Version id="18a8c4200597bf745391829c6cb9c04c747264fb" head="true"/> + <Parent id="259" name="Person"/> + <Property id="261" name="first_name" description="First name of a Person." datatype="TEXT" importance="FIX" flag="inheritance:FIX"/> + <Property id="262" name="last_name" description="LastName of a Person." datatype="TEXT" importance="FIX" flag="inheritance:FIX">AuthorB</Property> + </Record> + <Record id="289"> + <Version id="799b41948bde740f37e202a5bab70e3d8829b3f6" head="true"/> + <Parent id="259" name="Person"/> + <Property id="261" name="first_name" description="First name of a Person." datatype="TEXT" importance="FIX" flag="inheritance:FIX"/> + <Property id="262" name="last_name" description="LastName of a Person." datatype="TEXT" importance="FIX" flag="inheritance:FIX">AuthorA</Property> + </Record> + <Record id="290"> + <Version id="905f204d9bdc58890b59367338be038383f4dcf9" head="true"/> + <Parent id="259" name="Person"/> + <Property id="261" name="first_name" description="First name of a Person." datatype="TEXT" importance="FIX" flag="inheritance:FIX"/> + <Property id="262" name="last_name" description="LastName of a Person." datatype="TEXT" importance="FIX" flag="inheritance:FIX">AuthorC</Property> + </Record> + <Record id="291" description="Time-of-flight measurements to determine the speed of light"> + <Version id="2d2f795a165fe1401ed0270f5b0bee9e6781e2c9" head="true"/> + <Parent id="278" name="Measurement"/> + <Property id="247" name="date" description="date of the experiment" datatype="DATETIME" importance="FIX" flag="inheritance:FIX">2020-01-01</Property> + <Property id="248" name="identifier" description="identifier of the experiment" datatype="TEXT" importance="FIX" flag="inheritance:FIX">TimeOfFlight</Property> + <Property id="250" name="project" datatype="Project" importance="FIX" flag="inheritance:FIX">287</Property> + <Property id="249" name="responsible" datatype="LIST<Person>" importance="FIX" flag="inheritance:FIX"> + <Value>289</Value> + <Value>288</Value> + </Property> + </Record> + <Record id="292" description="comparison between predicted and measured temperatures for 2010 to 2019"> + <Version id="454be377ae35e44d89b7d28fc44d518b7e9321a3" head="true"/> + <Parent id="278" name="Measurement"/> + <Property id="247" name="date" description="date of the experiment" datatype="DATETIME" importance="FIX" flag="inheritance:FIX">2020-02-08</Property> + <Property id="248" name="identifier" description="identifier of the experiment" datatype="TEXT" importance="FIX" flag="inheritance:FIX">prediction-errors</Property> + <Property id="250" name="project" datatype="Project" importance="FIX" flag="inheritance:FIX">281</Property> + <Property id="249" name="responsible" datatype="LIST<Person>" importance="FIX" flag="inheritance:FIX"> + <Value>283</Value> + </Property> + </Record> + <Record id="293" description="Average over all data of each type of experiment separately and comined."> + <Version id="12f3cd8eb6ba7a264ecc2d296c6e8d3a9f7ffc95" head="true"/> + <Parent id="278" name="Measurement"/> + <Property id="247" name="date" description="date of the experiment" datatype="DATETIME" importance="FIX" flag="inheritance:FIX">2020-01-05</Property> + <Property id="248" name="identifier" description="identifier of the experiment" datatype="TEXT" importance="FIX" flag="inheritance:FIX">average-all-exp-corr</Property> + <Property id="250" name="project" datatype="Project" importance="FIX" flag="inheritance:FIX">287</Property> + <Property id="249" name="responsible" datatype="LIST<Person>" importance="FIX" flag="inheritance:FIX"> + <Value>289</Value> + </Property> + </Record> + <Record id="294" description="Average over all data of each type of experiment separately and comined."> + <Version id="4b513be5a2dbad332a3442eabe45ac7b1eae3b22" head="true"/> + <Parent id="278" name="Measurement"/> + <Property id="247" name="date" description="date of the experiment" datatype="DATETIME" importance="FIX" flag="inheritance:FIX">2020-01-04</Property> + <Property id="248" name="identifier" description="identifier of the experiment" datatype="TEXT" importance="FIX" flag="inheritance:FIX">average-all-exp</Property> + <Property id="250" name="project" datatype="Project" importance="FIX" flag="inheritance:FIX">287</Property> + <Property id="249" name="responsible" datatype="LIST<Person>" importance="FIX" flag="inheritance:FIX"> + <Value>289</Value> + </Property> + </Record> + <Record id="295" description="Code for fitting the predictive model to the training data and for predicting the average annual temperature for all measurement stations for the years 2010 to 2019"> + <Version id="e08fb3f41d0d2ab505f68795d4ee85c8235ef794" head="true"/> + <Parent id="278" name="Measurement"/> + <Property id="247" name="date" description="date of the experiment" datatype="DATETIME" importance="FIX" flag="inheritance:FIX">2020-02-01</Property> + <Property id="248" name="identifier" description="identifier of the experiment" datatype="TEXT" importance="FIX" flag="inheritance:FIX"/> + <Property id="250" name="project" datatype="Project" importance="FIX" flag="inheritance:FIX">281</Property> + <Property id="249" name="responsible" datatype="LIST<Person>" importance="FIX" flag="inheritance:FIX"> + <Value>282</Value> + </Property> + </Record> + <Record id="296" description="Average temperatures of the years 2010-2019 as obtained from wheatherdata.example"> + <Version id="81b7dae68df569f9fbf65e75448446093f816ab1" head="true"/> + <Parent id="278" name="Measurement"/> + <Property id="247" name="date" description="date of the experiment" datatype="DATETIME" importance="FIX" flag="inheritance:FIX">2010-01-01</Property> + <Property id="248" name="identifier" description="identifier of the experiment" datatype="TEXT" importance="FIX" flag="inheritance:FIX"/> + <Property id="250" name="project" datatype="Project" importance="FIX" flag="inheritance:FIX">281</Property> + <Property id="249" name="responsible" datatype="LIST<Person>" importance="FIX" flag="inheritance:FIX"> + <Value>283</Value> + </Property> + </Record> + <Record id="297" description="Radio interferometry measurements to determine the speed of light"> + <Version id="f3553ee9660b43b6a7598614de8eb17f40cf9782" head="true"/> + <Parent id="278" name="Measurement"/> + <Property id="247" name="date" description="date of the experiment" datatype="DATETIME" importance="FIX" flag="inheritance:FIX">2020-01-03</Property> + <Property id="248" name="identifier" description="identifier of the experiment" datatype="TEXT" importance="FIX" flag="inheritance:FIX"/> + <Property id="250" name="project" datatype="Project" importance="FIX" flag="inheritance:FIX">287</Property> + <Property id="249" name="responsible" datatype="LIST<Person>" importance="FIX" flag="inheritance:FIX"> + <Value>289</Value> + <Value>288</Value> + </Property> + </Record> + <Record id="298" description="Cavity resonance measurements for determining the speed of light"> + <Version id="06ddcf6f8a8c30761912c3752139acc3f6c610eb" head="true"/> + <Parent id="278" name="Measurement"/> + <Property id="247" name="date" description="date of the experiment" datatype="DATETIME" importance="FIX" flag="inheritance:FIX">2020-01-02</Property> + <Property id="248" name="identifier" description="identifier of the experiment" datatype="TEXT" importance="FIX" flag="inheritance:FIX">Cavity</Property> + <Property id="250" name="project" datatype="Project" importance="FIX" flag="inheritance:FIX">287</Property> + <Property id="249" name="responsible" datatype="LIST<Person>" importance="FIX" flag="inheritance:FIX"> + <Value>289</Value> + <Value>290</Value> + </Property> + </Record> +</Entities> diff --git a/unittests/scifolder_cfood.yml b/unittests/scifolder_cfood.yml new file mode 100644 index 0000000000000000000000000000000000000000..90f193444bfda7296c46260236274da2378635cc --- /dev/null +++ b/unittests/scifolder_cfood.yml @@ -0,0 +1,82 @@ +# This is only a scifolder test cfood with a limited functionality. +# The full scifolder cfood will be developed here: +# https://gitlab.indiscale.com/caosdb/src/crawler-cfoods/scifolder-cfood + +Definitions: + type: Definitions + #include "description.yml" + +Data: # name of the converter + type: Directory + match: (.*) + subtree: + DataAnalysis: # name of the converter + type: Directory + match: DataAnalysis + subtree: &template + project_dir: # name of the first subtree element which is a converter + type: Directory + match: ((?P<date>[0-9]{4,4})_)?(?P<identifier>.*) + records: + Project: # this is an identifiable in this case + parents: + - Project # not needed as the name is equivalent + date: $date + identifier: $identifier + + subtree: + measurement: # new name for folders on the 3rd level + type: Directory + match: (?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2})(_(?P<identifier>.*))? + records: + Measurement: + date: $date + identifier: $identifier + project: $Project + subtree: + README: + type: MarkdownFile # this is a subclass of converter File + # function signature: GeneralStore, StructureElement + # preprocessors: custom.caosdb.convert_values + match: ^README\.md$ + # how to make match case insensitive? + subtree: + description: + type: DictTextElement + match_value: (?P<description>.*) + match_name: description + records: + Measurement: + description: $description + responsible_single: + type: DictTextElement + match_name: responsible + match_value: &person_regexp ((?P<first_name>.+) )?(?P<last_name>.+) + records: &responsible_records + Person: + first_name: $first_name + last_name: $last_name + Measurement: # this uses the reference to the above defined record + responsible: +$Person # each record also implicitely creates a variable + # with the same name. The "+" indicates, that + # this will become a list entry in list property + # "responsible" belonging to Measurement. + + responsible_list: + type: DictListElement + match_name: responsible + subtree: + Person: + type: TextElement + match: *person_regexp + records: *responsible_records + + ExperimentalData: # name of the converter + type: Directory + match: ExperimentalData + subtree: *template + + SimulationData: # name of the converter + type: Directory + match: SimulationData + subtree: *template diff --git a/unittests/scifolder_extended.yml b/unittests/scifolder_extended.yml new file mode 100644 index 0000000000000000000000000000000000000000..9bab612b9b37e8e295ee8fd02575de506a98d8fc --- /dev/null +++ b/unittests/scifolder_extended.yml @@ -0,0 +1,103 @@ +# This is only a scifolder test cfood with a limited functionality. +# The full scifolder cfood will be developed here: +# https://gitlab.indiscale.com/caosdb/src/crawler-cfoods/scifolder-cfood + +Definitions: + type: Definitions + #include "description.yml" + +Data: # name of the converter + type: Directory + match: (.*) + subtree: + DataAnalysis: # name of the converter + type: Directory + match: DataAnalysis + subtree: &template + project_dir: # name of the first subtree element which is a converter + type: Directory + match: ((?P<year>[0-9]{4,4})_)?(?P<identifier>.*) + records: + Project: # this is an identifiable in this case + parents: + - Project # not needed as the name is equivalent + date: $year + identifier: $identifier + + subtree: + measurement: # new name for folders on the 3rd level + type: Directory + match: (?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2})(_(?P<identifier>.*))? + records: + Measurement: + date: $date + identifier: $identifier + project: $Project + subtree: + README: + type: MarkdownFile # this is a subclass of converter File + # function signature: GeneralStore, StructureElement + # preprocessors: custom.caosdb.convert_values + match: ^README\.md$ + # how to make match case insensitive? + records: # this block is very verbose and intended to make sure that this + # file is inserted correctly (and can be supplemented with properties + # and / or parents), TODO: maybe there should be a shorthand + ReadmeFile: + parents: [] + role: File + path: $README + file: $README # this is automatically the relative path + # starting from the top level structure element + # of this element + Measurement: + ReadmeFile: $ReadmeFile + + subtree: + description: + type: DictTextElement + match_value: (?P<description>.*) + match_name: description + records: + Measurement: + description: $description + responsible_single: + type: DictTextElement + match_name: responsible + match_value: &person_regexp ((?P<first_name>.+) )?(?P<last_name>.+) + records: &responsible_records + Person: + first_name: $first_name + last_name: $last_name + Measurement: # this uses the reference to the above defined record + responsible: +$Person # each record also implicitely creates a variable + # with the same name. The "+" indicates, that + # this will become a list entry in list property + # "responsible" belonging to Measurement. + + responsible_list: + type: DictListElement + match_name: responsible + subtree: + Person: + type: TextElement + match: *person_regexp + records: *responsible_records + + # sources_list: + # type: DictListElement + # match_name: sources + # subtree: + # Source: + # type: TextElement + # match: &path ... ??? + + ExperimentalData: # name of the converter + type: Directory + match: ExperimentalData + subtree: *template + + SimulationData: # name of the converter + type: Directory + match: SimulationData + subtree: *template diff --git a/unittests/scifolder_extended2.yml b/unittests/scifolder_extended2.yml new file mode 100644 index 0000000000000000000000000000000000000000..969325e91da488011819c338708a33dcfc32c93e --- /dev/null +++ b/unittests/scifolder_extended2.yml @@ -0,0 +1,104 @@ +# This is only a scifolder test cfood with a limited functionality. +# The full scifolder cfood will be developed here: +# https://gitlab.indiscale.com/caosdb/src/crawler-cfoods/scifolder-cfood + +Definitions: + type: Definitions + #include "description.yml" + +Data: # name of the converter + type: Directory + match: (.*) + subtree: + DataAnalysis: # name of the converter + type: Directory + match: DataAnalysis + subtree: &template + project_dir: # name of the first subtree element which is a converter + type: Directory + match: ((?P<year>[0-9]{4,4})_)?(?P<identifier>.*) + records: + Project: # this is an identifiable in this case + parents: + - Project # not needed as the name is equivalent + date: $year + identifier: $identifier + + subtree: + measurement: # new name for folders on the 3rd level + type: Directory + match: (?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2})(_(?P<identifier>.*))? + records: + Measurement: + date: $date + identifier: $identifier + project: $Project + subtree: + README: + type: MarkdownFile # this is a subclass of converter File + # function signature: GeneralStore, StructureElement + # preprocessors: custom.caosdb.convert_values + match: ^README\.md$ + # how to make match case insensitive? + records: # this block is very verbose and intended to make sure that this + # file is inserted correctly (and can be supplemented with properties + # and / or parents), TODO: maybe there should be a shorthand + ReadmeFile: + parents: + - ProjectMarkdownReadme + role: File + path: $README + file: $README # this is automatically the relative path + # starting from the top level structure element + # of this element + Measurement: + ReadmeFile: $ReadmeFile + + subtree: + description: + type: DictTextElement + match_value: (?P<description>.*) + match_name: description + records: + Measurement: + description: $description + responsible_single: + type: DictTextElement + match_name: responsible + match_value: &person_regexp ((?P<first_name>.+) )?(?P<last_name>.+) + records: &responsible_records + Person: + first_name: $first_name + last_name: $last_name + Measurement: # this uses the reference to the above defined record + responsible: +$Person # each record also implicitely creates a variable + # with the same name. The "+" indicates, that + # this will become a list entry in list property + # "responsible" belonging to Measurement. + + responsible_list: + type: DictListElement + match_name: responsible + subtree: + Person: + type: TextElement + match: *person_regexp + records: *responsible_records + + # sources_list: + # type: DictListElement + # match_name: sources + # subtree: + # Source: + # type: TextElement + # match: &path ... ??? + + ExperimentalData: # name of the converter + type: Directory + match: ExperimentalData + subtree: *template + + SimulationData: # name of the converter + type: Directory + match: SimulationData + subtree: *template diff --git a/unittests/simulated_server_data.py b/unittests/simulated_server_data.py new file mode 100644 index 0000000000000000000000000000000000000000..dd0c6b4e8693d64c9d96cafc5db2f447613daa1b --- /dev/null +++ b/unittests/simulated_server_data.py @@ -0,0 +1,24 @@ + +import caosdb as db +data_model = {"person": (db.RecordType(id=259, name="Person") + .add_property(name="first_name") + .add_property(name="last_name")), + "measurement": (db.RecordType(id=278, name="Measurement") + .add_property(name="identifier") + .add_property(name="date") + .add_property(name="project")), + "project": (db.RecordType(id=250, name="Project") + .add_property(name="date") + .add_property(name="identifier")), + "first_name": db.Property(name="first_name", datatype=db.TEXT, id=261), + "responsible": db.Property(name="responsible", datatype="Person", id=249), + "last_name": db.Property(name="last_name", datatype=db.TEXT, id=262), + "identifier": db.Property(name="identifier", datatype=db.TEXT, id=248), + "date": db.Property(name="date", datatype=db.DATETIME, id=247), + } +existing_data = { +} + +full_data = {} +full_data.update(data_model) +full_data.update(existing_data) diff --git a/unittests/test_cache.py b/unittests/test_cache.py new file mode 100644 index 0000000000000000000000000000000000000000..135316b92fda0ac1e43f4e5f2c4f28fbf1272494 --- /dev/null +++ b/unittests/test_cache.py @@ -0,0 +1,56 @@ +#!/bin/python +# Tests for entity comparison +# A. Schlemmer, 06/2021 + +import caosdb as db +from pytest import raises + +from caoscrawler.identified_cache import _create_hashable_string as create_hash_string + + +def test_normal_hash_creation(): + # Test the initial functionality: + # hash comprises only one parent, name and properties: + + r1 = db.Record() + r1.add_property(name="test") + r1.add_parent("bla") + hash1 = create_hash_string(r1) + + r2 = db.Record() + r2.add_property(name="test2") + r2.add_parent("bla") + hash2 = create_hash_string(r2) + + assert hash1 != hash2 + + r3 = db.Record() + r3.add_property(name="test") + r3.add_parent("bla bla") + hash3 = create_hash_string(r3) + assert hash1 != hash3 + assert hash2 != hash3 + + # no name and no properties and no parents: + r4 = db.Record() + with raises(RuntimeError, match=".*1 parent.*"): + create_hash_string(r4) + + # should work + r4.add_parent("bla") + assert len(create_hash_string(r4)) > 0 + r4.add_property(name="test") + assert len(create_hash_string(r4)) > 0 + + r4.add_parent("bla bla") + with raises(RuntimeError, match=".*1 parent.*"): + create_hash_string(r4) + + +def test_file_hash_creation(): + f1 = db.File(path="/bla/bla/test1.txt") + hash1 = create_hash_string(f1) + f2 = db.File(path="/bla/bla/test2.txt") + hash2 = create_hash_string(f2) + + assert hash1 != hash2 diff --git a/unittests/test_converters.py b/unittests/test_converters.py new file mode 100644 index 0000000000000000000000000000000000000000..30c5972c4f006aaf9923dfc058c3b861d8b5123b --- /dev/null +++ b/unittests/test_converters.py @@ -0,0 +1,387 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2021,2022 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2021,2022 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" +test the converters module +""" +import importlib +import os +import pytest +import yaml + +from caoscrawler.converters import (Converter, ConverterValidationError, + DictConverter, DirectoryConverter, + handle_value, MarkdownFileConverter, + JSONFileConverter) +from caoscrawler.crawl import Crawler +from caoscrawler.stores import GeneralStore +from caoscrawler.structure_elements import (File, DictTextElement, + DictListElement, DictElement, + DictBooleanElement, DictDictElement, + DictIntegerElement, + DictFloatElement, Directory) + +from test_tool import rfp + + +@pytest.fixture +def converter_registry(): + converter_registry: dict[str, dict[str, str]] = { + "Directory": { + "converter": "DirectoryConverter", + "package": "caoscrawler.converters"}, + "MarkdownFile": { + "converter": "MarkdownFileConverter", + "package": "caoscrawler.converters"}, + "Dict": { + "converter": "DictConverter", + "package": "caoscrawler.converters"}, + "DictTextElement": { + "converter": "DictTextElementConverter", + "package": "caoscrawler.converters"}, + "DictListElement": { + "converter": "DictListElementConverter", + "package": "caoscrawler.converters"}, + "TextElement": { + "converter": "TextElementConverter", + "package": "caoscrawler.converters"}, + "JSONFile": { + "converter": "JSONFileConverter", + "package": "caoscrawler.converters"}, + } + + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry + + +def testConverterTrivial(converter_registry): + + types = [ + "Directory", + "MarkdownFile", + "DictTextElement", + "DictListElement", + "TextElement" + ] + + for ct in types: + Converter.converter_factory( + definition={ + "type": ct}, + name="Test", + converter_registry=converter_registry) + + +def testDirectoryConverter(converter_registry): + """ test using the "test_directories" folder""" + dc = Converter.converter_factory( + definition={ + "type": "Directory" + }, + name="Test", converter_registry=converter_registry) + elements = dc.create_children(GeneralStore(), + Directory("test_directories", rfp("test_directories"))) + + # Check whether the right structure elements were created + # this has been updated, there are more directories now + # assert len(elements) == 1 + element_names = [] + for element in elements: + assert isinstance(element, Directory) + element_names.append(element.name) + assert "examples_article" in element_names + assert "example_overwrite_1" in element_names + assert "example_insert" in element_names + + +def test_markdown_converter(converter_registry): + test_readme = File( + "README.md", + rfp( + "test_directories", "examples_article", "DataAnalysis", + "2020_climate-model-predict", "2020-02-08_prediction-errors", "README.md" + ) + ) + + converter = MarkdownFileConverter({ + "match": "(.*)" + }, "TestMarkdownFileConverter", + converter_registry) + + m = converter.match(File("test_tool.py", rfp( + "test_tool.py"))) + assert m is None + + m = converter.match(test_readme) + assert m is not None + assert m.__class__ == dict + assert len(m) == 0 + + converter = MarkdownFileConverter({ + "match": "README.md" + }, "TestMarkdownFileConverter", + converter_registry) + + m = converter.match(test_readme) + assert m is not None + assert len(m) == 0 + + children = converter.create_children(None, test_readme) + assert len(children) == 5 + assert children[1].__class__ == DictTextElement + assert children[1].name == "description" + assert children[1].value.__class__ == str + + assert children[0].__class__ == DictTextElement + assert children[0].name == "responsible" + assert children[0].value.__class__ == str + + test_readme2 = File( + "README.md", + rfp("test_directories", "examples_article", + "ExperimentalData", "2020_SpeedOfLight", "2020-01-01_TimeOfFlight", "README.md") + ) + + m = converter.match(test_readme2) + assert m is not None + assert len(m) == 0 + + children = converter.create_children(None, test_readme2) + assert len(children) == 2 + assert children[1].__class__ == DictTextElement + assert children[1].name == "description" + assert children[1].value.__class__ == str + + assert children[0].__class__ == DictListElement + assert children[0].name == "responsible" + assert children[0].value.__class__ == list + + +def test_json_converter(converter_registry): + test_json = File("testjson.json", rfp( + "test_directories", "examples_json", "testjson.json")) + + schema_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), + "test_directories", "examples_json", "testjson.schema.json") + jsonconverter = JSONFileConverter( + definition={"match": "(.*)", "validate": schema_path}, + name="TestJSONFileConverter", + converter_registry=converter_registry) + + m = jsonconverter.match(test_json) + assert m is not None + assert len(m) == 0 + + children = jsonconverter.create_children(None, test_json) + assert len(children) == 8 + assert children[0].__class__ == DictTextElement + assert children[0].name == "name" + assert children[0].value.__class__ == str + assert children[0].value == "DEMO" + + assert children[1].__class__ == DictIntegerElement + assert children[1].name == "projectId" + assert children[1].value.__class__ == int + assert children[1].value == 10002 + + assert children[2].__class__ == DictBooleanElement + assert children[2].name == "archived" + assert children[2].value.__class__ == bool + + assert children[3].__class__ == DictListElement + assert children[3].name == "Person" + assert children[3].value.__class__ == list + assert len(children[3].value) == 2 + + assert children[4].__class__ == DictTextElement + assert children[4].name == "start_date" + assert children[4].value.__class__ == str + + assert children[5].__class__ == DictListElement + assert children[5].name == "candidates" + assert children[5].value.__class__ == list + assert children[5].value == ["Mouse", "Penguine"] + + assert children[6].__class__ == DictFloatElement + assert children[6].name == "rvalue" + assert children[6].value.__class__ == float + + assert children[7].__class__ == DictTextElement + assert children[7].name == "url" + assert children[7].value.__class__ == str + + broken_json = File( + "brokenjson.json", + rfp("test_directories", "examples_json", "brokenjson.json") + ) + m = jsonconverter.match(broken_json) + + # Doesn't validate because of missing required 'name' property + with pytest.raises(ConverterValidationError) as err: + children = jsonconverter.create_children(None, broken_json) + + assert err.value.message.startswith("Couldn't validate") + + +def test_variable_replacement(): + values = GeneralStore() + values["a"] = 4 + values["b"] = "68" + + assert handle_value("b", values) == ("b", "single") + assert handle_value("+b", values) == ("b", "list") + assert handle_value("*b", values) == ("b", "multiproperty") + assert handle_value("$b", values) == ("68", "single") + assert handle_value("+$b", values) == ("68", "list") + assert handle_value("*$b", values) == ("68", "multiproperty") + + assert handle_value({"value": "b", + "collection_mode": "single"}, values) == ("b", "single") + assert handle_value({"value": "b", + "collection_mode": "list"}, values) == ("b", "list") + assert handle_value({"value": "b", + "collection_mode": "multiproperty"}, values) == ("b", "multiproperty") + assert handle_value({"value": "$b", + "collection_mode": "single"}, values) == ("68", "single") + assert handle_value({"value": "$b", + "collection_mode": "list"}, values) == ("68", "list") + assert handle_value({"value": "$b", + "collection_mode": "multiproperty"}, values) == ("68", "multiproperty") + + assert handle_value(["a", "b"], values) == (["a", "b"], "single") + assert handle_value(["$a", "$b"], values) == (["4", "68"], "single") + + +def test_filter_children_of_directory(converter_registry): + """Verify that children (i.e., files) in a directory are filtered or sorted + correctly. + + """ + test_dir = Directory("examples_filter_children", rfp( + "test_directories", "examples_filter_children")) + + dc = DirectoryConverter( + definition={ + "match": "(.*)", + "filter": { + "expr": "test_(?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2}).json", + "group": "date", + "rule": "only_max" + } + }, + name="TestOnlyMaxDirectoryConverter", + converter_registry=converter_registry + ) + + m = dc.match(test_dir) + assert m is not None + + # This should only contain the youngest json and the csv that doesn't match + # the above filter expression. + children = dc.create_children(None, test_dir) + assert len(children) == 2 + assert children[0].__class__ == File + assert children[0].name == "test_2022-02-02.json" + assert children[1].__class__ == File + assert children[1].name == "some_other_file.csv" + + dc = DirectoryConverter( + definition={ + "match": "(.*)", + "filter": { + "expr": "test_(?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2}).json", + "group": "date", + "rule": "only_min" + } + }, + name="TestOnlyMinDirectoryConverter", + converter_registry=converter_registry + ) + + m = dc.match(test_dir) + assert m is not None + + # This should only contain the youngest json and the csv that doesn't match + # the above filter expression. + children = dc.create_children(None, test_dir) + assert len(children) == 2 + assert children[0].__class__ == File + assert children[0].name == "test_2022-01-01.json" + assert children[1].__class__ == File + assert children[1].name == "some_other_file.csv" + + dc = DirectoryConverter( + definition={ + "match": "(.*)", + "filter": { + "expr": "test_(?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2}).json", + "group": "date", + "rule": "does_not_exist" + } + }, + name="TestBrokenDirectoryConverter", + converter_registry=converter_registry + ) + + m = dc.match(test_dir) + assert m is not None + + with pytest.raises(RuntimeError): + children = dc.create_children(None, test_dir) + + +def test_validate_custom_converters(): + one_doc_yaml = """ +Converters: + MyNewType: + converter: MyNewTypeConverter + package: some_package.my_converters +MyElement: + type: MyNewType + match: something + """ + crawler1 = Crawler() + one_doc_definitions = crawler1._load_definition_from_yaml_dict( + [yaml.load(one_doc_yaml, Loader=yaml.SafeLoader)]) + assert "MyElement" in one_doc_definitions + assert one_doc_definitions["MyElement"]["type"] == "MyNewType" + + # this has to be equivalent + two_doc_yaml = """ +--- +metadata: + Converters: + MyNewType: + converter: MyNewTypeConverter + package: some_package.my_converters +--- +MyElement: + type: MyNewType + match: something + """ + crawler2 = Crawler() + two_doc_definitions = crawler2._load_definition_from_yaml_dict( + list(yaml.safe_load_all(two_doc_yaml))) + assert "MyElement" in two_doc_definitions + assert two_doc_definitions["MyElement"]["type"] == one_doc_definitions["MyElement"]["type"] diff --git a/unittests/test_directories/example_insert/SimulationData/2020_climate-model-predict/2022-01-14/README.md b/unittests/test_directories/example_insert/SimulationData/2020_climate-model-predict/2022-01-14/README.md new file mode 100644 index 0000000000000000000000000000000000000000..cb9437486636054377ac3020445345268c12fe1d --- /dev/null +++ b/unittests/test_directories/example_insert/SimulationData/2020_climate-model-predict/2022-01-14/README.md @@ -0,0 +1,26 @@ +--- +responsible: AuthorE +description: > + Code for fitting the predictive model to the + training data and for predicting the average + annual temperature for all measurement stations + for the years 2010 to 2019. + This is a second run of the same simulation from 2020-02-01 to check replication. +sources: +- ../../../ExperimentalData/2020_climate-model-predict/1980-01-01/temperatures-*.csv +- ../../../ExperimentalData/2020_climate-model-predict/1990-01-01/temperatures-*.csv +- ../../../ExperimentalData/2020_climate-model-predict/2000-01-01/temperatures-*.csv +- ../2020-02-01/ +results: +- file: params.json + description: Model parameters for the best fit to the training set +- file: predictions-201*.csv + description: Annual temperature predictions with geographical locations +scripts: +- file: model.py + description: python module with the model equations +- file: fit_parameters.py + description: Fit model parameters to training data using a basinhopping optimizer +- file: predict.py + description: Use optimized parameters to simulate average temperatures from 2010 to 2019 +... diff --git a/unittests/test_directories/example_overwrite_1/SimulationData/2020_climate-model-predict/2022-01-14/README.md b/unittests/test_directories/example_overwrite_1/SimulationData/2020_climate-model-predict/2022-01-14/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e39fef19e2aa8e131e4c57697f583520b8e42be7 --- /dev/null +++ b/unittests/test_directories/example_overwrite_1/SimulationData/2020_climate-model-predict/2022-01-14/README.md @@ -0,0 +1,26 @@ +--- +responsible: AuthorE +description: > + Code for fitting the predictive model to the + training data and for predicting the average + annual temperature for all measurement stations + for the years 2010 to 2019. + This is a second run of the same simulation from 2020-02-01 to check replication. This is a test for an update due to a changed description. +sources: +- ../../../ExperimentalData/2020_climate-model-predict/1980-01-01/temperatures-*.csv +- ../../../ExperimentalData/2020_climate-model-predict/1990-01-01/temperatures-*.csv +- ../../../ExperimentalData/2020_climate-model-predict/2000-01-01/temperatures-*.csv +- ../2020-02-01/ +results: +- file: params.json + description: Model parameters for the best fit to the training set. +- file: predictions-201*.csv + description: Annual temperature predictions with geographical locations +scripts: +- file: model.py + description: python module with the model equations +- file: fit_parameters.py + description: Fit model parameters to training data using a basinhopping optimizer +- file: predict.py + description: Use optimized parameters to simulate average temperatures from 2010 to 2019 +... diff --git a/unittests/test_directories/example_substitutions/ExperimentalData/220512_data.dat b/unittests/test_directories/example_substitutions/ExperimentalData/220512_data.dat new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/unittests/test_directories/example_substitutions/substitutions.yml b/unittests/test_directories/example_substitutions/substitutions.yml new file mode 100644 index 0000000000000000000000000000000000000000..1b4e8784a69d1ad1b80fa757ad77cd137c8cc7b5 --- /dev/null +++ b/unittests/test_directories/example_substitutions/substitutions.yml @@ -0,0 +1,22 @@ + +ExperimentalData: # name of the converter + type: Directory + match: ExperimentalData + records: + Project: + name: project + subtree: + File: # name of the converter + type: SimpleFile + match: (?P<year>[0-9]{2,2})(?P<month>[0-9]{2,2})(?P<day>[0-9]{2,2})_data.dat + records: + Experiment: + date: 20$year-$month-$day + + ExperimentSeries: + Experiment: $Experiment + + Project: + Experiments: +$Experiment + dates: +20$year-$month-$day + diff --git a/unittests/test_directories/example_substitutions/substitutions_parents.yml b/unittests/test_directories/example_substitutions/substitutions_parents.yml new file mode 100644 index 0000000000000000000000000000000000000000..107e766ccd833fab618cecfc04f13bc29abc80a6 --- /dev/null +++ b/unittests/test_directories/example_substitutions/substitutions_parents.yml @@ -0,0 +1,25 @@ + +ExperimentalData: # name of the converter + type: Directory + match: ExperimentalData + records: + Project: + name: project + subtree: + File: # name of the converter + type: SimpleFile + match: (?P<year>[0-9]{2,2})(?P<month>[0-9]{2,2})(?P<day>[0-9]{2,2})_data.dat + records: + Experiment: + parents: + - Experiment + - Month_$month # This adds a special parent as record type + date: 20$year-$month-$day + + ExperimentSeries: + Experiment: $Experiment + + Project: + Experiments: +$Experiment + dates: +20$year-$month-$day + diff --git a/tests/test_directories/examples_article/DataAnalysis/2020_SpeedOfLight/2020-01-04_average-all-exp/README.md b/unittests/test_directories/examples_article/DataAnalysis/2020_SpeedOfLight/2020-01-04_average-all-exp/README.md similarity index 100% rename from tests/test_directories/examples_article/DataAnalysis/2020_SpeedOfLight/2020-01-04_average-all-exp/README.md rename to unittests/test_directories/examples_article/DataAnalysis/2020_SpeedOfLight/2020-01-04_average-all-exp/README.md diff --git a/tests/test_directories/examples_article/DataAnalysis/2020_SpeedOfLight/2020-01-05_average-all-exp-corr/README.md b/unittests/test_directories/examples_article/DataAnalysis/2020_SpeedOfLight/2020-01-05_average-all-exp-corr/README.md similarity index 100% rename from tests/test_directories/examples_article/DataAnalysis/2020_SpeedOfLight/2020-01-05_average-all-exp-corr/README.md rename to unittests/test_directories/examples_article/DataAnalysis/2020_SpeedOfLight/2020-01-05_average-all-exp-corr/README.md diff --git a/tests/test_directories/examples_article/DataAnalysis/2020_climate-model-predict/2020-02-08_prediction-errors/README.md b/unittests/test_directories/examples_article/DataAnalysis/2020_climate-model-predict/2020-02-08_prediction-errors/README.md similarity index 100% rename from tests/test_directories/examples_article/DataAnalysis/2020_climate-model-predict/2020-02-08_prediction-errors/README.md rename to unittests/test_directories/examples_article/DataAnalysis/2020_climate-model-predict/2020-02-08_prediction-errors/README.md diff --git a/tests/test_directories/examples_article/ExperimentalData/2020_SpeedOfLight/2020-01-01_TimeOfFlight/README.md b/unittests/test_directories/examples_article/ExperimentalData/2020_SpeedOfLight/2020-01-01_TimeOfFlight/README.md similarity index 100% rename from tests/test_directories/examples_article/ExperimentalData/2020_SpeedOfLight/2020-01-01_TimeOfFlight/README.md rename to unittests/test_directories/examples_article/ExperimentalData/2020_SpeedOfLight/2020-01-01_TimeOfFlight/README.md diff --git a/tests/test_directories/examples_article/ExperimentalData/2020_SpeedOfLight/2020-01-02_Cavity/README.md b/unittests/test_directories/examples_article/ExperimentalData/2020_SpeedOfLight/2020-01-02_Cavity/README.md similarity index 100% rename from tests/test_directories/examples_article/ExperimentalData/2020_SpeedOfLight/2020-01-02_Cavity/README.md rename to unittests/test_directories/examples_article/ExperimentalData/2020_SpeedOfLight/2020-01-02_Cavity/README.md diff --git a/tests/test_directories/examples_article/ExperimentalData/2020_SpeedOfLight/2020-01-03/README.md b/unittests/test_directories/examples_article/ExperimentalData/2020_SpeedOfLight/2020-01-03/README.md similarity index 100% rename from tests/test_directories/examples_article/ExperimentalData/2020_SpeedOfLight/2020-01-03/README.md rename to unittests/test_directories/examples_article/ExperimentalData/2020_SpeedOfLight/2020-01-03/README.md diff --git a/tests/test_directories/examples_article/ExperimentalData/2020_climate-model-predict/1980-01-01/README.md b/unittests/test_directories/examples_article/ExperimentalData/2020_climate-model-predict/1980-01-01/README.md similarity index 100% rename from tests/test_directories/examples_article/ExperimentalData/2020_climate-model-predict/1980-01-01/README.md rename to unittests/test_directories/examples_article/ExperimentalData/2020_climate-model-predict/1980-01-01/README.md diff --git a/tests/test_directories/examples_article/ExperimentalData/2020_climate-model-predict/1990-01-01/README.md b/unittests/test_directories/examples_article/ExperimentalData/2020_climate-model-predict/1990-01-01/README.md similarity index 100% rename from tests/test_directories/examples_article/ExperimentalData/2020_climate-model-predict/1990-01-01/README.md rename to unittests/test_directories/examples_article/ExperimentalData/2020_climate-model-predict/1990-01-01/README.md diff --git a/tests/test_directories/examples_article/ExperimentalData/2020_climate-model-predict/2000-01-01/README.md b/unittests/test_directories/examples_article/ExperimentalData/2020_climate-model-predict/2000-01-01/README.md similarity index 100% rename from tests/test_directories/examples_article/ExperimentalData/2020_climate-model-predict/2000-01-01/README.md rename to unittests/test_directories/examples_article/ExperimentalData/2020_climate-model-predict/2000-01-01/README.md diff --git a/tests/test_directories/examples_article/ExperimentalData/2020_climate-model-predict/2010-01-01/README.md b/unittests/test_directories/examples_article/ExperimentalData/2020_climate-model-predict/2010-01-01/README.md similarity index 100% rename from tests/test_directories/examples_article/ExperimentalData/2020_climate-model-predict/2010-01-01/README.md rename to unittests/test_directories/examples_article/ExperimentalData/2020_climate-model-predict/2010-01-01/README.md diff --git a/tests/test_directories/examples_article/Publications/Articles/2020_AuthorA-JourRel/README.md b/unittests/test_directories/examples_article/Publications/Articles/2020_AuthorA-JourRel/README.md similarity index 100% rename from tests/test_directories/examples_article/Publications/Articles/2020_AuthorA-JourRel/README.md rename to unittests/test_directories/examples_article/Publications/Articles/2020_AuthorA-JourRel/README.md diff --git a/tests/test_directories/examples_article/Publications/Presentations/2020-03-01_AuthorD-climate-model-conf/README.md b/unittests/test_directories/examples_article/Publications/Presentations/2020-03-01_AuthorD-climate-model-conf/README.md similarity index 100% rename from tests/test_directories/examples_article/Publications/Presentations/2020-03-01_AuthorD-climate-model-conf/README.md rename to unittests/test_directories/examples_article/Publications/Presentations/2020-03-01_AuthorD-climate-model-conf/README.md diff --git a/tests/test_directories/examples_article/Publications/Reports/2020-01-10_avg-speed-of-light/README.md b/unittests/test_directories/examples_article/Publications/Reports/2020-01-10_avg-speed-of-light/README.md similarity index 100% rename from tests/test_directories/examples_article/Publications/Reports/2020-01-10_avg-speed-of-light/README.md rename to unittests/test_directories/examples_article/Publications/Reports/2020-01-10_avg-speed-of-light/README.md diff --git a/tests/test_directories/examples_article/SimulationData/2020_climate-model-predict/2020-02-01/README.md b/unittests/test_directories/examples_article/SimulationData/2020_climate-model-predict/2020-02-01/README.md similarity index 100% rename from tests/test_directories/examples_article/SimulationData/2020_climate-model-predict/2020-02-01/README.md rename to unittests/test_directories/examples_article/SimulationData/2020_climate-model-predict/2020-02-01/README.md diff --git a/unittests/test_directories/examples_filter_children/some_other_file.csv b/unittests/test_directories/examples_filter_children/some_other_file.csv new file mode 100644 index 0000000000000000000000000000000000000000..bc715fe81656397eae98aa4b04f9af2e3fdd9e43 --- /dev/null +++ b/unittests/test_directories/examples_filter_children/some_other_file.csv @@ -0,0 +1,2 @@ +some,other,data +1,2,3 diff --git a/unittests/test_directories/examples_filter_children/test_2022-01-01.json b/unittests/test_directories/examples_filter_children/test_2022-01-01.json new file mode 100644 index 0000000000000000000000000000000000000000..8de42f29d2eed374a0aba356c7fce2daa3e08e49 --- /dev/null +++ b/unittests/test_directories/examples_filter_children/test_2022-01-01.json @@ -0,0 +1,3 @@ +{ + "key": "value", +} diff --git a/unittests/test_directories/examples_filter_children/test_2022-01-02.json b/unittests/test_directories/examples_filter_children/test_2022-01-02.json new file mode 100644 index 0000000000000000000000000000000000000000..8de42f29d2eed374a0aba356c7fce2daa3e08e49 --- /dev/null +++ b/unittests/test_directories/examples_filter_children/test_2022-01-02.json @@ -0,0 +1,3 @@ +{ + "key": "value", +} diff --git a/unittests/test_directories/examples_filter_children/test_2022-02-02.json b/unittests/test_directories/examples_filter_children/test_2022-02-02.json new file mode 100644 index 0000000000000000000000000000000000000000..8de42f29d2eed374a0aba356c7fce2daa3e08e49 --- /dev/null +++ b/unittests/test_directories/examples_filter_children/test_2022-02-02.json @@ -0,0 +1,3 @@ +{ + "key": "value", +} diff --git a/unittests/test_directories/examples_json/brokenjson.json b/unittests/test_directories/examples_json/brokenjson.json new file mode 100644 index 0000000000000000000000000000000000000000..9c012bf062264014278fc2df7be6cf33b65c7469 --- /dev/null +++ b/unittests/test_directories/examples_json/brokenjson.json @@ -0,0 +1,13 @@ +{ + "projectId": 10002, + "archived": false, + "coordinator": { + "firstname": "Miri", + "lastname": "Mueller", + "email": "miri.mueller@science.de" + }, + "start_date": "2022-03-01", + "candidates": ["Mouse", "Penguine"], + "rvalue": 0.4444, + "url": "https://site.de/index.php/" +} diff --git a/unittests/test_directories/examples_json/jsontest_cfood.yml b/unittests/test_directories/examples_json/jsontest_cfood.yml new file mode 100644 index 0000000000000000000000000000000000000000..f1eb6a9fa186c07f551bd12a84050f544abfdabc --- /dev/null +++ b/unittests/test_directories/examples_json/jsontest_cfood.yml @@ -0,0 +1,58 @@ + +JSONTest: # name of the converter + type: JSONFile + match: '(.*)' + validate: ./testjson.schema.json + records: + Project: # this is an identifiable in this case + parents: + - Project # not needed as the name is equivalent + subtree: + name_element: + type: DictTextElement + match_name: "name" + match_value: "(?P<name>.*)" + records: + Project: + name: $name + url_element: # name of the first subtree element which is a converter + type: DictTextElement + match_value: "(?P<url>.*)" + match_name: "url" + records: + Project: + url: $url + persons_element: + type: DictListElement + match_name: "Person" + subtree: + person_element: + type: Dict + records: + Person: + parents: + - Person + Project: + Person: +$Person + subtree: + firstname_element: + type: DictTextElement + match_name: "firstname" + match_value: "(?P<firstname>.*)" + records: + Person: + firstname: $firstname + lastname_element: + type: DictTextElement + match_name: "lastname" + match_value: "(?P<lastname>.*)" + records: + Person: + lastname: $lastname + email_element: + type: DictTextElement + match_name: "email" + match_value: "(?P<email>.*)" + records: + Person: + email: $email diff --git a/unittests/test_directories/examples_json/testjson.json b/unittests/test_directories/examples_json/testjson.json new file mode 100644 index 0000000000000000000000000000000000000000..d37ea2defc21d767e4e13ad3b39d6682b3c452ef --- /dev/null +++ b/unittests/test_directories/examples_json/testjson.json @@ -0,0 +1,22 @@ +{ + "name": "DEMO", + "projectId": 10002, + "archived": false, + "Person": [ + { + "firstname": "Miri", + "lastname": "Mueller", + "other": null, + "email": "miri.mueller@science.de" + }, + { + "firstname": "Mara", + "lastname": "Mueller", + "email": "mara.mueller@science.de" + } + ], + "start_date": "2022-03-01", + "candidates": ["Mouse", "Penguine"], + "rvalue": 0.4444, + "url": "https://site.de/index.php/" +} diff --git a/unittests/test_directories/examples_json/testjson.schema.json b/unittests/test_directories/examples_json/testjson.schema.json new file mode 100644 index 0000000000000000000000000000000000000000..fc784a61079e4737f1a0176fe4240133f5d1b5d0 --- /dev/null +++ b/unittests/test_directories/examples_json/testjson.schema.json @@ -0,0 +1,60 @@ +{ + "title": "Dataset", + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "projectId": { + "type": "integer" + }, + "archived": { + "type": "boolean" + }, + "Person": { + "type": "array", + "items": { + "type": "object", + "properties": { + "firstname": { + "type": "string" + }, + "lastname": { + "type": "string" + }, + "email": { + "type": "string" + } + }, + "required": [ + "firstname", + "lastname", + "email" + ], + "additionalProperties": true + } + }, + "start_date": { + "type": "string", + "format": "date" + }, + "candidates": { + "type": "array", + "items": { + "type": "string" + } + }, + "rvalue": { + "type": "number" + }, + "url": { + "type": "string" + } + }, + "required": [ + "name", + "projectId", + "Person" + ], + "additionalProperties": false +} diff --git a/unittests/test_directories/examples_tables/ExperimentalData/test1.csv b/unittests/test_directories/examples_tables/ExperimentalData/test1.csv new file mode 100644 index 0000000000000000000000000000000000000000..c2eb297b523c06729937a07221c695105df0b09c --- /dev/null +++ b/unittests/test_directories/examples_tables/ExperimentalData/test1.csv @@ -0,0 +1,8 @@ +Col_1,Col_2,Col_3,text +Index,description,, +,m,s, +0,12,1,jdsfkljadskf +1,14,3,jdkfljad +2,3,4,jadkfjdsk +3,4.5,6, +4,8,7,jadskfj diff --git a/unittests/test_directories/examples_tables/ExperimentalData/test1.xlsx b/unittests/test_directories/examples_tables/ExperimentalData/test1.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..2bf68c8a854ae7f618e47e1db58490fc76c055b2 Binary files /dev/null and b/unittests/test_directories/examples_tables/ExperimentalData/test1.xlsx differ diff --git a/unittests/test_directories/examples_tables/crawler_for_tables.yml b/unittests/test_directories/examples_tables/crawler_for_tables.yml new file mode 100644 index 0000000000000000000000000000000000000000..7aaea3e55eb4b8cb2329c24c8b7861f0d9e76d69 --- /dev/null +++ b/unittests/test_directories/examples_tables/crawler_for_tables.yml @@ -0,0 +1,59 @@ + +ExperimentalData: + type: Directory + match: ExperimentalData + records: + Project: + name: project + subtree: + XLSXTable: + type: XLSXTableConverter + match: test1\.xlsx + skiprows: [1, 2] + header: 0 + records: + Experiment: {} + + subtree: + Row: + type: DictDictElement + match_name: .* + records: + Measurement: {} + Experiment: + Measurements: +$Measurement + subtree: + Col_1: + type: DictIntegerElement + match_name: Col_1 + match_value: (?P<Value>[0-9]+) + records: + Measurement: + Col_1: $Value + CSVTable: + type: CSVTableConverter + match: test1\.csv + skiprows: [1, 2] + header: 0 + records: + Experiment: {} + + subtree: + Row: + type: DictDictElement + match_name: .* + records: + Measurement: {} + Experiment: + Measurements: +$Measurement + subtree: + Col_1: + type: DictIntegerElement + match_name: Col_1 + match_value: (?P<Value>[0-9]+) + records: + Measurement: + Col_1: $Value + + + diff --git a/unittests/test_directories/single_file_test_data/identifiables.yml b/unittests/test_directories/single_file_test_data/identifiables.yml new file mode 100644 index 0000000000000000000000000000000000000000..e32746d5a6984096cc46fa618250832b325965b0 --- /dev/null +++ b/unittests/test_directories/single_file_test_data/identifiables.yml @@ -0,0 +1,7 @@ +Person: + - full_name +Keyword: + - name +Project: + - project_id + - title diff --git a/unittests/test_entity_comparison.py b/unittests/test_entity_comparison.py new file mode 100644 index 0000000000000000000000000000000000000000..549bc4f42a59765d25446d44fbb845e49ca4d9b9 --- /dev/null +++ b/unittests/test_entity_comparison.py @@ -0,0 +1,95 @@ +#!/bin/python +# Tests for entity comparison +# A. Schlemmer, 06/2021 + +import caosdb as db + +import pytest +from pytest import raises + +from caoscrawler.crawl import check_identical + + +def test_compare_entities(): + record1 = db.Record() + record2 = db.Record() + + assert check_identical(record1, record2) + + record1.add_property(name="type", value="int") + assert not check_identical(record1, record2) + assert not check_identical(record2, record1) + + record2.add_property(name="type", value="int") + assert check_identical(record1, record2) + record2.get_property("type").value = "int2" + assert not check_identical(record1, record2) + record2.get_property("type").value = 4 + assert not check_identical(record1, record2) + + record2.get_property("type").value = "int" + assert check_identical(record1, record2) + record2.add_parent(db.RecordType(name="Parent")) + assert not check_identical(record1, record2) + record1.add_parent(db.RecordType(name="Parent")) + + # This is confusing, but needed: + record1.add_property(name="field_with_type", value=42, datatype=db.INTEGER) + record2.add_property(name="field_with_type", value=42) + # not identical, because record1 sets the datatype + assert not check_identical(record1, record2) + # identical, because record2 sets the datatype + assert check_identical(record2, record1) + record2.get_property("field_with_type").datatype = db.INTEGER + assert check_identical(record1, record2) + assert check_identical(record2, record1) + + record2.get_property("field_with_type").datatype = db.DOUBLE + assert not check_identical(record1, record2) + assert not check_identical(record2, record1) + + # TODO: report this as a hacky workaround (for setting datatype from double to integer): + record2.get_property("field_with_type").datatype = db.TEXT + record2.get_property("field_with_type").datatype = db.INTEGER + assert check_identical(record1, record2) + assert check_identical(record2, record1) + + record1 = db.File() + record2 = db.File() + + vals = (("bla bla", "bla bla bla"), + (1, 2)) + + for attribute, values in zip(("description", "name", "path", "id"), + (vals[0], vals[0], vals[0], vals[1])): + setattr(record1, attribute, values[0]) + assert not check_identical(record1, record2) + assert not check_identical(record2, record1) + setattr(record2, attribute, values[1]) + assert not check_identical(record1, record2) + assert not check_identical(record2, record1) + + setattr(record2, attribute, values[0]) + assert check_identical(record1, record2) + assert check_identical(record2, record1) + + # currently "file" is not checked by compare_entities + + vals = (("abcd", "bcde"), + (1, 2)) + # This is confusing, but needed: + for attribute, values in zip(("_checksum", "_size"), + (vals[0], vals[1])): + setattr(record1, attribute, values[0]) + # not identical, because record1 sets the datatype + assert not check_identical(record1, record2) + # identical, because record2 sets the datatype + assert check_identical(record2, record1) + + setattr(record2, attribute, values[1]) + assert not check_identical(record1, record2) + assert not check_identical(record2, record1) + + setattr(record2, attribute, values[0]) + assert check_identical(record1, record2) + assert check_identical(record2, record1) diff --git a/unittests/test_file_identifiables.py b/unittests/test_file_identifiables.py new file mode 100644 index 0000000000000000000000000000000000000000..b0b9801993dc68fe473e788b8ca79a2244912676 --- /dev/null +++ b/unittests/test_file_identifiables.py @@ -0,0 +1,74 @@ +#!/bin/python +# Tests for file identifiables +# A. Schlemmer, 06/2021 + +import caosdb as db + +import pytest +from pytest import raises + +from caoscrawler.identifiable_adapters import LocalStorageIdentifiableAdapter + + +def test_file_identifiable(): + ident = LocalStorageIdentifiableAdapter() + file_obj = db.File() + + identifiable = ident.get_identifiable(file_obj) + identifiable2 = ident.get_identifiable_for_file(file_obj) + + # these are two different objects: + assert identifiable != identifiable2 + assert file_obj != identifiable + # ... but the path is equal: + assert identifiable.path == identifiable2.path + # ... and very boring: + assert identifiable.path is None + # Test functionality of retrieving the files: + identified_file = ident.get_file(identifiable) + identified_file2 = ident.get_file(file_obj) + # The both should be None currently as there are no files in the local store yet: + assert identified_file is None + assert identified_file2 is None + + # Let's make it more interesting: + file_obj.path = "/test/bla/bla.txt" + file_obj._checksum = "abcd" + identifiable = ident.get_identifiable(file_obj) + assert file_obj != identifiable + assert file_obj.path == identifiable.path + # Checksum is not part of the identifiable: + assert file_obj.checksum != identifiable.checksum + + # This is the wrong method, so it should definitely return None: + identified_file = ident.retrieve_identified_record_for_identifiable( + identifiable) + assert identified_file is None + # This is the correct method to use: + identified_file = ident.get_file(identifiable) + # or directly using: + identified_file2 = ident.get_file(file_obj) + # The both should be None currently as there are no files in the local store yet: + assert identified_file is None + assert identified_file2 is None + + # Try again with actual files in the store: + records = ident.get_records() + test_record_wrong_path = db.File( + path="/bla/bla/test.txt") + test_record_correct_path = db.File( + path="/test/bla/bla.txt") + test_record_alsocorrect_path = db.File( + path="/test/bla/bla.txt") + records.append(test_record_wrong_path) + identified_file = ident.get_file(file_obj) + assert identified_file is None + + records.append(test_record_correct_path) + identified_file = ident.get_file(file_obj) + assert identified_file is not None + assert identified_file.path == file_obj.path + + with raises(RuntimeError, match=".*unambigiously.*"): + records.append(test_record_alsocorrect_path) + identified_file = ident.get_file(file_obj) diff --git a/unittests/test_identifiable_adapters.py b/unittests/test_identifiable_adapters.py new file mode 100644 index 0000000000000000000000000000000000000000..ef7998a460c07342d30a3f769fd609c1045a9cca --- /dev/null +++ b/unittests/test_identifiable_adapters.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2021 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +""" +test identifiable_adapters module +""" + +import os +from datetime import datetime +from caoscrawler.identifiable_adapters import ( + CaosDBIdentifiableAdapter, IdentifiableAdapter) +import caosdb as db + + +def test_create_query_for_identifiable(): + query = IdentifiableAdapter.create_query_for_identifiable( + db.Record().add_parent("Person") + .add_property("first_name", value="A") + .add_property("last_name", value="B")) + assert query.lower() == "find record person with 'first_name'='a' and 'last_name'='b' " + + query = IdentifiableAdapter.create_query_for_identifiable( + db.Record(name="A").add_parent("B") + .add_property("c", value="c") + .add_property("d", value=5) + .add_property("e", value=5.5) + .add_property("f", value=datetime(2020, 10, 10)) + .add_property("g", value=True) + .add_property("h", value=db.Record(id=1111)) + .add_property("i", value=db.File(id=1112)) + .add_property("j", value=[2222, db.Record(id=3333)])) + assert (query.lower() == "find record b with name='a' and 'c'='c' and 'd'='5' and 'e'='5.5'" + " and 'f'='2020-10-10t00:00:00' and 'g'='true' and 'h'='1111' and 'i'='1112' and " + "'j'='2222' and 'j'='3333' ") + + # The name can be the only identifiable + query = IdentifiableAdapter.create_query_for_identifiable( + db.Record(name="TestRecord").add_parent("TestType")) + assert query.lower() == "find record testtype with name='testrecord'" + + +def test_load_from_yaml_file(): + ident = CaosDBIdentifiableAdapter() + ident.load_from_yaml_definition( + os.path.join(os.path.dirname(__file__), "test_directories", + "single_file_test_data", "identifiables.yml") + ) + + person_i = ident.get_registered_identifiable( + db.Record().add_parent("Person")) + assert person_i is not None + assert person_i.get_property("full_name") is not None + + keyword_i = ident.get_registered_identifiable( + db.Record().add_parent("Keyword")) + assert keyword_i is not None + assert keyword_i.get_property("name") is not None + + project_i = ident.get_registered_identifiable( + db.Record().add_parent("Project")) + assert project_i is not None + assert project_i.get_property("project_id") is not None + assert project_i.get_property("title") is not None diff --git a/unittests/test_identified_cache.py b/unittests/test_identified_cache.py new file mode 100644 index 0000000000000000000000000000000000000000..33add97d4309d87705144ec5331366d0bcd05541 --- /dev/null +++ b/unittests/test_identified_cache.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2021 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +""" +test identified_cache module +""" + +from caoscrawler.identified_cache import _create_hashable_string, IdentifiedCache +import caosdb as db + + +def test_create_hash(): + assert _create_hashable_string( + db.Record("A").add_parent("B")) == "P<B>N<A>" + assert _create_hashable_string(db.Record("A") + .add_parent("B").add_property('a', 5)) == "P<B>N<A>a:5" + assert (_create_hashable_string( + db.Record("A").add_parent("B") + .add_property('a', 4).add_property('b', 5)) == _create_hashable_string( + db.Record("A").add_parent("B") + .add_property('b', 5).add_property('a', 4))) + assert (_create_hashable_string(db.Record("A") + .add_parent("B") + .add_property('a', db.Record(id=12))) == "P<B>N<A>a:12") + assert (_create_hashable_string(db.Record("A") + .add_parent("B") + .add_property('a', [db.Record(id=12)])) == "P<B>N<A>a:[12]") + assert (_create_hashable_string(db.Record("A") + .add_parent("B").add_property('a', [12])) == "P<B>N<A>a:[12]") + assert (_create_hashable_string( + db.Record("A") + .add_parent("B") + .add_property('a', [db.Record(id=12), 11])) == "P<B>N<A>a:[12, 11]") + + +def test_IdentifiedCache(): + ident = db.Record("A").add_parent("B") + record = db.Record("A").add_parent("B").add_property('b', 5) + cache = IdentifiedCache() + assert ident not in cache + cache.add(record=record, identifiable=ident) + assert ident in cache + assert record not in cache + assert cache[ident] is record diff --git a/unittests/test_issues.py b/unittests/test_issues.py new file mode 100644 index 0000000000000000000000000000000000000000..6e77b0c7f26f4b2970203cfc4b8cc786fe24121b --- /dev/null +++ b/unittests/test_issues.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2022 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2022 Florian Spreckelsen <f.spreckelsen@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +from pytest import mark + +from caoscrawler.crawl import Crawler +from caoscrawler.structure_elements import Dict +from test_tool import rfp + + +@mark.xfail( + reason="Wait until value conversion in dicts is fixed, see " + "https://gitlab.com/caosdb/caosdb-crawler/-/issues/10." +) +def test_issue_10(): + """Test integer-to-float conversion in dictionaries""" + crawler_definition = { + "DictTest": { + "type": "Dict", + "match": "(.*)", + "records": { + "TestRec": {} + }, + "subtree": { + "float_element": { + "type": "DictFloatElement", + "match_name": "float_value", + "match_value": "(?P<float_value>.*)", + "records": { + "TestRec": { + "float_prop": "$float_value" + } + } + } + } + } + } + + crawler = Crawler(debug=True) + converter_registry = crawler.load_converters(crawler_definition) + + test_dict = { + "float_value": 4 + } + + records = crawler.start_crawling( + Dict("TestDict", test_dict), crawler_definition, converter_registry) + assert len(records) == 1 + assert records[0].parents[0].name == "TestRec" + assert records[0].get_property("float_prop") is not None + assert float(records[0].get_property("float_prop").value) == 4.0 diff --git a/unittests/test_json.py b/unittests/test_json.py new file mode 100644 index 0000000000000000000000000000000000000000..97d9831de20a2b9f712294d1a0f6322789580f30 --- /dev/null +++ b/unittests/test_json.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2021 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +""" +module description +""" +import json +import os + +from pytest import raises + +import caosdb as db + +from caoscrawler.converters import JSONFileConverter, DictConverter +from caoscrawler.crawl import Crawler +from caoscrawler.structure_elements import File, JSONFile +from test_tool import rfp, dircheckstr + + +def test_json(): + crawler_definition_path = rfp("test_directories", "examples_json", + "jsontest_cfood.yml") + json_file_path = rfp("test_directories", "examples_json", "testjson.json") + + crawler = Crawler(debug=True) + crawler_definition = crawler.load_definition(crawler_definition_path) + # Load and register converter packages: + converter_registry = crawler.load_converters(crawler_definition) + + records = crawler.start_crawling( + JSONFile(os.path.basename(json_file_path), json_file_path), + crawler_definition, + converter_registry + ) + + rec = [r for r in records if r.name == "DEMO"] + assert len(rec) == 1 + rec = rec[0] + assert len(rec.parents) == 1 + assert rec.parents[0].name == "Project" + assert rec.get_property("url") is not None + assert rec.get_property("url").value == "https://site.de/index.php/" + assert rec.get_property("Person") is not None + assert isinstance(rec.get_property("Person").value, list) + assert len(rec.get_property("Person").value) == 2 + + +def test_broken_validation(): + crawler_definition_path = rfp( + "broken_cfoods", "broken_validation_path.yml") + crawler = Crawler() + with raises(FileNotFoundError) as err: + crawler_definition = crawler.load_definition(crawler_definition_path) + + assert str(err.value).startswith("Couldn't find validation file") diff --git a/unittests/test_macros.py b/unittests/test_macros.py new file mode 100644 index 0000000000000000000000000000000000000000..7ac34cc7c48df3cb2855d7022119e4775d90c9a6 --- /dev/null +++ b/unittests/test_macros.py @@ -0,0 +1,330 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2022 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +from caoscrawler.macros import defmacro_constructor, macro_constructor +from caoscrawler.macros.macro_yaml_object import macro_store +from caoscrawler.crawl import Crawler + +from tempfile import NamedTemporaryFile + +import yaml +import pytest + + +@pytest.fixture +def register_macros(): + yaml.SafeLoader.add_constructor("!defmacro", defmacro_constructor) + yaml.SafeLoader.add_constructor("!macro", macro_constructor) + + +@pytest.fixture +def macro_store_reset(): + macro_store.clear() + + +def _temp_file_load(txt: str): + """ + Create a temporary file with txt and load the crawler + definition using load_definition from Crawler. + """ + definition = None + with NamedTemporaryFile() as f: + f.write(txt.encode()) + f.flush() + c = Crawler() + definition = c.load_definition(f.name) + return definition + + +def test_macros(register_macros, macro_store_reset): + dat = yaml.load(""" +defs: +- !defmacro + name: test + params: + a: 2 + b: bla + c: $variable + definition: + expanded_$b: + blubb: ok$a + $b: $c + +testnode: + obl: !macro + test: + a: 4 + b: yea +""", Loader=yaml.SafeLoader) + assert dat["testnode"]["obl"]["expanded_yea"]["blubb"] == "ok4" + assert dat["testnode"]["obl"]["expanded_yea"]["yea"] == "$variable" + assert "expanded_bla" not in dat["testnode"]["obl"] + assert "bla" not in dat["testnode"]["obl"]["expanded_yea"] + + +def test_macro_list_replacment(register_macros, macro_store_reset): + dat = yaml.load(""" +defs: +- !defmacro + name: test + params: + a: 2 + b: bla + c: $variable + definition: + expanded_$b: + blubb: + - ok$a + - $b: $c + +testnode: + obl: !macro + test: + a: 4 + b: yea +""", Loader=yaml.SafeLoader) + assert isinstance(dat["testnode"]["obl"]["expanded_yea"]["blubb"], list) + assert len(dat["testnode"]["obl"]["expanded_yea"]["blubb"]) == 2 + assert dat["testnode"]["obl"]["expanded_yea"]["blubb"][0] == "ok4" + assert dat["testnode"]["obl"]["expanded_yea"]["blubb"][1]["yea"] == "$variable" + + +def test_multi_macros(register_macros, macro_store_reset): + dat = yaml.load(""" +defs: +- !defmacro + name: test_one + params: {} + definition: + replaced1: ok +- !defmacro + name: test_two + params: {} + definition: + replaced2: ok + replaced3: ok + +testnode: + obl: !macro + test_one: + test_two: +""", Loader=yaml.SafeLoader) + assert dat["testnode"]["obl"]["replaced1"] == "ok" + assert dat["testnode"]["obl"]["replaced2"] == "ok" + assert dat["testnode"]["obl"]["replaced3"] == "ok" + + +def test_multi_macros_toplevel(register_macros, macro_store_reset): + """ + See: https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/23 + """ + dat_loader = list(yaml.safe_load_all(""" +--- +metadata: + macros: + - !defmacro + name: test_one + params: {} + definition: + replaced1: ok + - !defmacro + name: test_two + params: {} + definition: + replaced2: ok + replaced3: ok +--- +testnode: !macro + test_one: + test_two: +""")) + assert len(dat_loader) == 2 + dat = dat_loader[1] + assert dat["testnode"]["replaced1"] == "ok" + assert dat["testnode"]["replaced2"] == "ok" + assert dat["testnode"]["replaced3"] == "ok" + + +def test_load_definition(register_macros, macro_store_reset): + txt = """ +extroot: + type: Directory + match: extroot + subtree: + SimulationData: + type: Directory + match: SimulationData + """ + # Check whether simple cfoods can be loaded: + cfood = _temp_file_load(txt) + assert cfood["extroot"]["subtree"]["SimulationData"]["match"] == "SimulationData" + + cfood = _temp_file_load(""" +--- +metadata: + macros: + - !defmacro + name: test_one + params: {} + definition: + replaced1: ok + - !defmacro + name: test_two + params: + match_name: null + definition: + type: Directory + match: $match_name +--- +extroot: + type: Directory + match: extroot + subtree: + SimulationData: + type: Directory + match: SimulationData +extroot2: !macro # test top level macro + test_one: +extroot3: + subtree: + SimulationData: !macro + test_two: + match_name: SimulationData + """) + assert cfood["extroot"]["subtree"]["SimulationData"]["match"] == "SimulationData" + assert cfood["extroot2"]["replaced1"] == "ok" + assert cfood["extroot3"]["subtree"]["SimulationData"]["match"] == "SimulationData" + + +@pytest.mark.xfail +def test_replace_arbitrary_objects(register_macros, macro_store_reset): + """ + See: https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/24 + """ + dat = yaml.load(""" +defs: +- !defmacro + name: test + params: + b: 25 + testvar_list: + - a + - $b + testvar_dict: + t1: a + t2: $b + definition: + replaced1: + $b: ok + c: $testvar_dict + d: $testvar_list + +testnode: + obl: !macro + test: +""", Loader=yaml.SafeLoader) + print(yaml.dump(dat)) + assert dat["testnode"]["obl"]["replaced1"]["c"]["t1"] == "a" + assert dat["testnode"]["obl"]["replaced1"]["c"]["t2"] == "25" + assert dat["testnode"]["obl"]["replaced1"]["d"][0] == "a" + assert dat["testnode"]["obl"]["replaced1"]["d"][1] == "25" + + +def test_circular_macro_definition(register_macros, macro_store_reset): + """Test the (ab-)use of macros to create an infinite loop.""" + cfood = _temp_file_load(""" +--- +metadata: + macros: + - !defmacro + name: test_one + params: {} + definition: !macro + test_two: + - !defmacro + name: test_two + params: {} + definition: !macro + test_one: + - !defmacro + name: test_three + params: {} + definition: !macro + test_two: + - !defmacro + name: test_four + params: {} + definition: !macro + test_four: +--- +extroot: !macro + test_one: +extroot2: !macro + test_three: +extroot3: !macro + test_four: + """) + # macros in macros can be used, but there are no circles; they stop at the first one. + assert "test_one" not in cfood["extroot"] + assert cfood["extroot"]["test_two"] is None + assert "test_three" not in cfood["extroot2"] + assert "test_one" not in cfood["extroot2"] + assert cfood["extroot2"]["test_two"] is None + # No recursion + assert cfood["extroot3"]["test_four"] is None + + +# @pytest.mark.xfail(reason="Fix multiple usage of the same macro.") +def test_use_macro_twice(): + """Test that the same macro can be used twice with different parameters in + the same CFood element if the name depends on the parameters. + + """ + + cfood = _temp_file_load(""" +--- +metadata: + macros: + - !defmacro + name: test_twice + params: + macro_name: default_name + a: 4 + definition: + $macro_name: + something: + a: $a +--- +extroot: !macro + test_twice: + - macro_name: once + - macro_name: twice + a: 5 + - {} + """) + for name in ["once", "twice", "default_name"]: + assert name in cfood["extroot"] + assert cfood["extroot"]["once"]["something"]["a"] == "4" + assert cfood["extroot"]["twice"]["something"]["a"] == "5" + assert cfood["extroot"]["default_name"]["something"]["a"] == "4" diff --git a/unittests/test_scalars_cfood.py b/unittests/test_scalars_cfood.py new file mode 100644 index 0000000000000000000000000000000000000000..1bf8f0b7d67f00f2018b5b68424d6b9cc17602eb --- /dev/null +++ b/unittests/test_scalars_cfood.py @@ -0,0 +1,57 @@ +#!/bin/python +# Tests for: +# https://gitlab.com/caosdb/caosdb-crawler/-/issues/9 +# A. Schlemmer, 06/2021 + +import pytest + +# The main function that is affected by this issue: +from caoscrawler.converters import handle_value +from caoscrawler.crawl import Crawler +# We need the store for the above function +from caoscrawler.stores import GeneralStore + +from test_tool import dircheckstr, rfp + + +@pytest.fixture +def crawler(): + crawler = Crawler(debug=True) + crawler.crawl_directory(rfp("test_directories", "examples_article"), + rfp("cfoods_scalar.yml")) + return crawler + + +def test_handle_value(): + # Note that we will need this store only, if we also want to test variables substitution: + store = GeneralStore() + + # This one should work: + assert handle_value("bla", store) == ("bla", "single") + + # These failed: + assert handle_value(4, store) == (4, "single") + assert handle_value(4.2, store) == (4.2, "single") + assert handle_value(True, store) == (True, "single") + + # List test: + assert handle_value([4, 3, 2], store) == ([4, 3, 2], "single") + + +def test_record_structure_generation(crawler): + subd = crawler.debug_tree[dircheckstr("DataAnalysis")] + assert len(subd) == 2 + # variables store on Data Analysis node of debug tree + assert len(subd[0]) == 3 + assert "Data" in subd[0] + assert "DataAnalysis" in subd[0] + assert "RecordThatGetsParentsLater" in subd[0] + + prop = subd[0]["RecordThatGetsParentsLater"].get_property("someId") + assert type(prop.value) == int + assert prop.value == 23 + + # record store on Data Analysis node of debug tree + assert len(subd[1]) == 1 + prop2 = subd[1]["RecordThatGetsParentsLater"].get_property("someId") + assert prop == prop2 diff --git a/unittests/test_schema.py b/unittests/test_schema.py new file mode 100644 index 0000000000000000000000000000000000000000..0736698eb32146fb3cfbee6acbcf11f5436df27e --- /dev/null +++ b/unittests/test_schema.py @@ -0,0 +1,31 @@ +#!/bin/python +# Tests for schema validation +# A. Schlemmer, 06/2021 + +from importlib_resources import files +import caosdb as db + +from os.path import join, dirname +from caoscrawler import Crawler + +import pytest +from pytest import raises + +from jsonschema.exceptions import ValidationError + + +def rfp(*pathcomponents): + """ + Return full path. + Shorthand convenience function. + """ + return join(dirname(__file__), *pathcomponents) + + +def test_schema_validation(): + cr = Crawler() + cr.load_definition(rfp("scifolder_cfood.yml")) + cr.load_definition(rfp("scifolder_extended.yml")) + + with raises(ValidationError, match=".*enum.*"): + cr.load_definition(rfp("broken_cfoods", "broken1.yml")) diff --git a/unittests/test_table_converter.py b/unittests/test_table_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..85255d3efd34dc666d5d2e97423f33177dea6732 --- /dev/null +++ b/unittests/test_table_converter.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2022 Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +""" +test the converters module +""" + +from caoscrawler.converters import Converter +from caoscrawler.stores import GeneralStore +from caoscrawler.converters import (ConverterValidationError, + DictConverter, XLSXTableConverter, CSVTableConverter) +from caoscrawler.structure_elements import Directory +from caoscrawler.structure_elements import (File, DictTextElement, + DictListElement, DictElement, + DictBooleanElement, DictDictElement, + DictIntegerElement, DictFloatElement) + +from os.path import join, dirname, basename + +from caoscrawler.identifiable_adapters import IdentifiableAdapter, LocalStorageIdentifiableAdapter + +import pytest +import os +import importlib + +import math + +from caoscrawler import Crawler + +import caosdb as db + + +@pytest.fixture +def converter_registry(): + converter_registry: dict[str, dict[str, str]] = { + "Directory": { + "converter": "DirectoryConverter", + "package": "caoscrawler.converters"}, + "CSVTableConverter": { + "converter": "CSVTableConverter", + "package": "caoscrawler.converters"}, + "XLSXTableConverter": { + "converter": "XLSXTableConverter", + "package": "caoscrawler.converters"}, + + "DictDictElement": { + "converter": "DictDictElementConverter", + "package": "caoscrawler.converters"}, + "DictTextElement": { + "converter": "DictTextElementConverter", + "package": "caoscrawler.converters"}, + "DictIntegerElement": { + "converter": "DictIntegerElementConverter", + "package": "caoscrawler.converters"}, + "DictFloatElement": { + "converter": "DictFloatElementConverter", + "package": "caoscrawler.converters"}, + } + + +def rfp(*pathcomponents): + """ + Return full path. + Shorthand convenience function. + """ + return join(dirname(__file__), *pathcomponents) + + +def dircheckstr(*pathcomponents): + """ + Return the debug tree identifier for a given path. + """ + return "caoscrawler.structure_elements.File: " + basename(join(*pathcomponents)) + ", " + rfp("test_directories", "examples_tables", "ExperimentalData", *pathcomponents) + + +@pytest.fixture +def crawler(): + crawler = Crawler(debug=True) + crawler.crawl_directory(rfp("test_directories", "examples_tables", "ExperimentalData"), + rfp("test_directories", "examples_tables", "crawler_for_tables.yml")) + return crawler + + +def test_convert_table(converter_registry): + extentions = ["xlsx", "csv", "tsv"] + if importlib.util.find_spec("odf") is not None: + extentions.append("ods") + for file_ext in extentions: + def_opt = {"skiprows": ["1", "2"], "header": 0} + if file_ext == "tsv": + def_opt["sep"] = "\t" + if file_ext in ["csv", "tsv"]: + converter = CSVTableConverter( + def_opt, + "Tab", + converter_registry) + else: + converter = XLSXTableConverter( + def_opt, + "Tab", + converter_registry) + store = GeneralStore() + file_element = File("table." + file_ext, + rfp("test_tables", "test1." + file_ext)) + res = converter.create_children(store, + file_element) + assert len(res) == 5 + for i in range(5): + assert res[i].name == str(i) + assert type(res[i].name) == str + assert type(res[i].value) == dict + assert len(res[i].value) == 4 + assert type(res[i].value["Col_1"]) == int + assert res[i].value["Col_1"] == i + assert type(res[i].value["Col_2"]) == float + assert type(res[i].value["Col_3"]) == int + if i != 3: + assert type(res[i].value["text"]) == str + else: + assert type(res[i].value["text"]) == float # the nan value + assert math.isnan(res[i].value["text"]) + + # Using an index col: + converter = XLSXTableConverter( + {"skiprows": ["1", "2"], "header": 0, "index_col": "3"}, + "XLSXTable", + converter_registry) + store = GeneralStore() + file_element = File("table.xlsx", + rfp("test_tables", "test1.xlsx")) + res = converter.create_children(store, + file_element) + assert res[0].name == "jdsfkljadskf" + + +def test_crawl_csv_table(crawler): + for file_ext in ["xlsx", "csv"]: + subd = crawler.debug_tree[dircheckstr("test1." + file_ext)] + record_experiment = subd[1]["Experiment"] + assert isinstance(record_experiment, db.Record) + assert isinstance(record_experiment.get_property("Measurements").value, list) + assert len(record_experiment.get_property("Measurements").value) == 5 + prop_measure = record_experiment.get_property("Measurements").value[2] + assert isinstance(prop_measure, db.Record) + assert prop_measure.get_property("Col_1").value == "2" diff --git a/unittests/test_tables/test1.csv b/unittests/test_tables/test1.csv new file mode 100644 index 0000000000000000000000000000000000000000..c2eb297b523c06729937a07221c695105df0b09c --- /dev/null +++ b/unittests/test_tables/test1.csv @@ -0,0 +1,8 @@ +Col_1,Col_2,Col_3,text +Index,description,, +,m,s, +0,12,1,jdsfkljadskf +1,14,3,jdkfljad +2,3,4,jadkfjdsk +3,4.5,6, +4,8,7,jadskfj diff --git a/unittests/test_tables/test1.ods b/unittests/test_tables/test1.ods new file mode 100644 index 0000000000000000000000000000000000000000..6d5138b496511b02d0e6104868b6ba1e6816bfb6 Binary files /dev/null and b/unittests/test_tables/test1.ods differ diff --git a/unittests/test_tables/test1.tsv b/unittests/test_tables/test1.tsv new file mode 100644 index 0000000000000000000000000000000000000000..69286fcecd82c955f900bcdf7e6b5adfe26ab8c8 --- /dev/null +++ b/unittests/test_tables/test1.tsv @@ -0,0 +1,8 @@ +Col_1 Col_2 Col_3 text +Index description + m s +0 12 1 jdsfkljadskf +1 14 3 jdkfljad +2 3 4 jadkfjdsk +3 4.5 6 +4 8 7 jadskfj diff --git a/unittests/test_tables/test1.xlsx b/unittests/test_tables/test1.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..2bf68c8a854ae7f618e47e1db58490fc76c055b2 Binary files /dev/null and b/unittests/test_tables/test1.xlsx differ diff --git a/unittests/test_tool.py b/unittests/test_tool.py new file mode 100755 index 0000000000000000000000000000000000000000..a190efdeaaa9b3ede8d6fc1b9d1fb2d6e0d9c210 --- /dev/null +++ b/unittests/test_tool.py @@ -0,0 +1,693 @@ +#!/bin/python +# Tests for the tool using pytest +# Adapted from check-sfs +# A. Schlemmer, 06/2021 + +from caoscrawler.crawl import Crawler, SecurityMode +from caoscrawler.structure_elements import File, DictTextElement, DictListElement +from caoscrawler.identifiable_adapters import IdentifiableAdapter, LocalStorageIdentifiableAdapter +from simulated_server_data import full_data +from functools import partial +from copy import deepcopy +from unittest.mock import patch +import caosdb.common.models as dbmodels +from unittest.mock import MagicMock, Mock +from os.path import join, dirname, basename +import yaml +import caosdb as db +from caosdb.apiutils import compare_entities + +import pytest +from pytest import raises + + +def rfp(*pathcomponents): + """ + Return full path. + Shorthand convenience function. + """ + return join(dirname(__file__), *pathcomponents) + + +ident = LocalStorageIdentifiableAdapter() +ident.restore_state(rfp("records.xml")) +full_data.update({el.name: el for el in ident._records if el.name is not None}) +full_data.update({el.id: el for el in ident._records if el.name is None}) + + +def dircheckstr(*pathcomponents): + """ + Return the debug tree identifier for a given path. + """ + return ("caoscrawler.structure_elements.Directory: " + basename( + join(*pathcomponents)) + ", " + rfp( + "test_directories", "examples_article", *pathcomponents)) + + +@pytest.fixture +def crawler(): + crawler = Crawler(debug=True) + crawler.crawl_directory(rfp("test_directories", "examples_article"), + rfp("scifolder_cfood.yml")) + return crawler + + +@pytest.fixture +def ident(crawler): + ident = LocalStorageIdentifiableAdapter() + crawler.identifiableAdapter = ident + + # The records.xml file is constructed as follows: + # To a full run of the crawler, resolve all identifiables and insert all resulting entities. + # See: test-setup/datamodel/generate_test_data.py for details. + ident.restore_state(rfp("records.xml")) + + ident.register_identifiable( + "Person", db.RecordType() + .add_parent(name="Person") + .add_property(name="first_name") + .add_property(name="last_name")) + ident.register_identifiable( + "Measurement", db.RecordType() + .add_parent(name="Measurement") + .add_property(name="identifier") + .add_property(name="date") + .add_property(name="project")) + ident.register_identifiable( + "Project", db.RecordType() + .add_parent(name="Project") + .add_property(name="date") + .add_property(name="identifier")) + return ident + + +def test_record_structure_generation(crawler): + subd = crawler.debug_tree[dircheckstr("DataAnalysis")] + subc = crawler.debug_metadata["copied"][dircheckstr("DataAnalysis")] + assert len(subd) == 2 + # variables store on Data Analysis node of debug tree + assert len(subd[0]) == 2 + # record store on Data Analysis node of debug tree + assert len(subd[1]) == 0 + assert len(subc) == 2 + assert len(subc[0]) == 2 + assert len(subc[1]) == 0 + + # The data analysis node creates one variable for the node itself: + assert subd[0]["DataAnalysis"] == "examples_article/DataAnalysis" + assert subc[0]["DataAnalysis"] is False + + subd = crawler.debug_tree[dircheckstr( + "DataAnalysis", "2020_climate-model-predict")] + subc = crawler.debug_metadata["copied"][dircheckstr( + "DataAnalysis", "2020_climate-model-predict")] + + assert len(subd[1]) == 1 + assert len(subd[1]["Project"].get_parents()) == 1 + assert subd[1]["Project"].get_parents()[0].name == "Project" + assert subd[1]["Project"].get_property("date").value == "2020" + assert subd[1]["Project"].get_property( + "identifier").value == "climate-model-predict" + + assert len(subd[0]) == 6 + assert subd[0]["date"] == "2020" + assert subd[0]["identifier"] == "climate-model-predict" + assert subd[0]["Project"].__class__ == db.Record + + assert subd[0]["DataAnalysis"] == "examples_article/DataAnalysis" + assert subc[0]["DataAnalysis"] is True + assert subd[0]["project_dir"] == "examples_article/DataAnalysis/2020_climate-model-predict" + assert subc[0]["project_dir"] is False + + # Check the copy flags for the first level in the hierarchy: + assert len(subc[0]) == 6 + assert len(subc[1]) == 1 + assert subc[1]["Project"] is False + assert subc[0]["Project"] is False + assert subc[0]["date"] is False + assert subc[0]["identifier"] is False + + subd = crawler.debug_tree[dircheckstr("DataAnalysis", + "2020_climate-model-predict", + "2020-02-08_prediction-errors")] + subc = crawler.debug_metadata["copied"][dircheckstr("DataAnalysis", + "2020_climate-model-predict", + "2020-02-08_prediction-errors")] + assert len(subd[0]) == 8 + assert subd[0]["date"] == "2020-02-08" + assert subd[0]["identifier"] == "prediction-errors" + assert subd[0]["Project"].__class__ == db.Record + assert subd[0]["Measurement"].__class__ == db.Record + + assert len(subd[1]) == 2 + + assert len(subd[1]["Project"].get_parents()) == 1 + assert subd[1]["Project"].get_parents()[0].name == "Project" + assert subd[1]["Project"].get_property("date").value == "2020" + assert subd[1]["Project"].get_property( + "identifier").value == "climate-model-predict" + + assert len(subd[1]["Measurement"].get_parents()) == 1 + assert subd[1]["Measurement"].get_parents()[0].name == "Measurement" + assert subd[1]["Measurement"].get_property("date").value == "2020-02-08" + assert subd[1]["Measurement"].get_property( + "identifier").value == "prediction-errors" + assert subd[1]["Measurement"].get_property("project").value != "$Project" + assert subd[1]["Measurement"].get_property( + "project").value.__class__ == db.Record + assert subd[1]["Measurement"].get_property( + "project").value == subd[0]["Project"] + + # Check the copy flags for the second level in the hierarchy: + assert subc[1]["Project"] is True + assert subc[0]["Project"] is True + assert subc[1]["Measurement"] is False + assert subc[0]["Measurement"] is False + assert subc[0]["date"] is False + assert subc[0]["identifier"] is False + + +# def prepare_test_record_file(): +# ident = LocalStorageIdentifiableAdapter() +# crawler = Crawler(debug=True, identifiableAdapter=ident) +# crawler.crawl_directory(rfp("test_directories", "examples_article"), +# rfp("scifolder_cfood.yml")) + +# # clean record list: +# recordlist = ident.get_records() +# for i in range(len(recordlist)-1, 1, -1): +# if recordlist[i].parents[0].name == "Person": +# del recordlist[i] + +# ident.store_state(rfp("records.xml")) + + +def test_ambigious_records(crawler, ident): + ident.get_records().clear() + ident.get_records().extend(crawler.target_data) + r = ident.get_records() + id_r0 = ident.get_identifiable(r[0]) + with raises(RuntimeError, match=".*unambigiously.*"): + ident.retrieve_identified_record_for_identifiable(id_r0) + + +def test_crawler_update_list(crawler, ident): + # If the following assertions fail, that is a hint, that the test file records.xml has changed + # and this needs to be updated: + assert len(ident.get_records()) == 18 + assert len( + [r for r in ident.get_records() if r.parents[0].name == "Person"] + ) == 5 + assert len( + [r for r in ident.get_records() if r.parents[0].name == "Measurement"] + ) == 11 + assert len( + [r for r in ident.get_records() if r.parents[0].name == "Project"] + ) == 2 + + # The crawler contains lots of duplicates, because identifiables have not been resolved yet: + assert len(ident.get_records()) != len(crawler.target_data) + + # Check consistency: + # Check whether identifiables retrieved from current identifiable store return + # the same results. + + # take the first person in the list of records: + for r in ident.get_records(): + if r.parents[0].name == "Person": + r_cur = r + break + + id_r0 = ident.get_identifiable(r_cur) + assert r_cur.parents[0].name == id_r0.parents[0].name + assert r_cur.get_property( + "first_name").value == id_r0.get_property("first_name").value + assert r_cur.get_property( + "last_name").value == id_r0.get_property("last_name").value + assert len(r_cur.parents) == 1 + assert len(id_r0.parents) == 1 + assert len(r_cur.properties) == 2 + assert len(id_r0.properties) == 2 + + idr_r0_test = ident.retrieve_identified_record_for_identifiable(id_r0) + idr_r0 = ident.retrieve_identified_record_for_record(r_cur) + assert idr_r0 == idr_r0_test + + # take the first measurement in the list of records: + for r in ident.get_records(): + if r.parents[0].name == "Measurement": + r_cur = r + break + + id_r1 = ident.get_identifiable(r_cur) + assert r_cur.parents[0].name == id_r1.parents[0].name + assert r_cur.get_property( + "identifier").value == id_r1.get_property("identifier").value + assert r_cur.get_property("date").value == id_r1.get_property("date").value + assert r_cur.get_property( + "project").value == id_r1.get_property("project").value + assert len(r_cur.parents) == 1 + assert len(id_r1.parents) == 1 + assert len(r_cur.properties) == 4 + assert len(id_r1.properties) == 3 + + idr_r1_test = ident.retrieve_identified_record_for_identifiable(id_r1) + idr_r1 = ident.retrieve_identified_record_for_record(r_cur) + assert idr_r1 == idr_r1_test + assert idr_r1 != idr_r0 + assert idr_r1_test != idr_r0_test + + assert len(idr_r1.properties) == 4 + assert r_cur.get_property( + "responsible").value == idr_r1.get_property("responsible").value + assert r_cur.description == idr_r1.description + + # test whether compare_entites function works in this context: + comp = compare_entities(r_cur, id_r1) + assert len(comp[0]["parents"]) == 0 + assert len(comp[1]["parents"]) == 0 + assert len(comp[0]["properties"]) == 1 + assert len(comp[1]["properties"]) == 0 + assert "responsible" in comp[0]["properties"] + assert "description" in comp[0] + + comp = compare_entities(r_cur, idr_r1) + assert len(comp[0]["parents"]) == 0 + assert len(comp[1]["parents"]) == 0 + assert len(comp[0]["properties"]) == 0 + assert len(comp[1]["properties"]) == 0 + + +def test_synchronization(crawler, ident): + insl, updl = crawler.synchronize(commit_changes=False) + assert len(insl) == 0 + assert len(updl) == 0 + + +def test_identifiable_adapter(): + query = IdentifiableAdapter.create_query_for_identifiable( + db.Record().add_parent("Person") + .add_property("first_name", value="A") + .add_property("last_name", value="B")) + assert query.lower() == "find record person with 'first_name'='a' and 'last_name'='b' " + + +def test_remove_unnecessary_updates(): + # test trvial case + upl = [db.Record().add_parent("A")] + irs = [db.Record().add_parent("A")] + Crawler.remove_unnecessary_updates(upl, irs) + assert len(upl) == 0 + + # test property difference case + # TODO this should work right? + # upl = [db.Record().add_parent("A").add_property("a", 3)] + # irs = [db.Record().add_parent("A")] # ID should be s + # Crawler.remove_unnecessary_updates(upl, irs) + # assert len(upl) == 1 + + # test value difference case + upl = [db.Record().add_parent("A").add_property("a", 5)] + irs = [db.Record().add_parent("A").add_property("a")] + Crawler.remove_unnecessary_updates(upl, irs) + assert len(upl) == 1 + upl = [db.Record().add_parent("A").add_property("a", 5)] + irs = [db.Record().add_parent("A").add_property("a", 5)] + Crawler.remove_unnecessary_updates(upl, irs) + assert len(upl) == 0 + + # test unit difference case + upl = [db.Record().add_parent("A").add_property("a", unit='cm')] + irs = [db.Record().add_parent("A").add_property("a")] + Crawler.remove_unnecessary_updates(upl, irs) + assert len(upl) == 1 + + # test None difference case + upl = [db.Record().add_parent("A").add_property("a")] + irs = [db.Record().add_parent("A").add_property("a", 5)] + Crawler.remove_unnecessary_updates(upl, irs) + assert len(upl) == 1 + + +# Current status: +# TODO: currently, this test fails, because non identifiable records cannot +# be inserted into the cache. Solution might be, just not to add them +# into the local cache. Probably in split_into_inserts_and_updates. +@pytest.mark.xfail +def test_identifiable_adapter_no_identifiable(crawler, ident): + del ident._registered_identifiables["Person"] + insl, updl = crawler.synchronize() + assert len(updl) == 0 + + pers = [r for r in crawler.target_data if r.parents[0].name == "Person"] + # All persons are inserted, because they are not identifiable: + assert len(insl) == len(pers) + + +def test_provenance_debug_data(crawler): + crawler.save_debug_data(rfp("provenance.yml")) + + with open(rfp("provenance.yml"), "r") as f: + provenance = yaml.load(f, Loader=yaml.SafeLoader) + + pr = provenance["provenance"] + + def check_key_count(prefix): + return sum([1 for key in pr.keys() if key.startswith(prefix)]) + assert check_key_count("Measurement") == 11 + assert check_key_count("Project") == 5 + assert check_key_count("Person") == 14 + + +def basic_retrieve_by_name_mock_up(rec, known): + """ returns a stored Record if rec.name is an existing key, None otherwise """ + if rec.name in known: + return known[rec.name] + else: + return None + + +@pytest.fixture +def crawler_mocked_identifiable_retrieve(crawler): + # mock retrieval of registered identifiabls: return Record with just a parent + crawler.identifiableAdapter.get_registered_identifiable = Mock( + side_effect=lambda x: db.Record().add_parent(x.parents[0].name)) + + # Simulate remote server content by using the names to identify records + # There is only a single known Record with name A + crawler.identifiableAdapter.retrieve_identified_record_for_record = Mock(side_effect=partial( + basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A")})) + return crawler + + +def test_split_into_inserts_and_updates_trivial(crawler): + # Try trivial argument + crawler.split_into_inserts_and_updates([]) + + +def test_split_into_inserts_and_updates_single(crawler_mocked_identifiable_retrieve): + crawler = crawler_mocked_identifiable_retrieve + entlist = [db.Record(name="A").add_parent( + "C"), db.Record(name="B").add_parent("C")] + + assert crawler.get_identified_record_from_local_cache(entlist[0]) is None + assert crawler.get_identified_record_from_local_cache(entlist[1]) is None + assert crawler.can_be_checked_externally(entlist[0]) + assert crawler.can_be_checked_externally(entlist[1]) + assert crawler.identifiableAdapter.retrieve_identified_record_for_record( + entlist[0]).id == 1111 + assert crawler.identifiableAdapter.retrieve_identified_record_for_record( + entlist[1]) is None + + insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) + assert len(insert) == 1 + assert insert[0].name == "B" + assert len(update) == 1 + assert update[0].name == "A" + # if this ever fails, the mock up may be removed + crawler.identifiableAdapter.get_registered_identifiable.assert_called() + crawler.identifiableAdapter.retrieve_identified_record_for_record.assert_called() + + +def test_split_into_inserts_and_updates_with_duplicate(crawler_mocked_identifiable_retrieve): + crawler = crawler_mocked_identifiable_retrieve + a = db.Record(name="A").add_parent("C") + b = db.Record(name="B").add_parent("C") + b.add_property("A", a) + # This is identical to a and should be removed + c = db.Record(name="A").add_parent("C") + entlist = [a, b, c] + insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) + assert len(insert) == 1 + assert insert[0].name == "B" + assert len(update) == 1 + assert update[0].name == "A" + # if this ever fails, the mock up may be removed + crawler.identifiableAdapter.get_registered_identifiable.assert_called() + crawler.identifiableAdapter.retrieve_identified_record_for_record.assert_called() + + +def test_split_into_inserts_and_updates_with_ref(crawler_mocked_identifiable_retrieve): + crawler = crawler_mocked_identifiable_retrieve + # try it with a reference + a = db.Record(name="A").add_parent("C") + b = db.Record(name="B").add_parent("C") + b.add_property("A", a) + entlist = [a, b] + insert, update = crawler.split_into_inserts_and_updates(entlist) + assert len(insert) == 1 + assert insert[0].name == "B" + assert len(update) == 1 + assert update[0].name == "A" + # if this ever fails, the mock up may be removed + crawler.identifiableAdapter.retrieve_identified_record_for_record.assert_called() + crawler.identifiableAdapter.get_registered_identifiable.assert_called() + + +def test_split_into_inserts_and_updates_with_circ(crawler): + # try circular + a = db.Record(name="A").add_parent("C") + b = db.Record(name="B").add_parent("C") + b.add_property("A", a) + a.add_property("B", b) + entlist = [a, b] + # TODO this does not seem to be complete! + + +def test_split_into_inserts_and_updates_with_complex(crawler_mocked_identifiable_retrieve): + crawler = crawler_mocked_identifiable_retrieve + # A + # ^ + # | + # F <- B <- G + a = db.Record(name="A").add_parent("C").add_property( + 'd', 13).add_property('e', "lskdjlsfdj") + b = db.Record(name="B").add_parent("C") + g = db.Record(name="G").add_parent("C") + f = db.Record(name="F").add_parent("C") + g.add_property("A", a) + b.add_property("A", f) + b.add_property("A", a) + entlist = [a, b, g] + insert, update = crawler.split_into_inserts_and_updates(entlist) + assert len(insert) == 3 + assert "B" in [el.name for el in insert] + assert len(update) == 1 + assert update[0].name == "A" + # if this ever fails, the mock up may be removed + crawler.identifiableAdapter.get_registered_identifiable.assert_called() + crawler.identifiableAdapter.retrieve_identified_record_for_record.assert_called() + + # TODO write test where the unresoled entity is not part of the identifiable + + +def test_split_into_inserts_and_updates_with_copy_attr(crawler_mocked_identifiable_retrieve): + crawler = crawler_mocked_identifiable_retrieve + # assume identifiable is only the name + a = db.Record(name="A").add_parent("C") + a.add_property("foo", 1) + b = db.Record(name="A").add_parent("C") + b.add_property("bar", 2) + entlist = [a, b] + insert, update = crawler.split_into_inserts_and_updates(entlist) + + assert update[0].get_property("bar").value == 2 + assert update[0].get_property("foo").value == 1 + # if this ever fails, the mock up may be removed + crawler.identifiableAdapter.get_registered_identifiable.assert_called() + crawler.identifiableAdapter.retrieve_identified_record_for_record.assert_called() + + +def test_all_references_are_existing_already(crawler): + # Simulate remote server content by using the names to identify records + # There are only two known Records with name A and B + crawler.identifiableAdapter.get_registered_identifiable = Mock(side_effect=partial( + basic_retrieve_by_name_mock_up, known={"A": db.Record(name="A").add_parent("C"), + "B": db.Record(name="B").add_parent("C")})) + + assert crawler.all_references_are_existing_already( + db.Record().add_property('a', 123)) + assert crawler.all_references_are_existing_already(db.Record() + .add_property('a', db.Record(id=123))) + assert crawler.all_references_are_existing_already(db.Record() + .add_property('a', 123) + .add_property('b', db.Record(id=123))) + assert not crawler.all_references_are_existing_already(db.Record() + .add_property('a', 123) + .add_property('b', db.Record(name="A") + .add_parent("C"))) + a = db.Record(name="A").add_parent("C") + crawler.add_identified_record_to_local_cache(a) + assert crawler.all_references_are_existing_already(db.Record() + .add_property('a', 123) + .add_property('b', a)) + # if this ever fails, the mock up may be removed + crawler.identifiableAdapter.get_registered_identifiable.assert_called() + + +def test_can_be_checked_externally(crawler): + assert crawler.can_be_checked_externally( + db.Record().add_property('a', 123)) + assert crawler.can_be_checked_externally(db.Record() + .add_property('a', db.Record(id=123))) + assert crawler.can_be_checked_externally(db.Record() + .add_property('a', 123) + .add_property('b', db.Record(id=123))) + + assert not crawler.can_be_checked_externally(db.Record() + .add_property('a', 123) + .add_property('b', db.Record())) + + +def test_replace_entities_with_ids(crawler): + a = (db.Record().add_parent("B").add_property("A", 12345) + .add_property("B", db.Record(id=12345)) + .add_property("C", [db.Record(id=12345), 233324])) + + crawler.replace_entities_with_ids(a) + assert a.get_property("A").value == 12345 + assert a.get_property("B").value == 12345 + assert a.get_property("C").value == [12345, 233324] + + +def mock_get_entity_by_id(id): + candidates = [el for el in list(full_data.values()) if el.id == id] + if len(candidates) > 0: + return candidates[0] + else: + raise ValueError() + + +def mock_get_entity_by_name(name): + candidates = [el for el in full_data.values() + if (el.name is not None and el.name.lower() == name.lower())] + if len(candidates) > 0: + return candidates[0] + else: + raise ValueError() + + +def prepare_crawler_with_sec_mode(mode, ident): + crawler = Crawler(debug=True, securityMode=mode) + crawler.crawl_directory(rfp("test_directories", "examples_article"), + rfp("scifolder_cfood.yml")) + crawler.identifiableAdapter = ident + + return crawler + + +def reset_mocks(mocks): + for mock in mocks: + mock.reset_mock() + + +def change_identifiable_prop(ident): + # the checks in here are only to make sure we change the record as we intend to + meas = ident._records[-2] + assert meas.parents[0].name == "Measurement" + resps = meas.properties[0] + assert resps.name == "date" + # change one element; This changes the date which is part of the identifiable + resps.value = "2022-01-04" + + +def change_non_identifiable_prop(ident): + # the checks in here are only to make sure we change the record as we intend to + meas = ident._records[-1] + assert meas.parents[0].name == "Measurement" + resps = meas.properties[-1] + assert resps.name == "responsible" + assert len(resps.value) == 2 + # change one element; This removes a responsible which is not part of the identifiable + del resps.value[-1] + + +@patch("caoscrawler.crawl.Crawler._get_entity_by_id", + new=Mock(side_effect=mock_get_entity_by_id)) +@patch("caoscrawler.crawl.Crawler._get_entity_by_name", + new=Mock(side_effect=mock_get_entity_by_name)) +@patch("caoscrawler.crawl.db.Container.insert") +@patch("caoscrawler.crawl.db.Container.update") +@patch("caoscrawler.crawl.UpdateCache.insert") +def test_security_mode(updateCacheMock, upmock, insmock, ident): + records_backup = deepcopy(ident._records) + + # trivial case: nothing to do + crawler = prepare_crawler_with_sec_mode(SecurityMode.RETRIEVE, ident) + crawler.synchronize(commit_changes=True) + assert crawler.run_id is not None + insmock.assert_not_called() + upmock.assert_not_called() + updateCacheMock.assert_not_called() + + # RETRIEVE: insert only + crawler = prepare_crawler_with_sec_mode(SecurityMode.RETRIEVE, ident) + # remove one element + del ident._records[-1] + # insert forbidden + crawler.synchronize(commit_changes=True) + assert crawler.run_id is not None + insmock.assert_not_called() + upmock.assert_not_called() + assert updateCacheMock.call_count == 1 + # reset counts + reset_mocks([updateCacheMock, insmock, upmock]) + # restore original ident + ident._records = deepcopy(records_backup) + + # RETRIEVE: update only + crawler = prepare_crawler_with_sec_mode(SecurityMode.RETRIEVE, ident) + # change one element + change_non_identifiable_prop(ident) + crawler.synchronize(commit_changes=True) + assert crawler.run_id is not None + insmock.assert_not_called() + upmock.assert_not_called() + assert updateCacheMock.call_count == 1 + # reset counts + reset_mocks([updateCacheMock, insmock, upmock]) + # restore original ident + ident._records = deepcopy(records_backup) + + # INSERT: insert only + crawler = prepare_crawler_with_sec_mode(SecurityMode.INSERT, ident) + # remove one element + del ident._records[-1] + crawler.synchronize(commit_changes=True) + assert crawler.run_id is not None + insmock.assert_called_once() + upmock.assert_not_called() + updateCacheMock.assert_not_called() + # reset counts + reset_mocks([updateCacheMock, insmock, upmock]) + # restore original ident + ident._records = deepcopy(records_backup) + + # INSERT: update only + crawler = prepare_crawler_with_sec_mode(SecurityMode.INSERT, ident) + # change one element + change_non_identifiable_prop(ident) + crawler.synchronize(commit_changes=True) + assert crawler.run_id is not None + insmock.assert_not_called() + upmock.assert_not_called() + updateCacheMock.assert_called_once() + # reset counts + reset_mocks([updateCacheMock, insmock, upmock]) + # restore original ident + ident._records = deepcopy(records_backup) + + # INSERT: insert and update + crawler = prepare_crawler_with_sec_mode(SecurityMode.INSERT, ident) + # change two elements + change_non_identifiable_prop(ident) + change_identifiable_prop(ident) + crawler.synchronize(commit_changes=True) + assert crawler.run_id is not None + insmock.asser_called_once() + upmock.assert_not_called() + updateCacheMock.assert_called_once() + # reset counts + reset_mocks([updateCacheMock, insmock, upmock]) + # restore original ident + ident._records = deepcopy(records_backup) diff --git a/unittests/test_tool_extended.py b/unittests/test_tool_extended.py new file mode 100644 index 0000000000000000000000000000000000000000..d0b431a539a15e3e83906540c69becff437742ec --- /dev/null +++ b/unittests/test_tool_extended.py @@ -0,0 +1,78 @@ +#!/bin/python +# Tests for the tool using pytest +# Adapted from check-sfs +# A. Schlemmer, 06/2021 + +from caoscrawler import Crawler +from caoscrawler.structure_elements import File, DictTextElement, DictListElement +from caoscrawler.identifiable_adapters import IdentifiableAdapter, LocalStorageIdentifiableAdapter +from functools import partial +from copy import deepcopy +from unittest.mock import MagicMock, Mock +from os.path import join, dirname, basename +import yaml +import caosdb as db +from caosdb.apiutils import compare_entities + +import pytest +from pytest import raises + + +def rfp(*pathcomponents): + """ + Return full path. + Shorthand convenience function. + """ + return join(dirname(__file__), *pathcomponents) + + +def dircheckstr(*pathcomponents, structure_element_type="Directory"): + """ + Return the debug tree identifier for a given path. + """ + return ("caoscrawler.structure_elements." + structure_element_type + ": " + + basename(join(*pathcomponents)) + ", " + + rfp("test_directories", "examples_article", *pathcomponents)) + + +@pytest.fixture +def crawler(): + crawler = Crawler(debug=True) + crawler.crawl_directory(rfp("test_directories", "examples_article"), + rfp("scifolder_extended.yml")) + return crawler + + +# @pytest.fixture +# def ident(crawler): +# ident = LocalStorageIdentifiableAdapter() +# crawler.identifiableAdapter = ident + +# ident.restore_state(rfp("records.xml")) + +# ident.register_identifiable( +# "Person", db.RecordType() +# .add_parent(name="Person") +# .add_property(name="first_name") +# .add_property(name="last_name")) +# ident.register_identifiable( +# "Measurement", db.RecordType() +# .add_parent(name="Measurement") +# .add_property(name="identifier") +# .add_property(name="date") +# .add_property(name="project")) +# ident.register_identifiable( +# "Project", db.RecordType() +# .add_parent(name="Project") +# .add_property(name="date") +# .add_property(name="identifier")) +# return ident + + +def test_file_structure_generation(crawler): + sd = crawler.debug_tree[dircheckstr("SimulationData", + "2020_climate-model-predict", "2020-02-01", + "README.md", structure_element_type="File")] + assert sd[1]["ReadmeFile"].role == "File" + assert len(sd[1]["ReadmeFile"].path) > 0 + assert len(sd[1]["ReadmeFile"].file) > 0 diff --git a/unittests/test_validation.py b/unittests/test_validation.py new file mode 100644 index 0000000000000000000000000000000000000000..686c66f72f55b66344322e0c6f3b9d1a2b76b3f9 --- /dev/null +++ b/unittests/test_validation.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2022 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +""" +Test the validation of cfood definition files. +""" + +from caoscrawler.crawl import Crawler + +from tempfile import NamedTemporaryFile + +import yaml +import pytest diff --git a/unittests/test_variable_substitutions.py b/unittests/test_variable_substitutions.py new file mode 100644 index 0000000000000000000000000000000000000000..203197b7f8af51605a413ac354a0426d61c9c0cb --- /dev/null +++ b/unittests/test_variable_substitutions.py @@ -0,0 +1,85 @@ +#!/bin/python +# Tests for variable substitutions +# A. Schlemmer, 05/2022 + +from caoscrawler import Crawler +from caoscrawler.structure_elements import File, DictTextElement, DictListElement +from caoscrawler.identifiable_adapters import IdentifiableAdapter, LocalStorageIdentifiableAdapter +from functools import partial +from copy import deepcopy +from unittest.mock import MagicMock, Mock +from os.path import join, dirname, basename +import yaml +import caosdb as db +from caosdb.apiutils import compare_entities + +import pytest +from pytest import raises + + +def rfp(*pathcomponents): + """ + Return full path. + Shorthand convenience function. + """ + return join(dirname(__file__), *pathcomponents) + + +def dircheckstr(element_type, *pathcomponents): + """ + Return the debug tree identifier for a given path. + """ + return "caoscrawler.structure_elements." + element_type + ": " + basename(join(*pathcomponents)) + ", " + rfp("test_directories", "example_substitutions", *pathcomponents) + + +@pytest.fixture +def crawler(): + crawler = Crawler(debug=True) + crawler.crawl_directory(rfp("test_directories", "example_substitutions", "ExperimentalData"), + rfp("test_directories", "example_substitutions", "substitutions.yml")) + return crawler + + +@pytest.fixture +def crawler_2(): + crawler = Crawler(debug=True) + crawler.crawl_directory(rfp("test_directories", "example_substitutions", "ExperimentalData"), + rfp("test_directories", "example_substitutions", + "substitutions_parents.yml")) + return crawler + + +def test_substitutions(crawler): + # @review Florian Spreckelsen 2022-05-13 + for i in range(2): + subd = crawler.debug_tree[dircheckstr( + "File", "ExperimentalData", "220512_data.dat")] + assert subd[i]["Experiment"].get_property("date").value == "2022-05-12" + assert isinstance(subd[i]["ExperimentSeries"].get_property( + "Experiment").value, db.Record) + + subd = crawler.debug_tree[dircheckstr("Directory", "ExperimentalData")] + assert subd[i]["Project"].name == "project" + assert isinstance(subd[i]["Project"].get_property( + "Experiments").value, list) + assert isinstance(subd[i]["Project"].get_property( + "Experiments").value[0], db.Record) + + assert isinstance(subd[i]["Project"].get_property("dates").value, list) + assert subd[i]["Project"].get_property( + "dates").value[0] == "2022-05-12" + + +def test_substitutions_parents(crawler_2): + # This is a test for: + # https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/35 + # ... testing whether variable substitutions can be used in parent declarations. + subd = crawler_2.debug_tree[dircheckstr( + "File", "ExperimentalData", "220512_data.dat")] + # subd[0] <- generalStore + # subd[1] <- recordStore + + parents = subd[1]["Experiment"].get_parents() + assert len(parents) == 2 + assert parents[0].name == "Experiment" + assert parents[1].name == "Month_05"