Skip to content
Snippets Groups Projects
Commit f1ae11ed authored by Alexander Schlemmer's avatar Alexander Schlemmer
Browse files

Merge branch 'dev' into f-macros

parents 73d4d222 d692474c
No related branches found
No related tags found
2 merge requests!53Release 0.1,!25F macros
Pipeline #28523 failed
Showing
with 673 additions and 186 deletions
...@@ -10,14 +10,16 @@ RUN apt-get update && \ ...@@ -10,14 +10,16 @@ RUN apt-get update && \
tox \ tox \
-y -y
COPY .docker/wait-for-it.sh /wait-for-it.sh COPY .docker/wait-for-it.sh /wait-for-it.sh
ARG PYLIB
ADD https://gitlab.indiscale.com/api/v4/projects/97/repository/commits/${PYLIB} \ ADD https://gitlab.indiscale.com/api/v4/projects/97/repository/commits/${PYLIB} \
pylib_version.json pylib_version.json
RUN git clone https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git && \ RUN git clone https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git && \
cd caosdb-pylib && git checkout ${PYLIB} && pip3 install . cd caosdb-pylib && git checkout ${PYLIB} && pip3 install .
ARG ADVANCED
ADD https://gitlab.indiscale.com/api/v4/projects/104/repository/commits/${ADVANCED} \ ADD https://gitlab.indiscale.com/api/v4/projects/104/repository/commits/${ADVANCED} \
advanced_version.json advanced_version.json
RUN git clone https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git && \ RUN git clone https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git && \
cd caosdb-advanced-user-tools && git checkout ${ADVANCED} && pip3 install . cd caosdb-advanced-user-tools && git checkout ${ADVANCED} && pip3 install .[h5-crawler]
COPY . /git COPY . /git
# Delete .git because it is huge. # Delete .git because it is huge.
......
...@@ -13,3 +13,4 @@ provenance.yml ...@@ -13,3 +13,4 @@ provenance.yml
*.jks *.jks
*.tar.gz *.tar.gz
*.sql *.sql
/integrationtests/test-profile/custom/other/cert/
...@@ -211,6 +211,17 @@ build-testenv: ...@@ -211,6 +211,17 @@ build-testenv:
- PYLIB=${PYLIB:-dev} - PYLIB=${PYLIB:-dev}
- echo $PYLIB - echo $PYLIB
- if [ -z "$ADVANCED" ]; then
if echo "$CI_COMMIT_REF_NAME" | grep -c "^f-" ; then
echo "Check if advanced user tools have branch $CI_COMMIT_REF_NAME" ;
if wget https://gitlab.indiscale.com/api/v4/projects/104/repository/branches/${CI_COMMIT_REF_NAME} ; then
ADVANCED=$CI_COMMIT_REF_NAME ;
fi;
fi;
fi;
- ADVANCED=${ADVANCED:-dev}
- echo $ADVANCED
- docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
# use here general latest or specific branch latest... # use here general latest or specific branch latest...
- docker build - docker build
......
...@@ -10,6 +10,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ...@@ -10,6 +10,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added ### Added
* Everything * Everything
* Added new converters for tables: CSVTableConverter and XLSXTableConverter
* Possibility to authorize updates as in the old crawler
* Allow authorization of inserts
### Changed ### Changed
...@@ -21,4 +24,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ...@@ -21,4 +24,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed ### Fixed
* Fixed #12
* FIX: Variables are now also replaced when the value is given as a list.
### Security ### Security
...@@ -25,6 +25,8 @@ After installation of the package run (within the project folder): ...@@ -25,6 +25,8 @@ After installation of the package run (within the project folder):
pytest pytest
``` ```
## Integration Tests
see `integrationtests/README.md`
# Contributers # Contributers
......
1. Mount test_data/extroot as extroot folder in the CaosDB server 1. Mount test_data/extroot as extroot folder in the CaosDB server
2. use an empty server 2. use an empty server
3. run pytest from `src`: `python -m pytest ../integrationtests`
...@@ -28,12 +28,13 @@ ...@@ -28,12 +28,13 @@
module description module description
""" """
from caosadvancedtools.crawler import Crawler as OldCrawler
import os import os
from caosdb import EmptyUniqueQueryError from caosdb import EmptyUniqueQueryError
import argparse import argparse
import sys import sys
from argparse import RawTextHelpFormatter from argparse import RawTextHelpFormatter
from caoscrawler import Crawler from caoscrawler import Crawler, SecurityMode
import caosdb as db import caosdb as db
from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter
import pytest import pytest
...@@ -41,8 +42,8 @@ from caosadvancedtools.models.parser import parse_model_from_yaml ...@@ -41,8 +42,8 @@ from caosadvancedtools.models.parser import parse_model_from_yaml
import yaml import yaml
# TODO is not yet merged in caosadvancedtools # TODO is not yet merged in caosadvancedtools
from caosadvancedtools.testutils import clear_database, set_test_key #from caosadvancedtools.testutils import clear_database, set_test_key
set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") # set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2")
def rfp(*pathcomponents): def rfp(*pathcomponents):
...@@ -53,6 +54,11 @@ def rfp(*pathcomponents): ...@@ -53,6 +54,11 @@ def rfp(*pathcomponents):
return os.path.join(os.path.dirname(__file__), *pathcomponents) return os.path.join(os.path.dirname(__file__), *pathcomponents)
@pytest.fixture
def clear_database():
db.execute_query("FIND Entity").delete()
@pytest.fixture @pytest.fixture
def usemodel(): def usemodel():
model = parse_model_from_yaml(rfp("model.yml")) model = parse_model_from_yaml(rfp("model.yml"))
...@@ -86,8 +92,8 @@ def ident(): ...@@ -86,8 +92,8 @@ def ident():
def crawl_standard_test_directory(cr: Crawler, def crawl_standard_test_directory(cr: Crawler,
subdir: str = "examples_article", subdir: str = "examples_article",
cfood: str = "scifolder_cfood.yml"): cfood: str = "scifolder_cfood.yml"):
cr.crawl_directory(rfp("..", "unittests", "test_directories", subdir), cr.crawl_directory(rfp("..", "..", "unittests", "test_directories", subdir),
rfp("..", "unittests", cfood)) rfp("..", "..", "unittests", cfood))
@pytest.fixture @pytest.fixture
...@@ -102,15 +108,13 @@ def crawler_extended(ident): ...@@ -102,15 +108,13 @@ def crawler_extended(ident):
cr = Crawler(debug=True, identifiableAdapter=ident) cr = Crawler(debug=True, identifiableAdapter=ident)
crawl_standard_test_directory(cr, cfood="scifolder_extended.yml") crawl_standard_test_directory(cr, cfood="scifolder_extended.yml")
# correct paths for current working directory # correct paths for current working directory
updateList = cr.updateList file_list = [r for r in cr.target_data if r.role == "File"]
fileList = [r for r in updateList if r.role == "File"] for f in file_list:
for f in fileList: f.file = rfp("..", "..", "unittests", "test_directories", f.file)
f.file = rfp("..", "unittests", "test_directories",
"examples_article", f.file)
return cr return cr
def test_single_insertion(clear_database, usemodel, crawler): def test_single_insertion(clear_database, usemodel, crawler, ident):
ins, ups = crawler.synchronize() ins, ups = crawler.synchronize()
# This test also generates the file records.xml used in some of the unittesets: # This test also generates the file records.xml used in some of the unittesets:
...@@ -118,7 +122,7 @@ def test_single_insertion(clear_database, usemodel, crawler): ...@@ -118,7 +122,7 @@ def test_single_insertion(clear_database, usemodel, crawler):
for i in reversed(range(len(res))): for i in reversed(range(len(res))):
if res[i].parents[0].name == "PyTestInfo": if res[i].parents[0].name == "PyTestInfo":
del res[i] del res[i]
filename = rfp("..", "unittests", "records.xml") filename = rfp("..", "..", "unittests", "records.xml")
with open(filename, "w") as f: with open(filename, "w") as f:
xml = res.to_xml() xml = res.to_xml()
# Remove noscript and transaction benchmark: # Remove noscript and transaction benchmark:
...@@ -131,10 +135,9 @@ def test_single_insertion(clear_database, usemodel, crawler): ...@@ -131,10 +135,9 @@ def test_single_insertion(clear_database, usemodel, crawler):
assert len(ups) == 0 assert len(ups) == 0
# Do a second run on the same data, there should be no changes: # Do a second run on the same data, there should be no changes:
crawler = Crawler(debug=True, identifiableAdapter=ident_adapt) crawler = Crawler(debug=True, identifiableAdapter=ident)
crawler.copy_attributes = Mock() crawler.crawl_directory(rfp("../../unittests/test_directories", "examples_article"),
crawler.crawl_directory(rfp("../unittests/test_directories", "examples_article"), rfp("../../unittests/scifolder_cfood.yml"))
rfp("../unittests/scifolder_cfood.yml"))
ins, ups = crawler.synchronize() ins, ups = crawler.synchronize()
assert len(ins) == 0 assert len(ins) == 0
assert len(ups) == 0 assert len(ups) == 0
...@@ -157,7 +160,7 @@ def test_insertion(clear_database, usemodel, ident, crawler): ...@@ -157,7 +160,7 @@ def test_insertion(clear_database, usemodel, ident, crawler):
# Do a second run on the same data, there should a new insert: # Do a second run on the same data, there should a new insert:
cr = Crawler(debug=True, identifiableAdapter=ident) cr = Crawler(debug=True, identifiableAdapter=ident)
crawl_standard_test_directory(cr, "example_insert") crawl_standard_test_directory(cr, "example_insert")
assert len(cr.updateList) == 3 assert len(cr.target_data) == 3
ins, ups = cr.synchronize() ins, ups = cr.synchronize()
assert len(ins) == 1 assert len(ins) == 1
assert len(ups) == 0 assert len(ups) == 0
...@@ -165,7 +168,29 @@ def test_insertion(clear_database, usemodel, ident, crawler): ...@@ -165,7 +168,29 @@ def test_insertion(clear_database, usemodel, ident, crawler):
# Do it again to check whether nothing is changed: # Do it again to check whether nothing is changed:
cr = Crawler(debug=True, identifiableAdapter=ident) cr = Crawler(debug=True, identifiableAdapter=ident)
crawl_standard_test_directory(cr, "example_insert") crawl_standard_test_directory(cr, "example_insert")
assert len(cr.updateList) == 3 assert len(cr.target_data) == 3
ins, ups = cr.synchronize()
assert len(ins) == 0
assert len(ups) == 0
def test_insert_auth(clear_database, usemodel, ident, crawler):
ins, ups = crawler.synchronize()
# Do a second run on the same data, there should a new insert:
cr = Crawler(debug=True, identifiableAdapter=ident, securityMode=SecurityMode.RETRIEVE)
crawl_standard_test_directory(cr, "example_insert")
assert len(cr.target_data) == 3
ins, ups = cr.synchronize()
assert len(ins) == 1
assert not ins[0].is_valid()
nins, nups = OldCrawler.update_authorized_changes(cr.run_id)
assert nins == 1
# Do it again to check whether nothing is changed:
cr = Crawler(debug=True, identifiableAdapter=ident)
crawl_standard_test_directory(cr, "example_insert")
assert len(cr.target_data) == 3
ins, ups = cr.synchronize() ins, ups = cr.synchronize()
assert len(ins) == 0 assert len(ins) == 0
assert len(ups) == 0 assert len(ups) == 0
...@@ -180,9 +205,9 @@ def test_insertion_and_update(clear_database, usemodel, ident, crawler): ...@@ -180,9 +205,9 @@ def test_insertion_and_update(clear_database, usemodel, ident, crawler):
cr = Crawler(debug=True, identifiableAdapter=ident) cr = Crawler(debug=True, identifiableAdapter=ident)
crawl_standard_test_directory(cr, "example_overwrite_1") crawl_standard_test_directory(cr, "example_overwrite_1")
# print(cr.updateList) # print(cr.target_data)
# cr.save_debug_data(rfp("provenance.yml")) # cr.save_debug_data(rfp("provenance.yml"))
assert len(cr.updateList) == 3 assert len(cr.target_data) == 3
ins, ups = cr.synchronize() ins, ups = cr.synchronize()
assert len(ins) == 0 assert len(ins) == 0
assert len(ups) == 1 assert len(ups) == 1
...@@ -197,7 +222,7 @@ def test_identifiable_update(clear_database, usemodel, ident, crawler): ...@@ -197,7 +222,7 @@ def test_identifiable_update(clear_database, usemodel, ident, crawler):
crawl_standard_test_directory(cr) crawl_standard_test_directory(cr)
# Test the addition of a single property: # Test the addition of a single property:
l = cr.updateList l = cr.target_data
for record in l: for record in l:
if (record.parents[0].name == "Measurement" and if (record.parents[0].name == "Measurement" and
record.get_property("date").value == "2020-01-03"): record.get_property("date").value == "2020-01-03"):
...@@ -213,7 +238,7 @@ def test_identifiable_update(clear_database, usemodel, ident, crawler): ...@@ -213,7 +238,7 @@ def test_identifiable_update(clear_database, usemodel, ident, crawler):
# Test the change within one property: # Test the change within one property:
cr = Crawler(debug=True, identifiableAdapter=ident) cr = Crawler(debug=True, identifiableAdapter=ident)
crawl_standard_test_directory(cr) crawl_standard_test_directory(cr)
l = cr.updateList l = cr.target_data
for record in l: for record in l:
if (record.parents[0].name == "Measurement" and if (record.parents[0].name == "Measurement" and
record.get_property("date").value == "2020-01-03"): record.get_property("date").value == "2020-01-03"):
...@@ -227,7 +252,7 @@ def test_identifiable_update(clear_database, usemodel, ident, crawler): ...@@ -227,7 +252,7 @@ def test_identifiable_update(clear_database, usemodel, ident, crawler):
# Changing the date should result in a new insertion: # Changing the date should result in a new insertion:
cr = Crawler(debug=True, identifiableAdapter=ident) cr = Crawler(debug=True, identifiableAdapter=ident)
crawl_standard_test_directory(cr) crawl_standard_test_directory(cr)
l = cr.updateList l = cr.target_data
for record in l: for record in l:
if (record.parents[0].name == "Measurement" and if (record.parents[0].name == "Measurement" and
record.get_property("date").value == "2020-01-03"): record.get_property("date").value == "2020-01-03"):
...@@ -244,24 +269,23 @@ def test_file_insertion_dry(clear_database, usemodel, ident): ...@@ -244,24 +269,23 @@ def test_file_insertion_dry(clear_database, usemodel, ident):
crawler_extended = Crawler(debug=True, identifiableAdapter=ident) crawler_extended = Crawler(debug=True, identifiableAdapter=ident)
crawl_standard_test_directory( crawl_standard_test_directory(
crawler_extended, cfood="scifolder_extended.yml") crawler_extended, cfood="scifolder_extended.yml")
updateList = crawler_extended.updateList file_list = [r for r in crawler_extended.target_data if r.role == "File"]
fileList = [r for r in updateList if r.role == "File"] assert len(file_list) == 11
assert len(fileList) == 11
for f in fileList: for f in file_list:
assert f.path.endswith("README.md") assert f.path.endswith("README.md")
assert f.path == f.file assert f.path[1:] == f.file
ins, ups = crawler_extended.synchronize(commit_changes=False) ins, ups = crawler_extended.synchronize(commit_changes=False)
assert len(ups) == 0 assert len(ups) == 0
fileList_ins = [r for r in ins if r.role == "File"] file_list_ins = [r for r in ins if r.role == "File"]
assert len(fileList_ins) == 11 assert len(file_list_ins) == 11
def test_file_insertion(clear_database, usemodel, ident, crawler_extended): def test_file_insertion(clear_database, usemodel, ident, crawler_extended):
ins, ups = crawler_extended.synchronize(commit_changes=True) ins, ups = crawler_extended.synchronize(commit_changes=True)
fileList_ins = [r for r in ins if r.role == "File"] file_list_ins = [r for r in ins if r.role == "File"]
assert len(fileList_ins) == 11 assert len(file_list_ins) == 11
assert db.execute_query("COUNT File") > 0 assert db.execute_query("COUNT File") > 0
...@@ -276,16 +300,14 @@ def test_file_insertion(clear_database, usemodel, ident, crawler_extended): ...@@ -276,16 +300,14 @@ def test_file_insertion(clear_database, usemodel, ident, crawler_extended):
def test_file_update(clear_database, usemodel, ident, crawler_extended): def test_file_update(clear_database, usemodel, ident, crawler_extended):
ins1, ups1 = crawler_extended.synchronize(commit_changes=True) ins1, ups1 = crawler_extended.synchronize(commit_changes=True)
fileList_ins = [r for r in ins1 if r.role == "File"] file_list_ins = [r for r in ins1 if r.role == "File"]
cr = Crawler(debug=True, identifiableAdapter=ident) cr = Crawler(debug=True, identifiableAdapter=ident)
crawl_standard_test_directory(cr, cfood="scifolder_extended.yml") crawl_standard_test_directory(cr, cfood="scifolder_extended.yml")
updateList = cr.updateList file_list = [r for r in cr.target_data if r.role == "File"]
fileList = [r for r in updateList if r.role == "File"] for f in file_list:
for f in fileList: f.file = rfp("..", "..", "unittests", "test_directories", f.file)
f.file = rfp("..", "unittests", "test_directories",
"examples_article", f.file)
ins2, ups2 = cr.synchronize(commit_changes=True) ins2, ups2 = cr.synchronize(commit_changes=True)
assert len(ups1) == 0 assert len(ups1) == 0
assert len(ups2) == 0 assert len(ups2) == 0
...@@ -298,11 +320,9 @@ def test_file_update(clear_database, usemodel, ident, crawler_extended): ...@@ -298,11 +320,9 @@ def test_file_update(clear_database, usemodel, ident, crawler_extended):
cr2 = Crawler(debug=True, identifiableAdapter=ident) cr2 = Crawler(debug=True, identifiableAdapter=ident)
crawl_standard_test_directory(cr2, cfood="scifolder_extended2.yml") crawl_standard_test_directory(cr2, cfood="scifolder_extended2.yml")
updateList = cr2.updateList file_list = [r for r in cr2.target_data if r.role == "File"]
fileList = [r for r in updateList if r.role == "File"] for f in file_list:
for f in fileList: f.file = rfp("..", "..", "unittests", "test_directories", f.file)
f.file = rfp("..", "unittests", "test_directories",
"examples_article", f.file)
ins3, ups3 = cr2.synchronize(commit_changes=True) ins3, ups3 = cr2.synchronize(commit_changes=True)
assert len(ups3) == 11 assert len(ups3) == 11
...@@ -313,4 +333,4 @@ def test_file_update(clear_database, usemodel, ident, crawler_extended): ...@@ -313,4 +333,4 @@ def test_file_update(clear_database, usemodel, ident, crawler_extended):
# TODO: Implement file update checks (based on checksum) # TODO: Implement file update checks (based on checksum)
# Add test with actual file update: # Add test with actual file update:
# assert len(ins2) == 0 # assert len(ins2) == 0
# assert len(ups2) == len(fileList_ins) # assert len(ups2) == len(file_list_ins)
[Connection]
url=https://localhost:10443/
username=admin
debug=0
#cacert=/home//CaosDB/caosdb-deploy/profiles/default/custom/other/cert/caosdb.cert.pem
password_method=plain
password=caosdb
ssl_insecure=True
timeout=5000
[Container]
debug=0
#[Crawler]
#oldprefix=/ExperimentalData/
#newprefix=/home/professional/CaosDB/caosdb-advanced-user-tools/integrationtests/extroot/ExperimentalData
#[IntegrationTests]
#test_server_side_scripting.bin_dir=/home/professional/CaosDB/caosdb-pyinttest/resources
[Misc]
sendmail=sendmail_to_file
#sendmail=/usr/local/bin/sendmail_to_file
entity_loan.curator_mail_from=admin@indiscale.com
entity_loan.curator_mail_to=admin@indiscale.com
[sss_helper]
external_uri = https://localhost:10443
[advancedtools]
crawler.from_mail=admin@indiscale.com
crawler.to_mail=admin@indiscale.com
...@@ -318,6 +318,13 @@ Data: ...@@ -318,6 +318,13 @@ Data:
Dataset: Dataset:
Project: $Project Project: $Project
subtree: subtree:
name_element:
type: DictTextElement
match_name: "name"
match_value: "(?P<name>.*)"
records:
Project:
name: $name
full_name_element: full_name_element:
type: DictTextElement type: DictTextElement
match_name: "full_name" match_name: "full_name"
......
license:
- name
project_type:
- name
Keyword:
- name
Taxon:
- name
Person:
- email
# - full_name
Dataset:
- title
# - DOI
Event:
- longitude
- latitude
- start_datetime
Dataspace:
- dataspace_id
Project:
- name
../../../pycaosdb.ini
\ No newline at end of file
...@@ -29,7 +29,7 @@ import os ...@@ -29,7 +29,7 @@ import os
import caosdb as db import caosdb as db
from caoscrawler.crawl import Crawler from caoscrawler.crawl import Crawler, main as crawler_main
from caoscrawler.converters import JSONFileConverter, DictConverter from caoscrawler.converters import JSONFileConverter, DictConverter
from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter
from caoscrawler.structure_elements import File, JSONFile, Directory from caoscrawler.structure_elements import File, JSONFile, Directory
...@@ -81,20 +81,27 @@ def clear_database(): ...@@ -81,20 +81,27 @@ def clear_database():
ents.delete() ents.delete()
def test_dataset( def create_identifiable_adapter():
clear_database,
usemodel):
# json_file_path = rfp("test_directories", "single_file_test_data", "testjson.json")
ident = CaosDBIdentifiableAdapter() ident = CaosDBIdentifiableAdapter()
ident.register_identifiable( ident.register_identifiable("license", (
"license", db.RecordType().add_parent("license").add_property("name")) db.RecordType()
ident.register_identifiable("project_type", db.RecordType( .add_parent("license")
).add_parent("project_type").add_property("name")) .add_property("name")))
ident.register_identifiable("Person", db.RecordType( ident.register_identifiable("project_type", (
).add_parent("Person").add_property("full_name")) db.RecordType()
.add_parent("project_type")
crawler = Crawler(debug=True, identifiableAdapter=ident) .add_property("name")))
ident.register_identifiable("Person", (
db.RecordType()
.add_parent("Person")
.add_property("full_name")))
return ident
def test_dataset(clear_database, usemodel):
ident = create_identifiable_adapter()
crawler = Crawler(identifiableAdapter=ident)
crawler_definition = crawler.load_definition( crawler_definition = crawler.load_definition(
os.path.join(DATADIR, "dataset_cfoods.yml")) os.path.join(DATADIR, "dataset_cfoods.yml"))
# print(json.dumps(crawler_definition, indent=3)) # print(json.dumps(crawler_definition, indent=3))
...@@ -107,13 +114,7 @@ def test_dataset( ...@@ -107,13 +114,7 @@ def test_dataset(
crawler_definition, crawler_definition,
converter_registry converter_registry
) )
subd = crawler.debug_tree crawler.synchronize()
subc = crawler.debug_metadata
# print(json.dumps(subc, indent=3))
# print(subd)
# print(subc)
# print(records)
ins, ups = crawler.synchronize()
dataspace = db.execute_query("FIND RECORD Dataspace WITH name=35 AND dataspace_id=20002 AND " dataspace = db.execute_query("FIND RECORD Dataspace WITH name=35 AND dataspace_id=20002 AND "
"archived=FALSE AND url='https://datacloud.de/index.php/f/7679'" "archived=FALSE AND url='https://datacloud.de/index.php/f/7679'"
...@@ -130,3 +131,80 @@ def test_dataset( ...@@ -130,3 +131,80 @@ def test_dataset(
"") == 1 "") == 1
assert db.execute_query(f"COUNT RECORD with id={dataset.id} AND WHICH REFERENCES Event WITH " assert db.execute_query(f"COUNT RECORD with id={dataset.id} AND WHICH REFERENCES Event WITH "
"start_datetime='2022-02-10T16:36:48+01:00'") == 1 "start_datetime='2022-02-10T16:36:48+01:00'") == 1
def test_event_update(clear_database, usemodel):
identifiable_path = os.path.join(DATADIR, "identifiables.yml")
crawler_definition_path = os.path.join(DATADIR, "dataset_cfoods.yml")
# TODO(fspreck): Use crawler_main
crawler_main(
os.path.join(DATADIR, 'data'),
crawler_definition_path,
identifiable_path,
True,
os.path.join(DATADIR, "provenance.yml"),
False,
""
)
old_dataset_rec = db.execute_query(
"FIND RECORD Dataset WHICH HAS AN EVENT WITH location='Bremen, Germany'")
assert len(old_dataset_rec) == 1
old_dataset_rec = old_dataset_rec[0]
assert old_dataset_rec.get_property("Event").datatype == db.LIST("Event")
assert len(old_dataset_rec.get_property("Event").value) == 1
old_event_rec = db.Record(
id=old_dataset_rec.get_property("Event").value[0]).retrieve()
# TODO(fspreck): crawl again manually, edit the event records in the update
# list, synchronize, and test whether the events have been updated.
ident = CaosDBIdentifiableAdapter()
ident.load_from_yaml_definition(identifiable_path)
second_crawler = Crawler(identifiableAdapter=ident)
crawler_definition = second_crawler.load_definition(
crawler_definition_path)
converter_registry = second_crawler.load_converters(crawler_definition)
records = second_crawler.start_crawling(
Directory("data", os.path.join(DATADIR, "data")),
crawler_definition,
converter_registry
)
for rec in records:
if rec.parents[0].name == "Event":
rec.get_property("longitude").value = 0.0
rec.get_property("latitude").value = 0.0
rec.get_property("location").value = "Origin"
elif rec.parents[0].name == "Dataset":
rec.get_property("Event").value[0].get_property(
"longitude").value = 0.0
rec.get_property("Event").value[0].get_property(
"latitude").value = 0.0
rec.get_property("Event").value[0].get_property(
"location").value = "Origin"
second_crawler.synchronize()
# Dataset is still the same Record, but with an updated event
new_dataset_rec = db.Record(id=old_dataset_rec.id).retrieve()
for prop in old_dataset_rec.get_properties():
if not prop.name == "Event":
assert new_dataset_rec.get_property(
prop.name).datatype == prop.datatype
assert new_dataset_rec.get_property(
prop.name).value == prop.value
assert new_dataset_rec.get_property("Event").datatype == db.LIST("Event")
assert new_dataset_rec.get_property("Event").value is not None
assert len(new_dataset_rec.get_property("Event").value) == 1
assert new_dataset_rec.get_property("Event").value[0] != old_event_rec.id
# The event has new properties
new_event_rec = db.Record(
id=new_dataset_rec.get_property("Event").value[0]).retrieve()
assert new_event_rec.get_property("longitude").value == 0.0
assert new_event_rec.get_property("latitude").value == 0.0
assert new_event_rec.get_property("location").value == "Origin"
assert new_event_rec.get_property(
"start_datetime").value == old_event_rec.get_property("start_datetime").value
...@@ -32,7 +32,7 @@ from subprocess import run ...@@ -32,7 +32,7 @@ from subprocess import run
import caosdb as db import caosdb as db
from caosadvancedtools.loadFiles import loadpath from caosadvancedtools.loadFiles import loadpath
from caosadvancedtools.models import parser as parser from caosadvancedtools.models import parser as parser
from caoscrawler.crawl import crawler_main from caoscrawler.crawl import main as crawler_main
# TODO(fspreck) Re-eneable once this is part of dev in advancedusertools. # TODO(fspreck) Re-eneable once this is part of dev in advancedusertools.
...@@ -77,7 +77,6 @@ def test_complete_crawler( ...@@ -77,7 +77,6 @@ def test_complete_crawler(
True, True,
os.path.join(DATADIR, "provenance.yml"), os.path.join(DATADIR, "provenance.yml"),
False, False,
True,
"/use_case_simple_presentation") "/use_case_simple_presentation")
res = db.execute_query("FIND Record Experiment") res = db.execute_query("FIND Record Experiment")
......
...@@ -24,6 +24,8 @@ install_requires = ...@@ -24,6 +24,8 @@ install_requires =
caosadvancedtools caosadvancedtools
yaml-header-tools yaml-header-tools
pyyaml pyyaml
odfpy #make optional
pandas
[options.packages.find] [options.packages.find]
where = src where = src
......
from .crawl import Crawler from .crawl import Crawler, SecurityMode
#!/usr/bin/env python3
# encoding: utf-8
#
# This file is a part of the CaosDB Project.
#
# Copyright (C) 2022 Henrik tom Wörden
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
from caosadvancedtools.crawler import Crawler as OldCrawler
import argparse
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("run_id",
help="Run ID or the crawler run that created the changes that shall be "
"authorized.")
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
OldCrawler.update_authorized_changes(args.run_id)
...@@ -23,6 +23,8 @@ cfood: ...@@ -23,6 +23,8 @@ cfood:
- Definitions - Definitions
- Dict - Dict
- JSONFile - JSONFile
- CSVTableConverter
- XLSXTableConverter
description: Type of this converter node. description: Type of this converter node.
match: match:
description: typically a regexp which is matched to a structure element name description: typically a regexp which is matched to a structure element name
......
...@@ -40,6 +40,8 @@ from abc import abstractmethod ...@@ -40,6 +40,8 @@ from abc import abstractmethod
from string import Template from string import Template
import yaml_header_tools import yaml_header_tools
import pandas as pd
import yaml import yaml
# These are special properties which are (currently) treated differently # These are special properties which are (currently) treated differently
...@@ -48,6 +50,15 @@ SPECIAL_PROPERTIES = ("description", "name", "id", "path", ...@@ -48,6 +50,15 @@ SPECIAL_PROPERTIES = ("description", "name", "id", "path",
"file", "checksum", "size") "file", "checksum", "size")
def str_to_bool(x):
if str(x).lower() == "true":
return True
elif str(x).lower() == "false":
return False
else:
raise RuntimeError("Should be 'true' or 'false'.")
class ConverterValidationError(Exception): class ConverterValidationError(Exception):
"""To be raised if contents of an element to be converted are invalid.""" """To be raised if contents of an element to be converted are invalid."""
...@@ -55,13 +66,37 @@ class ConverterValidationError(Exception): ...@@ -55,13 +66,37 @@ class ConverterValidationError(Exception):
self.message = msg self.message = msg
def handle_value(value: Union[dict, str], values: GeneralStore): def replace_variables(propvalue, values: GeneralStore):
""" # Check if the replacement is a single variable containing a record:
Function to generically handle values for properties defined in the match = re.match(r"^\$(\{)?(?P<varname>[0-9a-zA-Z_]+)(\})?$", propvalue)
yaml structure. if match is not None:
varname = match.group("varname")
if varname in values:
if values[varname] is None:
return None
if isinstance(values[varname], db.Entity):
return values[varname]
propvalue_template = Template(propvalue)
return propvalue_template.safe_substitute(**values.get_storage())
def handle_value(value: Union[dict, str, list], values: GeneralStore):
"""
determines whether the given value needs to set a property, be added to an existing value (create a list) or
add as an additional property (multiproperty).
Variable names (starting with a "$") are replaced by the corresponding value stored in the
`values` GeneralStore.
Parameters:
- value: if str, the value to be interpreted. E.g. "4", "hallo" or "$a" etc.
if dict, must have keys "value" and "collection_mode". The returned tuple is directly
created from the corresponding values.
if list, each element is checked for replacement and the resulting list will be used
as (list) value for the property
Returns a tuple: Returns a tuple:
- the final value of the property - the final value of the property; variable names contained in `values` are replaced.
- the collection mode (can be single, list or multiproperty) - the collection mode (can be single, list or multiproperty)
""" """
# @review Florian Spreckelsen 2022-05-13 # @review Florian Spreckelsen 2022-05-13
...@@ -90,22 +125,19 @@ def handle_value(value: Union[dict, str], values: GeneralStore): ...@@ -90,22 +125,19 @@ def handle_value(value: Union[dict, str], values: GeneralStore):
# different from the two cases above. # different from the two cases above.
collection_mode = "single" collection_mode = "single"
propvalue = value propvalue = value
return (propvalue, collection_mode)
# Check if the replacement is a single variable containing a record: # variables replacement:
match = re.match(r"^\$(\{)?(?P<varname>[0-9a-zA-Z_]+)(\})?$", propvalue) propvalue = [replace_variables(i, values) for i in propvalue]
if match is not None:
varname = match.group("varname")
if varname in values:
if values[varname] is None:
propvalue = None
return (propvalue, collection_mode)
if isinstance(values[varname], db.Entity):
propvalue = values[varname]
return (propvalue, collection_mode)
propvalue_template = Template(propvalue) return (propvalue, collection_mode)
propvalue = propvalue_template.safe_substitute(**values.get_storage()) else:
# value is another simple type
# collection_mode = "single"
# propvalue = value["value"]
# return (propvalue, collection_mode)
raise RuntimeError()
propvalue = replace_variables(propvalue, values)
return (propvalue, collection_mode) return (propvalue, collection_mode)
...@@ -113,7 +145,7 @@ def create_records(values: GeneralStore, ...@@ -113,7 +145,7 @@ def create_records(values: GeneralStore,
records: RecordStore, records: RecordStore,
def_records: dict): def_records: dict):
# list of keys to identify, which variables have been set by which paths: # list of keys to identify, which variables have been set by which paths:
# these are tuples: # the items are tuples:
# 0: record name # 0: record name
# 1: property name # 1: property name
keys_modified = [] keys_modified = []
...@@ -143,6 +175,11 @@ def create_records(values: GeneralStore, ...@@ -143,6 +175,11 @@ def create_records(values: GeneralStore,
for key, value in record.items(): for key, value in record.items():
if key == "parents" or key == "role": if key == "parents" or key == "role":
continue continue
# Allow replacing variables in keys / names of properties:
key_template = Template(key)
key = key_template.safe_substitute(**values.get_storage())
keys_modified.append((name, key)) keys_modified.append((name, key))
propvalue, collection_mode = handle_value(value, values) propvalue, collection_mode = handle_value(value, values)
...@@ -151,6 +188,9 @@ def create_records(values: GeneralStore, ...@@ -151,6 +188,9 @@ def create_records(values: GeneralStore,
# list mode does not work for them # list mode does not work for them
if key == "path" and not propvalue.startswith(os.path.sep): if key == "path" and not propvalue.startswith(os.path.sep):
propvalue = os.path.sep + propvalue propvalue = os.path.sep + propvalue
# Convert relative to absolute paths:
propvalue = os.path.normpath(propvalue)
setattr(c_record, key, propvalue) setattr(c_record, key, propvalue)
else: else:
...@@ -607,3 +647,102 @@ class TextElementConverter(Converter): ...@@ -607,3 +647,102 @@ class TextElementConverter(Converter):
if m is None: if m is None:
return None return None
return m.groupdict() return m.groupdict()
class TableConverter(Converter):
"""
This converter reads tables in different formats line by line and
allows matching the corresponding rows.
The subtree generated by the table converter consists of DictDictElements, each being
a row. The corresponding header elements will become the dictionary keys.
The rows can be matched using a DictDictElementConverter.
"""
@abstractmethod
def get_options(self):
"""
This method needs to be overwritten by the specific table converter to provide
information about the possible options.
"""
pass
def _get_options(self, possible_options):
option_dict = dict()
for opt_name, opt_conversion in possible_options:
if opt_name in self.definition:
el = self.definition[opt_name]
# The option can often either be a single value or a list of values.
# In the latter case each element of the list will be converted to the defined type.
if isinstance(el, list):
option_dict[opt_name] = [opt_conversion(el_el) for el_el in el]
else:
option_dict[opt_name] = opt_conversion(el)
return option_dict
def typecheck(self, element: StructureElement):
return isinstance(element, File)
def match(self, element: StructureElement):
if not isinstance(element, File):
raise RuntimeError("Element must be a File.")
m = re.match(self.definition["match"], element.name)
if m is None:
return None
return m.groupdict()
class XLSXTableConverter(TableConverter):
def get_options(self):
return self._get_options([
("sheet_name", str),
("header", int),
("names", str),
("index_col", int),
("usecols", int),
("true_values", str),
("false_values", str),
("na_values", str),
("skiprows", int),
("nrows", int),
("keep_default_na", str_to_bool), ]
)
def create_children(self, generalStore: GeneralStore,
element: StructureElement):
if not isinstance(element, File):
raise RuntimeError("Element must be a File.")
table = pd.read_excel(element.path, **self.get_options())
child_elements = list()
for index, row in table.iterrows():
child_elements.append(
DictDictElement(str(index), row.to_dict()))
return child_elements
class CSVTableConverter(TableConverter):
def get_options(self):
return self._get_options([
("sep", str),
("delimiter", str),
("header", int),
("names", str),
("index_col", int),
("usecols", int),
("true_values", str),
("false_values", str),
("na_values", str),
("skiprows", int),
("nrows", int),
("keep_default_na", str_to_bool), ])
def create_children(self, generalStore: GeneralStore,
element: StructureElement):
if not isinstance(element, File):
raise RuntimeError("Element must be a File.")
table = pd.read_csv(element.path, **self.get_options())
child_elements = list()
for index, row in table.iterrows():
child_elements.append(
DictDictElement(str(index), row.to_dict()))
return child_elements
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment