diff --git a/.docker/Dockerfile b/.docker/Dockerfile index b300a1a97aa22b3eafc91ef89c01bbd7111edd62..f7353e059d8cd027f08403d6f6527ffbcaabc965 100644 --- a/.docker/Dockerfile +++ b/.docker/Dockerfile @@ -10,14 +10,16 @@ RUN apt-get update && \ tox \ -y COPY .docker/wait-for-it.sh /wait-for-it.sh +ARG PYLIB ADD https://gitlab.indiscale.com/api/v4/projects/97/repository/commits/${PYLIB} \ pylib_version.json RUN git clone https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git && \ cd caosdb-pylib && git checkout ${PYLIB} && pip3 install . +ARG ADVANCED ADD https://gitlab.indiscale.com/api/v4/projects/104/repository/commits/${ADVANCED} \ advanced_version.json RUN git clone https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git && \ - cd caosdb-advanced-user-tools && git checkout ${ADVANCED} && pip3 install . + cd caosdb-advanced-user-tools && git checkout ${ADVANCED} && pip3 install .[h5-crawler] COPY . /git # Delete .git because it is huge. diff --git a/.gitignore b/.gitignore index 11c17317428964b82b47d55399a4dde1a9e698a9..9af5ee22fdd68c1c25e98614ab516bf4d384d577 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ provenance.yml *.jks *.tar.gz *.sql +/integrationtests/test-profile/custom/other/cert/ diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a30140e684b465d40b964f1bfb9b97959b29834d..b5bc53fed1b069f3a6f665a188aa8bdcd7252570 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -211,6 +211,17 @@ build-testenv: - PYLIB=${PYLIB:-dev} - echo $PYLIB + - if [ -z "$ADVANCED" ]; then + if echo "$CI_COMMIT_REF_NAME" | grep -c "^f-" ; then + echo "Check if advanced user tools have branch $CI_COMMIT_REF_NAME" ; + if wget https://gitlab.indiscale.com/api/v4/projects/104/repository/branches/${CI_COMMIT_REF_NAME} ; then + ADVANCED=$CI_COMMIT_REF_NAME ; + fi; + fi; + fi; + - ADVANCED=${ADVANCED:-dev} + - echo $ADVANCED + - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY # use here general latest or specific branch latest... - docker build diff --git a/CHANGELOG.md b/CHANGELOG.md index 8315011da5a89f53ba9eb5b2533115ef245e790e..001bf62a262ee970f7dc0de93b09cf8dea14507e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * Everything * Added new converters for tables: CSVTableConverter and XLSXTableConverter * Possibility to authorize updates as in the old crawler +* Allow authorization of inserts ### Changed diff --git a/integrationtests/basic_example/test.py b/integrationtests/basic_example/test_basic.py similarity index 83% rename from integrationtests/basic_example/test.py rename to integrationtests/basic_example/test_basic.py index cecd6533669fd9fb75124faf758efeae8b8d9778..b24a1c658cfc9e23ca0ba2de266161864cb6b66c 100755 --- a/integrationtests/basic_example/test.py +++ b/integrationtests/basic_example/test_basic.py @@ -28,12 +28,13 @@ module description """ +from caosadvancedtools.crawler import Crawler as OldCrawler import os from caosdb import EmptyUniqueQueryError import argparse import sys from argparse import RawTextHelpFormatter -from caoscrawler import Crawler +from caoscrawler import Crawler, SecurityMode import caosdb as db from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter import pytest @@ -41,8 +42,8 @@ from caosadvancedtools.models.parser import parse_model_from_yaml import yaml # TODO is not yet merged in caosadvancedtools -from caosadvancedtools.testutils import clear_database, set_test_key -set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") +#from caosadvancedtools.testutils import clear_database, set_test_key +# set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") def rfp(*pathcomponents): @@ -53,6 +54,11 @@ def rfp(*pathcomponents): return os.path.join(os.path.dirname(__file__), *pathcomponents) +@pytest.fixture +def clear_database(): + db.execute_query("FIND Entity").delete() + + @pytest.fixture def usemodel(): model = parse_model_from_yaml(rfp("model.yml")) @@ -86,8 +92,8 @@ def ident(): def crawl_standard_test_directory(cr: Crawler, subdir: str = "examples_article", cfood: str = "scifolder_cfood.yml"): - cr.crawl_directory(rfp("..", "unittests", "test_directories", subdir), - rfp("..", "unittests", cfood)) + cr.crawl_directory(rfp("..", "..", "unittests", "test_directories", subdir), + rfp("..", "..", "unittests", cfood)) @pytest.fixture @@ -104,12 +110,11 @@ def crawler_extended(ident): # correct paths for current working directory file_list = [r for r in cr.target_data if r.role == "File"] for f in file_list: - f.file = rfp("..", "unittests", "test_directories", - "examples_article", f.file) + f.file = rfp("..", "..", "unittests", "test_directories", f.file) return cr -def test_single_insertion(clear_database, usemodel, crawler): +def test_single_insertion(clear_database, usemodel, crawler, ident): ins, ups = crawler.synchronize() # This test also generates the file records.xml used in some of the unittesets: @@ -117,7 +122,7 @@ def test_single_insertion(clear_database, usemodel, crawler): for i in reversed(range(len(res))): if res[i].parents[0].name == "PyTestInfo": del res[i] - filename = rfp("..", "unittests", "records.xml") + filename = rfp("..", "..", "unittests", "records.xml") with open(filename, "w") as f: xml = res.to_xml() # Remove noscript and transaction benchmark: @@ -130,10 +135,9 @@ def test_single_insertion(clear_database, usemodel, crawler): assert len(ups) == 0 # Do a second run on the same data, there should be no changes: - crawler = Crawler(debug=True, identifiableAdapter=ident_adapt) - crawler.copy_attributes = Mock() - crawler.crawl_directory(rfp("../unittests/test_directories", "examples_article"), - rfp("../unittests/scifolder_cfood.yml")) + crawler = Crawler(debug=True, identifiableAdapter=ident) + crawler.crawl_directory(rfp("../../unittests/test_directories", "examples_article"), + rfp("../../unittests/scifolder_cfood.yml")) ins, ups = crawler.synchronize() assert len(ins) == 0 assert len(ups) == 0 @@ -170,6 +174,28 @@ def test_insertion(clear_database, usemodel, ident, crawler): assert len(ups) == 0 +def test_insert_auth(clear_database, usemodel, ident, crawler): + ins, ups = crawler.synchronize() + + # Do a second run on the same data, there should a new insert: + cr = Crawler(debug=True, identifiableAdapter=ident, securityMode=SecurityMode.RETRIEVE) + crawl_standard_test_directory(cr, "example_insert") + assert len(cr.target_data) == 3 + ins, ups = cr.synchronize() + assert len(ins) == 1 + assert not ins[0].is_valid() + nins, nups = OldCrawler.update_authorized_changes(cr.run_id) + assert nins == 1 + + # Do it again to check whether nothing is changed: + cr = Crawler(debug=True, identifiableAdapter=ident) + crawl_standard_test_directory(cr, "example_insert") + assert len(cr.target_data) == 3 + ins, ups = cr.synchronize() + assert len(ins) == 0 + assert len(ups) == 0 + + def test_insertion_and_update(clear_database, usemodel, ident, crawler): ins, ups = crawler.synchronize() @@ -248,7 +274,7 @@ def test_file_insertion_dry(clear_database, usemodel, ident): for f in file_list: assert f.path.endswith("README.md") - assert f.path == f.file + assert f.path[1:] == f.file ins, ups = crawler_extended.synchronize(commit_changes=False) assert len(ups) == 0 @@ -281,8 +307,7 @@ def test_file_update(clear_database, usemodel, ident, crawler_extended): file_list = [r for r in cr.target_data if r.role == "File"] for f in file_list: - f.file = rfp("..", "unittests", "test_directories", - "examples_article", f.file) + f.file = rfp("..", "..", "unittests", "test_directories", f.file) ins2, ups2 = cr.synchronize(commit_changes=True) assert len(ups1) == 0 assert len(ups2) == 0 @@ -297,8 +322,7 @@ def test_file_update(clear_database, usemodel, ident, crawler_extended): file_list = [r for r in cr2.target_data if r.role == "File"] for f in file_list: - f.file = rfp("..", "unittests", "test_directories", - "examples_article", f.file) + f.file = rfp("..", "..", "unittests", "test_directories", f.file) ins3, ups3 = cr2.synchronize(commit_changes=True) assert len(ups3) == 11 diff --git a/src/caoscrawler/__init__.py b/src/caoscrawler/__init__.py index 28ef97d421023ad41be65d9d0e6abac76fbef6fe..b65b9fd9d24b9519a52ca13d07e46c9d8f791a73 100644 --- a/src/caoscrawler/__init__.py +++ b/src/caoscrawler/__init__.py @@ -1 +1 @@ -from .crawl import Crawler +from .crawl import Crawler, SecurityMode diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index e5b0e6ba69898deb5320a382823313c1a4bf83c6..8886c5f87f1556517acafc7bfa673e8a0d29c6e2 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -777,12 +777,8 @@ class Crawler(object): if securityMode.value > SecurityMode.RETRIEVE.value: db.Container().extend(to_be_inserted).insert() elif run_id is not None: - - raise RuntimeError("You must not insert Entities since the Crawler was startet " - "with RETRIEVE only mode.") - # Caching forbidden inserts is currently not implemented - # cache = Cache() - # cache.insert(to_be_inserted, run_id) + update_cache = UpdateCache() + update_cache.insert(to_be_inserted, run_id, insert=True) @staticmethod def set_ids_and_datatype_of_parents_and_properties(rec_list): @@ -845,16 +841,20 @@ class Crawler(object): self.execute_updates_in_list(to_be_updated, self.securityMode, self.run_id) update_cache = UpdateCache() - pending_changes = update_cache.get_updates(self.run_id) + pending_inserts = update_cache.get_inserts(self.run_id) + if pending_inserts: + Crawler.inform_about_pending_changes( + pending_inserts, self.run_id, self.crawled_directory) - if pending_changes: + pending_updates = update_cache.get_updates(self.run_id) + if pending_updates: Crawler.inform_about_pending_changes( - pending_changes, self.run_id, self.crawled_directory) + pending_updates, self.run_id, self.crawled_directory) return (to_be_inserted, to_be_updated) @staticmethod - def inform_about_pending_changes(pending_changes, run_id, path): + def inform_about_pending_changes(pending_changes, run_id, path, inserts=False): # Sending an Email with a link to a form to authorize updates is # only done in SSS mode @@ -870,7 +870,8 @@ UNAUTHORIZED UPDATE ({} of {}): ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) logger.info("There were unauthorized changes (see above). An " "email was sent to the curator.\n" - "You can authorize the updates by invoking the crawler" + "You can authorize the " + ("inserts" if inserts else "updates") + + " by invoking the crawler" " with the run id: {rid}\n".format(rid=run_id)) @staticmethod diff --git a/tox.ini b/tox.ini index 2cf966fb5b80e62cb7f216b0785ba567e13ee3ff..5ab67e67cfef0b3cf0cf82d2d28de0fe11aca6a1 100644 --- a/tox.ini +++ b/tox.ini @@ -1,11 +1,14 @@ [tox] -envlist=py36, py37, py38, py39, py310 +envlist=py38, py39, py310 skip_missing_interpreters = true [testenv] deps = . pytest pytest-cov + # TODO: Make this f-branch sensitive + git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev + git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@dev commands=py.test --cov=caosdb -vv {posargs} [flake8] max-line-length=100 diff --git a/unittests/scifolder_cfood.yml b/unittests/scifolder_cfood.yml index 1fd7c98d57b35fa651e36bee2c529a46e3a96cde..90f193444bfda7296c46260236274da2378635cc 100644 --- a/unittests/scifolder_cfood.yml +++ b/unittests/scifolder_cfood.yml @@ -16,7 +16,7 @@ Data: # name of the converter subtree: &template project_dir: # name of the first subtree element which is a converter type: Directory - match: (?P<date>.*?)_(?P<identifier>.*) + match: ((?P<date>[0-9]{4,4})_)?(?P<identifier>.*) records: Project: # this is an identifiable in this case parents: diff --git a/unittests/scifolder_extended.yml b/unittests/scifolder_extended.yml index 2a1416b778e96ba57fc216d9763572568703ab75..9bab612b9b37e8e295ee8fd02575de506a98d8fc 100644 --- a/unittests/scifolder_extended.yml +++ b/unittests/scifolder_extended.yml @@ -16,12 +16,12 @@ Data: # name of the converter subtree: &template project_dir: # name of the first subtree element which is a converter type: Directory - match: (?P<date>.*?)_(?P<identifier>.*) + match: ((?P<year>[0-9]{4,4})_)?(?P<identifier>.*) records: Project: # this is an identifiable in this case parents: - Project # not needed as the name is equivalent - date: $date + date: $year identifier: $identifier subtree: diff --git a/unittests/scifolder_extended2.yml b/unittests/scifolder_extended2.yml index f1dfc2d4635b6956930343685c7b17ca4f2f1679..969325e91da488011819c338708a33dcfc32c93e 100644 --- a/unittests/scifolder_extended2.yml +++ b/unittests/scifolder_extended2.yml @@ -6,95 +6,99 @@ Definitions: type: Definitions #include "description.yml" -DataAnalysis: # name of the converter +Data: # name of the converter type: Directory - match: DataAnalysis - subtree: &template - project_dir: # name of the first subtree element which is a converter + match: (.*) + subtree: + DataAnalysis: # name of the converter type: Directory - match: (?P<date>.*?)_(?P<identifier>.*) - records: - Project: # this is an identifiable in this case - parents: - - Project # not needed as the name is equivalent - date: $date - identifier: $identifier - - subtree: - measurement: # new name for folders on the 3rd level + match: DataAnalysis + subtree: &template + project_dir: # name of the first subtree element which is a converter type: Directory - match: (?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2})(_(?P<identifier>.*))? + match: ((?P<year>[0-9]{4,4})_)?(?P<identifier>.*) records: - Measurement: - date: $date + Project: # this is an identifiable in this case + parents: + - Project # not needed as the name is equivalent + date: $year identifier: $identifier - project: $Project + subtree: - README: - type: MarkdownFile # this is a subclass of converter File - # function signature: GeneralStore, StructureElement - # preprocessors: custom.caosdb.convert_values - match: ^README\.md$ - # how to make match case insensitive? - records: # this block is very verbose and intended to make sure that this - # file is inserted correctly (and can be supplemented with properties - # and / or parents), TODO: maybe there should be a shorthand - ReadmeFile: - parents: - - ProjectMarkdownReadme - role: File - path: $README - file: $README # this is automatically the relative path - # starting from the top level structure element - # of this element + measurement: # new name for folders on the 3rd level + type: Directory + match: (?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2})(_(?P<identifier>.*))? + records: Measurement: - ReadmeFile: $ReadmeFile - + date: $date + identifier: $identifier + project: $Project subtree: - description: - type: DictTextElement - match_value: (?P<description>.*) - match_name: description - records: + README: + type: MarkdownFile # this is a subclass of converter File + # function signature: GeneralStore, StructureElement + # preprocessors: custom.caosdb.convert_values + match: ^README\.md$ + # how to make match case insensitive? + records: # this block is very verbose and intended to make sure that this + # file is inserted correctly (and can be supplemented with properties + # and / or parents), TODO: maybe there should be a shorthand + ReadmeFile: + parents: + - ProjectMarkdownReadme + role: File + path: $README + file: $README # this is automatically the relative path + # starting from the top level structure element + # of this element Measurement: - description: $description - responsible_single: - type: DictTextElement - match_name: responsible - match_value: &person_regexp ((?P<first_name>.+) )?(?P<last_name>.+) - records: &responsible_records - Person: - first_name: $first_name - last_name: $last_name - Measurement: # this uses the reference to the above defined record - responsible: +$Person # each record also implicitely creates a variable - # with the same name. The "+" indicates, that - # this will become a list entry in list property - # "responsible" belonging to Measurement. - - responsible_list: - type: DictListElement - match_name: responsible + ReadmeFile: $ReadmeFile + subtree: - Person: - type: TextElement - match: *person_regexp - records: *responsible_records + description: + type: DictTextElement + match_value: (?P<description>.*) + match_name: description + records: + Measurement: + description: $description + responsible_single: + type: DictTextElement + match_name: responsible + match_value: &person_regexp ((?P<first_name>.+) )?(?P<last_name>.+) + records: &responsible_records + Person: + first_name: $first_name + last_name: $last_name + Measurement: # this uses the reference to the above defined record + responsible: +$Person # each record also implicitely creates a variable + # with the same name. The "+" indicates, that + # this will become a list entry in list property + # "responsible" belonging to Measurement. - # sources_list: - # type: DictListElement - # match_name: sources - # subtree: - # Source: - # type: TextElement - # match: &path ... ??? + responsible_list: + type: DictListElement + match_name: responsible + subtree: + Person: + type: TextElement + match: *person_regexp + records: *responsible_records -ExperimentalData: # name of the converter - type: Directory - match: ExperimentalData - subtree: *template + # sources_list: + # type: DictListElement + # match_name: sources + # subtree: + # Source: + # type: TextElement + # match: &path ... ??? -SimulationData: # name of the converter - type: Directory - match: SimulationData - subtree: *template + ExperimentalData: # name of the converter + type: Directory + match: ExperimentalData + subtree: *template + + SimulationData: # name of the converter + type: Directory + match: SimulationData + subtree: *template diff --git a/unittests/test_tool.py b/unittests/test_tool.py index b86bc7b82113e2b357c6cf6fe16594a7e162ce8b..59573756fe61ef697976e480dd1550cb0ead0998 100755 --- a/unittests/test_tool.py +++ b/unittests/test_tool.py @@ -605,13 +605,11 @@ def test_security_mode(updateCacheMock, upmock, insmock, ident): # remove one element del ident._records[-1] # insert forbidden - with pytest.raises(RuntimeError) as excinfo: - crawler.synchronize(commit_changes=True) + crawler.synchronize(commit_changes=True) assert crawler.run_id is not None insmock.assert_not_called() upmock.assert_not_called() - # as long as caching of inserts is not implemented this is not called - updateCacheMock.assert_not_called() + assert updateCacheMock.call_count == 1 # reset counts reset_mocks([updateCacheMock, insmock, upmock]) # restore original ident