diff --git a/.docker/Dockerfile b/.docker/Dockerfile index b300a1a97aa22b3eafc91ef89c01bbd7111edd62..f7353e059d8cd027f08403d6f6527ffbcaabc965 100644 --- a/.docker/Dockerfile +++ b/.docker/Dockerfile @@ -10,14 +10,16 @@ RUN apt-get update && \ tox \ -y COPY .docker/wait-for-it.sh /wait-for-it.sh +ARG PYLIB ADD https://gitlab.indiscale.com/api/v4/projects/97/repository/commits/${PYLIB} \ pylib_version.json RUN git clone https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git && \ cd caosdb-pylib && git checkout ${PYLIB} && pip3 install . +ARG ADVANCED ADD https://gitlab.indiscale.com/api/v4/projects/104/repository/commits/${ADVANCED} \ advanced_version.json RUN git clone https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git && \ - cd caosdb-advanced-user-tools && git checkout ${ADVANCED} && pip3 install . + cd caosdb-advanced-user-tools && git checkout ${ADVANCED} && pip3 install .[h5-crawler] COPY . /git # Delete .git because it is huge. diff --git a/.gitignore b/.gitignore index 11c17317428964b82b47d55399a4dde1a9e698a9..9af5ee22fdd68c1c25e98614ab516bf4d384d577 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ provenance.yml *.jks *.tar.gz *.sql +/integrationtests/test-profile/custom/other/cert/ diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a30140e684b465d40b964f1bfb9b97959b29834d..b5bc53fed1b069f3a6f665a188aa8bdcd7252570 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -211,6 +211,17 @@ build-testenv: - PYLIB=${PYLIB:-dev} - echo $PYLIB + - if [ -z "$ADVANCED" ]; then + if echo "$CI_COMMIT_REF_NAME" | grep -c "^f-" ; then + echo "Check if advanced user tools have branch $CI_COMMIT_REF_NAME" ; + if wget https://gitlab.indiscale.com/api/v4/projects/104/repository/branches/${CI_COMMIT_REF_NAME} ; then + ADVANCED=$CI_COMMIT_REF_NAME ; + fi; + fi; + fi; + - ADVANCED=${ADVANCED:-dev} + - echo $ADVANCED + - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY # use here general latest or specific branch latest... - docker build diff --git a/CHANGELOG.md b/CHANGELOG.md index d0a2883005d6651f0ba3ef22b9fa5fe0d03349aa..001bf62a262ee970f7dc0de93b09cf8dea14507e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added * Everything +* Added new converters for tables: CSVTableConverter and XLSXTableConverter +* Possibility to authorize updates as in the old crawler +* Allow authorization of inserts ### Changed @@ -21,4 +24,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed +* Fixed #12 +* FIX: Variables are now also replaced when the value is given as a list. + ### Security diff --git a/README.md b/README.md index 59b88aaa36ed97d8c2cc9e4474820e3dad4a478b..8576e5c969556005fdeb346ef2cdfadf1b7fc266 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,8 @@ After installation of the package run (within the project folder): pytest ``` +## Integration Tests +see `integrationtests/README.md` # Contributers diff --git a/integrationtests/README.md b/integrationtests/README.md index 96789ed9f02036a0c7cc25ca1a60d9f0042a5557..88d55902e3fdc5836baefd97c3192cc9ff01e7bd 100644 --- a/integrationtests/README.md +++ b/integrationtests/README.md @@ -1,2 +1,3 @@ 1. Mount test_data/extroot as extroot folder in the CaosDB server 2. use an empty server +3. run pytest from `src`: `python -m pytest ../integrationtests` diff --git a/integrationtests/basic_example/test.py b/integrationtests/basic_example/test_basic.py similarity index 76% rename from integrationtests/basic_example/test.py rename to integrationtests/basic_example/test_basic.py index 6e35f7f2e4532acb5a2c3c80d06d9faeabd0fe0a..b24a1c658cfc9e23ca0ba2de266161864cb6b66c 100755 --- a/integrationtests/basic_example/test.py +++ b/integrationtests/basic_example/test_basic.py @@ -28,12 +28,13 @@ module description """ +from caosadvancedtools.crawler import Crawler as OldCrawler import os from caosdb import EmptyUniqueQueryError import argparse import sys from argparse import RawTextHelpFormatter -from caoscrawler import Crawler +from caoscrawler import Crawler, SecurityMode import caosdb as db from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter import pytest @@ -41,8 +42,8 @@ from caosadvancedtools.models.parser import parse_model_from_yaml import yaml # TODO is not yet merged in caosadvancedtools -from caosadvancedtools.testutils import clear_database, set_test_key -set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") +#from caosadvancedtools.testutils import clear_database, set_test_key +# set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") def rfp(*pathcomponents): @@ -53,6 +54,11 @@ def rfp(*pathcomponents): return os.path.join(os.path.dirname(__file__), *pathcomponents) +@pytest.fixture +def clear_database(): + db.execute_query("FIND Entity").delete() + + @pytest.fixture def usemodel(): model = parse_model_from_yaml(rfp("model.yml")) @@ -86,8 +92,8 @@ def ident(): def crawl_standard_test_directory(cr: Crawler, subdir: str = "examples_article", cfood: str = "scifolder_cfood.yml"): - cr.crawl_directory(rfp("..", "unittests", "test_directories", subdir), - rfp("..", "unittests", cfood)) + cr.crawl_directory(rfp("..", "..", "unittests", "test_directories", subdir), + rfp("..", "..", "unittests", cfood)) @pytest.fixture @@ -102,15 +108,13 @@ def crawler_extended(ident): cr = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory(cr, cfood="scifolder_extended.yml") # correct paths for current working directory - updateList = cr.updateList - fileList = [r for r in updateList if r.role == "File"] - for f in fileList: - f.file = rfp("..", "unittests", "test_directories", - "examples_article", f.file) + file_list = [r for r in cr.target_data if r.role == "File"] + for f in file_list: + f.file = rfp("..", "..", "unittests", "test_directories", f.file) return cr -def test_single_insertion(clear_database, usemodel, crawler): +def test_single_insertion(clear_database, usemodel, crawler, ident): ins, ups = crawler.synchronize() # This test also generates the file records.xml used in some of the unittesets: @@ -118,7 +122,7 @@ def test_single_insertion(clear_database, usemodel, crawler): for i in reversed(range(len(res))): if res[i].parents[0].name == "PyTestInfo": del res[i] - filename = rfp("..", "unittests", "records.xml") + filename = rfp("..", "..", "unittests", "records.xml") with open(filename, "w") as f: xml = res.to_xml() # Remove noscript and transaction benchmark: @@ -131,10 +135,9 @@ def test_single_insertion(clear_database, usemodel, crawler): assert len(ups) == 0 # Do a second run on the same data, there should be no changes: - crawler = Crawler(debug=True, identifiableAdapter=ident_adapt) - crawler.copy_attributes = Mock() - crawler.crawl_directory(rfp("../unittests/test_directories", "examples_article"), - rfp("../unittests/scifolder_cfood.yml")) + crawler = Crawler(debug=True, identifiableAdapter=ident) + crawler.crawl_directory(rfp("../../unittests/test_directories", "examples_article"), + rfp("../../unittests/scifolder_cfood.yml")) ins, ups = crawler.synchronize() assert len(ins) == 0 assert len(ups) == 0 @@ -157,7 +160,7 @@ def test_insertion(clear_database, usemodel, ident, crawler): # Do a second run on the same data, there should a new insert: cr = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory(cr, "example_insert") - assert len(cr.updateList) == 3 + assert len(cr.target_data) == 3 ins, ups = cr.synchronize() assert len(ins) == 1 assert len(ups) == 0 @@ -165,7 +168,29 @@ def test_insertion(clear_database, usemodel, ident, crawler): # Do it again to check whether nothing is changed: cr = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory(cr, "example_insert") - assert len(cr.updateList) == 3 + assert len(cr.target_data) == 3 + ins, ups = cr.synchronize() + assert len(ins) == 0 + assert len(ups) == 0 + + +def test_insert_auth(clear_database, usemodel, ident, crawler): + ins, ups = crawler.synchronize() + + # Do a second run on the same data, there should a new insert: + cr = Crawler(debug=True, identifiableAdapter=ident, securityMode=SecurityMode.RETRIEVE) + crawl_standard_test_directory(cr, "example_insert") + assert len(cr.target_data) == 3 + ins, ups = cr.synchronize() + assert len(ins) == 1 + assert not ins[0].is_valid() + nins, nups = OldCrawler.update_authorized_changes(cr.run_id) + assert nins == 1 + + # Do it again to check whether nothing is changed: + cr = Crawler(debug=True, identifiableAdapter=ident) + crawl_standard_test_directory(cr, "example_insert") + assert len(cr.target_data) == 3 ins, ups = cr.synchronize() assert len(ins) == 0 assert len(ups) == 0 @@ -180,9 +205,9 @@ def test_insertion_and_update(clear_database, usemodel, ident, crawler): cr = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory(cr, "example_overwrite_1") - # print(cr.updateList) + # print(cr.target_data) # cr.save_debug_data(rfp("provenance.yml")) - assert len(cr.updateList) == 3 + assert len(cr.target_data) == 3 ins, ups = cr.synchronize() assert len(ins) == 0 assert len(ups) == 1 @@ -197,7 +222,7 @@ def test_identifiable_update(clear_database, usemodel, ident, crawler): crawl_standard_test_directory(cr) # Test the addition of a single property: - l = cr.updateList + l = cr.target_data for record in l: if (record.parents[0].name == "Measurement" and record.get_property("date").value == "2020-01-03"): @@ -213,7 +238,7 @@ def test_identifiable_update(clear_database, usemodel, ident, crawler): # Test the change within one property: cr = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory(cr) - l = cr.updateList + l = cr.target_data for record in l: if (record.parents[0].name == "Measurement" and record.get_property("date").value == "2020-01-03"): @@ -227,7 +252,7 @@ def test_identifiable_update(clear_database, usemodel, ident, crawler): # Changing the date should result in a new insertion: cr = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory(cr) - l = cr.updateList + l = cr.target_data for record in l: if (record.parents[0].name == "Measurement" and record.get_property("date").value == "2020-01-03"): @@ -244,24 +269,23 @@ def test_file_insertion_dry(clear_database, usemodel, ident): crawler_extended = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory( crawler_extended, cfood="scifolder_extended.yml") - updateList = crawler_extended.updateList - fileList = [r for r in updateList if r.role == "File"] - assert len(fileList) == 11 + file_list = [r for r in crawler_extended.target_data if r.role == "File"] + assert len(file_list) == 11 - for f in fileList: + for f in file_list: assert f.path.endswith("README.md") - assert f.path == f.file + assert f.path[1:] == f.file ins, ups = crawler_extended.synchronize(commit_changes=False) assert len(ups) == 0 - fileList_ins = [r for r in ins if r.role == "File"] - assert len(fileList_ins) == 11 + file_list_ins = [r for r in ins if r.role == "File"] + assert len(file_list_ins) == 11 def test_file_insertion(clear_database, usemodel, ident, crawler_extended): ins, ups = crawler_extended.synchronize(commit_changes=True) - fileList_ins = [r for r in ins if r.role == "File"] - assert len(fileList_ins) == 11 + file_list_ins = [r for r in ins if r.role == "File"] + assert len(file_list_ins) == 11 assert db.execute_query("COUNT File") > 0 @@ -276,16 +300,14 @@ def test_file_insertion(clear_database, usemodel, ident, crawler_extended): def test_file_update(clear_database, usemodel, ident, crawler_extended): ins1, ups1 = crawler_extended.synchronize(commit_changes=True) - fileList_ins = [r for r in ins1 if r.role == "File"] + file_list_ins = [r for r in ins1 if r.role == "File"] cr = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory(cr, cfood="scifolder_extended.yml") - updateList = cr.updateList - fileList = [r for r in updateList if r.role == "File"] - for f in fileList: - f.file = rfp("..", "unittests", "test_directories", - "examples_article", f.file) + file_list = [r for r in cr.target_data if r.role == "File"] + for f in file_list: + f.file = rfp("..", "..", "unittests", "test_directories", f.file) ins2, ups2 = cr.synchronize(commit_changes=True) assert len(ups1) == 0 assert len(ups2) == 0 @@ -298,11 +320,9 @@ def test_file_update(clear_database, usemodel, ident, crawler_extended): cr2 = Crawler(debug=True, identifiableAdapter=ident) crawl_standard_test_directory(cr2, cfood="scifolder_extended2.yml") - updateList = cr2.updateList - fileList = [r for r in updateList if r.role == "File"] - for f in fileList: - f.file = rfp("..", "unittests", "test_directories", - "examples_article", f.file) + file_list = [r for r in cr2.target_data if r.role == "File"] + for f in file_list: + f.file = rfp("..", "..", "unittests", "test_directories", f.file) ins3, ups3 = cr2.synchronize(commit_changes=True) assert len(ups3) == 11 @@ -313,4 +333,4 @@ def test_file_update(clear_database, usemodel, ident, crawler_extended): # TODO: Implement file update checks (based on checksum) # Add test with actual file update: # assert len(ins2) == 0 - # assert len(ups2) == len(fileList_ins) + # assert len(ups2) == len(file_list_ins) diff --git a/integrationtests/pycaosdb.ini b/integrationtests/pycaosdb.ini new file mode 100644 index 0000000000000000000000000000000000000000..a4f429736c9b46c8987d05a02724725295f32081 --- /dev/null +++ b/integrationtests/pycaosdb.ini @@ -0,0 +1,29 @@ +[Connection] +url=https://localhost:10443/ +username=admin +debug=0 +#cacert=/home//CaosDB/caosdb-deploy/profiles/default/custom/other/cert/caosdb.cert.pem +password_method=plain +password=caosdb + +ssl_insecure=True +timeout=5000 +[Container] +debug=0 + +#[Crawler] +#oldprefix=/ExperimentalData/ +#newprefix=/home/professional/CaosDB/caosdb-advanced-user-tools/integrationtests/extroot/ExperimentalData +#[IntegrationTests] +#test_server_side_scripting.bin_dir=/home/professional/CaosDB/caosdb-pyinttest/resources + +[Misc] +sendmail=sendmail_to_file +#sendmail=/usr/local/bin/sendmail_to_file +entity_loan.curator_mail_from=admin@indiscale.com +entity_loan.curator_mail_to=admin@indiscale.com +[sss_helper] +external_uri = https://localhost:10443 +[advancedtools] +crawler.from_mail=admin@indiscale.com +crawler.to_mail=admin@indiscale.com diff --git a/integrationtests/test_data/extroot/realworld_example/dataset_cfoods.yml b/integrationtests/test_data/extroot/realworld_example/dataset_cfoods.yml index 1589cba2b44afc3e2645b0ee72f91bf83b327032..eaf2690ae130cb61c8a74452e3e4e1d4fd06846a 100644 --- a/integrationtests/test_data/extroot/realworld_example/dataset_cfoods.yml +++ b/integrationtests/test_data/extroot/realworld_example/dataset_cfoods.yml @@ -318,6 +318,13 @@ Data: Dataset: Project: $Project subtree: + name_element: + type: DictTextElement + match_name: "name" + match_value: "(?P<name>.*)" + records: + Project: + name: $name full_name_element: type: DictTextElement match_name: "full_name" diff --git a/integrationtests/test_data/extroot/realworld_example/identifiables.yml b/integrationtests/test_data/extroot/realworld_example/identifiables.yml new file mode 100644 index 0000000000000000000000000000000000000000..0ea0265ecfec05392c599457d81339bc91ba18d0 --- /dev/null +++ b/integrationtests/test_data/extroot/realworld_example/identifiables.yml @@ -0,0 +1,22 @@ +license: + - name +project_type: + - name +Keyword: + - name +Taxon: + - name +Person: + - email + # - full_name +Dataset: + - title + # - DOI +Event: + - longitude + - latitude + - start_datetime +Dataspace: + - dataspace_id +Project: + - name diff --git a/integrationtests/test_data/extroot/realworld_example/pycaosdb.ini b/integrationtests/test_data/extroot/realworld_example/pycaosdb.ini new file mode 120000 index 0000000000000000000000000000000000000000..bc443439d842f18ce05e002e5f6b95d37ca22747 --- /dev/null +++ b/integrationtests/test_data/extroot/realworld_example/pycaosdb.ini @@ -0,0 +1 @@ +../../../pycaosdb.ini \ No newline at end of file diff --git a/integrationtests/test_data/extroot/realworld_example/schema/zmt-organisation.yml b/integrationtests/test_data/extroot/realworld_example/schema/organisation.yml similarity index 100% rename from integrationtests/test_data/extroot/realworld_example/schema/zmt-organisation.yml rename to integrationtests/test_data/extroot/realworld_example/schema/organisation.yml diff --git a/integrationtests/test_realworld_example.py b/integrationtests/test_realworld_example.py index 28a9469d311b56aa12c35661b8ef66929fae8a8a..5ec2f3219625937e3d18f31eaaa2eb71566c75d7 100644 --- a/integrationtests/test_realworld_example.py +++ b/integrationtests/test_realworld_example.py @@ -29,7 +29,7 @@ import os import caosdb as db -from caoscrawler.crawl import Crawler +from caoscrawler.crawl import Crawler, main as crawler_main from caoscrawler.converters import JSONFileConverter, DictConverter from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter from caoscrawler.structure_elements import File, JSONFile, Directory @@ -81,20 +81,27 @@ def clear_database(): ents.delete() -def test_dataset( - clear_database, - usemodel): - # json_file_path = rfp("test_directories", "single_file_test_data", "testjson.json") - +def create_identifiable_adapter(): ident = CaosDBIdentifiableAdapter() - ident.register_identifiable( - "license", db.RecordType().add_parent("license").add_property("name")) - ident.register_identifiable("project_type", db.RecordType( - ).add_parent("project_type").add_property("name")) - ident.register_identifiable("Person", db.RecordType( - ).add_parent("Person").add_property("full_name")) - - crawler = Crawler(debug=True, identifiableAdapter=ident) + ident.register_identifiable("license", ( + db.RecordType() + .add_parent("license") + .add_property("name"))) + ident.register_identifiable("project_type", ( + db.RecordType() + .add_parent("project_type") + .add_property("name"))) + ident.register_identifiable("Person", ( + db.RecordType() + .add_parent("Person") + .add_property("full_name"))) + + return ident + + +def test_dataset(clear_database, usemodel): + ident = create_identifiable_adapter() + crawler = Crawler(identifiableAdapter=ident) crawler_definition = crawler.load_definition( os.path.join(DATADIR, "dataset_cfoods.yml")) # print(json.dumps(crawler_definition, indent=3)) @@ -107,13 +114,7 @@ def test_dataset( crawler_definition, converter_registry ) - subd = crawler.debug_tree - subc = crawler.debug_metadata - # print(json.dumps(subc, indent=3)) - # print(subd) - # print(subc) - # print(records) - ins, ups = crawler.synchronize() + crawler.synchronize() dataspace = db.execute_query("FIND RECORD Dataspace WITH name=35 AND dataspace_id=20002 AND " "archived=FALSE AND url='https://datacloud.de/index.php/f/7679'" @@ -130,3 +131,80 @@ def test_dataset( "") == 1 assert db.execute_query(f"COUNT RECORD with id={dataset.id} AND WHICH REFERENCES Event WITH " "start_datetime='2022-02-10T16:36:48+01:00'") == 1 + + +def test_event_update(clear_database, usemodel): + + identifiable_path = os.path.join(DATADIR, "identifiables.yml") + crawler_definition_path = os.path.join(DATADIR, "dataset_cfoods.yml") + + # TODO(fspreck): Use crawler_main + crawler_main( + os.path.join(DATADIR, 'data'), + crawler_definition_path, + identifiable_path, + True, + os.path.join(DATADIR, "provenance.yml"), + False, + "" + ) + + old_dataset_rec = db.execute_query( + "FIND RECORD Dataset WHICH HAS AN EVENT WITH location='Bremen, Germany'") + assert len(old_dataset_rec) == 1 + old_dataset_rec = old_dataset_rec[0] + assert old_dataset_rec.get_property("Event").datatype == db.LIST("Event") + assert len(old_dataset_rec.get_property("Event").value) == 1 + old_event_rec = db.Record( + id=old_dataset_rec.get_property("Event").value[0]).retrieve() + + # TODO(fspreck): crawl again manually, edit the event records in the update + # list, synchronize, and test whether the events have been updated. + ident = CaosDBIdentifiableAdapter() + ident.load_from_yaml_definition(identifiable_path) + + second_crawler = Crawler(identifiableAdapter=ident) + crawler_definition = second_crawler.load_definition( + crawler_definition_path) + converter_registry = second_crawler.load_converters(crawler_definition) + records = second_crawler.start_crawling( + Directory("data", os.path.join(DATADIR, "data")), + crawler_definition, + converter_registry + ) + + for rec in records: + if rec.parents[0].name == "Event": + rec.get_property("longitude").value = 0.0 + rec.get_property("latitude").value = 0.0 + rec.get_property("location").value = "Origin" + elif rec.parents[0].name == "Dataset": + rec.get_property("Event").value[0].get_property( + "longitude").value = 0.0 + rec.get_property("Event").value[0].get_property( + "latitude").value = 0.0 + rec.get_property("Event").value[0].get_property( + "location").value = "Origin" + second_crawler.synchronize() + + # Dataset is still the same Record, but with an updated event + new_dataset_rec = db.Record(id=old_dataset_rec.id).retrieve() + for prop in old_dataset_rec.get_properties(): + if not prop.name == "Event": + assert new_dataset_rec.get_property( + prop.name).datatype == prop.datatype + assert new_dataset_rec.get_property( + prop.name).value == prop.value + assert new_dataset_rec.get_property("Event").datatype == db.LIST("Event") + assert new_dataset_rec.get_property("Event").value is not None + assert len(new_dataset_rec.get_property("Event").value) == 1 + assert new_dataset_rec.get_property("Event").value[0] != old_event_rec.id + + # The event has new properties + new_event_rec = db.Record( + id=new_dataset_rec.get_property("Event").value[0]).retrieve() + assert new_event_rec.get_property("longitude").value == 0.0 + assert new_event_rec.get_property("latitude").value == 0.0 + assert new_event_rec.get_property("location").value == "Origin" + assert new_event_rec.get_property( + "start_datetime").value == old_event_rec.get_property("start_datetime").value diff --git a/integrationtests/test_use_case_simple_presentation.py b/integrationtests/test_use_case_simple_presentation.py index ba8009fcaab2696fb20970a2a6daaa8848d6d0a6..f1c838d1aadf4cb8b51043a8a24b93eddf275c75 100644 --- a/integrationtests/test_use_case_simple_presentation.py +++ b/integrationtests/test_use_case_simple_presentation.py @@ -32,7 +32,7 @@ from subprocess import run import caosdb as db from caosadvancedtools.loadFiles import loadpath from caosadvancedtools.models import parser as parser -from caoscrawler.crawl import crawler_main +from caoscrawler.crawl import main as crawler_main # TODO(fspreck) Re-eneable once this is part of dev in advancedusertools. @@ -77,7 +77,6 @@ def test_complete_crawler( True, os.path.join(DATADIR, "provenance.yml"), False, - True, "/use_case_simple_presentation") res = db.execute_query("FIND Record Experiment") diff --git a/setup.cfg b/setup.cfg index 2f8d46b30ee04d68adc6aef69e1a04115bbc44d8..9c652aa9ad32757075bd37f0bd5efeadcaa34582 100644 --- a/setup.cfg +++ b/setup.cfg @@ -24,6 +24,8 @@ install_requires = caosadvancedtools yaml-header-tools pyyaml + odfpy #make optional + pandas [options.packages.find] where = src diff --git a/src/caoscrawler/__init__.py b/src/caoscrawler/__init__.py index 28ef97d421023ad41be65d9d0e6abac76fbef6fe..b65b9fd9d24b9519a52ca13d07e46c9d8f791a73 100644 --- a/src/caoscrawler/__init__.py +++ b/src/caoscrawler/__init__.py @@ -1 +1 @@ -from .crawl import Crawler +from .crawl import Crawler, SecurityMode diff --git a/src/caoscrawler/authorize.py b/src/caoscrawler/authorize.py new file mode 100644 index 0000000000000000000000000000000000000000..6f1011b227881d4b73186996076abe20d94d52e5 --- /dev/null +++ b/src/caoscrawler/authorize.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2022 Henrik tom Wörden +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +from caosadvancedtools.crawler import Crawler as OldCrawler + +import argparse + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("run_id", + help="Run ID or the crawler run that created the changes that shall be " + "authorized.") + + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + OldCrawler.update_authorized_changes(args.run_id) diff --git a/src/caoscrawler/cfood-schema.yml b/src/caoscrawler/cfood-schema.yml index 6505cde7601d89dea84fa80d1ab7c36b2eca6895..d7b5abfd1ac6c381b50bd4ce61015f1b8602b408 100644 --- a/src/caoscrawler/cfood-schema.yml +++ b/src/caoscrawler/cfood-schema.yml @@ -23,6 +23,8 @@ cfood: - Definitions - Dict - JSONFile + - CSVTableConverter + - XLSXTableConverter description: Type of this converter node. match: description: typically a regexp which is matched to a structure element name diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index b8b9bd2ce7bff206d1233953f05c795a45a5b4ca..e3f72b10ce2694853d6bc0644c736f0d621ed881 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -40,6 +40,8 @@ from abc import abstractmethod from string import Template import yaml_header_tools +import pandas as pd + import yaml # These are special properties which are (currently) treated differently @@ -48,6 +50,15 @@ SPECIAL_PROPERTIES = ("description", "name", "id", "path", "file", "checksum", "size") +def str_to_bool(x): + if str(x).lower() == "true": + return True + elif str(x).lower() == "false": + return False + else: + raise RuntimeError("Should be 'true' or 'false'.") + + class ConverterValidationError(Exception): """To be raised if contents of an element to be converted are invalid.""" @@ -55,13 +66,37 @@ class ConverterValidationError(Exception): self.message = msg -def handle_value(value: Union[dict, str], values: GeneralStore): - """ - Function to generically handle values for properties defined in the - yaml structure. +def replace_variables(propvalue, values: GeneralStore): + # Check if the replacement is a single variable containing a record: + match = re.match(r"^\$(\{)?(?P<varname>[0-9a-zA-Z_]+)(\})?$", propvalue) + if match is not None: + varname = match.group("varname") + if varname in values: + if values[varname] is None: + return None + if isinstance(values[varname], db.Entity): + return values[varname] + propvalue_template = Template(propvalue) + return propvalue_template.safe_substitute(**values.get_storage()) + + +def handle_value(value: Union[dict, str, list], values: GeneralStore): + """ + determines whether the given value needs to set a property, be added to an existing value (create a list) or + add as an additional property (multiproperty). + + Variable names (starting with a "$") are replaced by the corresponding value stored in the + `values` GeneralStore. + + Parameters: + - value: if str, the value to be interpreted. E.g. "4", "hallo" or "$a" etc. + if dict, must have keys "value" and "collection_mode". The returned tuple is directly + created from the corresponding values. + if list, each element is checked for replacement and the resulting list will be used + as (list) value for the property Returns a tuple: - - the final value of the property + - the final value of the property; variable names contained in `values` are replaced. - the collection mode (can be single, list or multiproperty) """ # @review Florian Spreckelsen 2022-05-13 @@ -90,22 +125,19 @@ def handle_value(value: Union[dict, str], values: GeneralStore): # different from the two cases above. collection_mode = "single" propvalue = value - return (propvalue, collection_mode) - # Check if the replacement is a single variable containing a record: - match = re.match(r"^\$(\{)?(?P<varname>[0-9a-zA-Z_]+)(\})?$", propvalue) - if match is not None: - varname = match.group("varname") - if varname in values: - if values[varname] is None: - propvalue = None - return (propvalue, collection_mode) - if isinstance(values[varname], db.Entity): - propvalue = values[varname] - return (propvalue, collection_mode) + # variables replacement: + propvalue = [replace_variables(i, values) for i in propvalue] - propvalue_template = Template(propvalue) - propvalue = propvalue_template.safe_substitute(**values.get_storage()) + return (propvalue, collection_mode) + else: + # value is another simple type + # collection_mode = "single" + # propvalue = value["value"] + # return (propvalue, collection_mode) + raise RuntimeError() + + propvalue = replace_variables(propvalue, values) return (propvalue, collection_mode) @@ -113,7 +145,7 @@ def create_records(values: GeneralStore, records: RecordStore, def_records: dict): # list of keys to identify, which variables have been set by which paths: - # these are tuples: + # the items are tuples: # 0: record name # 1: property name keys_modified = [] @@ -143,6 +175,11 @@ def create_records(values: GeneralStore, for key, value in record.items(): if key == "parents" or key == "role": continue + + # Allow replacing variables in keys / names of properties: + key_template = Template(key) + key = key_template.safe_substitute(**values.get_storage()) + keys_modified.append((name, key)) propvalue, collection_mode = handle_value(value, values) @@ -151,6 +188,9 @@ def create_records(values: GeneralStore, # list mode does not work for them if key == "path" and not propvalue.startswith(os.path.sep): propvalue = os.path.sep + propvalue + + # Convert relative to absolute paths: + propvalue = os.path.normpath(propvalue) setattr(c_record, key, propvalue) else: @@ -607,3 +647,102 @@ class TextElementConverter(Converter): if m is None: return None return m.groupdict() + + +class TableConverter(Converter): + """ + This converter reads tables in different formats line by line and + allows matching the corresponding rows. + + The subtree generated by the table converter consists of DictDictElements, each being + a row. The corresponding header elements will become the dictionary keys. + + The rows can be matched using a DictDictElementConverter. + """ + @abstractmethod + def get_options(self): + """ + This method needs to be overwritten by the specific table converter to provide + information about the possible options. + """ + pass + + def _get_options(self, possible_options): + option_dict = dict() + for opt_name, opt_conversion in possible_options: + if opt_name in self.definition: + el = self.definition[opt_name] + # The option can often either be a single value or a list of values. + # In the latter case each element of the list will be converted to the defined type. + if isinstance(el, list): + option_dict[opt_name] = [opt_conversion(el_el) for el_el in el] + else: + option_dict[opt_name] = opt_conversion(el) + return option_dict + + def typecheck(self, element: StructureElement): + return isinstance(element, File) + + def match(self, element: StructureElement): + if not isinstance(element, File): + raise RuntimeError("Element must be a File.") + m = re.match(self.definition["match"], element.name) + if m is None: + return None + return m.groupdict() + + +class XLSXTableConverter(TableConverter): + def get_options(self): + return self._get_options([ + ("sheet_name", str), + ("header", int), + ("names", str), + ("index_col", int), + ("usecols", int), + ("true_values", str), + ("false_values", str), + ("na_values", str), + ("skiprows", int), + ("nrows", int), + ("keep_default_na", str_to_bool), ] + ) + + def create_children(self, generalStore: GeneralStore, + element: StructureElement): + if not isinstance(element, File): + raise RuntimeError("Element must be a File.") + table = pd.read_excel(element.path, **self.get_options()) + child_elements = list() + for index, row in table.iterrows(): + child_elements.append( + DictDictElement(str(index), row.to_dict())) + return child_elements + + +class CSVTableConverter(TableConverter): + def get_options(self): + return self._get_options([ + ("sep", str), + ("delimiter", str), + ("header", int), + ("names", str), + ("index_col", int), + ("usecols", int), + ("true_values", str), + ("false_values", str), + ("na_values", str), + ("skiprows", int), + ("nrows", int), + ("keep_default_na", str_to_bool), ]) + + def create_children(self, generalStore: GeneralStore, + element: StructureElement): + if not isinstance(element, File): + raise RuntimeError("Element must be a File.") + table = pd.read_csv(element.path, **self.get_options()) + child_elements = list() + for index, row in table.iterrows(): + child_elements.append( + DictDictElement(str(index), row.to_dict())) + return child_elements diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 3a78392e56b55b152f10760b7bec9f9b205263af..d9e23a69f0f6ddeb7e124c11c5fa2015d08bbafc 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -28,13 +28,19 @@ Crawl a file structure using a yaml cfood definition and synchronize the acuired data with CaosDB. """ +import importlib +from caosadvancedtools.cache import UpdateCache, Cache +import uuid import sys import os import yaml +from enum import Enum +import logging from importlib_resources import files import argparse from argparse import RawTextHelpFormatter import caosdb as db +from caosadvancedtools.crawler import Crawler as OldCrawler from caosdb.common.datatype import is_reference from .stores import GeneralStore, RecordStore from .identified_cache import IdentifiedCache @@ -51,7 +57,7 @@ from jsonschema import validate from .macros import defmacro_constructor, macro_constructor -import importlib +logger = logging.getLogger(__name__) SPECIAL_PROPERTIES_STRICT = ("description", "name", "id", "path") SPECIAL_PROPERTIES_NOT_STRICT = ("file", "checksum", "size") @@ -107,7 +113,7 @@ def check_identical(record1: db.Entity, record2: db.Entity, ignore_id=False): return False for attribute in ("datatype", "importance", "unit"): # only make an update for those attributes if there is a value difference and - # the value in the updateList is not None + # the value in the target_data is not None if attribute in comp[0]["properties"][key]: attr_val = comp[0]["properties"][key][attribute] other_attr_val = (comp[1]["properties"][key][attribute] @@ -128,6 +134,7 @@ def check_identical(record1: db.Entity, record2: db.Entity, ignore_id=False): def _resolve_datatype(prop: db.Property, remote_entity: db.Entity): + """ sets the datatype on the given property (side effect) """ if remote_entity.role == "Property": datatype = remote_entity.datatype @@ -144,6 +151,12 @@ def _resolve_datatype(prop: db.Property, remote_entity: db.Entity): return prop +class SecurityMode(Enum): + RETRIEVE = 0 + INSERT = 1 + UPDATE = 2 + + class Crawler(object): """ Crawler class that encapsulates crawling functions. @@ -151,23 +164,35 @@ class Crawler(object): storage for values (general store). """ - def __init__(self, converters: list[Converter] = [], + def __init__(self, + converters: list[Converter] = [], generalStore: Optional[GeneralStore] = None, debug: bool = False, - identifiableAdapter: IdentifiableAdapter = None): + identifiableAdapter: IdentifiableAdapter = None, + securityMode: int = SecurityMode.UPDATE + ): """ Create a new crawler and initialize an empty RecordStore and GeneralStore. - converters: The set of converters used for this crawler. - recordStore: An initial GeneralStore which might store e.g. environment variables. - - debug: Create a debugging information tree when set to True. - The debugging information tree is a variable stored in - self.debug_tree. It is a dictionary mapping directory entries - to a tuple of general stores and record stores which are valid for the directory scope. - Furthermore, it is stored in a second tree named self.debug_copied whether the - objects in debug_tree had been copied from a higher level in the hierarchy - of the structureelements. + Parameters + ---------- + converters : list[Converter] + The set of converters used for this crawler. + recordStore : GeneralStore + An initial GeneralStore which might store e.g. environment variables. + debug : bool + Create a debugging information tree when set to True. + The debugging information tree is a variable stored in + self.debug_tree. It is a dictionary mapping directory entries + to a tuple of general stores and record stores which are valid for the directory scope. + Furthermore, it is stored in a second tree named self.debug_copied whether the + objects in debug_tree had been copied from a higher level in the hierarchy + of the structureelements. + identifiableAdapter : IdentifiableAdapter + TODO describe + securityMode : int + Whether only retrieves are allowed or also inserts or even updates. + Please use SecurityMode Enum """ # TODO: check if this feature is really needed @@ -175,6 +200,7 @@ class Crawler(object): self.identified_cache = IdentifiedCache() self.recordStore = RecordStore() + self.securityMode = securityMode self.generalStore = generalStore if generalStore is None: @@ -183,7 +209,8 @@ class Crawler(object): self.identifiableAdapter = identifiableAdapter if identifiableAdapter is None: self.identifiableAdapter = LocalStorageIdentifiableAdapter() - + # If a directory is crawled this may hold the path to that directory + self.crawled_directory = None self.debug = debug if self.debug: # order in the tuple: @@ -278,6 +305,12 @@ class Crawler(object): "JSONFile": { "converter": "JSONFileConverter", "package": "caoscrawler.converters"}, + "CSVTableConverter": { + "converter": "CSVTableConverter", + "package": "caoscrawler.converters"}, + "XLSXTableConverter": { + "converter": "XLSXTableConverter", + "package": "caoscrawler.converters"}, "Dict": { "converter": "DictConverter", "package": "caoscrawler.converters"}, @@ -333,6 +366,7 @@ class Crawler(object): raise ValueError( "You have to provide a non-empty path for crawling.") dir_structure_name = os.path.basename(dirname) + self.crawled_directory = dirname if not dir_structure_name and dirname.endswith('/'): if dirname == '/': # Crawling the entire file system @@ -372,12 +406,19 @@ class Crawler(object): """ Start point of the crawler recursion. - items: A list of structure elements (or a single StructureElemen) that is used for - generating the initial items for the crawler. This could e.g. be a Directory. - crawler_definition: A dictionary representing the crawler definition, possibly from a yaml - file. + Parameters + ---------- + items: list + A list of structure elements (or a single StructureElement) that is used for + generating the initial items for the crawler. This could e.g. be a Directory. + crawler_definition : dict + A dictionary representing the crawler definition, possibly from a yaml + file. - Return the final update list. + Returns + ------- + target_data : list + the final list with the target state of Records. """ # This function builds the tree of converters out of the crawler definition. @@ -388,10 +429,11 @@ class Crawler(object): if not isinstance(items, list): items = [items] + self.run_id = uuid.uuid1() local_converters = Crawler.create_local_converters(crawler_definition, converter_registry) # This recursive crawling procedure generates the update list: - self.updateList: list[db.Record] = [] + self.target_data: list[db.Record] = [] self._crawl(items, self.global_converters, local_converters, self.generalStore, self.recordStore, [], []) @@ -399,7 +441,7 @@ class Crawler(object): if self.debug: self.debug_converters = self.global_converters + local_converters - return self.updateList + return self.target_data def synchronize(self, commit_changes: bool = True): """ @@ -409,7 +451,7 @@ class Crawler(object): # After the crawling, the actual synchronization with the database, based on the # update list is carried out: - return self._synchronize(self.updateList, commit_changes) + return self._synchronize(self.target_data, commit_changes) def can_be_checked_externally(self, record: db.Record): """ @@ -689,18 +731,19 @@ class Crawler(object): return to_be_inserted, to_be_updated - # TODO: replace _by_ with _with_ - def replace_entities_by_ids(self, rec: db.Record): + def replace_entities_with_ids(self, rec: db.Record): for el in rec.properties: if isinstance(el.value, db.Entity): - el.value = el.value.id + if el.value.id is not None: + el.value = el.value.id elif isinstance(el.value, list): for index, val in enumerate(el.value): if isinstance(val, db.Entity): - el.value[index] = val.id + if val.id is not None: + el.value[index] = val.id @staticmethod - def remove_unnecessary_updates(updateList: list[db.Record], + def remove_unnecessary_updates(target_data: list[db.Record], identified_records: list[db.Record]): """ checks whether all relevant attributes (especially Property values) are equal @@ -710,52 +753,67 @@ class Crawler(object): update list without unecessary updates """ - if len(updateList) != len(identified_records): + if len(target_data) != len(identified_records): raise RuntimeError("The lists of updates and of identified records need to be of the " "same length!") # TODO this can now easily be changed to a function without side effect - for i in reversed(range(len(updateList))): - identical = check_identical(updateList[i], identified_records[i]) + for i in reversed(range(len(target_data))): + identical = check_identical(target_data[i], identified_records[i]) if identical: - del updateList[i] + del target_data[i] continue else: pass @staticmethod - def execute_inserts_in_list(to_be_inserted): + def _get_entity_by_name(name): + return db.Entity(name=name).retrieve() + + @staticmethod + def execute_inserts_in_list(to_be_inserted, securityMode, run_id: int = None): for record in to_be_inserted: for prop in record.properties: - entity = db.Entity(name=prop.name).retrieve() - prop = _resolve_datatype(prop, entity) - print("INSERT") - print(to_be_inserted) + entity = Crawler._get_entity_by_name(prop.name) + _resolve_datatype(prop, entity) + logger.debug("INSERT") + logger.debug(to_be_inserted) if len(to_be_inserted) > 0: - db.Container().extend(to_be_inserted).insert() + if securityMode.value > SecurityMode.RETRIEVE.value: + db.Container().extend(to_be_inserted).insert() + elif run_id is not None: + update_cache = UpdateCache() + update_cache.insert(to_be_inserted, run_id, insert=True) @staticmethod - def execute_updates_in_list(to_be_updated): - # retrieve ids of properties when missing: - for record in to_be_updated: + def set_ids_and_datatype_of_parents_and_properties(rec_list): + for record in rec_list: for parent in record.parents: if parent.id is None: - parent.id = db.Entity(name=parent.name).retrieve().id + parent.id = Crawler._get_entity_by_name(parent.name).id for prop in record.properties: if prop.id is None: - entity = db.Entity(name=prop.name).retrieve() + entity = Crawler._get_entity_by_name(prop.name) prop.id = entity.id - prop = _resolve_datatype(prop, entity) - print("UPDATE") - print(to_be_updated) + _resolve_datatype(prop, entity) + + @staticmethod + def execute_updates_in_list(to_be_updated, securityMode, run_id: int = None): + Crawler.set_ids_and_datatype_of_parents_and_properties(to_be_updated) + logger.debug("UPDATE") + logger.debug(to_be_updated) if len(to_be_updated) > 0: - db.Container().extend(to_be_updated).update() + if securityMode.value > SecurityMode.INSERT.value: + db.Container().extend(to_be_updated).update() + elif run_id is not None: + update_cache = UpdateCache() + update_cache.insert(to_be_updated, run_id) - def _synchronize(self, updateList: list[db.Record], commit_changes: bool = True): + def _synchronize(self, target_data: list[db.Record], commit_changes: bool = True): """ This function applies several stages: - 1) Retrieve identifiables for all records in updateList. - 2) Compare updateList with existing records. + 1) Retrieve identifiables for all records in target_data. + 2) Compare target_data with existing records. 3) Insert and update records based on the set of identified differences. This function makes use of an IdentifiableAdapter which is used to retrieve @@ -764,30 +822,63 @@ class Crawler(object): if commit_changes is True, the changes are synchronized to the CaosDB server. For debugging in can be useful to set this to False. - Return the final insertList and updateList as tuple. + Return the final to_be_inserted and to_be_updated as tuple. """ if self.identifiableAdapter is None: raise RuntimeError("Should not happen.") - to_be_inserted, to_be_updated = self.split_into_inserts_and_updates( - updateList) + to_be_inserted, to_be_updated = self.split_into_inserts_and_updates(target_data) - # remove unnecessary updates from list # TODO: refactoring of typo for el in to_be_updated: - self.replace_entities_by_ids(el) + # all entity objects are replaced by their IDs except for the not yet inserted ones + self.replace_entities_with_ids(el) - identified_records = [self.identifiableAdapter.retrieve_identified_record_for_record(record) for record - in to_be_updated] + identified_records = [ + self.identifiableAdapter.retrieve_identified_record_for_record(record) + for record in to_be_updated] + # remove unnecessary updates from list by comparing the target records to the existing ones self.remove_unnecessary_updates(to_be_updated, identified_records) if commit_changes: - self.execute_inserts_in_list(to_be_inserted) - self.execute_updates_in_list(to_be_updated) + self.execute_inserts_in_list(to_be_inserted, self.securityMode, self.run_id) + self.execute_updates_in_list(to_be_updated, self.securityMode, self.run_id) + + update_cache = UpdateCache() + pending_inserts = update_cache.get_inserts(self.run_id) + if pending_inserts: + Crawler.inform_about_pending_changes( + pending_inserts, self.run_id, self.crawled_directory) + + pending_updates = update_cache.get_updates(self.run_id) + if pending_updates: + Crawler.inform_about_pending_changes( + pending_updates, self.run_id, self.crawled_directory) return (to_be_inserted, to_be_updated) + @staticmethod + def inform_about_pending_changes(pending_changes, run_id, path, inserts=False): + # Sending an Email with a link to a form to authorize updates is + # only done in SSS mode + + if "SHARED_DIR" in os.environ: + filename = OldCrawler.save_form([el[3] for el in pending_changes], path, run_id) + OldCrawler.send_mail([el[3] for el in pending_changes], filename) + + for i, el in enumerate(pending_changes): + + logger.debug( + """ +UNAUTHORIZED UPDATE ({} of {}): +____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) + logger.info("There were unauthorized changes (see above). An " + "email was sent to the curator.\n" + "You can authorize the " + ("inserts" if inserts else "updates") + + " by invoking the crawler" + " with the run id: {rid}\n".format(rid=run_id)) + @staticmethod def debug_build_usage_tree(converter: Converter): res: dict[str, dict[str, Any]] = { @@ -899,7 +990,7 @@ class Crawler(object): # to the general update container. scoped_records = recordStore.get_records_current_scope() for record in scoped_records: - self.updateList.append(record) + self.target_data.append(record) # TODO: the scoped variables should be cleaned up as soon if the variables # are no longer in the current scope. This can be implemented as follows, @@ -912,29 +1003,55 @@ class Crawler(object): # del recordStore[name] # del generalStore[name] - return self.updateList + return self.target_data -def crawler_main(args_path, - args_cfood, - args_load_identifiables, - args_debug, - args_provenance, - args_dry_sync, - args_sync, - args_prefix): - crawler = Crawler(debug=args_debug) - crawler.crawl_directory(args_path, args_cfood) - if args_provenance is not None: - crawler.save_debug_data(args_provenance) +def crawler_main(crawled_directory_path: str, + cfood_file_name: str, + identifiables_definition_file: str = None, + debug: bool = False, + provenance_file: str = None, + dry_run: bool = False, + prefix: str = "", + securityMode: int = SecurityMode.UPDATE): + """ - if args_load_identifiables is not None: + Parameters + ---------- + crawled_directory_path : str + path to be crawled + cfood_file_name : str + filename of the cfood to be used + identifiables_definition_file : str + filename of an identifiable definition yaml file + debug : bool + whether or not to run in debug mode + provenance_file : str + provenance information will be stored in a file with given filename + dry_run : bool + do not commit any chnages to the server + prefix : str + remove the given prefix from file paths + securityMode : int + securityMode of Crawler + + Returns + ------- + return_value : int + 0 if successful + """ + crawler = Crawler(debug=debug, securityMode=securityMode) + crawler.crawl_directory(crawled_directory_path, cfood_file_name) + if provenance_file is not None: + crawler.save_debug_data(provenance_file) + + if identifiables_definition_file is not None: ident = CaosDBIdentifiableAdapter() - ident.load_from_yaml_definition(args_load_identifiables) + ident.load_from_yaml_definition(identifiables_definition_file) crawler.identifiableAdapter = ident - if args_dry_sync: + if dry_run: ins, upd = crawler.synchronize(commit_changes=False) inserts = [str(i) for i in ins] updates = [str(i) for i in upd] @@ -942,14 +1059,14 @@ def crawler_main(args_path, f.write(yaml.dump({ "insert": inserts, "update": updates})) - elif args_sync: + else: rtsfinder = dict() - for elem in crawler.updateList: + for elem in crawler.target_data: if isinstance(elem, db.File): # correct the file path: # elem.file = os.path.join(args.path, elem.file) - if elem.path.startswith(args_prefix): - elem.path = elem.path[len(args_prefix):] + if elem.path.startswith(prefix): + elem.path = elem.path[len(prefix):] elem.file = None # TODO: as long as the new file backend is not finished # we are using the loadFiles function to insert symlinks. @@ -981,18 +1098,20 @@ def crawler_main(args_path, def parse_args(): parser = argparse.ArgumentParser(description=__doc__, formatter_class=RawTextHelpFormatter) - parser.add_argument("cfood", + parser.add_argument("cfood_file_name", help="Path name of the cfood yaml file to be used.") parser.add_argument("--provenance", required=False, help="Path name of the provenance yaml file. " "This file will only be generated if this option is set.") parser.add_argument("--debug", required=False, action="store_true", help="Path name of the cfood yaml file to be used.") - parser.add_argument("path", + parser.add_argument("crawled_directory_path", help="The subtree of files below the given path will " "be considered. Use '/' for everything.") - - parser.add_argument("-n", "--dry-sync", action="store_true", + parser.add_argument("-s", "--security-mode", choices=["retrieve", "insert", "update"], + help="Determines whether entities may only be read from the server, or " + "whether inserts or even updates may be done.") + parser.add_argument("-n", "--dry-run", action="store_true", help="Create two files dry.yml to show" "what would actually be committed without doing the synchronization.") @@ -1000,9 +1119,6 @@ def parse_args(): parser.add_argument("-i", "--load-identifiables", help="Load identifiables from " "the given yaml file.") - parser.add_argument("-s", "--sync", action="store_true", - help="Do the synchronization. This is probably the expected " - "standard behavior of the crawler.") parser.add_argument("-p", "--prefix", help="Remove the given prefix from the paths " @@ -1010,20 +1126,31 @@ def parse_args(): return parser.parse_args() - def main(): args = parse_args() - return crawler_main( - args.path, - args.cfood, + + conlogger = logging.getLogger("connection") + conlogger.setLevel(level=logging.ERROR) + + # logging config for local execution + logger.addHandler(logging.StreamHandler(sys.stdout)) + if args.debug: + logger.setLevel(logging.DEBUG) + else: + logger.setLevel(logging.INFO) + + sys.exit(crawler_main( + args.crawled_directory_path, + args.cfood_file_name, args.load_identifiables, args.debug, args.provenance, - args.dry_sync, - args.sync, - args.prefix - ) - + args.dry_run, + args.prefix, + {"retrieve": SecurityMode.RETRIEVE, + "insert": SecurityMode.INSERT, + "update": SecurityMode.UPDATE}[args.security_mode] + )) if __name__ == "__main__": - sys.exit(main()) + main() diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index 47fd5324a4803c67d7c9f99448378e7b5f9241bd..0b00cbeaefe42bcf600db735d27c67571ca6a79b 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -27,8 +27,10 @@ import yaml from datetime import datetime import caosdb as db +import logging from abc import abstractmethod, ABCMeta from .utils import has_parent +logger = logging.getLogger(__name__) def convert_value(value): @@ -202,7 +204,9 @@ class IdentifiableAdapter(metaclass=ABCMeta): if record_prop is None: # TODO: how to handle missing values in identifiables # raise an exception? - raise NotImplementedError() + raise NotImplementedError( + f"RECORD\n{record}\nPROPERTY\n{prop.name}" + ) newval = record_prop.value if isinstance(record_prop.value, db.Entity): newval = self.resolve_reference(record_prop.value) @@ -245,6 +249,7 @@ class IdentifiableAdapter(metaclass=ABCMeta): pass # TODO: remove side effect + # TODO: use ID if record has one? def retrieve_identified_record_for_record(self, record: db.Record): """ This function combines all functionality of the IdentifierAdapter by diff --git a/tox.ini b/tox.ini index 2cf966fb5b80e62cb7f216b0785ba567e13ee3ff..5ab67e67cfef0b3cf0cf82d2d28de0fe11aca6a1 100644 --- a/tox.ini +++ b/tox.ini @@ -1,11 +1,14 @@ [tox] -envlist=py36, py37, py38, py39, py310 +envlist=py38, py39, py310 skip_missing_interpreters = true [testenv] deps = . pytest pytest-cov + # TODO: Make this f-branch sensitive + git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev + git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@dev commands=py.test --cov=caosdb -vv {posargs} [flake8] max-line-length=100 diff --git a/unittests/scifolder_cfood.yml b/unittests/scifolder_cfood.yml index 1fd7c98d57b35fa651e36bee2c529a46e3a96cde..90f193444bfda7296c46260236274da2378635cc 100644 --- a/unittests/scifolder_cfood.yml +++ b/unittests/scifolder_cfood.yml @@ -16,7 +16,7 @@ Data: # name of the converter subtree: &template project_dir: # name of the first subtree element which is a converter type: Directory - match: (?P<date>.*?)_(?P<identifier>.*) + match: ((?P<date>[0-9]{4,4})_)?(?P<identifier>.*) records: Project: # this is an identifiable in this case parents: diff --git a/unittests/scifolder_extended.yml b/unittests/scifolder_extended.yml index 2a1416b778e96ba57fc216d9763572568703ab75..9bab612b9b37e8e295ee8fd02575de506a98d8fc 100644 --- a/unittests/scifolder_extended.yml +++ b/unittests/scifolder_extended.yml @@ -16,12 +16,12 @@ Data: # name of the converter subtree: &template project_dir: # name of the first subtree element which is a converter type: Directory - match: (?P<date>.*?)_(?P<identifier>.*) + match: ((?P<year>[0-9]{4,4})_)?(?P<identifier>.*) records: Project: # this is an identifiable in this case parents: - Project # not needed as the name is equivalent - date: $date + date: $year identifier: $identifier subtree: diff --git a/unittests/scifolder_extended2.yml b/unittests/scifolder_extended2.yml index f1dfc2d4635b6956930343685c7b17ca4f2f1679..969325e91da488011819c338708a33dcfc32c93e 100644 --- a/unittests/scifolder_extended2.yml +++ b/unittests/scifolder_extended2.yml @@ -6,95 +6,99 @@ Definitions: type: Definitions #include "description.yml" -DataAnalysis: # name of the converter +Data: # name of the converter type: Directory - match: DataAnalysis - subtree: &template - project_dir: # name of the first subtree element which is a converter + match: (.*) + subtree: + DataAnalysis: # name of the converter type: Directory - match: (?P<date>.*?)_(?P<identifier>.*) - records: - Project: # this is an identifiable in this case - parents: - - Project # not needed as the name is equivalent - date: $date - identifier: $identifier - - subtree: - measurement: # new name for folders on the 3rd level + match: DataAnalysis + subtree: &template + project_dir: # name of the first subtree element which is a converter type: Directory - match: (?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2})(_(?P<identifier>.*))? + match: ((?P<year>[0-9]{4,4})_)?(?P<identifier>.*) records: - Measurement: - date: $date + Project: # this is an identifiable in this case + parents: + - Project # not needed as the name is equivalent + date: $year identifier: $identifier - project: $Project + subtree: - README: - type: MarkdownFile # this is a subclass of converter File - # function signature: GeneralStore, StructureElement - # preprocessors: custom.caosdb.convert_values - match: ^README\.md$ - # how to make match case insensitive? - records: # this block is very verbose and intended to make sure that this - # file is inserted correctly (and can be supplemented with properties - # and / or parents), TODO: maybe there should be a shorthand - ReadmeFile: - parents: - - ProjectMarkdownReadme - role: File - path: $README - file: $README # this is automatically the relative path - # starting from the top level structure element - # of this element + measurement: # new name for folders on the 3rd level + type: Directory + match: (?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2})(_(?P<identifier>.*))? + records: Measurement: - ReadmeFile: $ReadmeFile - + date: $date + identifier: $identifier + project: $Project subtree: - description: - type: DictTextElement - match_value: (?P<description>.*) - match_name: description - records: + README: + type: MarkdownFile # this is a subclass of converter File + # function signature: GeneralStore, StructureElement + # preprocessors: custom.caosdb.convert_values + match: ^README\.md$ + # how to make match case insensitive? + records: # this block is very verbose and intended to make sure that this + # file is inserted correctly (and can be supplemented with properties + # and / or parents), TODO: maybe there should be a shorthand + ReadmeFile: + parents: + - ProjectMarkdownReadme + role: File + path: $README + file: $README # this is automatically the relative path + # starting from the top level structure element + # of this element Measurement: - description: $description - responsible_single: - type: DictTextElement - match_name: responsible - match_value: &person_regexp ((?P<first_name>.+) )?(?P<last_name>.+) - records: &responsible_records - Person: - first_name: $first_name - last_name: $last_name - Measurement: # this uses the reference to the above defined record - responsible: +$Person # each record also implicitely creates a variable - # with the same name. The "+" indicates, that - # this will become a list entry in list property - # "responsible" belonging to Measurement. - - responsible_list: - type: DictListElement - match_name: responsible + ReadmeFile: $ReadmeFile + subtree: - Person: - type: TextElement - match: *person_regexp - records: *responsible_records + description: + type: DictTextElement + match_value: (?P<description>.*) + match_name: description + records: + Measurement: + description: $description + responsible_single: + type: DictTextElement + match_name: responsible + match_value: &person_regexp ((?P<first_name>.+) )?(?P<last_name>.+) + records: &responsible_records + Person: + first_name: $first_name + last_name: $last_name + Measurement: # this uses the reference to the above defined record + responsible: +$Person # each record also implicitely creates a variable + # with the same name. The "+" indicates, that + # this will become a list entry in list property + # "responsible" belonging to Measurement. - # sources_list: - # type: DictListElement - # match_name: sources - # subtree: - # Source: - # type: TextElement - # match: &path ... ??? + responsible_list: + type: DictListElement + match_name: responsible + subtree: + Person: + type: TextElement + match: *person_regexp + records: *responsible_records -ExperimentalData: # name of the converter - type: Directory - match: ExperimentalData - subtree: *template + # sources_list: + # type: DictListElement + # match_name: sources + # subtree: + # Source: + # type: TextElement + # match: &path ... ??? -SimulationData: # name of the converter - type: Directory - match: SimulationData - subtree: *template + ExperimentalData: # name of the converter + type: Directory + match: ExperimentalData + subtree: *template + + SimulationData: # name of the converter + type: Directory + match: SimulationData + subtree: *template diff --git a/unittests/simulated_server_data.py b/unittests/simulated_server_data.py new file mode 100644 index 0000000000000000000000000000000000000000..6a523dbb06397b380510f72502a76cc6bda5f06c --- /dev/null +++ b/unittests/simulated_server_data.py @@ -0,0 +1,24 @@ + +import caosdb as db +data_model = {"person": (db.RecordType(id=10001, name="Person") + .add_property(name="first_name") + .add_property(name="last_name")), + "measurement": (db.RecordType(id=10002, name="Measurement") + .add_property(name="identifier") + .add_property(name="date") + .add_property(name="project")), + "project": (db.RecordType(id=10003, name="Project") + .add_property(name="date") + .add_property(name="identifier")), + "first_name": db.Property(name="first_name", datatype=db.TEXT, id=10004), + "responsible": db.Property(name="responsible", datatype="Person", id=10005), + "last_name": db.Property(name="last_name", datatype=db.TEXT, id=10006), + "identifier": db.Property(name="identifier", datatype=db.TEXT, id=10007), + "date": db.Property(name="date", datatype=db.DATETIME, id=10008), + } +existing_data = { +} + +full_data = {} +full_data.update(data_model) +full_data.update(existing_data) diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 5f56486ba0f63fdd64d4e4dd80e6d6eaeed705d1..7a6987b8b3fae9d747f2440de202df5d10a34cc0 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -1,11 +1,10 @@ #!/usr/bin/env python3 # encoding: utf-8 # -# ** header v3.0 # This file is a part of the CaosDB Project. # -# Copyright (C) 2021 Indiscale GmbH <info@indiscale.com> -# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# Copyright (C) 2021,2022 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2021,2022 Henrik tom Wörden <h.tomwoerden@indiscale.com> # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as @@ -20,8 +19,6 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. # -# ** end header -# """ test the converters module @@ -38,6 +35,8 @@ from caoscrawler.structure_elements import (File, DictTextElement, DictBooleanElement, DictDictElement, DictIntegerElement, DictFloatElement) +from caoscrawler.converters import handle_value + from test_tool import rfp import pytest @@ -244,3 +243,32 @@ def test_json_converter(converter_registry): children = jsonconverter.create_children(None, broken_json) assert err.value.message.startswith("Couldn't validate") + + +def test_variable_replacement(): + values = GeneralStore() + values["a"] = 4 + values["b"] = "68" + + assert handle_value("b", values) == ("b", "single") + assert handle_value("+b", values) == ("b", "list") + assert handle_value("*b", values) == ("b", "multiproperty") + assert handle_value("$b", values) == ("68", "single") + assert handle_value("+$b", values) == ("68", "list") + assert handle_value("*$b", values) == ("68", "multiproperty") + + assert handle_value({"value": "b", + "collection_mode": "single"}, values) == ("b", "single") + assert handle_value({"value": "b", + "collection_mode": "list"}, values) == ("b", "list") + assert handle_value({"value": "b", + "collection_mode": "multiproperty"}, values) == ("b", "multiproperty") + assert handle_value({"value": "$b", + "collection_mode": "single"}, values) == ("68", "single") + assert handle_value({"value": "$b", + "collection_mode": "list"}, values) == ("68", "list") + assert handle_value({"value": "$b", + "collection_mode": "multiproperty"}, values) == ("68", "multiproperty") + + assert handle_value(["a", "b"], values) == (["a", "b"], "single") + assert handle_value(["$a", "$b"], values) == (["4", "68"], "single") diff --git a/unittests/test_directories/examples_tables/ExperimentalData/test1.csv b/unittests/test_directories/examples_tables/ExperimentalData/test1.csv new file mode 100644 index 0000000000000000000000000000000000000000..c2eb297b523c06729937a07221c695105df0b09c --- /dev/null +++ b/unittests/test_directories/examples_tables/ExperimentalData/test1.csv @@ -0,0 +1,8 @@ +Col_1,Col_2,Col_3,text +Index,description,, +,m,s, +0,12,1,jdsfkljadskf +1,14,3,jdkfljad +2,3,4,jadkfjdsk +3,4.5,6, +4,8,7,jadskfj diff --git a/unittests/test_directories/examples_tables/ExperimentalData/test1.xlsx b/unittests/test_directories/examples_tables/ExperimentalData/test1.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..2bf68c8a854ae7f618e47e1db58490fc76c055b2 Binary files /dev/null and b/unittests/test_directories/examples_tables/ExperimentalData/test1.xlsx differ diff --git a/unittests/test_directories/examples_tables/crawler_for_tables.yml b/unittests/test_directories/examples_tables/crawler_for_tables.yml new file mode 100644 index 0000000000000000000000000000000000000000..7aaea3e55eb4b8cb2329c24c8b7861f0d9e76d69 --- /dev/null +++ b/unittests/test_directories/examples_tables/crawler_for_tables.yml @@ -0,0 +1,59 @@ + +ExperimentalData: + type: Directory + match: ExperimentalData + records: + Project: + name: project + subtree: + XLSXTable: + type: XLSXTableConverter + match: test1\.xlsx + skiprows: [1, 2] + header: 0 + records: + Experiment: {} + + subtree: + Row: + type: DictDictElement + match_name: .* + records: + Measurement: {} + Experiment: + Measurements: +$Measurement + subtree: + Col_1: + type: DictIntegerElement + match_name: Col_1 + match_value: (?P<Value>[0-9]+) + records: + Measurement: + Col_1: $Value + CSVTable: + type: CSVTableConverter + match: test1\.csv + skiprows: [1, 2] + header: 0 + records: + Experiment: {} + + subtree: + Row: + type: DictDictElement + match_name: .* + records: + Measurement: {} + Experiment: + Measurements: +$Measurement + subtree: + Col_1: + type: DictIntegerElement + match_name: Col_1 + match_value: (?P<Value>[0-9]+) + records: + Measurement: + Col_1: $Value + + + diff --git a/unittests/test_table_converter.py b/unittests/test_table_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..85255d3efd34dc666d5d2e97423f33177dea6732 --- /dev/null +++ b/unittests/test_table_converter.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2022 Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +""" +test the converters module +""" + +from caoscrawler.converters import Converter +from caoscrawler.stores import GeneralStore +from caoscrawler.converters import (ConverterValidationError, + DictConverter, XLSXTableConverter, CSVTableConverter) +from caoscrawler.structure_elements import Directory +from caoscrawler.structure_elements import (File, DictTextElement, + DictListElement, DictElement, + DictBooleanElement, DictDictElement, + DictIntegerElement, DictFloatElement) + +from os.path import join, dirname, basename + +from caoscrawler.identifiable_adapters import IdentifiableAdapter, LocalStorageIdentifiableAdapter + +import pytest +import os +import importlib + +import math + +from caoscrawler import Crawler + +import caosdb as db + + +@pytest.fixture +def converter_registry(): + converter_registry: dict[str, dict[str, str]] = { + "Directory": { + "converter": "DirectoryConverter", + "package": "caoscrawler.converters"}, + "CSVTableConverter": { + "converter": "CSVTableConverter", + "package": "caoscrawler.converters"}, + "XLSXTableConverter": { + "converter": "XLSXTableConverter", + "package": "caoscrawler.converters"}, + + "DictDictElement": { + "converter": "DictDictElementConverter", + "package": "caoscrawler.converters"}, + "DictTextElement": { + "converter": "DictTextElementConverter", + "package": "caoscrawler.converters"}, + "DictIntegerElement": { + "converter": "DictIntegerElementConverter", + "package": "caoscrawler.converters"}, + "DictFloatElement": { + "converter": "DictFloatElementConverter", + "package": "caoscrawler.converters"}, + } + + +def rfp(*pathcomponents): + """ + Return full path. + Shorthand convenience function. + """ + return join(dirname(__file__), *pathcomponents) + + +def dircheckstr(*pathcomponents): + """ + Return the debug tree identifier for a given path. + """ + return "caoscrawler.structure_elements.File: " + basename(join(*pathcomponents)) + ", " + rfp("test_directories", "examples_tables", "ExperimentalData", *pathcomponents) + + +@pytest.fixture +def crawler(): + crawler = Crawler(debug=True) + crawler.crawl_directory(rfp("test_directories", "examples_tables", "ExperimentalData"), + rfp("test_directories", "examples_tables", "crawler_for_tables.yml")) + return crawler + + +def test_convert_table(converter_registry): + extentions = ["xlsx", "csv", "tsv"] + if importlib.util.find_spec("odf") is not None: + extentions.append("ods") + for file_ext in extentions: + def_opt = {"skiprows": ["1", "2"], "header": 0} + if file_ext == "tsv": + def_opt["sep"] = "\t" + if file_ext in ["csv", "tsv"]: + converter = CSVTableConverter( + def_opt, + "Tab", + converter_registry) + else: + converter = XLSXTableConverter( + def_opt, + "Tab", + converter_registry) + store = GeneralStore() + file_element = File("table." + file_ext, + rfp("test_tables", "test1." + file_ext)) + res = converter.create_children(store, + file_element) + assert len(res) == 5 + for i in range(5): + assert res[i].name == str(i) + assert type(res[i].name) == str + assert type(res[i].value) == dict + assert len(res[i].value) == 4 + assert type(res[i].value["Col_1"]) == int + assert res[i].value["Col_1"] == i + assert type(res[i].value["Col_2"]) == float + assert type(res[i].value["Col_3"]) == int + if i != 3: + assert type(res[i].value["text"]) == str + else: + assert type(res[i].value["text"]) == float # the nan value + assert math.isnan(res[i].value["text"]) + + # Using an index col: + converter = XLSXTableConverter( + {"skiprows": ["1", "2"], "header": 0, "index_col": "3"}, + "XLSXTable", + converter_registry) + store = GeneralStore() + file_element = File("table.xlsx", + rfp("test_tables", "test1.xlsx")) + res = converter.create_children(store, + file_element) + assert res[0].name == "jdsfkljadskf" + + +def test_crawl_csv_table(crawler): + for file_ext in ["xlsx", "csv"]: + subd = crawler.debug_tree[dircheckstr("test1." + file_ext)] + record_experiment = subd[1]["Experiment"] + assert isinstance(record_experiment, db.Record) + assert isinstance(record_experiment.get_property("Measurements").value, list) + assert len(record_experiment.get_property("Measurements").value) == 5 + prop_measure = record_experiment.get_property("Measurements").value[2] + assert isinstance(prop_measure, db.Record) + assert prop_measure.get_property("Col_1").value == "2" diff --git a/unittests/test_tables/test1.csv b/unittests/test_tables/test1.csv new file mode 100644 index 0000000000000000000000000000000000000000..c2eb297b523c06729937a07221c695105df0b09c --- /dev/null +++ b/unittests/test_tables/test1.csv @@ -0,0 +1,8 @@ +Col_1,Col_2,Col_3,text +Index,description,, +,m,s, +0,12,1,jdsfkljadskf +1,14,3,jdkfljad +2,3,4,jadkfjdsk +3,4.5,6, +4,8,7,jadskfj diff --git a/unittests/test_tables/test1.ods b/unittests/test_tables/test1.ods new file mode 100644 index 0000000000000000000000000000000000000000..6d5138b496511b02d0e6104868b6ba1e6816bfb6 Binary files /dev/null and b/unittests/test_tables/test1.ods differ diff --git a/unittests/test_tables/test1.tsv b/unittests/test_tables/test1.tsv new file mode 100644 index 0000000000000000000000000000000000000000..69286fcecd82c955f900bcdf7e6b5adfe26ab8c8 --- /dev/null +++ b/unittests/test_tables/test1.tsv @@ -0,0 +1,8 @@ +Col_1 Col_2 Col_3 text +Index description + m s +0 12 1 jdsfkljadskf +1 14 3 jdkfljad +2 3 4 jadkfjdsk +3 4.5 6 +4 8 7 jadskfj diff --git a/unittests/test_tables/test1.xlsx b/unittests/test_tables/test1.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..2bf68c8a854ae7f618e47e1db58490fc76c055b2 Binary files /dev/null and b/unittests/test_tables/test1.xlsx differ diff --git a/unittests/test_tool.py b/unittests/test_tool.py index 1e7f10069c49ce6cab71da5f469e28b69158b4b5..59573756fe61ef697976e480dd1550cb0ead0998 100755 --- a/unittests/test_tool.py +++ b/unittests/test_tool.py @@ -3,11 +3,14 @@ # Adapted from check-sfs # A. Schlemmer, 06/2021 -from caoscrawler import Crawler +from caoscrawler.crawl import Crawler, SecurityMode from caoscrawler.structure_elements import File, DictTextElement, DictListElement from caoscrawler.identifiable_adapters import IdentifiableAdapter, LocalStorageIdentifiableAdapter +from simulated_server_data import full_data from functools import partial from copy import deepcopy +from unittest.mock import patch +import caosdb.common.models as dbmodels from unittest.mock import MagicMock, Mock from os.path import join, dirname, basename import yaml @@ -173,7 +176,7 @@ def test_record_structure_generation(crawler): def test_ambigious_records(crawler, ident): ident.get_records().clear() - ident.get_records().extend(crawler.updateList) + ident.get_records().extend(crawler.target_data) r = ident.get_records() id_r0 = ident.get_identifiable(r[0]) with raises(RuntimeError, match=".*unambigiously.*"): @@ -195,7 +198,7 @@ def test_crawler_update_list(crawler, ident): ) == 2 # The crawler contains lots of duplicates, because identifiables have not been resolved yet: - assert len(ident.get_records()) != len(crawler.updateList) + assert len(ident.get_records()) != len(crawler.target_data) # Check consistency: # Check whether identifiables retrieved from current identifiable store return the same results. @@ -289,10 +292,10 @@ def test_remove_unnecessary_updates(): # test property difference case # TODO this should work right? - #upl = [db.Record().add_parent("A").add_property("a", 3)] + # upl = [db.Record().add_parent("A").add_property("a", 3)] # irs = [db.Record().add_parent("A")] # ID should be s - #Crawler.remove_unnecessary_updates(upl, irs) - #assert len(upl) == 1 + # Crawler.remove_unnecessary_updates(upl, irs) + # assert len(upl) == 1 # test value difference case upl = [db.Record().add_parent("A").add_property("a", 5)] @@ -327,7 +330,7 @@ def test_identifiable_adapter_no_identifiable(crawler, ident): insl, updl = crawler.synchronize() assert len(updl) == 0 - pers = [r for r in crawler.updateList if r.parents[0].name == "Person"] + pers = [r for r in crawler.target_data if r.parents[0].name == "Person"] # All persons are inserted, because they are not identifiable: assert len(insl) == len(pers) @@ -347,22 +350,24 @@ def test_provenance_debug_data(crawler): assert check_key_count("Person") == 14 +def basic_retrieve_by_name_mock_up(rec, known): + """ returns a stored Record if rec.name is an existing key, None otherwise """ + if rec.name in known: + return known[rec.name] + else: + return None + + @pytest.fixture -def mock_retrieve(crawler): - # simulate remote server content by using the names to identify records - def base_mocked_lookup(rec, known): - if rec.name in known: - return known[rec.name] - else: - return None - - # a record that is found remotely and should be added to the update list and one that is not - # found and should be added to the insert one - remote_known = {"A": db.Record(id=1111, name="A")} +def crawler_mocked_identifiable_retrieve(crawler): + # mock retrieval of registered identifiabls: return Record with just a parent + crawler.identifiableAdapter.get_registered_identifiable = Mock( + side_effect=lambda x: db.Record().add_parent(x.parents[0].name)) + + # Simulate remote server content by using the names to identify records + # There is only a single known Record with name A crawler.identifiableAdapter.retrieve_identified_record_for_record = Mock(side_effect=partial( - base_mocked_lookup, known=remote_known)) - crawler.identifiableAdapter.get_registered_identifiable = ( - lambda x: db.Record().add_parent(x.parents[0].name)) + basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A")})) return crawler @@ -371,9 +376,8 @@ def test_split_into_inserts_and_updates_trivial(crawler): crawler.split_into_inserts_and_updates([]) -def test_split_into_inserts_and_updates_single(mock_retrieve): - crawler = mock_retrieve - +def test_split_into_inserts_and_updates_single(crawler_mocked_identifiable_retrieve): + crawler = crawler_mocked_identifiable_retrieve entlist = [db.Record(name="A").add_parent( "C"), db.Record(name="B").add_parent("C")] @@ -391,10 +395,13 @@ def test_split_into_inserts_and_updates_single(mock_retrieve): assert insert[0].name == "B" assert len(update) == 1 assert update[0].name == "A" + # if this ever fails, the mock up may be removed + crawler.identifiableAdapter.get_registered_identifiable.assert_called() + crawler.identifiableAdapter.retrieve_identified_record_for_record.assert_called() -def test_split_into_inserts_and_updates_with_duplicate(mock_retrieve): - crawler = mock_retrieve +def test_split_into_inserts_and_updates_with_duplicate(crawler_mocked_identifiable_retrieve): + crawler = crawler_mocked_identifiable_retrieve a = db.Record(name="A").add_parent("C") b = db.Record(name="B").add_parent("C") b.add_property("A", a) @@ -406,10 +413,13 @@ def test_split_into_inserts_and_updates_with_duplicate(mock_retrieve): assert insert[0].name == "B" assert len(update) == 1 assert update[0].name == "A" + # if this ever fails, the mock up may be removed + crawler.identifiableAdapter.get_registered_identifiable.assert_called() + crawler.identifiableAdapter.retrieve_identified_record_for_record.assert_called() -def test_split_into_inserts_and_updates_with_ref(mock_retrieve): - crawler = mock_retrieve +def test_split_into_inserts_and_updates_with_ref(crawler_mocked_identifiable_retrieve): + crawler = crawler_mocked_identifiable_retrieve # try it with a reference a = db.Record(name="A").add_parent("C") b = db.Record(name="B").add_parent("C") @@ -420,20 +430,23 @@ def test_split_into_inserts_and_updates_with_ref(mock_retrieve): assert insert[0].name == "B" assert len(update) == 1 assert update[0].name == "A" + # if this ever fails, the mock up may be removed + crawler.identifiableAdapter.retrieve_identified_record_for_record.assert_called() + crawler.identifiableAdapter.get_registered_identifiable.assert_called() -def test_split_into_inserts_and_updates_with_circ(mock_retrieve): +def test_split_into_inserts_and_updates_with_circ(crawler): # try circular - crawler = mock_retrieve a = db.Record(name="A").add_parent("C") b = db.Record(name="B").add_parent("C") b.add_property("A", a) a.add_property("B", b) entlist = [a, b] + # TODO this does not seem to be complete! -def test_split_into_inserts_and_updates_with_complex(mock_retrieve): - crawler = mock_retrieve +def test_split_into_inserts_and_updates_with_complex(crawler_mocked_identifiable_retrieve): + crawler = crawler_mocked_identifiable_retrieve # A # ^ # | @@ -452,12 +465,15 @@ def test_split_into_inserts_and_updates_with_complex(mock_retrieve): assert "B" in [el.name for el in insert] assert len(update) == 1 assert update[0].name == "A" + # if this ever fails, the mock up may be removed + crawler.identifiableAdapter.get_registered_identifiable.assert_called() + crawler.identifiableAdapter.retrieve_identified_record_for_record.assert_called() # TODO write test where the unresoled entity is not part of the identifiable -def test_split_into_inserts_and_updates_with_copy_attr(mock_retrieve): - crawler = mock_retrieve +def test_split_into_inserts_and_updates_with_copy_attr(crawler_mocked_identifiable_retrieve): + crawler = crawler_mocked_identifiable_retrieve # assume identifiable is only the name a = db.Record(name="A").add_parent("C") a.add_property("foo", 1) @@ -468,17 +484,17 @@ def test_split_into_inserts_and_updates_with_copy_attr(mock_retrieve): assert update[0].get_property("bar").value == 2 assert update[0].get_property("foo").value == 1 + # if this ever fails, the mock up may be removed + crawler.identifiableAdapter.get_registered_identifiable.assert_called() + crawler.identifiableAdapter.retrieve_identified_record_for_record.assert_called() def test_all_references_are_existing_already(crawler): - def base_mocked_lookup(rec, known): - if rec.name in known: - return known[rec.name] - else: - return None + # Simulate remote server content by using the names to identify records + # There are only two known Records with name A and B crawler.identifiableAdapter.get_registered_identifiable = Mock(side_effect=partial( - base_mocked_lookup, known={"A": db.Record(name="A").add_parent("C"), - "B": db.Record(name="B").add_parent("C")})) + basic_retrieve_by_name_mock_up, known={"A": db.Record(name="A").add_parent("C"), + "B": db.Record(name="B").add_parent("C")})) assert crawler.all_references_are_existing_already( db.Record().add_property('a', 123)) @@ -496,6 +512,8 @@ def test_all_references_are_existing_already(crawler): assert crawler.all_references_are_existing_already(db.Record() .add_property('a', 123) .add_property('b', a)) + # if this ever fails, the mock up may be removed + crawler.identifiableAdapter.get_registered_identifiable.assert_called() def test_can_be_checked_externally(crawler): @@ -512,12 +530,144 @@ def test_can_be_checked_externally(crawler): .add_property('b', db.Record())) -def test_replace_entities_by_ids(crawler): +def test_replace_entities_with_ids(crawler): a = (db.Record().add_parent("B").add_property("A", 12345) .add_property("B", db.Record(id=12345)) .add_property("C", [db.Record(id=12345), 233324])) - crawler.replace_entities_by_ids(a) + crawler.replace_entities_with_ids(a) assert a.get_property("A").value == 12345 assert a.get_property("B").value == 12345 assert a.get_property("C").value == [12345, 233324] + + +def mock_get_entity_by_name(name): + candidates = [el for el in full_data.values() if el.name.lower() == name.lower()] + if len(candidates) > 0: + return candidates[0] + else: + return None + + +def prepare_crawler_with_sec_mode(mode, ident): + crawler = Crawler(debug=True, securityMode=mode) + crawler.crawl_directory(rfp("test_directories", "examples_article"), + rfp("scifolder_cfood.yml")) + crawler.identifiableAdapter = ident + + return crawler + + +def reset_mocks(mocks): + for mock in mocks: + mock.reset_mock() + + +def change_identifiable_prop(ident): + # the checks in here are only to make sure we change the record as we intend to + meas = ident._records[-2] + assert meas.parents[0].name == "Measurement" + resps = meas.properties[0] + assert resps.name == "date" + # change one element; This changes the date which is part of the identifiable + resps.value = "2022-01-04" + + +def change_non_identifiable_prop(ident): + # the checks in here are only to make sure we change the record as we intend to + meas = ident._records[-1] + assert meas.parents[0].name == "Measurement" + resps = meas.properties[-1] + assert resps.name == "responsible" + assert len(resps.value) == 2 + # change one element; This removes a responsible which is not part of the identifiable + del resps.value[-1] + + +@patch("caoscrawler.crawl.Crawler._get_entity_by_name", + new=Mock(side_effect=mock_get_entity_by_name)) +@patch("caoscrawler.crawl.db.Container.insert") +@patch("caoscrawler.crawl.db.Container.update") +@patch("caoscrawler.crawl.UpdateCache.insert") +def test_security_mode(updateCacheMock, upmock, insmock, ident): + records_backup = deepcopy(ident._records) + + # trivial case: nothing to do + crawler = prepare_crawler_with_sec_mode(SecurityMode.RETRIEVE, ident) + crawler.synchronize(commit_changes=True) + assert crawler.run_id is not None + insmock.assert_not_called() + upmock.assert_not_called() + updateCacheMock.assert_not_called() + + # RETRIEVE: insert only + crawler = prepare_crawler_with_sec_mode(SecurityMode.RETRIEVE, ident) + # remove one element + del ident._records[-1] + # insert forbidden + crawler.synchronize(commit_changes=True) + assert crawler.run_id is not None + insmock.assert_not_called() + upmock.assert_not_called() + assert updateCacheMock.call_count == 1 + # reset counts + reset_mocks([updateCacheMock, insmock, upmock]) + # restore original ident + ident._records = deepcopy(records_backup) + + # RETRIEVE: update only + crawler = prepare_crawler_with_sec_mode(SecurityMode.RETRIEVE, ident) + # change one element + change_non_identifiable_prop(ident) + crawler.synchronize(commit_changes=True) + assert crawler.run_id is not None + insmock.assert_not_called() + upmock.assert_not_called() + assert updateCacheMock.call_count == 1 + # reset counts + reset_mocks([updateCacheMock, insmock, upmock]) + # restore original ident + ident._records = deepcopy(records_backup) + + # INSERT: insert only + crawler = prepare_crawler_with_sec_mode(SecurityMode.INSERT, ident) + # remove one element + del ident._records[-1] + crawler.synchronize(commit_changes=True) + assert crawler.run_id is not None + insmock.assert_called_once() + upmock.assert_not_called() + updateCacheMock.assert_not_called() + # reset counts + reset_mocks([updateCacheMock, insmock, upmock]) + # restore original ident + ident._records = deepcopy(records_backup) + + # INSERT: update only + crawler = prepare_crawler_with_sec_mode(SecurityMode.INSERT, ident) + # change one element + change_non_identifiable_prop(ident) + crawler.synchronize(commit_changes=True) + assert crawler.run_id is not None + insmock.assert_not_called() + upmock.assert_not_called() + updateCacheMock.assert_called_once() + # reset counts + reset_mocks([updateCacheMock, insmock, upmock]) + # restore original ident + ident._records = deepcopy(records_backup) + + # INSERT: insert and update + crawler = prepare_crawler_with_sec_mode(SecurityMode.INSERT, ident) + # change two elements + change_non_identifiable_prop(ident) + change_identifiable_prop(ident) + crawler.synchronize(commit_changes=True) + assert crawler.run_id is not None + insmock.asser_called_once() + upmock.assert_not_called() + updateCacheMock.assert_called_once() + # reset counts + reset_mocks([updateCacheMock, insmock, upmock]) + # restore original ident + ident._records = deepcopy(records_backup)