diff --git a/integrationtests/test_data/extroot/realworld_example/dataset_cfoods.yml b/integrationtests/test_data/extroot/realworld_example/dataset_cfoods.yml index 1589cba2b44afc3e2645b0ee72f91bf83b327032..eaf2690ae130cb61c8a74452e3e4e1d4fd06846a 100644 --- a/integrationtests/test_data/extroot/realworld_example/dataset_cfoods.yml +++ b/integrationtests/test_data/extroot/realworld_example/dataset_cfoods.yml @@ -318,6 +318,13 @@ Data: Dataset: Project: $Project subtree: + name_element: + type: DictTextElement + match_name: "name" + match_value: "(?P<name>.*)" + records: + Project: + name: $name full_name_element: type: DictTextElement match_name: "full_name" diff --git a/integrationtests/test_data/extroot/realworld_example/identifiables.yml b/integrationtests/test_data/extroot/realworld_example/identifiables.yml new file mode 100644 index 0000000000000000000000000000000000000000..0ea0265ecfec05392c599457d81339bc91ba18d0 --- /dev/null +++ b/integrationtests/test_data/extroot/realworld_example/identifiables.yml @@ -0,0 +1,22 @@ +license: + - name +project_type: + - name +Keyword: + - name +Taxon: + - name +Person: + - email + # - full_name +Dataset: + - title + # - DOI +Event: + - longitude + - latitude + - start_datetime +Dataspace: + - dataspace_id +Project: + - name diff --git a/integrationtests/test_realworld_example.py b/integrationtests/test_realworld_example.py index 28a9469d311b56aa12c35661b8ef66929fae8a8a..b2afe974ebc7f48fc3bb127cecb5d6def8ae87c7 100644 --- a/integrationtests/test_realworld_example.py +++ b/integrationtests/test_realworld_example.py @@ -29,7 +29,7 @@ import os import caosdb as db -from caoscrawler.crawl import Crawler +from caoscrawler.crawl import Crawler, crawler_main from caoscrawler.converters import JSONFileConverter, DictConverter from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter from caoscrawler.structure_elements import File, JSONFile, Directory @@ -84,7 +84,6 @@ def clear_database(): def test_dataset( clear_database, usemodel): - # json_file_path = rfp("test_directories", "single_file_test_data", "testjson.json") ident = CaosDBIdentifiableAdapter() ident.register_identifiable( @@ -94,7 +93,7 @@ def test_dataset( ident.register_identifiable("Person", db.RecordType( ).add_parent("Person").add_property("full_name")) - crawler = Crawler(debug=True, identifiableAdapter=ident) + crawler = Crawler(identifiableAdapter=ident) crawler_definition = crawler.load_definition( os.path.join(DATADIR, "dataset_cfoods.yml")) # print(json.dumps(crawler_definition, indent=3)) @@ -107,13 +106,7 @@ def test_dataset( crawler_definition, converter_registry ) - subd = crawler.debug_tree - subc = crawler.debug_metadata - # print(json.dumps(subc, indent=3)) - # print(subd) - # print(subc) - # print(records) - ins, ups = crawler.synchronize() + crawler.synchronize() dataspace = db.execute_query("FIND RECORD Dataspace WITH name=35 AND dataspace_id=20002 AND " "archived=FALSE AND url='https://datacloud.de/index.php/f/7679'" @@ -130,3 +123,85 @@ def test_dataset( "") == 1 assert db.execute_query(f"COUNT RECORD with id={dataset.id} AND WHICH REFERENCES Event WITH " "start_datetime='2022-02-10T16:36:48+01:00'") == 1 + + +@pytest.mark.xfail( + reason="Reference properties are not updated correctly. " + "See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/12." +) +def test_event_update(clear_database, usemodel): + + identifiable_path = os.path.join(DATADIR, "identifiables.yml") + crawler_definition_path = os.path.join(DATADIR, "dataset_cfoods.yml") + + # TODO(fspreck): Use crawler_main + crawler_main( + os.path.join(DATADIR, 'data'), + crawler_definition_path, + identifiable_path, + True, + os.path.join(DATADIR, "provenance.yml"), + False, + True, + "" + ) + + old_dataset_rec = db.execute_query( + "FIND RECORD Dataset WHICH HAS AN EVENT WITH location='Bremen, Germany'") + assert len(old_dataset_rec) == 1 + old_dataset_rec = old_dataset_rec[0] + assert old_dataset_rec.get_property("Event").datatype == db.LIST("Event") + assert len(old_dataset_rec.get_property("Event").value) == 1 + old_event_rec = db.Record( + id=old_dataset_rec.get_property("Event").value[0]).retrieve() + + # TODO(fspreck): crawl again manually, edit the event records in the update + # list, synchronize, and test whether the events have been updated. + ident = CaosDBIdentifiableAdapter() + ident.load_from_yaml_definition(identifiable_path) + + second_crawler = Crawler(identifiableAdapter=ident) + crawler_definition = second_crawler.load_definition( + crawler_definition_path) + converter_registry = second_crawler.load_converters(crawler_definition) + records = second_crawler.start_crawling( + Directory("data", os.path.join(DATADIR, "data")), + crawler_definition, + converter_registry + ) + + for rec in records: + if rec.parents[0].name == "Event": + rec.get_property("longitude").value = 0.0 + rec.get_property("latitude").value = 0.0 + rec.get_property("location").value = "Origin" + elif rec.parents[0].name == "Dataset": + rec.get_property("Event").value[0].get_property( + "longitude").value = 0.0 + rec.get_property("Event").value[0].get_property( + "latitude").value = 0.0 + rec.get_property("Event").value[0].get_property( + "location").value = "Origin" + second_crawler.synchronize() + + # Dataset is still the same Record, but with an updated event + new_dataset_rec = db.Record(id=old_dataset_rec.id).retrieve() + for prop in old_dataset_rec.get_properties(): + if not prop.name == "Event": + assert new_dataset_rec.get_property( + prop.name).datatype == prop.datatype + assert new_dataset_rec.get_property( + prop.name).value == prop.value + assert new_dataset_rec.get_property("Event").datatype == db.LIST("Event") + assert new_dataset_rec.get_property("Event").value is not None + assert len(new_dataset_rec.get_property("Event").value) == 1 + assert new_dataset_rec.get_property("Event").value[0] != old_event_rec.id + + # The event has new properties + new_event_rec = db.Record( + id=new_dataset_rec.get_property("Event").value[0]).retrieve() + assert new_event_rec.get_property("longitude").value == 0.0 + assert new_event_rec.get_property("latitude").value == 0.0 + assert new_event_rec.get_property("location").value == "Origin" + assert new_event_rec.get_property( + "start_datetime").value == old_event_rec.get_property("start_datetime").value