diff --git a/src/newcrawler/converters.py b/src/newcrawler/converters.py index ebc3ab19ceb0f8c18cba5cb1bc3f86d5e31bfb84..b8b9bd2ce7bff206d1233953f05c795a45a5b4ca 100644 --- a/src/newcrawler/converters.py +++ b/src/newcrawler/converters.py @@ -37,6 +37,7 @@ from .structure_elements import (StructureElement, Directory, File, Dict, JSONFi TextElement, DictTextElement, DictElement, DictListElement) from typing import Optional, Union from abc import abstractmethod +from string import Template import yaml_header_tools import yaml @@ -63,6 +64,7 @@ def handle_value(value: Union[dict, str], values: GeneralStore): - the final value of the property - the collection mode (can be single, list or multiproperty) """ + # @review Florian Spreckelsen 2022-05-13 if type(value) == dict: if "value" not in value: @@ -90,12 +92,20 @@ def handle_value(value: Union[dict, str], values: GeneralStore): propvalue = value return (propvalue, collection_mode) - if propvalue.startswith("$"): - propvalue = values[propvalue[1:]] - # Allow the insertion of $ signs at the beginning - if type(propvalue) == str and propvalue.startswith("$$"): - propvalue = propvalue[1:] - + # Check if the replacement is a single variable containing a record: + match = re.match(r"^\$(\{)?(?P<varname>[0-9a-zA-Z_]+)(\})?$", propvalue) + if match is not None: + varname = match.group("varname") + if varname in values: + if values[varname] is None: + propvalue = None + return (propvalue, collection_mode) + if isinstance(values[varname], db.Entity): + propvalue = values[varname] + return (propvalue, collection_mode) + + propvalue_template = Template(propvalue) + propvalue = propvalue_template.safe_substitute(**values.get_storage()) return (propvalue, collection_mode) diff --git a/src/newcrawler/crawl.py b/src/newcrawler/crawl.py index be4a997c39a5977b878813dffec31eec3f1ad0bc..de350a2aa96546d29825646caffc4b7a4940171e 100644 --- a/src/newcrawler/crawl.py +++ b/src/newcrawler/crawl.py @@ -769,6 +769,7 @@ class Crawler(object): updateList) # remove unnecessary updates from list + # TODO: refactoring of typo for el in to_be_updated: self.replace_entities_by_ids(el) diff --git a/unittests/test_directories/example_substitutions/ExperimentalData/220512_data.dat b/unittests/test_directories/example_substitutions/ExperimentalData/220512_data.dat new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/unittests/test_directories/example_substitutions/substitutions.yml b/unittests/test_directories/example_substitutions/substitutions.yml new file mode 100644 index 0000000000000000000000000000000000000000..1b4e8784a69d1ad1b80fa757ad77cd137c8cc7b5 --- /dev/null +++ b/unittests/test_directories/example_substitutions/substitutions.yml @@ -0,0 +1,22 @@ + +ExperimentalData: # name of the converter + type: Directory + match: ExperimentalData + records: + Project: + name: project + subtree: + File: # name of the converter + type: SimpleFile + match: (?P<year>[0-9]{2,2})(?P<month>[0-9]{2,2})(?P<day>[0-9]{2,2})_data.dat + records: + Experiment: + date: 20$year-$month-$day + + ExperimentSeries: + Experiment: $Experiment + + Project: + Experiments: +$Experiment + dates: +20$year-$month-$day + diff --git a/unittests/test_tool.py b/unittests/test_tool.py index dd9fb83d772496cc6b3729f2893997360d318f18..9fe127e15f088992e9eab80913d2e5574a4d1fdf 100755 --- a/unittests/test_tool.py +++ b/unittests/test_tool.py @@ -74,8 +74,10 @@ def test_record_structure_generation(crawler): subd = crawler.debug_tree[dircheckstr("DataAnalysis")] subc = crawler.debug_metadata["copied"][dircheckstr("DataAnalysis")] assert len(subd) == 2 - assert len(subd[0]) == 2 # variables store on Data Analysis node of debug tree - assert len(subd[1]) == 0 # record store on Data Analysis node of debug tree + # variables store on Data Analysis node of debug tree + assert len(subd[0]) == 2 + # record store on Data Analysis node of debug tree + assert len(subd[1]) == 0 assert len(subc) == 2 assert len(subc[0]) == 2 assert len(subc[1]) == 0 @@ -84,7 +86,8 @@ def test_record_structure_generation(crawler): assert subd[0]["DataAnalysis"] == "examples_article/DataAnalysis" assert subc[0]["DataAnalysis"] == False - subd = crawler.debug_tree[dircheckstr("DataAnalysis", "2020_climate-model-predict")] + subd = crawler.debug_tree[dircheckstr( + "DataAnalysis", "2020_climate-model-predict")] subc = crawler.debug_metadata["copied"][dircheckstr( "DataAnalysis", "2020_climate-model-predict")] @@ -92,7 +95,8 @@ def test_record_structure_generation(crawler): assert len(subd[1]["Project"].get_parents()) == 1 assert subd[1]["Project"].get_parents()[0].name == "Project" assert subd[1]["Project"].get_property("date").value == "2020" - assert subd[1]["Project"].get_property("identifier").value == "climate-model-predict" + assert subd[1]["Project"].get_property( + "identifier").value == "climate-model-predict" assert len(subd[0]) == 6 assert subd[0]["date"] == "2020" @@ -129,15 +133,19 @@ def test_record_structure_generation(crawler): assert len(subd[1]["Project"].get_parents()) == 1 assert subd[1]["Project"].get_parents()[0].name == "Project" assert subd[1]["Project"].get_property("date").value == "2020" - assert subd[1]["Project"].get_property("identifier").value == "climate-model-predict" + assert subd[1]["Project"].get_property( + "identifier").value == "climate-model-predict" assert len(subd[1]["Measurement"].get_parents()) == 1 assert subd[1]["Measurement"].get_parents()[0].name == "Measurement" assert subd[1]["Measurement"].get_property("date").value == "2020-02-08" - assert subd[1]["Measurement"].get_property("identifier").value == "prediction-errors" + assert subd[1]["Measurement"].get_property( + "identifier").value == "prediction-errors" assert subd[1]["Measurement"].get_property("project").value != "$Project" - assert subd[1]["Measurement"].get_property("project").value.__class__ == db.Record - assert subd[1]["Measurement"].get_property("project").value == subd[0]["Project"] + assert subd[1]["Measurement"].get_property( + "project").value.__class__ == db.Record + assert subd[1]["Measurement"].get_property( + "project").value == subd[0]["Project"] # Check the copy flags for the second level in the hierarchy: assert subc[1]["Project"] is True @@ -176,9 +184,15 @@ def test_crawler_update_list(crawler, ident): # If the following assertions fail, that is a hint, that the test file records.xml has changed # and this needs to be updated: assert len(ident.get_records()) == 18 - assert len([r for r in ident.get_records() if r.parents[0].name == "Person"]) == 5 - assert len([r for r in ident.get_records() if r.parents[0].name == "Measurement"]) == 11 - assert len([r for r in ident.get_records() if r.parents[0].name == "Project"]) == 2 + assert len( + [r for r in ident.get_records() if r.parents[0].name == "Person"] + ) == 5 + assert len( + [r for r in ident.get_records() if r.parents[0].name == "Measurement"] + ) == 11 + assert len( + [r for r in ident.get_records() if r.parents[0].name == "Project"] + ) == 2 # The crawler contains lots of duplicates, because identifiables have not been resolved yet: assert len(ident.get_records()) != len(crawler.updateList) @@ -194,8 +208,10 @@ def test_crawler_update_list(crawler, ident): id_r0 = ident.get_identifiable(r_cur) assert r_cur.parents[0].name == id_r0.parents[0].name - assert r_cur.get_property("first_name").value == id_r0.get_property("first_name").value - assert r_cur.get_property("last_name").value == id_r0.get_property("last_name").value + assert r_cur.get_property( + "first_name").value == id_r0.get_property("first_name").value + assert r_cur.get_property( + "last_name").value == id_r0.get_property("last_name").value assert len(r_cur.parents) == 1 assert len(id_r0.parents) == 1 assert len(r_cur.properties) == 2 @@ -213,9 +229,11 @@ def test_crawler_update_list(crawler, ident): id_r1 = ident.get_identifiable(r_cur) assert r_cur.parents[0].name == id_r1.parents[0].name - assert r_cur.get_property("identifier").value == id_r1.get_property("identifier").value + assert r_cur.get_property( + "identifier").value == id_r1.get_property("identifier").value assert r_cur.get_property("date").value == id_r1.get_property("date").value - assert r_cur.get_property("project").value == id_r1.get_property("project").value + assert r_cur.get_property( + "project").value == id_r1.get_property("project").value assert len(r_cur.parents) == 1 assert len(id_r1.parents) == 1 assert len(r_cur.properties) == 4 @@ -228,7 +246,8 @@ def test_crawler_update_list(crawler, ident): assert idr_r1_test != idr_r0_test assert len(idr_r1.properties) == 4 - assert r_cur.get_property("responsible").value == idr_r1.get_property("responsible").value + assert r_cur.get_property( + "responsible").value == idr_r1.get_property("responsible").value assert r_cur.description == idr_r1.description # test whether compare_entites function works in this context: @@ -355,14 +374,17 @@ def test_split_into_inserts_and_updates_trivial(crawler): def test_split_into_inserts_and_updates_single(mock_retrieve): crawler = mock_retrieve - entlist = [db.Record(name="A").add_parent("C"), db.Record(name="B").add_parent("C")] + entlist = [db.Record(name="A").add_parent( + "C"), db.Record(name="B").add_parent("C")] assert crawler.get_identified_record_from_local_cache(entlist[0]) is None assert crawler.get_identified_record_from_local_cache(entlist[1]) is None assert crawler.can_be_checked_externally(entlist[0]) assert crawler.can_be_checked_externally(entlist[1]) - assert crawler.identifiableAdapter.retrieve_identified_record_for_record(entlist[0]).id == 1111 - assert crawler.identifiableAdapter.retrieve_identified_record_for_record(entlist[1]) is None + assert crawler.identifiableAdapter.retrieve_identified_record_for_record( + entlist[0]).id == 1111 + assert crawler.identifiableAdapter.retrieve_identified_record_for_record( + entlist[1]) is None insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) assert len(insert) == 1 @@ -416,7 +438,8 @@ def test_split_into_inserts_and_updates_with_complex(mock_retrieve): # ^ # | # F <- B <- G - a = db.Record(name="A").add_parent("C").add_property('d', 13).add_property('e', "lskdjlsfdj") + a = db.Record(name="A").add_parent("C").add_property( + 'd', 13).add_property('e', "lskdjlsfdj") b = db.Record(name="B").add_parent("C") g = db.Record(name="G").add_parent("C") f = db.Record(name="F").add_parent("C") @@ -457,7 +480,8 @@ def test_all_references_are_existing_already(crawler): base_mocked_lookup, known={"A": db.Record(name="A").add_parent("C"), "B": db.Record(name="B").add_parent("C")})) - assert crawler.all_references_are_existing_already(db.Record().add_property('a', 123)) + assert crawler.all_references_are_existing_already( + db.Record().add_property('a', 123)) assert crawler.all_references_are_existing_already(db.Record() .add_property('a', db.Record(id=123))) assert crawler.all_references_are_existing_already(db.Record() @@ -475,7 +499,8 @@ def test_all_references_are_existing_already(crawler): def test_can_be_checked_externally(crawler): - assert crawler.can_be_checked_externally(db.Record().add_property('a', 123)) + assert crawler.can_be_checked_externally( + db.Record().add_property('a', 123)) assert crawler.can_be_checked_externally(db.Record() .add_property('a', db.Record(id=123))) assert crawler.can_be_checked_externally(db.Record() diff --git a/unittests/test_variable_substitutions.py b/unittests/test_variable_substitutions.py new file mode 100644 index 0000000000000000000000000000000000000000..b8c10d85e05305bba2fb4810762cc1b7ce5fe0c4 --- /dev/null +++ b/unittests/test_variable_substitutions.py @@ -0,0 +1,61 @@ +#!/bin/python +# Tests for variable substitutions +# A. Schlemmer, 05/2022 + +from newcrawler import Crawler +from newcrawler.structure_elements import File, DictTextElement, DictListElement +from newcrawler.identifiable_adapters import IdentifiableAdapter, LocalStorageIdentifiableAdapter +from functools import partial +from copy import deepcopy +from unittest.mock import MagicMock, Mock +from os.path import join, dirname, basename +import yaml +import caosdb as db +from caosdb.apiutils import compare_entities + +import pytest +from pytest import raises + + +def rfp(*pathcomponents): + """ + Return full path. + Shorthand convenience function. + """ + return join(dirname(__file__), *pathcomponents) + + +def dircheckstr(element_type, *pathcomponents): + """ + Return the debug tree identifier for a given path. + """ + return "newcrawler.structure_elements." + element_type + ": " + basename(join(*pathcomponents)) + ", " + rfp("test_directories", "example_substitutions", *pathcomponents) + + +@pytest.fixture +def crawler(): + crawler = Crawler(debug=True) + crawler.crawl_directory(rfp("test_directories", "example_substitutions", "ExperimentalData"), + rfp("test_directories", "example_substitutions", "substitutions.yml")) + return crawler + + +def test_substitutions(crawler): + # @review Florian Spreckelsen 2022-05-13 + for i in range(2): + subd = crawler.debug_tree[dircheckstr( + "File", "ExperimentalData", "220512_data.dat")] + assert subd[i]["Experiment"].get_property("date").value == "2022-05-12" + assert isinstance(subd[i]["ExperimentSeries"].get_property( + "Experiment").value, db.Record) + + subd = crawler.debug_tree[dircheckstr("Directory", "ExperimentalData")] + assert subd[i]["Project"].name == "project" + assert isinstance(subd[i]["Project"].get_property( + "Experiments").value, list) + assert isinstance(subd[i]["Project"].get_property( + "Experiments").value[0], db.Record) + + assert isinstance(subd[i]["Project"].get_property("dates").value, list) + assert subd[i]["Project"].get_property( + "dates").value[0] == "2022-05-12"