diff --git a/CHANGELOG.md b/CHANGELOG.md index 9036ad3c8fff8eb873e75221cdd0c0bdb2a92498..d28cb8be7569b470531961f863ad8f08fa40aec2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Removed ### ### Fixed ### +- usage of ID when looking for identified records - Query generation when there are only backrefs or backrefs and a name ### Security ### diff --git a/integrationtests/basic_example/test_basic.py b/integrationtests/basic_example/test_basic.py index b33974d9c2c5600bf2a91cbf14d7c8799ffc2644..15aee62bbb9bf253607dc0bb04c44f3baae2548d 100755 --- a/integrationtests/basic_example/test_basic.py +++ b/integrationtests/basic_example/test_basic.py @@ -45,6 +45,7 @@ import yaml from caosdb.utils.register_tests import clear_database, set_test_key set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") +#TODO move test related stuff here and remove it from unittests def rfp(*pathcomponents): """ diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 7542a6592c0abfeb03056ea9ef5f230ecac7564a..7ea1cc20537060d1f94b9e9c9b233141acc0f565 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -833,7 +833,6 @@ class Crawler(object): to_be_inserted, to_be_updated = self.split_into_inserts_and_updates(crawled_data) referencing_entities = self.create_reference_mapping(to_be_updated + to_be_inserted) - # TODO: refactoring of typo for el in to_be_updated: # all entity objects are replaced by their IDs except for the not yet inserted ones self.replace_entities_with_ids(el) diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index 241685b5cfe9d87acad16e0c6a871d9ea6ad79e3..9122cc4a882b94e3a6bba27921f59bc4bbc9d9a0 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -27,6 +27,7 @@ from __future__ import annotations import yaml from datetime import datetime +from caosdb.cached import cached_get_entity_by from typing import Any from .identifiable import Identifiable import caosdb as db @@ -264,8 +265,6 @@ identifiabel, identifiable and identified record) for a Record. """ pass - # TODO: remove side effect - # TODO: use ID if record has one? def retrieve_identified_record_for_record(self, record: db.Record, referencing_entities=None): """ This function combines all functionality of the IdentifierAdapter by @@ -275,10 +274,12 @@ identifiabel, identifiable and identified record) for a Record. In case there was no appropriate registered identifiable or no identifiable could be found return value is None. """ - identifiable = self.get_identifiable(record, referencing_entities=referencing_entities) + if record.path is not None: + return cached_get_entity_by(path=record.path) + if record.id is not None: + return cached_get_entity_by(eid=record.id) - if identifiable.path is not None: - return self.get_file(identifiable) + identifiable = self.get_identifiable(record, referencing_entities=referencing_entities) return self.retrieve_identified_record_for_identifiable(identifiable) @@ -450,6 +451,10 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): self._registered_identifiables[name] = definition def get_file(self, identifiable: Identifiable): + # TODO is this needed for Identifiable? + # or can we get rid of this function? + if isinstance(identifiable, db.Entity): + return cached_get_entity_by(path=identifiable) if identifiable.path is None: raise RuntimeError("Path must not be None for File retrieval.") candidates = db.execute_query("FIND File which is stored at '{}'".format( diff --git a/unittests/example_cfood.yml b/unittests/example_cfood.yml new file mode 100644 index 0000000000000000000000000000000000000000..713bd4be0f3c816e1e8c8b7a057b30a4b400f13c --- /dev/null +++ b/unittests/example_cfood.yml @@ -0,0 +1,47 @@ +--- +metadata: + crawler-version: 0.3.1 +--- +Definitions: + type: Definitions + +data: + type: Dict + match_name: '.*' + subtree: + Expiments: + type: ListElement + match_name: 'Experiments' + subtree: + Experiment: + type: DictElement + match: '.*' + records: + Ent: + parents: ["Experiment"] + subtree: &date_res + date: + type: Date + match_name: 'date' + match_value: '(?P<date>.*)' + records: + Ent: + date: $date + result: + type: TextElement + match_name: 'result' + match_value: '(?P<res>.*)' + records: + Ent: + result: $res + Analyses: + type: ListElement + match_name: 'Analyses' + subtree: + Analysis: + type: DictElement + match: '.*' + records: + Ent: + parents: ["Analysis"] + subtree: *date_res diff --git a/unittests/example_datastructure.yml b/unittests/example_datastructure.yml new file mode 100644 index 0000000000000000000000000000000000000000..1ec9d4575c7216fe5b8954db22cd9f2d03a7e749 --- /dev/null +++ b/unittests/example_datastructure.yml @@ -0,0 +1,10 @@ +Experiments: + - date: 2022-02-01 + result: FAIL + - date: 2022-02-02 + result: SUCCESS +Analyses: + - date: 2022-03-01 + result: homogeneous + - date: 2022-03-02 + result: heterogeneous diff --git a/unittests/example_identifiables.yml b/unittests/example_identifiables.yml new file mode 100644 index 0000000000000000000000000000000000000000..7e7da0a74cc202178bcae8a70be52d85f660d4e6 --- /dev/null +++ b/unittests/example_identifiables.yml @@ -0,0 +1,4 @@ +Experiment: + - date +Analysis: + - date diff --git a/unittests/simulated_server_data.py b/unittests/simulated_server_data.py deleted file mode 100644 index dd0c6b4e8693d64c9d96cafc5db2f447613daa1b..0000000000000000000000000000000000000000 --- a/unittests/simulated_server_data.py +++ /dev/null @@ -1,24 +0,0 @@ - -import caosdb as db -data_model = {"person": (db.RecordType(id=259, name="Person") - .add_property(name="first_name") - .add_property(name="last_name")), - "measurement": (db.RecordType(id=278, name="Measurement") - .add_property(name="identifier") - .add_property(name="date") - .add_property(name="project")), - "project": (db.RecordType(id=250, name="Project") - .add_property(name="date") - .add_property(name="identifier")), - "first_name": db.Property(name="first_name", datatype=db.TEXT, id=261), - "responsible": db.Property(name="responsible", datatype="Person", id=249), - "last_name": db.Property(name="last_name", datatype=db.TEXT, id=262), - "identifier": db.Property(name="identifier", datatype=db.TEXT, id=248), - "date": db.Property(name="date", datatype=db.DATETIME, id=247), - } -existing_data = { -} - -full_data = {} -full_data.update(data_model) -full_data.update(existing_data) diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 154724be6d126aefb430c7d0600b86a5ec721812..ab5710feaaf14babc3fed65f10598250e53ffd9b 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -23,31 +23,35 @@ """ test the converters module """ +import datetime +import importlib import json -import yaml import logging -import sys -import importlib import os +import sys from itertools import product -import datetime +from pathlib import Path + import pytest import yaml - -from caoscrawler.converters import (Converter, ConverterValidationError, DictElementConverter, - DirectoryConverter, DictIntegerElementConverter, - handle_value, MarkdownFileConverter, DateElementConverter, - FloatElementConverter, IntegerElementConverter, - JSONFileConverter, YAMLFileConverter) -from caoscrawler.converters import _AbstractScalarValueElementConverter +from caoscrawler.converters import (Converter, ConverterValidationError, + DateElementConverter, DictElementConverter, + DictIntegerElementConverter, + DirectoryConverter, FloatElementConverter, + IntegerElementConverter, JSONFileConverter, + MarkdownFileConverter, YAMLFileConverter, + _AbstractScalarValueElementConverter, + handle_value) from caoscrawler.crawl import Crawler +from caoscrawler.scanner import (_load_definition_from_yaml_dict, + create_converter_registry, load_definition) from caoscrawler.stores import GeneralStore -from caoscrawler.structure_elements import (File, TextElement, ListElement, DictElement, - BooleanElement, IntegerElement, - FloatElement, Directory) -from caoscrawler.scanner import load_definition, _load_definition_from_yaml_dict, create_converter_registry +from caoscrawler.structure_elements import (BooleanElement, DictElement, + Directory, File, FloatElement, + IntegerElement, ListElement, + TextElement) -from test_tool import rfp +UNITTESTDIR = Path(__file__).parent @pytest.fixture @@ -108,7 +112,7 @@ def testDirectoryConverter(converter_registry): }, name="Test", converter_registry=converter_registry) elements = dc.create_children(GeneralStore(), - Directory("test_directories", rfp("test_directories"))) + Directory("test_directories", UNITTESTDIR / "test_directories")) # Check whether the right structure elements were created # this has been updated, there are more directories now @@ -125,17 +129,16 @@ def testDirectoryConverter(converter_registry): def test_markdown_converter(converter_registry): test_readme = File( "README.md", - rfp( - "test_directories", "examples_article", "DataAnalysis", - "2020_climate-model-predict", "2020-02-08_prediction-errors", "README.md" - ) + UNITTESTDIR / + "test_directories" / "examples_article" / "DataAnalysis" / + "2020_climate-model-predict" / "2020-02-08_prediction-errors" / "README.md" ) converter = MarkdownFileConverter({"match": "(.*)"}, "TestMarkdownFileConverter", converter_registry) with pytest.raises(ConverterValidationError) as err: - converter.create_children(None, File("test_tool.py", rfp("test_tool.py"))) + converter.create_children(None, File("test_tool.py", UNITTESTDIR / "test_crawler.py")) m = converter.match(test_readme) assert m is not None @@ -163,8 +166,8 @@ def test_markdown_converter(converter_registry): test_readme2 = File( "README.md", - rfp("test_directories", "examples_article", - "ExperimentalData", "2020_SpeedOfLight", "2020-01-01_TimeOfFlight", "README.md") + UNITTESTDIR/"test_directories" / "examples_article" / + "ExperimentalData" / "2020_SpeedOfLight" / "2020-01-01_TimeOfFlight" / "README.md" ) m = converter.match(test_readme2) @@ -183,8 +186,8 @@ def test_markdown_converter(converter_registry): def test_json_converter(converter_registry): - test_json = File("testjson.json", rfp( - "test_directories", "examples_json", "testjson.json")) + test_json = File("testjson.json", UNITTESTDIR / + "test_directories" / "examples_json" / "testjson.json") schema_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "test_directories", "examples_json", "testjson.schema.json") @@ -241,7 +244,7 @@ def test_json_converter(converter_registry): invalid_json = File( "invalidjson.json", - rfp("test_directories", "examples_json", "invalidjson.json") + UNITTESTDIR/"test_directories" / "examples_json" / "invalidjson.json" ) # Doesn't validate because of missing required 'name' property with pytest.raises(ConverterValidationError) as err: @@ -250,15 +253,15 @@ def test_json_converter(converter_registry): broken_json = File( "brokenjson.json", - rfp("test_directories", "examples_json", "brokenjson.json") + UNITTESTDIR/"test_directories" / "examples_json" / "brokenjson.json" ) with pytest.raises(json.decoder.JSONDecodeError) as err: jsonconverter.create_children(None, broken_json) def test_yaml_converter(converter_registry): - test_yaml = File("testyaml.yml", rfp( - "test_directories", "test_yamls", "testyaml.yml")) + test_yaml = File("testyaml.yml", UNITTESTDIR / + "test_directories" / "test_yamls" / "testyaml.yml") schema_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "test_directories", "test_yamls", "testyaml.schema.json") @@ -315,7 +318,7 @@ def test_yaml_converter(converter_registry): invalid_yaml = File( "invalidyaml.yml", - rfp("test_directories", "test_yamls", "invalidyaml.yml") + UNITTESTDIR/"test_directories" / "test_yamls" / "invalidyaml.yml" ) # Doesn't validate because of missing required 'name' property @@ -325,7 +328,7 @@ def test_yaml_converter(converter_registry): broken_yaml = File( "brokenyaml.yml", - rfp("test_directories", "test_yamls", "brokenyaml.yml") + UNITTESTDIR/"test_directories" / "test_yamls" / "brokenyaml.yml" ) with pytest.raises(yaml.parser.ParserError) as err: yamlconverter.create_children(None, broken_yaml) @@ -361,12 +364,9 @@ def test_variable_replacement(): def test_filter_children_of_directory(converter_registry, capsys): - """Verify that children (i.e., files) in a directory are filtered or sorted - correctly. - - """ - test_dir = Directory("examples_filter_children", rfp( - "test_directories", "examples_filter_children")) + """Verify that children (i.e., files) in a directory are filtered or sorted correctly. """ + test_dir = Directory("examples_filter_children", UNITTESTDIR / + "test_directories" / "examples_filter_children") dc = DirectoryConverter( definition={ diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index f3ad73c5d75acea5fd3e92954e3899983ea73a2a..1722ec0fe3291d6e96042870d65af4e249671e82 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -24,18 +24,85 @@ test the Crawler class """ import json +import logging import os +import warnings +from copy import deepcopy +from functools import partial +from os.path import basename, dirname, join +from pathlib import Path +from unittest.mock import MagicMock, Mock, patch +import caosdb as db +import caosdb.common.models as dbmodels +import pytest +import yaml +from caoscrawler.crawl import (Crawler, SecurityMode, _treat_deprecated_prefix, + crawler_main, split_restricted_path) +from caoscrawler.debug_tree import DebugTree +from caoscrawler.identifiable import Identifiable +from caoscrawler.identifiable_adapters import (CaosDBIdentifiableAdapter, + IdentifiableAdapter, + LocalStorageIdentifiableAdapter) +from caoscrawler.scanner import (create_converter_registry, scan_directory, + scan_structure_elements) +from caoscrawler.stores import GeneralStore, RecordStore +from caoscrawler.structure_elements import (DictElement, DictListElement, + DictTextElement, File) +from caosdb.apiutils import compare_entities +from caosdb.cached import cache_clear +from caosdb.exceptions import EmptyUniqueQueryError from pytest import raises -import caosdb as db +UNITTESTDIR = Path(__file__).parent -from caoscrawler.stores import GeneralStore -from caoscrawler.crawl import Crawler -import warnings +EXAMPLE_SERVER_STATE = [ + db.Property(id=1, name='result', datatype=db.TEXT), + db.Property(id=2, name='date', datatype=db.DATETIME), + db.RecordType(id=3, name="Experiment"), + db.RecordType(id=4, name="Analysis"), + db.Record(id=5) + .add_parent(name="Experiment", id=3) + .add_property(name="date", value="2022-02-01") + .add_property(name="result", value="FAIL"), + db.Record(id=6) + .add_parent(name="Experiment", id=3) + .add_property(name="date", value="2022-02-02") + .add_property(name="result", value="SUCCESS"), + db.Record(id=7) + .add_parent(name="Analysis", id=4) + .add_property(name="date", value="2022-03-01") + .add_property(name="result", value="homogeneous"), + db.Record(id=8) + .add_parent(name="Analysis", id=4) + .add_property(name="date", value="2022-03-02") + .add_property(name="result", value="heterogeneous"), +] +NEW_ELEMENT = (db.Record() + .add_parent(name="Analysis", id=4) + .add_property(name="date", value="2022-03-05") # new date + .add_property(name="result", value="homogeneous")) + + +def mock_get_entity_by(eid=None, name=None): + if eid is not None: + candidates = [el for el in EXAMPLE_SERVER_STATE if el.id == eid] + if len(candidates) > 0: + return candidates[0] + else: + raise EmptyUniqueQueryError("") + if name is not None: + candidates = [el for el in EXAMPLE_SERVER_STATE + if (el.name is not None and el.name.lower() == name.lower())] + if len(candidates) > 0: + return candidates[0] + else: + raise EmptyUniqueQueryError("") -from test_tool import rfp -import pytest + +@pytest.fixture(autouse=True) +def clear_cache(): + cache_clear() @pytest.mark.filterwarnings("ignore::DeprecationWarning") @@ -61,7 +128,7 @@ def test_deprecated_functions(): warnings.filterwarnings("ignore") warnings.filterwarnings("always", category=DeprecationWarning) cr = Crawler() - cr.crawl_directory(".", rfp("scifolder_cfood.yml")) + cr.crawl_directory(UNITTESTDIR, UNITTESTDIR/"scifolder_cfood.yml") print(w) print(w[0].message) assert issubclass(w[-1].category, DeprecationWarning) @@ -74,3 +141,691 @@ def test_deprecated_functions(): cr.crawled_data assert issubclass(w[-1].category, DeprecationWarning) assert "The use of self.crawled_data is depricated" in str(w[-1].message) + + +def test_remove_unnecessary_updates(): + # test trvial case + upl = [db.Record().add_parent("A")] + irs = [db.Record().add_parent("A")] + updates = Crawler.remove_unnecessary_updates(upl, irs) + assert len(updates) == 0 + + # test property difference case + # TODO this should work right? + # upl = [db.Record().add_parent("A").add_property("a", 3)] + # irs = [db.Record().add_parent("A")] # ID should be s + # Crawler.remove_unnecessary_updates(upl, irs) + # assert len(upl) == 1 + + # test value difference case + upl = [db.Record().add_parent("A").add_property("a", 5)] + irs = [db.Record().add_parent("A").add_property("a")] + updates = Crawler.remove_unnecessary_updates(upl, irs) + assert len(updates) == 1 + upl = [db.Record().add_parent("A").add_property("a", 5)] + irs = [db.Record().add_parent("A").add_property("a", 5)] + updates = Crawler.remove_unnecessary_updates(upl, irs) + assert len(updates) == 0 + + # test unit difference case + upl = [db.Record().add_parent("A").add_property("a", unit='cm')] + irs = [db.Record().add_parent("A").add_property("a")] + updates = Crawler.remove_unnecessary_updates(upl, irs) + assert len(updates) == 1 + + # test None difference case + upl = [db.Record().add_parent("A").add_property("a")] + irs = [db.Record().add_parent("A").add_property("a", 5)] + updates = Crawler.remove_unnecessary_updates(upl, irs) + assert len(updates) == 1 + + +def test_split_into_inserts_and_updates_trivial(): + crawler = Crawler() + crawler.split_into_inserts_and_updates([]) + + +def basic_retrieve_by_name_mock_up(rec, referencing_entities=None, known=None): + """ returns a stored Record if rec.name is an existing key, None otherwise """ + if rec.name in known: + return known[rec.name] + else: + return None + + +@pytest.fixture +def crawler_mocked_identifiable_retrieve(): + crawler = Crawler() + # TODO use minimal setup + # mock retrieval of registered identifiabls: return Record with just a parent + crawler.identifiableAdapter.get_registered_identifiable = Mock( + side_effect=lambda x: db.Record().add_parent(x.parents[0].name)) + + # Simulate remote server content by using the names to identify records + # There is only a single known Record with name A + crawler.identifiableAdapter.retrieve_identified_record_for_record = Mock(side_effect=partial( + basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A")})) + crawler.identifiableAdapter.retrieve_identified_record_for_identifiable = Mock( + side_effect=partial( + basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A")})) + return crawler + + +def test_split_into_inserts_and_updates_single(crawler_mocked_identifiable_retrieve): + crawler = crawler_mocked_identifiable_retrieve + identlist = [Identifiable(name="A", record_type="C"), Identifiable(name="B", record_type="C")] + entlist = [db.Record(name="A").add_parent( + "C"), db.Record(name="B").add_parent("C")] + + assert crawler.get_from_any_cache(identlist[0]) is None + assert crawler.get_from_any_cache(identlist[1]) is None + assert not crawler._has_reference_value_without_id(identlist[0]) + assert not crawler._has_reference_value_without_id(identlist[1]) + assert crawler.identifiableAdapter.retrieve_identified_record_for_record( + identlist[0]).id == 1111 + assert crawler.identifiableAdapter.retrieve_identified_record_for_record( + identlist[1]) is None + + insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) + assert len(insert) == 1 + assert insert[0].name == "B" + assert len(update) == 1 + assert update[0].name == "A" + # if this ever fails, the mock up may be removed + crawler.identifiableAdapter.get_registered_identifiable.assert_called() + crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() + + +def test_split_into_inserts_and_updates_with_duplicate(crawler_mocked_identifiable_retrieve): + crawler = crawler_mocked_identifiable_retrieve + a = db.Record(name="A").add_parent("C") + b = db.Record(name="B").add_parent("C") + b.add_property("A", a) + # This is identical to a and should be removed + c = db.Record(name="A").add_parent("C") + entlist = [a, b, c] + insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) + assert len(insert) == 1 + assert insert[0].name == "B" + assert len(update) == 1 + assert update[0].name == "A" + # if this ever fails, the mock up may be removed + crawler.identifiableAdapter.get_registered_identifiable.assert_called() + crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() + + +def test_split_into_inserts_and_updates_with_ref(crawler_mocked_identifiable_retrieve): + crawler = crawler_mocked_identifiable_retrieve + # try it with a reference + a = db.Record(name="A").add_parent("C") + b = db.Record(name="B").add_parent("C") + b.add_property("A", a) + entlist = [a, b] + insert, update = crawler.split_into_inserts_and_updates(entlist) + assert len(insert) == 1 + assert insert[0].name == "B" + assert len(update) == 1 + assert update[0].name == "A" + # if this ever fails, the mock up may be removed + crawler.identifiableAdapter.get_registered_identifiable.assert_called() + crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() + + +def test_split_into_inserts_and_updates_with_circ(): + # try circular + a = db.Record(name="A").add_parent("C") + b = db.Record(name="B").add_parent("C") + b.add_property("A", a) + a.add_property("B", b) + entlist = [a, b] + # TODO this does not seem to be complete! + + +def test_split_into_inserts_and_updates_with_complex(crawler_mocked_identifiable_retrieve): + crawler = crawler_mocked_identifiable_retrieve + # A + # ^ + # | + # F <- B <- G + a = db.Record(name="A").add_parent("C").add_property( + 'd', 13).add_property('e', "lskdjlsfdj") + b = db.Record(name="B").add_parent("C") + g = db.Record(name="G").add_parent("C") + f = db.Record(name="F").add_parent("C") + g.add_property("A", a) + b.add_property("A", f) + b.add_property("A", a) + entlist = [a, b, g] + insert, update = crawler.split_into_inserts_and_updates(entlist) + assert len(insert) == 3 + assert "B" in [el.name for el in insert] + assert len(update) == 1 + assert update[0].name == "A" + # if this ever fails, the mock up may be removed + crawler.identifiableAdapter.get_registered_identifiable.assert_called() + crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() + + # TODO write test where the unresoled entity is not part of the identifiable + + +def test_split_into_inserts_and_updates_with_copy_attr(crawler_mocked_identifiable_retrieve): + crawler = crawler_mocked_identifiable_retrieve + # assume identifiable is only the name + a = db.Record(name="A").add_parent("C") + a.add_property("foo", 1) + b = db.Record(name="A").add_parent("C") + b.add_property("bar", 2) + entlist = [a, b] + insert, update = crawler.split_into_inserts_and_updates(entlist) + + assert update[0].get_property("bar").value == 2 + assert update[0].get_property("foo").value == 1 + # if this ever fails, the mock up may be removed + crawler.identifiableAdapter.get_registered_identifiable.assert_called() + crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() + + +def test_has_missing_object_in_references(): + crawler = Crawler() + # Simulate remote server content by using the names to identify records + # There are only two known Records with name A and B + crawler.identifiableAdapter.get_registered_identifiable = Mock(side_effect=partial( + basic_retrieve_by_name_mock_up, known={"C": db.Record(name="C").add_parent("RTC") + .add_property("d"), + "D": db.Record(name="D").add_parent("RTD") + .add_property("d").add_property("e"), + })) + + # one reference with id -> check + assert not crawler._has_missing_object_in_references( + Identifiable(name="C", record_type="RTC", properties={'d': 123}), []) + # one ref with Entity with id -> check + assert not crawler._has_missing_object_in_references( + Identifiable(name="C", record_type="RTC", properties={'d': db.Record(id=123) + .add_parent("C")}), []) + # one ref with id one with Entity with id (mixed) -> check + assert not crawler._has_missing_object_in_references( + Identifiable(name="C", record_type="RTD", + properties={'d': 123, 'b': db.Record(id=123).add_parent("RTC")}), []) + # entity to be referenced in the following + a = db.Record(name="C").add_parent("C").add_property("d", 12311) + # one ref with id one with Entity without id (but not identifying) -> fail + assert not crawler._has_missing_object_in_references( + Identifiable(name="C", record_type="RTC", properties={'d': 123, 'e': a}), []) + + # one ref with id one with Entity without id (mixed) -> fail + assert not crawler._has_missing_object_in_references( + Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), []) + + crawler.add_to_remote_missing_cache(a, Identifiable(name="C", record_type="RTC", + properties={'d': 12311})) + # one ref with id one with Entity without id but in cache -> check + assert crawler._has_missing_object_in_references( + Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), []) + + # if this ever fails, the mock up may be removed + crawler.identifiableAdapter.get_registered_identifiable.assert_called() + + +@pytest.mark.xfail() +def test_references_entities_without_ids(): + crawler = Crawler() + assert not crawler._has_reference_value_without_id(db.Record().add_parent("Person") + .add_property('last_name', 123) + .add_property('first_name', 123)) + # id and rec with id + assert not crawler._has_reference_value_without_id(db.Record().add_parent("Person") + .add_property('first_name', 123) + .add_property('last_name', + db.Record(id=123))) + # id and rec with id and one unneeded prop + assert crawler._has_reference_value_without_id(db.Record().add_parent("Person") + .add_property('first_name', 123) + .add_property('stuff', db.Record()) + .add_property('last_name', db.Record(id=123))) + + # one identifying prop is missing + assert crawler._has_reference_value_without_id(db.Record().add_parent("Person") + .add_property('first_name', 123) + .add_property('last_name', db.Record())) + + +def test_replace_entities_with_ids(): + crawler = Crawler() + a = (db.Record().add_parent("B").add_property("A", 12345) + .add_property("B", db.Record(id=12345)) + .add_property("C", [db.Record(id=12345), 233324])) + + crawler.replace_entities_with_ids(a) + assert a.get_property("A").value == 12345 + assert a.get_property("B").value == 12345 + assert a.get_property("C").value == [12345, 233324] + + +def reset_mocks(mocks): + for mock in mocks: + mock.reset_mock() + + +def mock_retrieve_record(identifiable: Identifiable): + """ assumes that the identifiable is always only the date""" + + for record in EXAMPLE_SERVER_STATE: + if (record.role == "Record" + and record.get_property("date").value == identifiable.properties['date']): + return record + return None + + +@patch("caoscrawler.crawl.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@patch("caoscrawler.identifiable_adapters.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter." + "retrieve_identified_record_for_identifiable", + new=Mock(side_effect=mock_retrieve_record)) +@patch("caoscrawler.crawl.db.Container.insert") +@patch("caoscrawler.crawl.db.Container.update") +def test_synchronization_no_commit(upmock, insmock): + crawled_data = [r.copy() for r in EXAMPLE_SERVER_STATE if r.role == "Record"] + # change one; add one + crawled_data[-1].get_property('result').value = "wst" + crawled_data.append(NEW_ELEMENT.copy()) + + ident = CaosDBIdentifiableAdapter() + ident.load_from_yaml_definition(UNITTESTDIR/"example_identifiables.yml") + crawler = Crawler(securityMode=SecurityMode.UPDATE, identifiableAdapter=ident) + ins, ups = crawler.synchronize(commit_changes=False, crawled_data=crawled_data) + insmock.assert_not_called() + upmock.assert_not_called() + assert len(ins) == 1 + assert len(ups) == 1 + + +@patch("caoscrawler.crawl.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@patch("caoscrawler.identifiable_adapters.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter." + "retrieve_identified_record_for_identifiable", + new=Mock(side_effect=mock_retrieve_record)) +@patch("caoscrawler.crawl.db.Container.insert") +@patch("caoscrawler.crawl.db.Container.update") +@patch("caoscrawler.crawl.UpdateCache.insert") +def test_security_mode(updateCacheMock, upmock, insmock): + # trivial case: nothing to do + crawled_data = [r.copy() for r in EXAMPLE_SERVER_STATE if r.role == "Record"] + print(crawled_data) + crawler = Crawler(securityMode=SecurityMode.RETRIEVE) + crawler.synchronize(commit_changes=True, crawled_data=crawled_data) + assert crawler.run_id is not None + insmock.assert_not_called() + upmock.assert_not_called() + updateCacheMock.assert_not_called() + + # RETRIEVE: insert only + ident = CaosDBIdentifiableAdapter() + ident.load_from_yaml_definition(UNITTESTDIR/"example_identifiables.yml") + crawler = Crawler(securityMode=SecurityMode.RETRIEVE, identifiableAdapter=ident) + + # add a new entity + crawled_data.append(NEW_ELEMENT.copy()) + + # insert forbidden + crawler.synchronize(commit_changes=True, crawled_data=crawled_data) + assert crawler.run_id is not None + insmock.assert_not_called() + upmock.assert_not_called() + assert updateCacheMock.call_count == 1 + # reset counts + reset_mocks([updateCacheMock, insmock, upmock]) + # remove new record again + crawled_data.pop() + + # RETRIEVE: update only + crawler = Crawler(securityMode=SecurityMode.RETRIEVE) + # change one element + crawled_data[-1].get_property('result').value = "wst" + crawler.synchronize(commit_changes=True, crawled_data=crawled_data) + assert crawler.run_id is not None + insmock.assert_not_called() + upmock.assert_not_called() + # import IPython + # IPython.embed() + # print(updateCacheMock.call_args_list) + assert updateCacheMock.call_count == 1 + # reset counts + reset_mocks([updateCacheMock, insmock, upmock]) + # reset value + crawled_data[-1] = EXAMPLE_SERVER_STATE[-1].copy() + + # INSERT: insert only + # add one element + crawled_data.append(NEW_ELEMENT.copy()) + ident = CaosDBIdentifiableAdapter() + ident.load_from_yaml_definition(UNITTESTDIR/"example_identifiables.yml") + crawler = Crawler(securityMode=SecurityMode.INSERT, identifiableAdapter=ident) + crawler.synchronize(commit_changes=True, crawled_data=crawled_data) + assert crawler.run_id is not None + insmock.assert_called_once() + upmock.assert_not_called() + updateCacheMock.assert_not_called() + # reset counts + reset_mocks([updateCacheMock, insmock, upmock]) + # remove new record again + crawled_data.pop() + + # INSERT: update only + crawler = Crawler(securityMode=SecurityMode.INSERT, identifiableAdapter=ident) + # change one element + crawled_data[-1].get_property('result').value = "wst" + crawler.synchronize(commit_changes=True, crawled_data=crawled_data) + assert crawler.run_id is not None + insmock.assert_not_called() + upmock.assert_not_called() + updateCacheMock.assert_called_once() + # reset counts + reset_mocks([updateCacheMock, insmock, upmock]) + # reset value + crawled_data[-1] = EXAMPLE_SERVER_STATE[-1].copy() + + # INSERT: insert and update + ident = CaosDBIdentifiableAdapter() + ident.load_from_yaml_definition(UNITTESTDIR/"example_identifiables.yml") + crawler = Crawler(securityMode=SecurityMode.INSERT, identifiableAdapter=ident) + # change one; add one + crawled_data[-1].get_property('result').value = "wst" + crawled_data.append(NEW_ELEMENT.copy()) + crawler.synchronize(commit_changes=True, crawled_data=crawled_data) + assert crawler.run_id is not None + insmock.asser_called_once() + upmock.assert_not_called() + updateCacheMock.assert_called_once() + # reset counts + reset_mocks([updateCacheMock, insmock, upmock]) + # restore original ident + crawled_data.pop() + crawled_data[-1] = EXAMPLE_SERVER_STATE[-1].copy() + + +def test_create_reference_mapping(): + a = db.Record().add_parent("A") + b = db.Record().add_parent("B").add_property('a', a) + ref = Crawler.create_reference_mapping([a, b]) + assert id(a) in ref + assert id(b) not in ref + assert "B" in ref[id(a)] + assert ref[id(a)]["B"] == [b] + + +def test_create_flat_list(): + a = db.Record() + b = db.Record() + a.add_property(name="a", value=a) + a.add_property(name="b", value=b) + flat = Crawler.create_flat_list([a]) + assert len(flat) == 2 + assert a in flat + assert b in flat + c = db.Record() + c.add_property(name="a", value=a) + # This would caus recursion if it is not dealt with properly. + a.add_property(name="c", value=c) + flat = Crawler.create_flat_list([c]) + assert len(flat) == 3 + assert a in flat + assert b in flat + assert c in flat + + +@ pytest.fixture +def crawler_mocked_for_backref_test(): + crawler = Crawler() + # mock retrieval of registered identifiabls: return Record with just a parent + + def get_reg_ident(x): + if x.parents[0].name == "C": + return db.Record().add_parent(x.parents[0].name).add_property( + "is_referenced_by", value=["BR"]) + elif x.parents[0].name == "D": + return db.Record().add_parent(x.parents[0].name).add_property( + "is_referenced_by", value=["BR", "BR2"]) + else: + return db.Record().add_parent(x.parents[0].name) + crawler.identifiableAdapter.get_registered_identifiable = Mock(side_effect=get_reg_ident) + + # Simulate remote server content by using the names to identify records + # There is only a single known Record with name A + crawler.identifiableAdapter.retrieve_identified_record_for_record = Mock(side_effect=partial( + basic_retrieve_by_name_mock_up, known={"A": + db.Record(id=1111, name="A").add_parent("BR")})) + crawler.identifiableAdapter.retrieve_identified_record_for_identifiable = Mock( + side_effect=partial( + basic_retrieve_by_name_mock_up, known={"A": + db.Record(id=1111, name="A").add_parent("BR")})) + return crawler + + +def test_validation_error_print(caplog): + caplog.set_level(logging.DEBUG, logger="caoscrawler.converters") + # there should be no server interaction since we only test the behavior if a validation error + # occurs during the data collection stage + DATADIR = os.path.join(os.path.dirname(__file__), "test_data", "failing_validation") + for fi in ["cfood.yml", "cfood2.yml"]: + ret = crawler_main(DATADIR, + os.path.join(DATADIR, fi), + os.path.join(DATADIR, "identifiables.yml"), + True, + None, + False) + assert "Couldn't validate" in caplog.text + caplog.clear() + + +def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test): + crawler = crawler_mocked_for_backref_test + identlist = [Identifiable(name="A", record_type="BR"), + Identifiable(name="B", record_type="C", backrefs=[db.Entity()])] + referenced = db.Record(name="B").add_parent("C") + entlist = [referenced, db.Record(name="A").add_parent("BR").add_property("ref", referenced), ] + + # Test without referencing object + # currently a NotImplementedError is raised if necessary properties are missing. + with raises(NotImplementedError): + crawler.split_into_inserts_and_updates([db.Record(name="B").add_parent("C")]) + + # identifiables were not yet checked + assert crawler.get_from_any_cache(identlist[0]) is None + assert crawler.get_from_any_cache(identlist[1]) is None + # one with reference, one without + assert not crawler._has_reference_value_without_id(identlist[0]) + assert crawler._has_reference_value_without_id(identlist[1]) + # one can be found remotely, one not + assert crawler.identifiableAdapter.retrieve_identified_record_for_record( + identlist[0]).id == 1111 + assert crawler.identifiableAdapter.retrieve_identified_record_for_record( + identlist[1]) is None + + # check the split... + insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) + # A was found remotely and is therefore in the update list + assert len(update) == 1 + assert update[0].name == "A" + # B does not exist on the (simulated) remote server + assert len(insert) == 1 + assert insert[0].name == "B" + + +def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_test): + # test whether multiple references of the same record type are correctly used + crawler = crawler_mocked_for_backref_test + referenced = db.Record(name="B").add_parent("C") + entlist = [referenced, + db.Record(name="A").add_parent("BR").add_property("ref", referenced), + db.Record(name="C").add_parent("BR").add_property("ref", referenced), + ] + + # test whether both entities are listed in the backref attribute of the identifiable + referencing_entities = crawler.create_reference_mapping(entlist) + identifiable = crawler.identifiableAdapter.get_identifiable(referenced, referencing_entities) + assert len(identifiable.backrefs) == 2 + + # check the split... + insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) + assert len(update) == 1 + assert len(insert) == 2 + + +def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_test): + # test whether multiple references of the different record types are correctly used + crawler = crawler_mocked_for_backref_test + referenced = db.Record(name="B").add_parent("D") + entlist = [referenced, + db.Record(name="A").add_parent("BR").add_property("ref", referenced), + db.Record(name="A").add_parent("BR2").add_property("ref", referenced), + ] + + # test whether both entities are listed in the backref attribute of the identifiable + referencing_entities = crawler.create_reference_mapping(entlist) + identifiable = crawler.identifiableAdapter.get_identifiable(referenced, referencing_entities) + assert len(identifiable.backrefs) == 2 + + # check the split... + insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) + assert len(update) == 2 + assert len(insert) == 1 + + +def mock_create_values(values, element): + pass + + +@patch("caoscrawler.converters.IntegerElementConverter.create_values") +def test_restricted_path(create_mock): + """ + The restricted_path argument allows to ignroe part of the crawled data structure. Here, we make + sure, that is that argument is provided, ideed only the given path of the tree is traversed. + + The check is done using the mock of the create_values function of the IntegerElementConverter. + This function is only called if elements are being treated. + """ + crawler_definition = { + "DictTest": { + "type": "DictElement", + "match": "(.*)", + "subtree": { + "nextdict": { + "type": "DictElement", + "match": "(.*)", + "subtree": { + "int_element": { + "type": "IntegerElement", + "match_name": ".*", + "match_value": "(?P<int_value>.*)", + "records": { + "Dataset": { + "Subject": "$int_value" + } + } + } + } + } + } + } + } + + crawler = Crawler() + converter_registry = create_converter_registry(crawler_definition) + + # This structure is crawled + test_dict = { + "v1": { + "a": 1, + "b": 2, + }, + "v2": { + "c": 3, + "d": 4, + } + } + # first test without a restricted_path + restricted_path = None + records = scan_structure_elements( + DictElement("TestDict", test_dict), crawler_definition, converter_registry, + restricted_path + ) + assert create_mock.call_count == 4 + create_mock.reset_mock() + + # test with a restricted_path but one that has no effect (single root element) + # this also tests that the remainder of the tree is fully traversed + restricted_path = ["TestDict"] + records = scan_structure_elements( + DictElement("TestDict", test_dict), crawler_definition, converter_registry, + restricted_path + ) + assert create_mock.call_count == 4 + create_mock.reset_mock() + + # test with a restricted_path that restricts the tree (single root element) + restricted_path = ["TestDict", "v2"] + records = scan_structure_elements( + DictElement("TestDict", test_dict), crawler_definition, converter_registry, + restricted_path + ) + assert create_mock.call_count == 2 + create_mock.reset_mock() + + # test with a restricted_path that contains a bad element + restricted_path = ["TestDict", "v3"] + with raises(RuntimeError): + records = scan_structure_elements( + DictElement("TestDict", test_dict), crawler_definition, converter_registry, + restricted_path + ) + + +def test_split_restricted_path(): + assert ["el"] == split_restricted_path("/el") + assert ["el"] == split_restricted_path("/el/") + assert ["el", "el"] == split_restricted_path("/el/el") + + +# Filter the warning because we want to have it here and this way it does not hinder running +# tests with -Werror. +@pytest.mark.filterwarnings("ignore:The prefix:DeprecationWarning") +def test_deprecated_prefix_option(): + """Test that calling the crawler's main function with the deprecated + `prefix` option raises the correct errors and warnings. + + """ + + with pytest.deprecated_call(): + crawler_main("./", UNITTESTDIR/"scifolder_cfood.yml", prefix="to/be/removed") + + # Check that crawler main terminates with an error + assert 1 == crawler_main("./", UNITTESTDIR/"scifolder_cfood.yml", prefix="to/be/removed", + remove_prefix="to/be/removed") + + with raises(ValueError) as ve: + + _treat_deprecated_prefix(prefix="to/be/removed", remove_prefix="to/be/removed") + + assert "(deprecated) `prefix` and the `remove_prefix`" in str(ve.value) + + +def test_create_entity_summary(): + assert "" == Crawler.create_entity_summary([]).strip() + + entities = [ + db.Record(id=1).add_parent("A"), + db.Record(id=4, name='a').add_parent("B"), + db.Record(id=5).add_parent("A"), + db.Record(id=6, name='b').add_parent("B"), + ] + text = Crawler.create_entity_summary(entities).strip() + assert 'a' in text + assert 'b' in text + assert 'A:' in text + assert 'B:' in text + assert "<a href='/Entity/4'>a</a>, <a href='/Entity/6'>b</a>" in text diff --git a/unittests/test_file_identifiables.py b/unittests/test_file_identifiables.py index aff174d0228d2750efd1cca129547c821c974127..e46bb80711cea9684dc4610393b13f0592659427 100644 --- a/unittests/test_file_identifiables.py +++ b/unittests/test_file_identifiables.py @@ -6,11 +6,32 @@ import caosdb as db import pytest from pytest import raises +from unittest.mock import patch, Mock from caoscrawler.identifiable_adapters import LocalStorageIdentifiableAdapter +from caosdb.cached import cache_clear from caoscrawler.identifiable import Identifiable +from caosdb.exceptions import EmptyUniqueQueryError + +@pytest.fixture(autouse=True) +def clear_cache(): + cache_clear() + + +def mock_get_entity_by(eid=None, name=None, path=None): + if eid is not None: + candidates = [el for el in EXAMPLE_SERVER_STATE if el.id == eid] + if len(candidates) > 0: + return candidates[0] + else: + raise EmptyUniqueQueryError("") + raise EmptyUniqueQueryError("") + + +@patch("caoscrawler.identifiable_adapters.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) def test_file_identifiable(): ident = LocalStorageIdentifiableAdapter() @@ -31,7 +52,8 @@ def test_file_identifiable(): file_obj != identifiable # since the path does not exist in the data in ident, the follwoing functions return None - assert ident.retrieve_identified_record_for_record(file_obj) is None + with raises(EmptyUniqueQueryError): + ident.retrieve_identified_record_for_record(file_obj) assert ident.get_file(identifiable) is None # Try again with actual files in the store: diff --git a/unittests/test_identifiable_adapters.py b/unittests/test_identifiable_adapters.py index 894d476d628e9a05fbc6a4f7089404c886e01cbf..57f1459de9b6cf4f3fec73b9f3c0af1ae2b87659 100644 --- a/unittests/test_identifiable_adapters.py +++ b/unittests/test_identifiable_adapters.py @@ -29,10 +29,16 @@ test identifiable_adapters module import os from datetime import datetime -from caoscrawler.identifiable_adapters import ( - CaosDBIdentifiableAdapter, convert_value, IdentifiableAdapter) -from caoscrawler.identifiable import Identifiable +from pathlib import Path + import caosdb as db +import pytest +from caoscrawler.identifiable import Identifiable +from caoscrawler.identifiable_adapters import (CaosDBIdentifiableAdapter, + IdentifiableAdapter, + convert_value) + +UNITTESTDIR = Path(__file__).parent def test_create_query_for_identifiable(): @@ -90,8 +96,7 @@ def test_create_query_for_identifiable(): def test_load_from_yaml_file(): ident = CaosDBIdentifiableAdapter() ident.load_from_yaml_definition( - os.path.join(os.path.dirname(__file__), "test_directories", - "single_file_test_data", "identifiables.yml") + UNITTESTDIR / "test_directories" / "single_file_test_data" / "identifiables.yml" ) person_i = ident.get_registered_identifiable( @@ -118,3 +123,59 @@ def test_convert_value(): return " a " assert convert_value(A()) == " a " + + +def test_get_identifiable(): + # TODO modify this such that it becomes a test that acutally tests (sufficiently) the + # get_identifable function + + ident = CaosDBIdentifiableAdapter() + ident.load_from_yaml_definition(UNITTESTDIR / "example_identifiables.yml") + r_cur = (db.Record(id=5) + .add_parent(name="Experiment", id=3) + .add_property(name="date", value="2022-02-01") + .add_property(name="result", value="FAIL")) + id_r0 = ident.get_identifiable(r_cur) + assert r_cur.parents[0].name == id_r0.record_type + assert r_cur.get_property( + "date").value == id_r0.properties["date"] + assert len(r_cur.parents) == 1 + assert len(r_cur.properties) == 2 + assert len(id_r0.properties) == 1 + + +@pytest.mark.xfail +def test_retrieve_identified_record_for_identifiable(): + # TODO modify this such that it becomes a test that acutally tests (sufficiently) the + # retrieve_identified_record_for_identifiable function + idr_r0_test = ident.retrieve_identified_record_for_identifiable(id_r0) + idr_r0 = ident.retrieve_identified_record_for_record(r_cur) + assert idr_r0 == idr_r0_test + + # take the first measurement in the list of records: + for r in ident.get_records(): + if r.parents[0].name == "Measurement": + r_cur = r + break + + id_r1 = ident.get_identifiable(r_cur) + assert r_cur.parents[0].name == id_r1.record_type + assert r_cur.get_property( + "identifier").value == id_r1.properties["identifier"] + assert r_cur.get_property("date").value == id_r1.properties["date"] + assert r_cur.get_property( + "project").value == id_r1.properties["project"] + assert len(r_cur.parents) == 1 + assert len(r_cur.properties) == 4 + assert len(id_r1.properties) == 3 + + idr_r1_test = ident.retrieve_identified_record_for_identifiable(id_r1) + idr_r1 = ident.retrieve_identified_record_for_record(r_cur) + assert idr_r1 == idr_r1_test + assert idr_r1 != idr_r0 + assert idr_r1_test != idr_r0_test + + assert len(idr_r1.properties) == 4 + assert r_cur.get_property( + "responsible").value == idr_r1.get_property("responsible").value + assert r_cur.description == idr_r1.description diff --git a/unittests/test_issues.py b/unittests/test_issues.py index 46157af9225c11b79e76dd3ef856d60519a6eb9d..cbbe9cabcfd17daaf07165757351f00dc051eeab 100644 --- a/unittests/test_issues.py +++ b/unittests/test_issues.py @@ -28,7 +28,6 @@ from caoscrawler.crawl import Crawler from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter from caoscrawler.structure_elements import DictElement -from test_tool import rfp from caoscrawler.scanner import create_converter_registry, scan_structure_elements diff --git a/unittests/test_json.py b/unittests/test_json.py index 3c120be174ff819baeeaa49ddf142cf40dba751e..fdb332df60d73dce3356a563e09ae0d02cf845b7 100644 --- a/unittests/test_json.py +++ b/unittests/test_json.py @@ -34,16 +34,18 @@ from pytest import raises import caosdb as db from caoscrawler.converters import JSONFileConverter +from pathlib import Path from caoscrawler.crawl import Crawler from caoscrawler.structure_elements import File, JSONFile from caoscrawler.scanner import load_definition, create_converter_registry, scan_structure_elements -from test_tool import rfp, dircheckstr + +UNITTESTDIR = Path(__file__).parent def test_json(): - crawler_definition_path = rfp("test_directories", "examples_json", - "jsontest_cfood.yml") - json_file_path = rfp("test_directories", "examples_json", "testjson.json") + crawler_definition_path = (UNITTESTDIR / "test_directories" / "examples_json" + / "jsontest_cfood.yml") + json_file_path = UNITTESTDIR / "test_directories" / "examples_json" / "testjson.json" crawler_definition = load_definition(crawler_definition_path) # Load and register converter packages: @@ -68,8 +70,7 @@ def test_json(): def test_broken_validation(): - crawler_definition_path = rfp( - "broken_cfoods", "broken_validation_path.yml") + crawler_definition_path = UNITTESTDIR / "broken_cfoods" / "broken_validation_path.yml" with raises(FileNotFoundError) as err: crawler_definition = load_definition(crawler_definition_path) diff --git a/unittests/test_scalars_cfood.py b/unittests/test_scalars_cfood.py index 89d94fc74ebda6aedfbee422294e99eab2216d73..57a8083c2aace830115c410e0425a8af4da17a7b 100644 --- a/unittests/test_scalars_cfood.py +++ b/unittests/test_scalars_cfood.py @@ -2,19 +2,21 @@ # Tests for: # https://gitlab.com/caosdb/caosdb-crawler/-/issues/9 # A. Schlemmer, 06/2021 +import os +from pathlib import Path import pytest - # The main function that is affected by this issue: from caoscrawler.converters import handle_value from caoscrawler.crawl import Crawler +from caoscrawler.debug_tree import DebugTree +from caoscrawler.scanner import scan_directory # We need the store for the above function from caoscrawler.stores import GeneralStore -from caoscrawler.scanner import scan_directory -from caoscrawler.debug_tree import DebugTree +from utils import dircheckstr -from test_tool import dircheckstr, rfp +UNITTESTDIR = Path(__file__).parent def test_handle_value(): @@ -35,9 +37,11 @@ def test_handle_value(): def test_record_structure_generation(): dbt = DebugTree() - scan_directory(rfp("test_directories", "examples_article"), rfp("cfoods_scalar.yml"), + scan_directory(UNITTESTDIR/"test_directories" / "examples_article", + UNITTESTDIR/"cfoods_scalar.yml", debug_tree=dbt) - subd = dbt.debug_tree[dircheckstr("DataAnalysis")] + subd = dbt.debug_tree[dircheckstr( + UNITTESTDIR/"test_directories" / "examples_article", "DataAnalysis")] assert len(subd) == 2 # variables store on Data Analysis node of debug tree if "Data" in subd[0]: diff --git a/unittests/test_scanner.py b/unittests/test_scanner.py new file mode 100644 index 0000000000000000000000000000000000000000..3d0a1e042c4483af608140df521cd9c087c71c70 --- /dev/null +++ b/unittests/test_scanner.py @@ -0,0 +1,169 @@ + +import json +import logging +import os +import warnings +from copy import deepcopy +from functools import partial +from os.path import basename, dirname, join +from pathlib import Path +from tempfile import NamedTemporaryFile +from unittest.mock import MagicMock, Mock, patch + +import caosdb as db +import caosdb.common.models as dbmodels +import pytest +import yaml +from caoscrawler.crawl import (Crawler, SecurityMode, _treat_deprecated_prefix, + crawler_main, split_restricted_path) +from caoscrawler.debug_tree import DebugTree +from caoscrawler.identifiable import Identifiable +from caoscrawler.identifiable_adapters import (CaosDBIdentifiableAdapter, + IdentifiableAdapter, + LocalStorageIdentifiableAdapter) +from caoscrawler.scanner import (create_converter_registry, load_definition, + scan_directory, scan_structure_elements) +from caoscrawler.stores import GeneralStore, RecordStore +from caoscrawler.structure_elements import (DictElement, DictListElement, + DictTextElement, File) +from caosdb.apiutils import compare_entities +from caosdb.cached import cache_clear +from caosdb.exceptions import EmptyUniqueQueryError +from pytest import raises + +from utils import dircheckstr as dircheck_base + +UNITTESTDIR = Path(__file__).parent + +dircheckstr = partial(dircheck_base, UNITTESTDIR/"test_directories" / "examples_article") + + +def test_scan_structure_elements(): + tmpfi = NamedTemporaryFile(delete=False) + with open(UNITTESTDIR/"example_datastructure.yml", "r") as f: + data = yaml.load(f, Loader=yaml.SafeLoader) + + crawler_definition = load_definition(UNITTESTDIR/"example_cfood.yml") + converter_registry = create_converter_registry(crawler_definition) + recs = scan_structure_elements(DictElement(name="", value=data), crawler_definition, + converter_registry) + assert len(recs) == 4 + + +def test_provenance_debug_data(): + # TODO rewrite the test to use a smaller example setup + tmpfi = NamedTemporaryFile(delete=False) + debug_tree = DebugTree() + with open(UNITTESTDIR/"example_datastructure.yml", "r") as f: + data = yaml.load(f, Loader=yaml.SafeLoader) + + crawler_definition = load_definition(UNITTESTDIR/"example_cfood.yml") + converter_registry = create_converter_registry(crawler_definition) + stuff = scan_structure_elements(DictElement(name="", value=data), crawler_definition, + converter_registry, debug_tree=debug_tree) + crawler = Crawler() + crawler.save_debug_data(tmpfi.name, debug_tree) + with open(tmpfi.name, "r") as f: + provenance = yaml.load(f, Loader=yaml.SafeLoader) + + pr = provenance["provenance"] + + def check_key_count(prefix): + return sum([1 for key in pr.keys() if key.startswith(prefix)]) + assert check_key_count("Ent") == 4 + + +def test_record_structure_generation(): + # TODO create a test from this that tests scan_structure + # the cfood should be minimal but cover typical scenarios (e.g. children) + # add also a minimal test for scan_directory; it can be very basic since the only difference + # to scan_structure is the kind of starting structure_element (check this statement) + # The test should not check debug tree output but actual created records + + # TODO test creation of debug information in a separate test + + dbt = DebugTree() + scan_directory(UNITTESTDIR/"test_directories" / "examples_article", + UNITTESTDIR/"scifolder_cfood.yml", + debug_tree=dbt) + subd = dbt.debug_tree[dircheckstr("DataAnalysis")] + subc = dbt.debug_metadata["copied"][dircheckstr("DataAnalysis")] + assert len(subd) == 2 + # variables store on Data Analysis node of debug tree + assert len(subd[0]) == 4 + # record store on Data Analysis node of debug tree + assert len(subd[1]) == 0 + assert len(subc) == 2 + assert len(subc[0]) == 4 + assert len(subc[1]) == 0 + + # The data analysis node creates one variable for the node itself: + assert subd[0]["DataAnalysis"] == "examples_article/DataAnalysis" + assert subc[0]["DataAnalysis"] is False + + subd = dbt.debug_tree[dircheckstr("DataAnalysis", "2020_climate-model-predict")] + subc = dbt.debug_metadata["copied"][dircheckstr("DataAnalysis", "2020_climate-model-predict")] + + assert len(subd[1]) == 1 + assert len(subd[1]["Project"].get_parents()) == 1 + assert subd[1]["Project"].get_parents()[0].name == "Project" + assert subd[1]["Project"].get_property("date").value == "2020" + assert subd[1]["Project"].get_property( + "identifier").value == "climate-model-predict" + + assert len(subd[0]) == 9 + assert subd[0]["date"] == "2020" + assert subd[0]["identifier"] == "climate-model-predict" + assert subd[0]["Project"].__class__ == db.Record + + assert subd[0]["DataAnalysis"] == "examples_article/DataAnalysis" + assert subc[0]["DataAnalysis"] is True + assert subd[0]["project_dir"] == "examples_article/DataAnalysis/2020_climate-model-predict" + assert subc[0]["project_dir"] is False + + # Check the copy flags for the first level in the hierarchy: + assert len(subc[0]) == 9 + assert len(subc[1]) == 1 + assert subc[1]["Project"] is False + assert subc[0]["Project"] is False + assert subc[0]["date"] is False + assert subc[0]["identifier"] is False + + subd = dbt.debug_tree[dircheckstr("DataAnalysis", + "2020_climate-model-predict", + "2020-02-08_prediction-errors")] + subc = dbt.debug_metadata["copied"][dircheckstr("DataAnalysis", + "2020_climate-model-predict", + "2020-02-08_prediction-errors")] + assert len(subd[0]) == 12 + assert subd[0]["date"] == "2020-02-08" + assert subd[0]["identifier"] == "prediction-errors" + assert subd[0]["Project"].__class__ == db.Record + assert subd[0]["Measurement"].__class__ == db.Record + + assert len(subd[1]) == 2 + + assert len(subd[1]["Project"].get_parents()) == 1 + assert subd[1]["Project"].get_parents()[0].name == "Project" + assert subd[1]["Project"].get_property("date").value == "2020" + assert subd[1]["Project"].get_property( + "identifier").value == "climate-model-predict" + + assert len(subd[1]["Measurement"].get_parents()) == 1 + assert subd[1]["Measurement"].get_parents()[0].name == "Measurement" + assert subd[1]["Measurement"].get_property("date").value == "2020-02-08" + assert subd[1]["Measurement"].get_property( + "identifier").value == "prediction-errors" + assert subd[1]["Measurement"].get_property("project").value != "$Project" + assert subd[1]["Measurement"].get_property( + "project").value.__class__ == db.Record + assert subd[1]["Measurement"].get_property( + "project").value == subd[0]["Project"] + + # Check the copy flags for the second level in the hierarchy: + assert subc[1]["Project"] is True + assert subc[0]["Project"] is True + assert subc[1]["Measurement"] is False + assert subc[0]["Measurement"] is False + assert subc[0]["date"] is False + assert subc[0]["identifier"] is False diff --git a/unittests/test_table_converter.py b/unittests/test_table_converter.py index d739695fc4c6a019f28f3c3697e3f134e0f1755e..12fcabf968e5319e0f2d0b569cb56afa7ac23fda 100644 --- a/unittests/test_table_converter.py +++ b/unittests/test_table_converter.py @@ -26,29 +26,31 @@ test the converters module """ -from caoscrawler.converters import Converter -from caoscrawler.stores import GeneralStore -from caoscrawler.scanner import scan_directory -from caoscrawler.debug_tree import DebugTree -from caoscrawler.converters import (ConverterValidationError, - DictConverter, XLSXTableConverter, CSVTableConverter) -from caoscrawler.structure_elements import Directory -from caoscrawler.structure_elements import (File, TextElement, ListElement, DictElement, - BooleanElement, IntegerElement, FloatElement) - -from os.path import join, dirname, basename - -from caoscrawler.identifiable_adapters import IdentifiableAdapter, LocalStorageIdentifiableAdapter - -import pytest -import os import importlib - import math +import os +from os.path import basename, dirname, join +from pathlib import Path +import caosdb as db +import pytest from caoscrawler import Crawler +from caoscrawler.converters import (Converter, ConverterValidationError, + CSVTableConverter, DictConverter, + XLSXTableConverter) +from caoscrawler.debug_tree import DebugTree +from caoscrawler.identifiable_adapters import (IdentifiableAdapter, + LocalStorageIdentifiableAdapter) +from caoscrawler.scanner import scan_directory +from caoscrawler.stores import GeneralStore +from caoscrawler.structure_elements import (BooleanElement, DictElement, + Directory, File, FloatElement, + IntegerElement, ListElement, + TextElement) -import caosdb as db +from utils import dircheckstr + +UNITTESTDIR = Path(__file__).parent @pytest.fixture @@ -86,13 +88,6 @@ def rfp(*pathcomponents): return join(dirname(__file__), *pathcomponents) -def dircheckstr(*pathcomponents): - """ - Return the debug tree identifier for a given path. - """ - return "caoscrawler.structure_elements.File: " + basename(join(*pathcomponents)) + ", " + rfp("test_directories", "examples_tables", "ExperimentalData", *pathcomponents) - - def test_convert_table(converter_registry): extentions = ["xlsx", "csv", "tsv"] if importlib.util.find_spec("odf") is not None: @@ -151,7 +146,13 @@ def test_crawl_csv_table(): rfp("test_directories", "examples_tables", "crawler_for_tables.yml"), debug_tree=dbt) for file_ext in ["xlsx", "csv"]: - subd = dbt.debug_tree[dircheckstr("test1." + file_ext)] + print(dbt.debug_tree) + print(dircheckstr( + UNITTESTDIR / "test_directories" / "examples_tables" / "ExperimentalData", + "test1." + file_ext)) + subd = dbt.debug_tree[dircheckstr( + UNITTESTDIR / "test_directories" / "examples_tables" / "ExperimentalData", + "test1." + file_ext)] record_experiment = subd[1]["Experiment"] assert isinstance(record_experiment, db.Record) assert isinstance(record_experiment.get_property("Measurements").value, list) diff --git a/unittests/test_tool.py b/unittests/test_tool.py deleted file mode 100755 index ec3e0bb9e69a45416d23f3c7aba15ec759cabf77..0000000000000000000000000000000000000000 --- a/unittests/test_tool.py +++ /dev/null @@ -1,1024 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -# -# This file is a part of the CaosDB Project. -# -# Copyright (C) 2021 Alexander Schlemmer -# Copyright (C) 2023 Henrik tom Wörden <h.tomwoerden@indiscale.com> -# Copyright (C) 2023 IndiScale GmbH <info@indiscale.com> -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see <https://www.gnu.org/licenses/>. -# - -""" -Tests for the tool using pytest -Adapted from check-sfs -""" -import logging - -from caoscrawler.stores import GeneralStore, RecordStore -import os -from caoscrawler.crawl import (_treat_deprecated_prefix, Crawler, crawler_main, - SecurityMode, split_restricted_path) -from caoscrawler.identifiable import Identifiable -from caoscrawler.structure_elements import File, DictTextElement, DictListElement, DictElement -from caoscrawler.scanner import scan_directory -from caoscrawler.debug_tree import DebugTree -from caoscrawler.identifiable_adapters import IdentifiableAdapter, LocalStorageIdentifiableAdapter -from simulated_server_data import full_data -from functools import partial -from copy import deepcopy -from unittest.mock import patch -import caosdb.common.models as dbmodels -from unittest.mock import MagicMock, Mock -from os.path import join, dirname, basename -import yaml -import caosdb as db -from caosdb.apiutils import compare_entities -from caosdb.cached import cache_clear -import pytest -from pytest import raises - -from caoscrawler.scanner import create_converter_registry, scan_structure_elements - - -def rfp(*pathcomponents): - """ - Return full path. - Shorthand convenience function. - """ - return join(dirname(__file__), *pathcomponents) - - -ident = LocalStorageIdentifiableAdapter() -ident.restore_state(rfp("records.xml")) -full_data.update({el.name: el for el in ident._records if el.name is not None}) -full_data.update({el.id: el for el in ident._records if el.name is None}) - - -def dircheckstr(*pathcomponents): - """ - Return the debug tree identifier for a given path. - """ - return ("caoscrawler.structure_elements.Directory: " + basename( - join(*pathcomponents)) + ", " + rfp( - "test_directories", "examples_article", *pathcomponents)) - - -@pytest.fixture(autouse=True) -def clear_cache(): - cache_clear() - - -@pytest.fixture -def crawler(): - crawler = Crawler() - debug_tree = DebugTree() - crawled_data = scan_directory( - rfp("test_directories", "examples_article"), - rfp("scifolder_cfood.yml"), debug_tree=debug_tree) - return crawler, crawled_data, debug_tree - - -@pytest.fixture -def ident(crawler): - ident = LocalStorageIdentifiableAdapter() - crawler[0].identifiableAdapter = ident - - # The records.xml file is constructed as follows: - # To a full run of the crawler, resolve all identifiables and insert all resulting entities. - # See: test-setup/datamodel/generate_test_data.py for details. - ident.restore_state(rfp("records.xml")) - - ident.register_identifiable( - "Person", db.RecordType() - .add_parent(name="Person") - .add_property(name="first_name") - .add_property(name="last_name")) - ident.register_identifiable( - "Measurement", db.RecordType() - .add_parent(name="Measurement") - .add_property(name="identifier") - .add_property(name="date") - .add_property(name="project")) - ident.register_identifiable( - "Project", db.RecordType() - .add_parent(name="Project") - .add_property(name="date") - .add_property(name="identifier")) - return ident - - -def test_record_structure_generation(): - # TODO How does this test relate to the test function in test_scalars_cfood with the same name? - # There seems to be code duplication - - dbt = DebugTree() - scan_directory(rfp("test_directories", "examples_article"), - rfp("scifolder_cfood.yml"), - debug_tree=dbt) - subd = dbt.debug_tree[dircheckstr("DataAnalysis")] - subc = dbt.debug_metadata["copied"][dircheckstr("DataAnalysis")] - assert len(subd) == 2 - # variables store on Data Analysis node of debug tree - assert len(subd[0]) == 4 - # record store on Data Analysis node of debug tree - assert len(subd[1]) == 0 - assert len(subc) == 2 - assert len(subc[0]) == 4 - assert len(subc[1]) == 0 - - # The data analysis node creates one variable for the node itself: - assert subd[0]["DataAnalysis"] == "examples_article/DataAnalysis" - assert subc[0]["DataAnalysis"] is False - - subd = dbt.debug_tree[dircheckstr( - "DataAnalysis", "2020_climate-model-predict")] - subc = dbt.debug_metadata["copied"][dircheckstr( - "DataAnalysis", "2020_climate-model-predict")] - - assert len(subd[1]) == 1 - assert len(subd[1]["Project"].get_parents()) == 1 - assert subd[1]["Project"].get_parents()[0].name == "Project" - assert subd[1]["Project"].get_property("date").value == "2020" - assert subd[1]["Project"].get_property( - "identifier").value == "climate-model-predict" - - assert len(subd[0]) == 9 - assert subd[0]["date"] == "2020" - assert subd[0]["identifier"] == "climate-model-predict" - assert subd[0]["Project"].__class__ == db.Record - - assert subd[0]["DataAnalysis"] == "examples_article/DataAnalysis" - assert subc[0]["DataAnalysis"] is True - assert subd[0]["project_dir"] == "examples_article/DataAnalysis/2020_climate-model-predict" - assert subc[0]["project_dir"] is False - - # Check the copy flags for the first level in the hierarchy: - assert len(subc[0]) == 9 - assert len(subc[1]) == 1 - assert subc[1]["Project"] is False - assert subc[0]["Project"] is False - assert subc[0]["date"] is False - assert subc[0]["identifier"] is False - - subd = dbt.debug_tree[dircheckstr("DataAnalysis", - "2020_climate-model-predict", - "2020-02-08_prediction-errors")] - subc = dbt.debug_metadata["copied"][dircheckstr("DataAnalysis", - "2020_climate-model-predict", - "2020-02-08_prediction-errors")] - assert len(subd[0]) == 12 - assert subd[0]["date"] == "2020-02-08" - assert subd[0]["identifier"] == "prediction-errors" - assert subd[0]["Project"].__class__ == db.Record - assert subd[0]["Measurement"].__class__ == db.Record - - assert len(subd[1]) == 2 - - assert len(subd[1]["Project"].get_parents()) == 1 - assert subd[1]["Project"].get_parents()[0].name == "Project" - assert subd[1]["Project"].get_property("date").value == "2020" - assert subd[1]["Project"].get_property( - "identifier").value == "climate-model-predict" - - assert len(subd[1]["Measurement"].get_parents()) == 1 - assert subd[1]["Measurement"].get_parents()[0].name == "Measurement" - assert subd[1]["Measurement"].get_property("date").value == "2020-02-08" - assert subd[1]["Measurement"].get_property( - "identifier").value == "prediction-errors" - assert subd[1]["Measurement"].get_property("project").value != "$Project" - assert subd[1]["Measurement"].get_property( - "project").value.__class__ == db.Record - assert subd[1]["Measurement"].get_property( - "project").value == subd[0]["Project"] - - # Check the copy flags for the second level in the hierarchy: - assert subc[1]["Project"] is True - assert subc[0]["Project"] is True - assert subc[1]["Measurement"] is False - assert subc[0]["Measurement"] is False - assert subc[0]["date"] is False - assert subc[0]["identifier"] is False - - -# def prepare_test_record_file(): -# ident = LocalStorageIdentifiableAdapter() -# crawler = Crawler(debug=True, identifiableAdapter=ident) -# crawler.crawl_directory(rfp("test_directories", "examples_article"), -# rfp("scifolder_cfood.yml")) - -# # clean record list: -# recordlist = ident.get_records() -# for i in range(len(recordlist)-1, 1, -1): -# if recordlist[i].parents[0].name == "Person": -# del recordlist[i] - -# ident.store_state(rfp("records.xml")) - - -def test_crawler_update_list(crawler, ident): - crawled_data = crawler[1] - # If the following assertions fail, that is a hint, that the test file records.xml has changed - # and this needs to be updated: - assert len(ident.get_records()) == 18 - assert len( - [r for r in ident.get_records() if r.parents[0].name == "Person"] - ) == 5 - assert len( - [r for r in ident.get_records() if r.parents[0].name == "Measurement"] - ) == 11 - assert len( - [r for r in ident.get_records() if r.parents[0].name == "Project"] - ) == 2 - - # The crawler contains lots of duplicates, because identifiables have not been resolved yet: - assert len(ident.get_records()) != len(crawled_data) - - # Check consistency: - # Check whether identifiables retrieved from current identifiable store return - # the same results. - - # take the first person in the list of records: - for r in ident.get_records(): - if r.parents[0].name == "Person": - r_cur = r - break - - id_r0 = ident.get_identifiable(r_cur) - assert r_cur.parents[0].name == id_r0.record_type - assert r_cur.get_property( - "first_name").value == id_r0.properties["first_name"] - assert r_cur.get_property( - "last_name").value == id_r0.properties["last_name"] - assert len(r_cur.parents) == 1 - assert len(r_cur.properties) == 2 - assert len(id_r0.properties) == 2 - - idr_r0_test = ident.retrieve_identified_record_for_identifiable(id_r0) - idr_r0 = ident.retrieve_identified_record_for_record(r_cur) - assert idr_r0 == idr_r0_test - - # take the first measurement in the list of records: - for r in ident.get_records(): - if r.parents[0].name == "Measurement": - r_cur = r - break - - id_r1 = ident.get_identifiable(r_cur) - assert r_cur.parents[0].name == id_r1.record_type - assert r_cur.get_property( - "identifier").value == id_r1.properties["identifier"] - assert r_cur.get_property("date").value == id_r1.properties["date"] - assert r_cur.get_property( - "project").value == id_r1.properties["project"] - assert len(r_cur.parents) == 1 - assert len(r_cur.properties) == 4 - assert len(id_r1.properties) == 3 - - idr_r1_test = ident.retrieve_identified_record_for_identifiable(id_r1) - idr_r1 = ident.retrieve_identified_record_for_record(r_cur) - assert idr_r1 == idr_r1_test - assert idr_r1 != idr_r0 - assert idr_r1_test != idr_r0_test - - assert len(idr_r1.properties) == 4 - assert r_cur.get_property( - "responsible").value == idr_r1.get_property("responsible").value - assert r_cur.description == idr_r1.description - - -def test_synchronization(crawler, ident): - insl, updl = crawler[0].synchronize(commit_changes=False, crawled_data=crawler[1]) - assert len(insl) == 0 - assert len(updl) == 0 - - -def test_remove_unnecessary_updates(): - # test trvial case - upl = [db.Record().add_parent("A")] - irs = [db.Record().add_parent("A")] - updates = Crawler.remove_unnecessary_updates(upl, irs) - assert len(updates) == 0 - - # test property difference case - # TODO this should work right? - # upl = [db.Record().add_parent("A").add_property("a", 3)] - # irs = [db.Record().add_parent("A")] # ID should be s - # Crawler.remove_unnecessary_updates(upl, irs) - # assert len(upl) == 1 - - # test value difference case - upl = [db.Record().add_parent("A").add_property("a", 5)] - irs = [db.Record().add_parent("A").add_property("a")] - updates = Crawler.remove_unnecessary_updates(upl, irs) - assert len(updates) == 1 - upl = [db.Record().add_parent("A").add_property("a", 5)] - irs = [db.Record().add_parent("A").add_property("a", 5)] - updates = Crawler.remove_unnecessary_updates(upl, irs) - assert len(updates) == 0 - - # test unit difference case - upl = [db.Record().add_parent("A").add_property("a", unit='cm')] - irs = [db.Record().add_parent("A").add_property("a")] - updates = Crawler.remove_unnecessary_updates(upl, irs) - assert len(updates) == 1 - - # test None difference case - upl = [db.Record().add_parent("A").add_property("a")] - irs = [db.Record().add_parent("A").add_property("a", 5)] - updates = Crawler.remove_unnecessary_updates(upl, irs) - assert len(updates) == 1 - - -# Current status: -# TODO: currently, this test fails, because non identifiable records cannot -# be inserted into the cache. Solution might be, just not to add them -# into the local cache. Probably in split_into_inserts_and_updates. -@pytest.mark.xfail -def test_identifiable_adapter_no_identifiable(crawler, ident): - del ident._registered_identifiables["Person"] - insl, updl = crawler[0].synchronize() - assert len(updl) == 0 - - pers = [r for r in crawler[0].crawled_data if r.parents[0].name == "Person"] - # All persons are inserted, because they are not identifiable: - assert len(insl) == len(pers) - - -def test_provenance_debug_data(crawler): - crawler[0].save_debug_data(rfp("provenance.yml"), debug_tree=crawler[2]) - - with open(rfp("provenance.yml"), "r") as f: - provenance = yaml.load(f, Loader=yaml.SafeLoader) - - pr = provenance["provenance"] - - def check_key_count(prefix): - return sum([1 for key in pr.keys() if key.startswith(prefix)]) - assert check_key_count("Measurement") == 11 - assert check_key_count("Project") == 5 - assert check_key_count("Person") == 14 - - -def test_split_into_inserts_and_updates_trivial(crawler): - crawler[0].split_into_inserts_and_updates([]) - - -def basic_retrieve_by_name_mock_up(rec, referencing_entities=None, known=None): - """ returns a stored Record if rec.name is an existing key, None otherwise """ - if rec.name in known: - return known[rec.name] - else: - return None - - -@pytest.fixture -def crawler_mocked_identifiable_retrieve(crawler): - # mock retrieval of registered identifiabls: return Record with just a parent - crawler[0].identifiableAdapter.get_registered_identifiable = Mock( - side_effect=lambda x: db.Record().add_parent(x.parents[0].name)) - - # Simulate remote server content by using the names to identify records - # There is only a single known Record with name A - crawler[0].identifiableAdapter.retrieve_identified_record_for_record = Mock(side_effect=partial( - basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A")})) - crawler[0].identifiableAdapter.retrieve_identified_record_for_identifiable = Mock( - side_effect=partial( - basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A")})) - return crawler - - -def test_split_into_inserts_and_updates_single(crawler_mocked_identifiable_retrieve): - crawler = crawler_mocked_identifiable_retrieve[0] - identlist = [Identifiable(name="A", record_type="C"), Identifiable(name="B", record_type="C")] - entlist = [db.Record(name="A").add_parent( - "C"), db.Record(name="B").add_parent("C")] - - assert crawler.get_from_any_cache(identlist[0]) is None - assert crawler.get_from_any_cache(identlist[1]) is None - assert not crawler._has_reference_value_without_id(identlist[0]) - assert not crawler._has_reference_value_without_id(identlist[1]) - assert crawler.identifiableAdapter.retrieve_identified_record_for_record( - identlist[0]).id == 1111 - assert crawler.identifiableAdapter.retrieve_identified_record_for_record( - identlist[1]) is None - - insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) - assert len(insert) == 1 - assert insert[0].name == "B" - assert len(update) == 1 - assert update[0].name == "A" - # if this ever fails, the mock up may be removed - crawler.identifiableAdapter.get_registered_identifiable.assert_called() - crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() - - -def test_split_into_inserts_and_updates_with_duplicate(crawler_mocked_identifiable_retrieve): - crawler = crawler_mocked_identifiable_retrieve[0] - a = db.Record(name="A").add_parent("C") - b = db.Record(name="B").add_parent("C") - b.add_property("A", a) - # This is identical to a and should be removed - c = db.Record(name="A").add_parent("C") - entlist = [a, b, c] - insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) - assert len(insert) == 1 - assert insert[0].name == "B" - assert len(update) == 1 - assert update[0].name == "A" - # if this ever fails, the mock up may be removed - crawler.identifiableAdapter.get_registered_identifiable.assert_called() - crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() - - -def test_split_into_inserts_and_updates_with_ref(crawler_mocked_identifiable_retrieve): - crawler = crawler_mocked_identifiable_retrieve[0] - # try it with a reference - a = db.Record(name="A").add_parent("C") - b = db.Record(name="B").add_parent("C") - b.add_property("A", a) - entlist = [a, b] - insert, update = crawler.split_into_inserts_and_updates(entlist) - assert len(insert) == 1 - assert insert[0].name == "B" - assert len(update) == 1 - assert update[0].name == "A" - # if this ever fails, the mock up may be removed - crawler.identifiableAdapter.get_registered_identifiable.assert_called() - crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() - - -def test_split_into_inserts_and_updates_with_circ(crawler): - # try circular - a = db.Record(name="A").add_parent("C") - b = db.Record(name="B").add_parent("C") - b.add_property("A", a) - a.add_property("B", b) - entlist = [a, b] - # TODO this does not seem to be complete! - - -def test_split_into_inserts_and_updates_with_complex(crawler_mocked_identifiable_retrieve): - crawler = crawler_mocked_identifiable_retrieve[0] - # A - # ^ - # | - # F <- B <- G - a = db.Record(name="A").add_parent("C").add_property( - 'd', 13).add_property('e', "lskdjlsfdj") - b = db.Record(name="B").add_parent("C") - g = db.Record(name="G").add_parent("C") - f = db.Record(name="F").add_parent("C") - g.add_property("A", a) - b.add_property("A", f) - b.add_property("A", a) - entlist = [a, b, g] - insert, update = crawler.split_into_inserts_and_updates(entlist) - assert len(insert) == 3 - assert "B" in [el.name for el in insert] - assert len(update) == 1 - assert update[0].name == "A" - # if this ever fails, the mock up may be removed - crawler.identifiableAdapter.get_registered_identifiable.assert_called() - crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() - - # TODO write test where the unresoled entity is not part of the identifiable - - -def test_split_into_inserts_and_updates_with_copy_attr(crawler_mocked_identifiable_retrieve): - crawler = crawler_mocked_identifiable_retrieve[0] - # assume identifiable is only the name - a = db.Record(name="A").add_parent("C") - a.add_property("foo", 1) - b = db.Record(name="A").add_parent("C") - b.add_property("bar", 2) - entlist = [a, b] - insert, update = crawler.split_into_inserts_and_updates(entlist) - - assert update[0].get_property("bar").value == 2 - assert update[0].get_property("foo").value == 1 - # if this ever fails, the mock up may be removed - crawler.identifiableAdapter.get_registered_identifiable.assert_called() - crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() - - -def test_has_missing_object_in_references(crawler): - # Simulate remote server content by using the names to identify records - # There are only two known Records with name A and B - crawler[0].identifiableAdapter.get_registered_identifiable = Mock(side_effect=partial( - basic_retrieve_by_name_mock_up, known={"C": db.Record(name="C").add_parent("RTC") - .add_property("d"), - "D": db.Record(name="D").add_parent("RTD") - .add_property("d").add_property("e"), - })) - - # one reference with id -> check - assert not crawler[0]._has_missing_object_in_references( - Identifiable(name="C", record_type="RTC", properties={'d': 123}), []) - # one ref with Entity with id -> check - assert not crawler[0]._has_missing_object_in_references( - Identifiable(name="C", record_type="RTC", properties={'d': db.Record(id=123) - .add_parent("C")}), []) - # one ref with id one with Entity with id (mixed) -> check - assert not crawler[0]._has_missing_object_in_references( - Identifiable(name="C", record_type="RTD", - properties={'d': 123, 'b': db.Record(id=123).add_parent("RTC")}), []) - # entity to be referenced in the following - a = db.Record(name="C").add_parent("C").add_property("d", 12311) - # one ref with id one with Entity without id (but not identifying) -> fail - assert not crawler[0]._has_missing_object_in_references( - Identifiable(name="C", record_type="RTC", properties={'d': 123, 'e': a}), []) - - # one ref with id one with Entity without id (mixed) -> fail - assert not crawler[0]._has_missing_object_in_references( - Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), []) - - crawler[0].add_to_remote_missing_cache(a, Identifiable(name="C", record_type="RTC", - properties={'d': 12311})) - # one ref with id one with Entity without id but in cache -> check - assert crawler[0]._has_missing_object_in_references( - Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), []) - - # if this ever fails, the mock up may be removed - crawler[0].identifiableAdapter.get_registered_identifiable.assert_called() - - -@pytest.mark.xfail() -def test_references_entities_without_ids(crawler, ident): - assert not crawler[0]._has_reference_value_without_id(db.Record().add_parent("Person") - .add_property('last_name', 123) - .add_property('first_name', 123)) - # id and rec with id - assert not crawler[0]._has_reference_value_without_id(db.Record().add_parent("Person") - .add_property('first_name', 123) - .add_property('last_name', - db.Record(id=123))) - # id and rec with id and one unneeded prop - assert crawler[0]._has_reference_value_without_id(db.Record().add_parent("Person") - .add_property('first_name', 123) - .add_property('stuff', db.Record()) - .add_property('last_name', db.Record(id=123))) - - # one identifying prop is missing - assert crawler[0]._has_reference_value_without_id(db.Record().add_parent("Person") - .add_property('first_name', 123) - .add_property('last_name', db.Record())) - - -def test_replace_entities_with_ids(crawler): - a = (db.Record().add_parent("B").add_property("A", 12345) - .add_property("B", db.Record(id=12345)) - .add_property("C", [db.Record(id=12345), 233324])) - - crawler[0].replace_entities_with_ids(a) - assert a.get_property("A").value == 12345 - assert a.get_property("B").value == 12345 - assert a.get_property("C").value == [12345, 233324] - - -def mock_get_entity_by(eid=None, name=None): - if eid is not None: - candidates = [el for el in list(full_data.values()) if el.id == eid] - if len(candidates) > 0: - return candidates[0] - else: - raise ValueError() - if name is not None: - candidates = [el for el in full_data.values() - if (el.name is not None and el.name.lower() == name.lower())] - if len(candidates) > 0: - return candidates[0] - else: - raise ValueError() - - -def prepare_crawler_with_sec_mode(mode, ident): - crawler = Crawler(securityMode=mode) - debug_tree = DebugTree() - crawled_data = scan_directory( - rfp("test_directories", "examples_article"), - rfp("scifolder_cfood.yml"), debug_tree=debug_tree) - crawler.identifiableAdapter = ident - - return crawler, crawled_data, debug_tree - - -def reset_mocks(mocks): - for mock in mocks: - mock.reset_mock() - - -def change_identifiable_prop(ident): - """ - This function is supposed to change a non identifiing property. - """ - for ent in ident._records: - if len(ent.parents) == 0 or ent.parents[0].name != "Measurement": - continue - for prop in ent.properties: - if prop.name != "date": - continue - # change one element; This removes a responsible which is not part of the identifiable - prop.value = "2022-01-04" - return - # If it does not work, this test is not implemented properly - raise RuntimeError("Did not find the property that should be changed.") - - -def change_non_identifiable_prop(ident): - """ - This function is supposed to change a non identifiing property. - """ - for ent in ident._records: - if len(ent.parents) == 0 or ent.parents[0].name != "Measurement": - continue - - for prop in ent.properties: - if prop.name != "responsible" or len(prop.value) < 2: - continue - # change one element; This removes a responsible which is not part of the identifiable - del prop.value[-1] - return - raise RuntimeError("Did not find the property that should be changed.") - - -@patch("caoscrawler.crawl.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by)) -@patch("caoscrawler.crawl.db.Container.insert") -@patch("caoscrawler.crawl.db.Container.update") -@patch("caoscrawler.crawl.UpdateCache.insert") -def test_security_mode(updateCacheMock, upmock, insmock, ident): - records_backup = deepcopy(ident._records) - - # trivial case: nothing to do - crawler, crawled_data, debug_tree = prepare_crawler_with_sec_mode(SecurityMode.RETRIEVE, ident) - crawler.synchronize(commit_changes=True, crawled_data=crawled_data) - assert crawler.run_id is not None - insmock.assert_not_called() - upmock.assert_not_called() - updateCacheMock.assert_not_called() - - # RETRIEVE: insert only - crawler, crawled_data, debug_tree = prepare_crawler_with_sec_mode(SecurityMode.RETRIEVE, ident) - # remove one element - del ident._records[-1] - # insert forbidden - crawler.synchronize(commit_changes=True, crawled_data=crawled_data) - assert crawler.run_id is not None - insmock.assert_not_called() - upmock.assert_not_called() - assert updateCacheMock.call_count == 1 - # reset counts - reset_mocks([updateCacheMock, insmock, upmock]) - # restore original ident - ident._records = deepcopy(records_backup) - - # RETRIEVE: update only - crawler, crawled_data, debug_tree = prepare_crawler_with_sec_mode(SecurityMode.RETRIEVE, ident) - # change one element - change_non_identifiable_prop(ident) - crawler.synchronize(commit_changes=True, crawled_data=crawled_data) - assert crawler.run_id is not None - insmock.assert_not_called() - upmock.assert_not_called() - assert updateCacheMock.call_count == 1 - # reset counts - reset_mocks([updateCacheMock, insmock, upmock]) - # restore original ident - ident._records = deepcopy(records_backup) - - # INSERT: insert only - crawler, crawled_data, debug_tree = prepare_crawler_with_sec_mode(SecurityMode.INSERT, ident) - # remove one element - del ident._records[-1] - crawler.synchronize(commit_changes=True, crawled_data=crawled_data) - assert crawler.run_id is not None - insmock.assert_called_once() - upmock.assert_not_called() - updateCacheMock.assert_not_called() - # reset counts - reset_mocks([updateCacheMock, insmock, upmock]) - # restore original ident - ident._records = deepcopy(records_backup) - - # INSERT: update only - crawler, crawled_data, debug_tree = prepare_crawler_with_sec_mode(SecurityMode.INSERT, ident) - # change one element - change_non_identifiable_prop(ident) - crawler.synchronize(commit_changes=True, crawled_data=crawled_data) - assert crawler.run_id is not None - insmock.assert_not_called() - upmock.assert_not_called() - updateCacheMock.assert_called_once() - # reset counts - reset_mocks([updateCacheMock, insmock, upmock]) - # restore original ident - ident._records = deepcopy(records_backup) - - # INSERT: insert and update - crawler, crawled_data, debug_tree = prepare_crawler_with_sec_mode(SecurityMode.INSERT, ident) - # change two elements - change_non_identifiable_prop(ident) - change_identifiable_prop(ident) - crawler.synchronize(commit_changes=True, crawled_data=crawled_data) - assert crawler.run_id is not None - insmock.asser_called_once() - upmock.assert_not_called() - updateCacheMock.assert_called_once() - # reset counts - reset_mocks([updateCacheMock, insmock, upmock]) - # restore original ident - ident._records = deepcopy(records_backup) - - -def test_create_reference_mapping(): - a = db.Record().add_parent("A") - b = db.Record().add_parent("B").add_property('a', a) - ref = Crawler.create_reference_mapping([a, b]) - assert id(a) in ref - assert id(b) not in ref - assert "B" in ref[id(a)] - assert ref[id(a)]["B"] == [b] - - -def test_create_flat_list(): - a = db.Record() - b = db.Record() - a.add_property(name="a", value=a) - a.add_property(name="b", value=b) - flat = Crawler.create_flat_list([a]) - assert len(flat) == 2 - assert a in flat - assert b in flat - c = db.Record() - c.add_property(name="a", value=a) - # This would caus recursion if it is not dealt with properly. - a.add_property(name="c", value=c) - flat = Crawler.create_flat_list([c]) - assert len(flat) == 3 - assert a in flat - assert b in flat - assert c in flat - - -@pytest.fixture -def crawler_mocked_for_backref_test(crawler): - # mock retrieval of registered identifiabls: return Record with just a parent - def get_reg_ident(x): - if x.parents[0].name == "C": - return db.Record().add_parent(x.parents[0].name).add_property( - "is_referenced_by", value=["BR"]) - elif x.parents[0].name == "D": - return db.Record().add_parent(x.parents[0].name).add_property( - "is_referenced_by", value=["BR", "BR2"]) - else: - return db.Record().add_parent(x.parents[0].name) - crawler[0].identifiableAdapter.get_registered_identifiable = Mock(side_effect=get_reg_ident) - - # Simulate remote server content by using the names to identify records - # There is only a single known Record with name A - crawler[0].identifiableAdapter.retrieve_identified_record_for_record = Mock(side_effect=partial( - basic_retrieve_by_name_mock_up, known={"A": - db.Record(id=1111, name="A").add_parent("BR")})) - crawler[0].identifiableAdapter.retrieve_identified_record_for_identifiable = Mock( - side_effect=partial( - basic_retrieve_by_name_mock_up, known={"A": - db.Record(id=1111, name="A").add_parent("BR")})) - return crawler - - -def test_validation_error_print(caplog): - caplog.set_level(logging.DEBUG, logger="caoscrawler.converters") - # there should be no server interaction since we only test the behavior if a validation error - # occurs during the data collection stage - DATADIR = os.path.join(os.path.dirname(__file__), "test_data", "failing_validation") - for fi in ["cfood.yml", "cfood2.yml"]: - ret = crawler_main(DATADIR, - os.path.join(DATADIR, fi), - os.path.join(DATADIR, "identifiables.yml"), - True, - None, - False) - assert "Couldn't validate" in caplog.text - caplog.clear() - - -def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test): - crawler = crawler_mocked_for_backref_test[0] - identlist = [Identifiable(name="A", record_type="BR"), - Identifiable(name="B", record_type="C", backrefs=[db.Entity()])] - referenced = db.Record(name="B").add_parent("C") - entlist = [referenced, db.Record(name="A").add_parent("BR").add_property("ref", referenced), ] - - # Test without referencing object - # currently a NotImplementedError is raised if necessary properties are missing. - with raises(NotImplementedError): - crawler.split_into_inserts_and_updates([db.Record(name="B").add_parent("C")]) - - # identifiables were not yet checked - assert crawler.get_from_any_cache(identlist[0]) is None - assert crawler.get_from_any_cache(identlist[1]) is None - # one with reference, one without - assert not crawler._has_reference_value_without_id(identlist[0]) - assert crawler._has_reference_value_without_id(identlist[1]) - # one can be found remotely, one not - assert crawler.identifiableAdapter.retrieve_identified_record_for_record( - identlist[0]).id == 1111 - assert crawler.identifiableAdapter.retrieve_identified_record_for_record( - identlist[1]) is None - - # check the split... - insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) - # A was found remotely and is therefore in the update list - assert len(update) == 1 - assert update[0].name == "A" - # B does not exist on the (simulated) remote server - assert len(insert) == 1 - assert insert[0].name == "B" - - -def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_test): - # test whether multiple references of the same record type are correctly used - crawler = crawler_mocked_for_backref_test[0] - referenced = db.Record(name="B").add_parent("C") - entlist = [referenced, - db.Record(name="A").add_parent("BR").add_property("ref", referenced), - db.Record(name="C").add_parent("BR").add_property("ref", referenced), - ] - - # test whether both entities are listed in the backref attribute of the identifiable - referencing_entities = crawler.create_reference_mapping(entlist) - identifiable = crawler.identifiableAdapter.get_identifiable(referenced, referencing_entities) - assert len(identifiable.backrefs) == 2 - - # check the split... - insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) - assert len(update) == 1 - assert len(insert) == 2 - - -def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_test): - # test whether multiple references of the different record types are correctly used - crawler = crawler_mocked_for_backref_test[0] - referenced = db.Record(name="B").add_parent("D") - entlist = [referenced, - db.Record(name="A").add_parent("BR").add_property("ref", referenced), - db.Record(name="A").add_parent("BR2").add_property("ref", referenced), - ] - - # test whether both entities are listed in the backref attribute of the identifiable - referencing_entities = crawler.create_reference_mapping(entlist) - identifiable = crawler.identifiableAdapter.get_identifiable(referenced, referencing_entities) - assert len(identifiable.backrefs) == 2 - - # check the split... - insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) - assert len(update) == 2 - assert len(insert) == 1 - - -def mock_create_values(values, element): - pass - - -@patch("caoscrawler.converters.IntegerElementConverter.create_values") -def test_restricted_path(create_mock): - """ - The restricted_path argument allows to ignroe part of the crawled data structure. Here, we make - sure, that is that argument is provided, ideed only the given path of the tree is traversed. - - The check is done using the mock of the create_values function of the IntegerElementConverter. - This function is only called if elements are being treated. - """ - crawler_definition = { - "DictTest": { - "type": "DictElement", - "match": "(.*)", - "subtree": { - "nextdict": { - "type": "DictElement", - "match": "(.*)", - "subtree": { - "int_element": { - "type": "IntegerElement", - "match_name": ".*", - "match_value": "(?P<int_value>.*)", - "records": { - "Dataset": { - "Subject": "$int_value" - } - } - } - } - } - } - } - } - - crawler = Crawler() - converter_registry = create_converter_registry(crawler_definition) - - # This structure is crawled - test_dict = { - "v1": { - "a": 1, - "b": 2, - }, - "v2": { - "c": 3, - "d": 4, - } - } - # first test without a restricted_path - restricted_path = None - records = scan_structure_elements( - DictElement("TestDict", test_dict), crawler_definition, converter_registry, - restricted_path - ) - assert create_mock.call_count == 4 - create_mock.reset_mock() - - # test with a restricted_path but one that has no effect (single root element) - # this also tests that the remainder of the tree is fully traversed - restricted_path = ["TestDict"] - records = scan_structure_elements( - DictElement("TestDict", test_dict), crawler_definition, converter_registry, - restricted_path - ) - assert create_mock.call_count == 4 - create_mock.reset_mock() - - # test with a restricted_path that restricts the tree (single root element) - restricted_path = ["TestDict", "v2"] - records = scan_structure_elements( - DictElement("TestDict", test_dict), crawler_definition, converter_registry, - restricted_path - ) - assert create_mock.call_count == 2 - create_mock.reset_mock() - - # test with a restricted_path that contains a bad element - restricted_path = ["TestDict", "v3"] - with raises(RuntimeError): - records = scan_structure_elements( - DictElement("TestDict", test_dict), crawler_definition, converter_registry, - restricted_path - ) - - -def test_split_restricted_path(): - assert ["el"] == split_restricted_path("/el") - assert ["el"] == split_restricted_path("/el/") - assert ["el", "el"] == split_restricted_path("/el/el") - - -# Filter the warning because we want to have it here and this way it does not hinder running -# tests with -Werror. -@pytest.mark.filterwarnings("ignore:The prefix:DeprecationWarning") -def test_deprecated_prefix_option(): - """Test that calling the crawler's main function with the deprecated - `prefix` option raises the correct errors and warnings. - - """ - - with pytest.deprecated_call(): - crawler_main("./", rfp("scifolder_cfood.yml"), prefix="to/be/removed") - - # Check that crawler main terminates with an error - assert 1 == crawler_main("./", rfp("scifolder_cfood.yml"), prefix="to/be/removed", - remove_prefix="to/be/removed") - - with raises(ValueError) as ve: - - _treat_deprecated_prefix(prefix="to/be/removed", remove_prefix="to/be/removed") - - assert "(deprecated) `prefix` and the `remove_prefix`" in str(ve.value) - - -def test_create_entity_summary(): - assert "" == Crawler.create_entity_summary([]).strip() - - entities = [ - db.Record(id=1).add_parent("A"), - db.Record(id=4, name='a').add_parent("B"), - db.Record(id=5).add_parent("A"), - db.Record(id=6, name='b').add_parent("B"), - ] - text = Crawler.create_entity_summary(entities).strip() - assert 'a' in text - assert 'b' in text - assert 'A:' in text - assert 'B:' in text - assert "<a href='/Entity/4'>a</a>, <a href='/Entity/6'>b</a>" in text diff --git a/unittests/test_tool_extended.py b/unittests/test_tool_extended.py deleted file mode 100644 index 7dd4282e4c6d206c8c360424d865b9f736b5e582..0000000000000000000000000000000000000000 --- a/unittests/test_tool_extended.py +++ /dev/null @@ -1,84 +0,0 @@ -#!/bin/python -# Tests for the tool using pytest -# Adapted from check-sfs -# A. Schlemmer, 06/2021 - -from caoscrawler import Crawler -from caoscrawler.structure_elements import File, DictTextElement, DictListElement -from caoscrawler.identifiable_adapters import IdentifiableAdapter, LocalStorageIdentifiableAdapter -from caoscrawler.scanner import scan_directory -from functools import partial -from caoscrawler.debug_tree import DebugTree -from copy import deepcopy -from unittest.mock import MagicMock, Mock -from os.path import join, dirname, basename -import yaml -import caosdb as db -from caosdb.apiutils import compare_entities - -import pytest -from pytest import raises - - -def rfp(*pathcomponents): - """ - Return full path. - Shorthand convenience function. - """ - return join(dirname(__file__), *pathcomponents) - - -def dircheckstr(*pathcomponents, structure_element_type="Directory"): - """ - Return the debug tree identifier for a given path. - """ - return ("caoscrawler.structure_elements." + structure_element_type + ": " + - basename(join(*pathcomponents)) + ", " + - rfp("test_directories", "examples_article", *pathcomponents)) - - -@pytest.fixture -def crawler(): - crawler = Crawler(debug=True) - crawler.crawl_directory(rfp("test_directories", "examples_article"), - rfp("scifolder_extended.yml")) - return crawler - - -# @pytest.fixture -# def ident(crawler): -# ident = LocalStorageIdentifiableAdapter() -# crawler.identifiableAdapter = ident - -# ident.restore_state(rfp("records.xml")) - -# ident.register_identifiable( -# "Person", db.RecordType() -# .add_parent(name="Person") -# .add_property(name="first_name") -# .add_property(name="last_name")) -# ident.register_identifiable( -# "Measurement", db.RecordType() -# .add_parent(name="Measurement") -# .add_property(name="identifier") -# .add_property(name="date") -# .add_property(name="project")) -# ident.register_identifiable( -# "Project", db.RecordType() -# .add_parent(name="Project") -# .add_property(name="date") -# .add_property(name="identifier")) -# return ident - - -def test_file_structure_generation(): - dbt = DebugTree() - scan_directory(rfp("test_directories", "examples_article"), - rfp("scifolder_extended.yml"), - debug_tree=dbt) - sd = dbt.debug_tree[dircheckstr("SimulationData", - "2020_climate-model-predict", "2020-02-01", - "README.md", structure_element_type="File")] - assert sd[1]["ReadmeFile"].role == "File" - assert len(sd[1]["ReadmeFile"].path) > 0 - assert len(sd[1]["ReadmeFile"].file) > 0 diff --git a/unittests/test_variable_substitutions.py b/unittests/test_variable_substitutions.py index f13e759982e8102bbf37e65311ff4073ba52e5a2..9d1981c773e481e78eb4f82e785e27ee8f8d00d6 100644 --- a/unittests/test_variable_substitutions.py +++ b/unittests/test_variable_substitutions.py @@ -1,23 +1,49 @@ -#!/bin/python -# Tests for variable substitutions -# A. Schlemmer, 05/2022 +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2022 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# -from caoscrawler.debug_tree import DebugTree -from caoscrawler import Crawler -from caoscrawler.scanner import scan_directory -from caoscrawler.structure_elements import File, DictTextElement, DictListElement -from caoscrawler.identifiable_adapters import IdentifiableAdapter, LocalStorageIdentifiableAdapter -from functools import partial from copy import deepcopy +from functools import partial +from os.path import basename, dirname, join +from pathlib import Path from unittest.mock import MagicMock, Mock -from os.path import join, dirname, basename -import yaml -import caosdb as db -from caosdb.apiutils import compare_entities +import caosdb as db import pytest +import yaml +from caoscrawler import Crawler +from caoscrawler.debug_tree import DebugTree +from caoscrawler.identifiable_adapters import (IdentifiableAdapter, + LocalStorageIdentifiableAdapter) +from caoscrawler.scanner import scan_directory +from caoscrawler.structure_elements import (DictListElement, DictTextElement, + File) +from caosdb.apiutils import compare_entities from pytest import raises +from utils import dircheckstr as dircheckstr_base + +UNITTESTDIR = Path(__file__).parent +dircheckstr = partial(dircheckstr_base, UNITTESTDIR/"test_directories" / + "example_substitutions") + def rfp(*pathcomponents): """ @@ -27,13 +53,6 @@ def rfp(*pathcomponents): return join(dirname(__file__), *pathcomponents) -def dircheckstr(element_type, *pathcomponents): - """ - Return the debug tree identifier for a given path. - """ - return "caoscrawler.structure_elements." + element_type + ": " + basename(join(*pathcomponents)) + ", " + rfp("test_directories", "example_substitutions", *pathcomponents) - - def test_substitutions(): dbt = DebugTree() @@ -42,13 +61,12 @@ def test_substitutions(): debug_tree=dbt) # @review Florian Spreckelsen 2022-05-13 for i in range(2): - subd = dbt.debug_tree[dircheckstr( - "File", "ExperimentalData", "220512_data.dat")] + subd = dbt.debug_tree[dircheckstr("ExperimentalData", "220512_data.dat")] assert subd[i]["Experiment"].get_property("date").value == "2022-05-12" assert isinstance(subd[i]["ExperimentSeries"].get_property( "Experiment").value, db.Record) - subd = dbt.debug_tree[dircheckstr("Directory", "ExperimentalData")] + subd = dbt.debug_tree[dircheckstr("ExperimentalData")] assert subd[i]["Project"].name == "project" assert isinstance(subd[i]["Project"].get_property( "Experiments").value, list) @@ -69,8 +87,7 @@ def test_substitutions_parents(): # This is a test for: # https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/35 # ... testing whether variable substitutions can be used in parent declarations. - subd = dbt.debug_tree[dircheckstr( - "File", "ExperimentalData", "220512_data.dat")] + subd = dbt.debug_tree[dircheckstr("ExperimentalData", "220512_data.dat")] # subd[0] <- generalStore # subd[1] <- recordStore @@ -89,8 +106,7 @@ def test_empty_parents(): # This is a test for: # https://gitlab.com/caosdb/caosdb-crawler/-/issues/8 - subd = dbt.debug_tree[dircheckstr( - "File", "ExperimentalData", "220512_data.dat")] + subd = dbt.debug_tree[dircheckstr("ExperimentalData", "220512_data.dat")] parents = subd[1]["RecordWithoutParents"].get_parents() assert len(parents) == 0 diff --git a/unittests/utils.py b/unittests/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a9649dea686c33dc33d0d7636d08aa51beb35412 --- /dev/null +++ b/unittests/utils.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2023 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2023 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +import os +from pathlib import Path + +""" +utilities for tests +""" +UNITTESTDIR = Path(__file__).parent + + +def dircheckstr(prefix, *pathcomponents): + """ + Return the debug tree identifier for a given path. + """ + if os.path.isdir(os.path.join(prefix, *pathcomponents)): + ftype = "Directory" + else: + ftype = "File" + return (f"caoscrawler.structure_elements.{ftype}: " + os.path.basename( + os.path.join(*pathcomponents)) + ", " + os.path.join(prefix, *pathcomponents))