diff --git a/integrationtests/test_issues.py b/integrationtests/test_issues.py index 08e254daf4052670fcec18760626c460604efe15..5a141f944ec695f2186060c4046a3e32b0579874 100644 --- a/integrationtests/test_issues.py +++ b/integrationtests/test_issues.py @@ -30,6 +30,11 @@ from caosdb.utils.register_tests import clear_database, set_test_key set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") +@pytest.fixture +def clear_cache(autouse=True): + cache_clear() + + def test_issue_23(clear_database): """Test that an update leaves existing properties, that were not found by the crawler, unchanged. diff --git a/integrationtests/test_realworld_example.py b/integrationtests/test_realworld_example.py index 45873ddeb8b4f4a23fbcbc9225cbeea60b213cc4..e7b4ab379016d71dcd0171da5bbae948806226a0 100644 --- a/integrationtests/test_realworld_example.py +++ b/integrationtests/test_realworld_example.py @@ -56,6 +56,11 @@ def rfp(*pathcomponents): DATADIR = rfp("test_data", "extroot", "realworld_example") +@pytest.fixture +def clear_cache(autouse=True): + cache_clear() + + @pytest.fixture def addfiles(): loadpath(path='/opt/caosdb/mnt/extroot/', diff --git a/integrationtests/test_use_case_simple_presentation.py b/integrationtests/test_use_case_simple_presentation.py index 5f66e6fae7a77a315437f4a030110ded8d0ce867..8721b51551301085e44b5febdd1b67ead5364d7b 100644 --- a/integrationtests/test_use_case_simple_presentation.py +++ b/integrationtests/test_use_case_simple_presentation.py @@ -39,6 +39,11 @@ DATADIR = os.path.join(os.path.dirname(__file__), "test_data", "extroot", "use_case_simple_presentation") +@pytest.fixture +def clear_cache(autouse=True): + cache_clear() + + def test_complete_crawler(clear_database, caplog): # Setup the data model: model = parser.parse_model_from_yaml(os.path.join(DATADIR, "model.yml")) diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 0af64fd2b2d9e835d6d8273921aa93981b510338..41063f166df82901b9d91125c2d0dad82e3adcc7 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -38,6 +38,7 @@ import logging import os import sys import uuid +from caosdb.cached import cached_get_entity_by, cache_clear import warnings import yaml @@ -573,7 +574,7 @@ class Crawler(object): del flat[i] # 2. Can it be identified via a path? elif record.path is not None: - existing = self._get_entity_by_path(record.path) + existing = cached_get_entity_by(path=record.path) if existing is None: to_be_inserted.append(record) self.add_to_remote_missing_cache(record, identifiable) @@ -728,7 +729,7 @@ class Crawler(object): parent_updates = db.Container() for entity in to_be_updated: - old_entity = Crawler._get_entity_by_id(entity.id) + old_entity = cached_get_entity_by(id=entity.id) # Check whether the parents have been changed and add them if missing # in the old entity: @@ -757,28 +758,13 @@ class Crawler(object): "mode. This might lead to a failure of inserts that follow.") logger.info(parent_updates) - @staticmethod - def _get_entity_by_name(name): - return db.Entity(name=name).retrieve() - - @staticmethod - def _get_entity_by_path(path): - try: - return db.execute_query(f"FIND FILE WHICH IS STORED AT '{path}'", unique=True) - except db.exceptions.EmptyUniqueQueryError: - return None - - @staticmethod - def _get_entity_by_id(id): - return db.Entity(id=id).retrieve() - @staticmethod def execute_inserts_in_list(to_be_inserted, securityMode, run_id: Optional[uuid.UUID] = None, unique_names=True): for record in to_be_inserted: for prop in record.properties: - entity = Crawler._get_entity_by_name(prop.name) + entity = cached_get_entity_by(name=prop.name) _resolve_datatype(prop, entity) logger.debug("INSERT") logger.debug(to_be_inserted) @@ -794,10 +780,10 @@ class Crawler(object): for record in rec_list: for parent in record.parents: if parent.id is None: - parent.id = Crawler._get_entity_by_name(parent.name).id + parent.id = cached_get_entity_by(name=parent.name).id for prop in record.properties: if prop.id is None: - entity = Crawler._get_entity_by_name(prop.name) + entity = cached_get_entity_by(name=prop.name) prop.id = entity.id _resolve_datatype(prop, entity) @@ -860,6 +846,7 @@ class Crawler(object): logger.info(f"Going to insert {len(to_be_inserted)} Entities and update " f"{len(to_be_updated)} Entities.") if commit_changes: + cache_clear() self.execute_parent_updates_in_list(to_be_updated, securityMode=self.securityMode, run_id=self.run_id, unique_names=unique_names) logger.info(f"Added parent RecordTypes where necessary.") diff --git a/src/doc/concepts.rst b/src/doc/concepts.rst index 0881d9302b621d6b47575e171dd9e8c144e29cd4..0b15f9b5d9aebefc2137b234ac4a9440b84906f5 100644 --- a/src/doc/concepts.rst +++ b/src/doc/concepts.rst @@ -173,3 +173,14 @@ Example: File Objects ============ + +TODO + +Caching ++++++++ + +The Crawler uses the cached library function ``cached_get_entity_by``. The +cache is cleared automatically, when the Crawler does updates, but if you would +run the same Python process indefinetely the Crawler would not see changes due +to the Cache. Thus, please make sure to clear the cache if you create long +running Python processes. diff --git a/unittests/test_tool.py b/unittests/test_tool.py index b88720f4da89dfa735e782a4d2e41ccc3b0f4d3c..c8b11523211c66c1dfe6d7bf2d6938bdf9ff662a 100755 --- a/unittests/test_tool.py +++ b/unittests/test_tool.py @@ -46,7 +46,7 @@ from os.path import join, dirname, basename import yaml import caosdb as db from caosdb.apiutils import compare_entities - +from caosdb.cached import cache_clear import pytest from pytest import raises @@ -76,6 +76,11 @@ def dircheckstr(*pathcomponents): "test_directories", "examples_article", *pathcomponents)) +@pytest.fixture +def clear_cache(autouse=True): + cache_clear() + + @pytest.fixture def crawler(): crawler = Crawler() @@ -583,21 +588,20 @@ def test_replace_entities_with_ids(crawler): assert a.get_property("C").value == [12345, 233324] -def mock_get_entity_by_id(id): - candidates = [el for el in list(full_data.values()) if el.id == id] - if len(candidates) > 0: - return candidates[0] - else: - raise ValueError() - - -def mock_get_entity_by_name(name): - candidates = [el for el in full_data.values() - if (el.name is not None and el.name.lower() == name.lower())] - if len(candidates) > 0: - return candidates[0] - else: - raise ValueError() +def mock_get_entity_by(id=None, name=None): + if id is not None: + candidates = [el for el in list(full_data.values()) if el.id == id] + if len(candidates) > 0: + return candidates[0] + else: + raise ValueError() + if name is not None: + candidates = [el for el in full_data.values() + if (el.name is not None and el.name.lower() == name.lower())] + if len(candidates) > 0: + return candidates[0] + else: + raise ValueError() def prepare_crawler_with_sec_mode(mode, ident): @@ -650,10 +654,8 @@ def change_non_identifiable_prop(ident): raise RuntimeError("Did not find the property that should be changed.") -@patch("caoscrawler.crawl.Crawler._get_entity_by_id", - new=Mock(side_effect=mock_get_entity_by_id)) -@patch("caoscrawler.crawl.Crawler._get_entity_by_name", - new=Mock(side_effect=mock_get_entity_by_name)) +@patch("caoscrawler.crawl.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) @patch("caoscrawler.crawl.db.Container.insert") @patch("caoscrawler.crawl.db.Container.update") @patch("caoscrawler.crawl.UpdateCache.insert")