diff --git a/integrationtests/test_issues.py b/integrationtests/test_issues.py index 08e254daf4052670fcec18760626c460604efe15..86ce9307a74606bea03aa83b273de259041abf58 100644 --- a/integrationtests/test_issues.py +++ b/integrationtests/test_issues.py @@ -19,6 +19,7 @@ from pytest import fixture, mark import caosdb as db +from caosdb.cached import cache_clear from caoscrawler.crawl import Crawler from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter @@ -30,6 +31,11 @@ from caosdb.utils.register_tests import clear_database, set_test_key set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") +@fixture(autouse=True) +def clear_cache(): + cache_clear() + + def test_issue_23(clear_database): """Test that an update leaves existing properties, that were not found by the crawler, unchanged. diff --git a/integrationtests/test_realworld_example.py b/integrationtests/test_realworld_example.py index 45873ddeb8b4f4a23fbcbc9225cbeea60b213cc4..82644947a3cdc85a38be3403615b51fe1f4ded50 100644 --- a/integrationtests/test_realworld_example.py +++ b/integrationtests/test_realworld_example.py @@ -30,6 +30,7 @@ import json import os import caosdb as db +from caosdb.cached import cache_clear from caoscrawler.crawl import Crawler, crawler_main from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter @@ -56,6 +57,11 @@ def rfp(*pathcomponents): DATADIR = rfp("test_data", "extroot", "realworld_example") +@pytest.fixture(autouse=True) +def clear_cache(): + cache_clear() + + @pytest.fixture def addfiles(): loadpath(path='/opt/caosdb/mnt/extroot/', diff --git a/integrationtests/test_use_case_simple_presentation.py b/integrationtests/test_use_case_simple_presentation.py index 5f66e6fae7a77a315437f4a030110ded8d0ce867..cf38e951b78534806c0ea76ef58051436aa22704 100644 --- a/integrationtests/test_use_case_simple_presentation.py +++ b/integrationtests/test_use_case_simple_presentation.py @@ -29,6 +29,7 @@ from subprocess import run import caosdb as db from caosadvancedtools.loadFiles import loadpath +from caosdb.cached import cache_clear from caosadvancedtools.models import parser as parser from caoscrawler.crawl import crawler_main from caosdb.utils.register_tests import clear_database, set_test_key @@ -39,6 +40,11 @@ DATADIR = os.path.join(os.path.dirname(__file__), "test_data", "extroot", "use_case_simple_presentation") +@pytest.fixture(autouse=True) +def clear_cache(): + cache_clear() + + def test_complete_crawler(clear_database, caplog): # Setup the data model: model = parser.parse_model_from_yaml(os.path.join(DATADIR, "model.yml")) diff --git a/setup.cfg b/setup.cfg index cb5267e17d506ef32645b4eb7c547c38b5059a8a..8e642ee95db791973b1e8da9c868bc6f15671111 100644 --- a/setup.cfg +++ b/setup.cfg @@ -20,7 +20,7 @@ packages = find: python_requires = >=3.7 install_requires = importlib-resources - caosdb >= 0.11.0 + caosdb > 0.11.2 caosadvancedtools >= 0.7.0 yaml-header-tools >= 0.2.1 pyyaml diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 7b9119caa1cd4dd4623a9141de4a70abb4da5946..7542a6592c0abfeb03056ea9ef5f230ecac7564a 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -37,7 +37,9 @@ import importlib import logging import os import sys +from caosdb.exceptions import EmptyUniqueQueryError import uuid +from caosdb.cached import cached_get_entity_by, cache_clear import warnings import yaml @@ -573,7 +575,10 @@ class Crawler(object): del flat[i] # 2. Can it be identified via a path? elif record.path is not None: - existing = self._get_entity_by_path(record.path) + try: + existing = cached_get_entity_by(path=record.path) + except EmptyUniqueQueryError: + existing = None if existing is None: to_be_inserted.append(record) self.add_to_remote_missing_cache(record, identifiable) @@ -728,7 +733,7 @@ class Crawler(object): parent_updates = db.Container() for entity in to_be_updated: - old_entity = Crawler._get_entity_by_id(entity.id) + old_entity = cached_get_entity_by(eid=entity.id) # Check whether the parents have been changed and add them if missing # in the old entity: @@ -757,28 +762,13 @@ class Crawler(object): "mode. This might lead to a failure of inserts that follow.") logger.info(parent_updates) - @staticmethod - def _get_entity_by_name(name): - return db.Entity(name=name).retrieve() - - @staticmethod - def _get_entity_by_path(path): - try: - return db.execute_query(f"FIND FILE WHICH IS STORED AT '{path}'", unique=True) - except db.exceptions.EmptyUniqueQueryError: - return None - - @staticmethod - def _get_entity_by_id(id): - return db.Entity(id=id).retrieve() - @staticmethod def execute_inserts_in_list(to_be_inserted, securityMode, run_id: Optional[uuid.UUID] = None, unique_names=True): for record in to_be_inserted: for prop in record.properties: - entity = Crawler._get_entity_by_name(prop.name) + entity = cached_get_entity_by(name=prop.name) _resolve_datatype(prop, entity) logger.debug("INSERT") logger.debug(to_be_inserted) @@ -794,10 +784,10 @@ class Crawler(object): for record in rec_list: for parent in record.parents: if parent.id is None: - parent.id = Crawler._get_entity_by_name(parent.name).id + parent.id = cached_get_entity_by(name=parent.name).id for prop in record.properties: if prop.id is None: - entity = Crawler._get_entity_by_name(prop.name) + entity = cached_get_entity_by(name=prop.name) prop.id = entity.id _resolve_datatype(prop, entity) @@ -861,6 +851,7 @@ class Crawler(object): logger.info(f"Going to insert {len(to_be_inserted)} Entities and update " f"{len(to_be_updated)} Entities.") if commit_changes: + cache_clear() self.execute_parent_updates_in_list(to_be_updated, securityMode=self.securityMode, run_id=self.run_id, unique_names=unique_names) logger.info(f"Added parent RecordTypes where necessary.") diff --git a/src/caoscrawler/identifiable.py b/src/caoscrawler/identifiable.py index eda113d8fc0c5fc64a620ef7540dec4004401aef..75af5be8f06a6ab95a4b7f2b92eda8cf3e321a1b 100644 --- a/src/caoscrawler/identifiable.py +++ b/src/caoscrawler/identifiable.py @@ -65,7 +65,7 @@ class Identifiable(): self.path = path self.record_type = record_type self.name = name - if name is "": + if name == "": self.name = None self.properties: dict = {} if properties is not None: diff --git a/src/doc/concepts.rst b/src/doc/concepts.rst index 0881d9302b621d6b47575e171dd9e8c144e29cd4..0b15f9b5d9aebefc2137b234ac4a9440b84906f5 100644 --- a/src/doc/concepts.rst +++ b/src/doc/concepts.rst @@ -173,3 +173,14 @@ Example: File Objects ============ + +TODO + +Caching ++++++++ + +The Crawler uses the cached library function ``cached_get_entity_by``. The +cache is cleared automatically, when the Crawler does updates, but if you would +run the same Python process indefinetely the Crawler would not see changes due +to the Cache. Thus, please make sure to clear the cache if you create long +running Python processes. diff --git a/unittests/test_tool.py b/unittests/test_tool.py index b88720f4da89dfa735e782a4d2e41ccc3b0f4d3c..ec3e0bb9e69a45416d23f3c7aba15ec759cabf77 100755 --- a/unittests/test_tool.py +++ b/unittests/test_tool.py @@ -46,7 +46,7 @@ from os.path import join, dirname, basename import yaml import caosdb as db from caosdb.apiutils import compare_entities - +from caosdb.cached import cache_clear import pytest from pytest import raises @@ -76,6 +76,11 @@ def dircheckstr(*pathcomponents): "test_directories", "examples_article", *pathcomponents)) +@pytest.fixture(autouse=True) +def clear_cache(): + cache_clear() + + @pytest.fixture def crawler(): crawler = Crawler() @@ -583,21 +588,20 @@ def test_replace_entities_with_ids(crawler): assert a.get_property("C").value == [12345, 233324] -def mock_get_entity_by_id(id): - candidates = [el for el in list(full_data.values()) if el.id == id] - if len(candidates) > 0: - return candidates[0] - else: - raise ValueError() - - -def mock_get_entity_by_name(name): - candidates = [el for el in full_data.values() - if (el.name is not None and el.name.lower() == name.lower())] - if len(candidates) > 0: - return candidates[0] - else: - raise ValueError() +def mock_get_entity_by(eid=None, name=None): + if eid is not None: + candidates = [el for el in list(full_data.values()) if el.id == eid] + if len(candidates) > 0: + return candidates[0] + else: + raise ValueError() + if name is not None: + candidates = [el for el in full_data.values() + if (el.name is not None and el.name.lower() == name.lower())] + if len(candidates) > 0: + return candidates[0] + else: + raise ValueError() def prepare_crawler_with_sec_mode(mode, ident): @@ -650,10 +654,8 @@ def change_non_identifiable_prop(ident): raise RuntimeError("Did not find the property that should be changed.") -@patch("caoscrawler.crawl.Crawler._get_entity_by_id", - new=Mock(side_effect=mock_get_entity_by_id)) -@patch("caoscrawler.crawl.Crawler._get_entity_by_name", - new=Mock(side_effect=mock_get_entity_by_name)) +@patch("caoscrawler.crawl.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) @patch("caoscrawler.crawl.db.Container.insert") @patch("caoscrawler.crawl.db.Container.update") @patch("caoscrawler.crawl.UpdateCache.insert")