diff --git a/CHANGELOG.md b/CHANGELOG.md index 205fcb077cb0ca8f8facc6c1f636539f9bbb2384..b14e227fc2e7d7b8c61fd704d0935b7ff78acbd3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### - Added location argument to `src/caosdb/utils/checkFileSystemConsistency.py` +- Entity getters: `get_entity_by_<name/id/path>` +- Cached versions of entity getters and of execute_query (cached_query) ### Changed ### diff --git a/src/caosdb/__init__.py b/src/caosdb/__init__.py index 7e06885fe495c1e8c4ccc99b7d0c0f8ff8c34b5b..acf323e860a93753b57f2e104531383b412f3fa0 100644 --- a/src/caosdb/__init__.py +++ b/src/caosdb/__init__.py @@ -46,6 +46,7 @@ from caosdb.common.models import (ACL, ALL, FIX, NONE, OBLIGATORY, RECOMMENDED, Query, QueryTemplate, Record, RecordType, delete, execute_query, get_global_acl, get_known_permissions, raise_errors) +from caosdb.utils.get_entity import get_entity_by_name, get_entity_by_path, get_entity_by_id from caosdb.configuration import _read_config_files, configure, get_config from caosdb.connection.connection import configure_connection, get_connection from caosdb.exceptions import * diff --git a/src/caosdb/cached.py b/src/caosdb/cached.py new file mode 100644 index 0000000000000000000000000000000000000000..fd9e32557ab7f5001520895b8a87758e9681fdab --- /dev/null +++ b/src/caosdb/cached.py @@ -0,0 +1,159 @@ +# -*- coding: utf-8 -*- +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2023 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# Copyright (C) 2023 IndiScale GmbH <info@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" +This module provides some cached versions of functions that retrieve Entities from a remote server. +""" + +import functools +from typing import Union +from functools import lru_cache +from .utils import get_entity +from .common.models import execute_query, Entity, Container + + +from enum import Enum + +# roughly 1GB for typical entity sizes +DEFAULT_SIZE = 33333 + + +# Those dict caches are solely for filling the real cache manually (e.g. to reuse older query +# results) +_DUMB_BY_NAME_CACHE = {} +_DUMB_BY_PATH_CACHE = {} +_DUMB_BY_EID_CACHE = {} +_DUMB_BY_QUERY_CACHE = {} + + +class AccessType(Enum): + """ This module looks for entities based on those kinds of information. """ + QUERY = 1 + PATH = 2 + EID = 3 + NAME = 4 + + +def cached_get_entity_by(eid: Union[str, int] = None, name: str = None, path: str = None, query: + str = None) -> Entity: + """ returns a single entity that is identified uniquely by one of the arguments + + You must supply exactly one of the arguments. + """ + count = 0 + if eid is not None: + count += 1 + if name is not None: + count += 1 + if path is not None: + count += 1 + if query is not None: + count += 1 + if count != 1: + raise ValueError("You must supply exactly one argument.") + + if eid is not None: + return _cached_access(AccessType.EID, eid, unique=True) + if name is not None: + return _cached_access(AccessType.NAME, name, unique=True) + if path is not None: + return _cached_access(AccessType.PATH, path, unique=True) + if query is not None: + return _cached_access(AccessType.QUERY, query, unique=True) + + +def cached_query(query_string) -> Container: + """ a cached version of db.execute_query """ + return _cached_access(AccessType.QUERY, query_string, unique=False) + + +@lru_cache(maxsize=DEFAULT_SIZE) +def _cached_access(kind: AccessType, value: Union[str, int], unique=True): + # This is the function that is actually cached. + # Due to the arguments, the cache has kind of separate sections for cached_query and + # cached_get_entity_by with the different AccessTypes. However, there is only one cache size. + + # The dumb dict caches are only to allow filling the cache manually + if kind == AccessType.QUERY: + if value in _DUMB_BY_QUERY_CACHE: + return _DUMB_BY_QUERY_CACHE[value] + return execute_query(value) + elif kind == AccessType.NAME: + if value in _DUMB_BY_NAME_CACHE: + return _DUMB_BY_NAME_CACHE[value] + return get_entity.get_entity_by_name(value) + elif kind == AccessType.EID: + if value in _DUMB_BY_EID_CACHE: + return _DUMB_BY_EID_CACHE[value] + return get_entity.get_entity_by_id(value) + elif kind == AccessType.PATH: + if value in _DUMB_BY_PATH_CACHE: + return _DUMB_BY_PATH_CACHE[value] + return get_entity.get_entity_by_path(value) + + +def cache_clear() -> None: + """ Empty the cache that is used by cached_query and cached_get_entity_by """ + _cached_access.cache_clear() + + +def cache_info(): + """ Empty the cache that is used by cached_query and cached_get_entity_by """ + return _cached_access.cache_info() + + +def cache_initialize(maxsize=DEFAULT_SIZE) -> None: + """ Create a new cache with the given size for cached_query and cached_get_entity_by + + The old cache is removed with all its content. + """ + global _cached_access + _cached_access = functools.lru_cache(maxsize=maxsize)(_cached_access.__wrapped__) + + +def fill_cache(items: dict, kind: AccessType = AccessType.EID, unique=True) -> None: + """ add Entities to the cache manually + + This allows to fill the cache without actually submitting queries. + """ + # 1. add the given items to the corresponding dumb dict cache + if kind == AccessType.EID: + _DUMB_BY_EID_CACHE.update(items) + elif kind == AccessType.NAME: + _DUMB_BY_NAME_CACHE.update(items) + elif kind == AccessType.PATH: + _DUMB_BY_PATH_CACHE.update(items) + elif kind == AccessType.QUERY: + _DUMB_BY_QUERY_CACHE.update(items) + + # 2. call the cache function with each key (this only results in a dict look up) + for key in items.keys(): + _cached_access(kind, key, unique=unique) + + # 3. empty the dumpy dict cache again + if kind == AccessType.EID: + _DUMB_BY_EID_CACHE.clear() + elif kind == AccessType.NAME: + _DUMB_BY_NAME_CACHE.clear() + elif kind == AccessType.PATH: + _DUMB_BY_PATH_CACHE.clear() + elif kind == AccessType.QUERY: + _DUMB_BY_QUERY_CACHE.clear() diff --git a/src/caosdb/utils/get_entity.py b/src/caosdb/utils/get_entity.py new file mode 100644 index 0000000000000000000000000000000000000000..e5ba949b4c67ae91f65bc65f70356b42eaba17dd --- /dev/null +++ b/src/caosdb/utils/get_entity.py @@ -0,0 +1,49 @@ +# -*- coding: utf-8 -*- +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2023 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# Copyright (C) 2023 IndiScale GmbH <info@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" convenience functions to retrieve a specific entity """ + +from ..common.models import execute_query, Entity +from typing import Union + + +def get_entity_by_name(name: str) -> Entity: + """returns the result of a unique query that uses the name to find the correct entity + + Submits the query "FIND ENTITY WITH name='{name}'". + """ + return execute_query(f"", unique=True) + + +def get_entity_by_id(eid: Union[str, int]) -> Entity: + """returns the result of a unique query that uses the id to find the correct entity + + Submits the query "FIND ENTITY WITH id='{eid}'". + """ + return execute_query(f"FIND ENTITY WITH id='{eid}'", unique=True) + + +def get_entity_by_path(path: str) -> Entity: + """returns the result of a unique query that uses the path to find the correct entity + + Submits the query "FIND FILE WHICH IS STORED AT '{path}'". + """ + return execute_query(f"FIND FILE WHICH IS STORED AT '{path}'", unique=True) diff --git a/src/doc/tutorials/Data-Insertion.rst b/src/doc/tutorials/Data-Insertion.rst index 9060345c836a43c57f8039d40020082a7f342b02..c1b401591efdac8c343e20130efb8be1798dad9c 100644 --- a/src/doc/tutorials/Data-Insertion.rst +++ b/src/doc/tutorials/Data-Insertion.rst @@ -223,6 +223,28 @@ list-valued attribute in Python, as the following example illustrates. print(retrieved.get_property("TestList").value) +.. note:: + Properties of Entities, that shall be updated, need to have IDs. Let's look at an + example: + +.. code:: python + + experiment = db.Record(id=1111).retrieve() + experiment.add_property(name='date', value="2020-01-01") + retrieved.update() # Fails! The date Property needs to have an ID. + +The easiest way to get around this is to use the corresponding entity getter: + +.. code:: python + + experiment = db.Record(id=1111).retrieve() + experiment.add_property(db.get_entity_by_name('date'), value="2020-01-01") + retrieved.update() # Works! + +There are also the functions ``get_entity_by_path`` and ``get_entity_by_id``. You can easily use +cached versions of those functions (see :doc:`Entity Getters<Entity-Getters>`). + + File Update ----------- diff --git a/src/doc/tutorials/Entity-Getters.rst b/src/doc/tutorials/Entity-Getters.rst new file mode 100644 index 0000000000000000000000000000000000000000..383d94322200815575b16eebf1fd7766d64b200f --- /dev/null +++ b/src/doc/tutorials/Entity-Getters.rst @@ -0,0 +1,15 @@ + +Entity Getters +============== + +There is a very frequent situation when using PyCaosDB: You need to get a specific Entity from the +remote server. For example, you need the Property Entity in order to make an update. Sure, you can +do a ``db.Entity().retrieve()`` or submit a query, but there is an even faster way which also helps +preventing errors: + +- get_entity_by_name +- get_entity_by_id +- get_entity_by_path + +You can call them with a single argument (name/id/path). Since those, are ofte used quite +frequently, you might want to look at the :doc:`caching options<caching>` diff --git a/src/doc/tutorials/caching.rst b/src/doc/tutorials/caching.rst new file mode 100644 index 0000000000000000000000000000000000000000..76f0d45d00825280ea8d79e048db5e789d38a93c --- /dev/null +++ b/src/doc/tutorials/caching.rst @@ -0,0 +1,57 @@ + +Caching +======= + +.. note:: + +Caching is great, because it can speed up things considerably. But it can also create mean pit +falls if the cache is not cleared when needed and you work with outdated data. Thus, please use +the cache with care and make sure to clear it when needed. + +Python provides great tools to create caching. For example, it is very easy to create a +cached version of the ``get_entity_by_name`` function using Python's ``lru_cache``: + +.. code:: python + + @lru_cache(maxsize=1000) + def cached_get_by_name(name): + return db.get_entity_by_name(name) + + exp = cached_get_by_name('Experiment') + # reset the cache with + cached_get_by_name.cache_clear() + +For convenience we also provide the ``caosdb.cached`` module that defines the functions +``cached_query`` and ``cached_get_by``. They share a single cache with a single size. Let's have a +look: + +.. code:: python + + from caosdb.cached import cached_query, cached_get_by, cache_clear, cache_info, initialize_cache + rt1 = cached_get_by(name='RT1') + qresult = cached_query('FIND Experiment WITH parameter=1') + # you can inspect the cache + print(cache_info()) + # this will not cause server request since it is cached + rt1 = cached_get_by(name='RT1') + # you can clear the cache with + cache_clear() + # If you want to have a cache with a different size, you can initialize it (again). Old cached + # data is lost. + initialize_cache(size=10) + + +In case you want to add Entities manually to the cache (e.g. because you happen to have the one +million entites that you will be using already at hand from another query), you can do so with + +.. code:: python + + from caosdb.cached import fill_cache, AccessType + # Here, items needs to be a dict with Entity IDs as keys and the Entities as values + fill_cache(items, AccessType.EID, unique=True) + # if you now use IDs that where in items, they are taken from the cache + e1 = cached_get_by(eid=10001) + +For the cached entity getter functions (``cached_get_by``) you need to set ``unique=True``. + + diff --git a/src/doc/tutorials/index.rst b/src/doc/tutorials/index.rst index 0b08d0b4fe153d803a780bd144787819b827db78..ce37993d7ec5e0888da8a2b4c58904bcbdc43bb4 100644 --- a/src/doc/tutorials/index.rst +++ b/src/doc/tutorials/index.rst @@ -14,6 +14,8 @@ advanced usage of the Python client. basic_analysis Data-Insertion errors + Entity-Getters + caching data-model-interface complex_data_models serverside diff --git a/unittests/test_cached.py b/unittests/test_cached.py new file mode 100644 index 0000000000000000000000000000000000000000..19c7769cc7729bc82a7665f038a154e81aba237d --- /dev/null +++ b/unittests/test_cached.py @@ -0,0 +1,208 @@ +# -*- coding: utf-8 -*- +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2023 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# Copyright (C) 2023 IndiScale GmbH <info@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" test entity_getters module """ + +from caosdb.cached import (cached_get_entity_by, cache_clear, cache_info, fill_cache, + AccessType, cache_initialize, cached_query) +from unittest.mock import patch +import caosdb as db +from copy import deepcopy +import pytest + + +DUMMY_SERVER_CONTENT = [ + db.Record(name='a', id=101), + db.Record(name='b', id=102), + db.Record(name='c', id=103), + db.File(path='p', id=104), + db.File(path='pp', id=105), +] + + +@pytest.fixture(autouse=True) +def cache_clean_up(): + cache_clear() + yield + cache_clear() + + +def mocked_name_query(name): + # copy the object, because Entities would normally be created from XML response + return deepcopy([el for el in DUMMY_SERVER_CONTENT if el.name == name][0]) + + +def mocked_id_query(eid): + # copy the object, because Entities would normally be created from XML response + return deepcopy([el for el in DUMMY_SERVER_CONTENT if el.id == eid][0]) + + +def mocked_path_query(path): + # copy the object, because Entities would normally be created from XML response + return deepcopy([el for el in DUMMY_SERVER_CONTENT if el.path == path][0]) + + +def mocked_gen_query(q): + if q == 'a': + return db.Container().extend([DUMMY_SERVER_CONTENT[0]]) + else: + return db.Container().extend(DUMMY_SERVER_CONTENT) + + +@patch("caosdb.utils.get_entity.get_entity_by_name") +def test_get_by_name(mocked_get_by_name): + mocked_get_by_name.side_effect = mocked_name_query + # first call; not in cache -> mocked_execute is touched + a = cached_get_entity_by(name='a') + assert a.id == 101 + assert mocked_get_by_name.call_count == 1 + # second call; in cache -> mocked_execute is NOT touched (count is still 1) + b = cached_get_entity_by(name='a') + assert mocked_get_by_name.call_count == 1 + # the cache returned the same object + assert a is b + # check the info + assert cache_info().hits == 1 + assert cache_info().currsize == 1 + # after clearing the test, the mock is used again + cache_clear() + cached_get_entity_by(name='a') + assert mocked_get_by_name.call_count == 2 + # we fill the cache manually and make sure the element is used + fill_cache({'lol': db.Entity(id=10001, name='lol')}, AccessType.NAME, unique=True) + # there are now two elements in the cache: a and lol + assert cache_info().currsize == 2 + # we can retrieve the inserted element + lol = cached_get_entity_by(name='lol') + assert lol.id == 10001 + # this did not touch the mocked function + assert mocked_get_by_name.call_count == 2 + # make sure normal retrieval still works (count +1) + c = cached_get_entity_by(name='c') + assert mocked_get_by_name.call_count == 3 + assert c.id == 103 + + +@patch("caosdb.utils.get_entity.get_entity_by_id") +def test_get_by_id(mocked_get_by_id): + mocked_get_by_id.side_effect = mocked_id_query + # first call; not in cache -> mocked_execute is touched + b = cached_get_entity_by(eid=102) + assert b.id == 102 + assert b.name == 'b' + assert mocked_get_by_id.call_count == 1 + # second call; in cache -> mocked_execute is NOT touched (count is still 1) + a = cached_get_entity_by(eid=102) + assert mocked_get_by_id.call_count == 1 + # the cache returned the same object + assert a is b + # check the info + assert cache_info().hits == 1 + assert cache_info().currsize == 1 + # after clearing the test, the mock is used again + cache_clear() + cached_get_entity_by(eid=102) + assert mocked_get_by_id.call_count == 2 + # we fill the cache manually and make sure the element is used + fill_cache({10001: db.Entity(id=10001, name='lol')}, AccessType.EID, unique=True) + # there are now two elements in the cache: a and lol + assert cache_info().currsize == 2 + # we can retrieve the inserted element + lol = cached_get_entity_by(eid=10001) + assert lol.name == 'lol' + # this did not touch the mocked function + assert mocked_get_by_id.call_count == 2 + # make sure normal retrieval still works (count +1) + c = cached_get_entity_by(eid=103) + assert mocked_get_by_id.call_count == 3 + assert c.name == 'c' + + +@patch("caosdb.cached.get_entity.get_entity_by_path") +def test_get_by_path(mocked_get_by_path): + mocked_get_by_path.side_effect = mocked_path_query + # first call; not in cache -> mocked_execute is touched + b = cached_get_entity_by(path='p') + assert b.id == 104 + assert mocked_get_by_path.call_count == 1 + # second call; in cache -> mocked_execute is NOT touched (count is still 1) + a = cached_get_entity_by(path='p') + assert mocked_get_by_path.call_count == 1 + # the cache returned the same object + assert a is b + # check the info + assert cache_info().hits == 1 + assert cache_info().currsize == 1 + # after clearing the test, the mock is used again + cache_clear() + cached_get_entity_by(path='p') + assert mocked_get_by_path.call_count == 2 + # we fill the cache manually and make sure the element is used + fill_cache({'lol': db.File(id=10001, path='lol')}, AccessType.PATH, unique=True) + # there are now two elements in the cache: a and lol + assert cache_info().currsize == 2 + # we can retrieve the inserted element + lol = cached_get_entity_by(path='lol') + assert lol.id == 10001 + # this did not touch the mocked function + assert mocked_get_by_path.call_count == 2 + # make sure normal retrieval still works (count +1) + c = cached_get_entity_by(path='pp') + assert mocked_get_by_path.call_count == 3 + assert c.id == 105 + + +@patch("caosdb.cached.execute_query") +def test_cached_query(mocked_query): + mocked_query.side_effect = mocked_gen_query + # test cache initialization + cache_initialize(maxsize=10) + assert cache_info().maxsize == 10 + # first call; not in cache -> mocked_execute is touched + res = cached_query('stuff') + assert len(res) == len(DUMMY_SERVER_CONTENT) + assert mocked_query.call_count == 1 + # second call; in cache -> mocked_execute is NOT touched (count is still 1) + a = cached_query('stuff') + assert mocked_query.call_count == 1 + # the cache returned the same object + assert a is res + # check the info + assert cache_info().hits == 1 + assert cache_info().currsize == 1 + # after clearing the test, the mock is used again + cache_clear() + cached_query('stuff') + assert mocked_query.call_count == 2 + # we fill the cache manually and make sure the element is used + fill_cache({'lol': db.Container().extend([db.Entity(id=10001, name='lol')])}, + AccessType.QUERY, unique=False) + # there are now two elements in the cache: a and lol + assert cache_info().currsize == 2 + # we can retrieve the inserted element + lol = cached_query('lol') + assert lol[0].id == 10001 + # this did not touch the mocked function + assert mocked_query.call_count == 2 + # make sure normal retrieval still works (count +1) + c = cached_query('a') + assert mocked_query.call_count == 3 + assert c[0].id == 101