Skip to content
Snippets Groups Projects
Commit d3c90911 authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

ENH: add entity getters and cached functions

- Entity getters: `get_entity_by_<name/id/path>`
- Cached versions of entity getters and of execute_query (cached_query)
parent 67a46029
No related branches found
No related tags found
2 merge requests!107ENH: add entity getters and cached functions,!100ENH: add entity getters and cached functions
Pipeline #35395 passed
......@@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added ###
- Added location argument to `src/caosdb/utils/checkFileSystemConsistency.py`
- Entity getters: `get_entity_by_<name/id/path>`
- Cached versions of entity getters and of execute_query (cached_query)
### Changed ###
......
......@@ -46,6 +46,7 @@ from caosdb.common.models import (ACL, ALL, FIX, NONE, OBLIGATORY, RECOMMENDED,
Query, QueryTemplate, Record, RecordType,
delete, execute_query, get_global_acl,
get_known_permissions, raise_errors)
from caosdb.utils.get_entity import get_entity_by_name, get_entity_by_path, get_entity_by_id
from caosdb.configuration import _read_config_files, configure, get_config
from caosdb.connection.connection import configure_connection, get_connection
from caosdb.exceptions import *
......
# -*- coding: utf-8 -*-
#
# This file is a part of the CaosDB Project.
#
# Copyright (C) 2023 Henrik tom Wörden <h.tomwoerden@indiscale.com>
# Copyright (C) 2023 IndiScale GmbH <info@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
"""
This module provides some cached versions of functions that retrieve Entities from a remote server.
"""
import functools
from typing import Union
from functools import lru_cache
from .utils import get_entity
from .common.models import execute_query, Entity, Container
from enum import Enum
# roughly 1GB for typical entity sizes
DEFAULT_SIZE = 33333
# Those dict caches are solely for filling the real cache manually (e.g. to reuse older query
# results)
_DUMB_BY_NAME_CACHE = {}
_DUMB_BY_PATH_CACHE = {}
_DUMB_BY_EID_CACHE = {}
_DUMB_BY_QUERY_CACHE = {}
class AccessType(Enum):
""" This module looks for entities based on those kinds of information. """
QUERY = 1
PATH = 2
EID = 3
NAME = 4
def cached_get_entity_by(eid: Union[str, int] = None, name: str = None, path: str = None, query:
str = None) -> Entity:
""" returns a single entity that is identified uniquely by one of the arguments
You must supply exactly one of the arguments.
"""
count = 0
if eid is not None:
count += 1
if name is not None:
count += 1
if path is not None:
count += 1
if query is not None:
count += 1
if count != 1:
raise ValueError("You must supply exactly one argument.")
if eid is not None:
return _cached_access(AccessType.EID, eid, unique=True)
if name is not None:
return _cached_access(AccessType.NAME, name, unique=True)
if path is not None:
return _cached_access(AccessType.PATH, path, unique=True)
if query is not None:
return _cached_access(AccessType.QUERY, query, unique=True)
def cached_query(query_string) -> Container:
""" a cached version of db.execute_query """
return _cached_access(AccessType.QUERY, query_string, unique=False)
@lru_cache(maxsize=DEFAULT_SIZE)
def _cached_access(kind: AccessType, value: Union[str, int], unique=True):
# This is the function that is actually cached.
# Due to the arguments, the cache has kind of separate sections for cached_query and
# cached_get_entity_by with the different AccessTypes. However, there is only one cache size.
# The dumb dict caches are only to allow filling the cache manually
if kind == AccessType.QUERY:
if value in _DUMB_BY_QUERY_CACHE:
return _DUMB_BY_QUERY_CACHE[value]
return execute_query(value)
elif kind == AccessType.NAME:
if value in _DUMB_BY_NAME_CACHE:
return _DUMB_BY_NAME_CACHE[value]
return get_entity.get_entity_by_name(value)
elif kind == AccessType.EID:
if value in _DUMB_BY_EID_CACHE:
return _DUMB_BY_EID_CACHE[value]
return get_entity.get_entity_by_id(value)
elif kind == AccessType.PATH:
if value in _DUMB_BY_PATH_CACHE:
return _DUMB_BY_PATH_CACHE[value]
return get_entity.get_entity_by_path(value)
def cache_clear() -> None:
""" Empty the cache that is used by cached_query and cached_get_entity_by """
_cached_access.cache_clear()
def cache_info():
""" Empty the cache that is used by cached_query and cached_get_entity_by """
return _cached_access.cache_info()
def cache_initialize(maxsize=DEFAULT_SIZE) -> None:
""" Create a new cache with the given size for cached_query and cached_get_entity_by
The old cache is removed with all its content.
"""
global _cached_access
_cached_access = functools.lru_cache(maxsize=maxsize)(_cached_access.__wrapped__)
def fill_cache(items: dict, kind: AccessType = AccessType.EID, unique=True) -> None:
""" add Entities to the cache manually
This allows to fill the cache without actually submitting queries.
"""
# 1. add the given items to the corresponding dumb dict cache
if kind == AccessType.EID:
_DUMB_BY_EID_CACHE.update(items)
elif kind == AccessType.NAME:
_DUMB_BY_NAME_CACHE.update(items)
elif kind == AccessType.PATH:
_DUMB_BY_PATH_CACHE.update(items)
elif kind == AccessType.QUERY:
_DUMB_BY_QUERY_CACHE.update(items)
# 2. call the cache function with each key (this only results in a dict look up)
for key in items.keys():
_cached_access(kind, key, unique=unique)
# 3. empty the dumpy dict cache again
if kind == AccessType.EID:
_DUMB_BY_EID_CACHE.clear()
elif kind == AccessType.NAME:
_DUMB_BY_NAME_CACHE.clear()
elif kind == AccessType.PATH:
_DUMB_BY_PATH_CACHE.clear()
elif kind == AccessType.QUERY:
_DUMB_BY_QUERY_CACHE.clear()
# -*- coding: utf-8 -*-
#
# This file is a part of the CaosDB Project.
#
# Copyright (C) 2023 Henrik tom Wörden <h.tomwoerden@indiscale.com>
# Copyright (C) 2023 IndiScale GmbH <info@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
""" convenience functions to retrieve a specific entity """
from ..common.models import execute_query, Entity
from typing import Union
def get_entity_by_name(name: str) -> Entity:
"""returns the result of a unique query that uses the name to find the correct entity
Submits the query "FIND ENTITY WITH name='{name}'".
"""
return execute_query(f"", unique=True)
def get_entity_by_id(eid: Union[str, int]) -> Entity:
"""returns the result of a unique query that uses the id to find the correct entity
Submits the query "FIND ENTITY WITH id='{eid}'".
"""
return execute_query(f"FIND ENTITY WITH id='{eid}'", unique=True)
def get_entity_by_path(path: str) -> Entity:
"""returns the result of a unique query that uses the path to find the correct entity
Submits the query "FIND FILE WHICH IS STORED AT '{path}'".
"""
return execute_query(f"FIND FILE WHICH IS STORED AT '{path}'", unique=True)
......@@ -223,6 +223,28 @@ list-valued attribute in Python, as the following example illustrates.
print(retrieved.get_property("TestList").value)
.. note::
Properties of Entities, that shall be updated, need to have IDs. Let's look at an
example:
.. code:: python
experiment = db.Record(id=1111).retrieve()
experiment.add_property(name='date', value="2020-01-01")
retrieved.update() # Fails! The date Property needs to have an ID.
The easiest way to get around this is to use the corresponding entity getter:
.. code:: python
experiment = db.Record(id=1111).retrieve()
experiment.add_property(db.get_entity_by_name('date'), value="2020-01-01")
retrieved.update() # Works!
There are also the functions ``get_entity_by_path`` and ``get_entity_by_id``. You can easily use
cached versions of those functions (see :doc:`Entity Getters<Entity-Getters>`).
File Update
-----------
......
Entity Getters
==============
There is a very frequent situation when using PyCaosDB: You need to get a specific Entity from the
remote server. For example, you need the Property Entity in order to make an update. Sure, you can
do a ``db.Entity().retrieve()`` or submit a query, but there is an even faster way which also helps
preventing errors:
- get_entity_by_name
- get_entity_by_id
- get_entity_by_path
You can call them with a single argument (name/id/path). Since those, are ofte used quite
frequently, you might want to look at the :doc:`caching options<caching>`
Caching
=======
.. note::
Caching is great, because it can speed up things considerably. But it can also create mean pit
falls if the cache is not cleared when needed and you work with outdated data. Thus, please use
the cache with care and make sure to clear it when needed.
Python provides great tools to create caching. For example, it is very easy to create a
cached version of the ``get_entity_by_name`` function using Python's ``lru_cache``:
.. code:: python
@lru_cache(maxsize=1000)
def cached_get_by_name(name):
return db.get_entity_by_name(name)
exp = cached_get_by_name('Experiment')
# reset the cache with
cached_get_by_name.cache_clear()
For convenience we also provide the ``caosdb.cached`` module that defines the functions
``cached_query`` and ``cached_get_by``. They share a single cache with a single size. Let's have a
look:
.. code:: python
from caosdb.cached import cached_query, cached_get_by, cache_clear, cache_info, initialize_cache
rt1 = cached_get_by(name='RT1')
qresult = cached_query('FIND Experiment WITH parameter=1')
# you can inspect the cache
print(cache_info())
# this will not cause server request since it is cached
rt1 = cached_get_by(name='RT1')
# you can clear the cache with
cache_clear()
# If you want to have a cache with a different size, you can initialize it (again). Old cached
# data is lost.
initialize_cache(size=10)
In case you want to add Entities manually to the cache (e.g. because you happen to have the one
million entites that you will be using already at hand from another query), you can do so with
.. code:: python
from caosdb.cached import fill_cache, AccessType
# Here, items needs to be a dict with Entity IDs as keys and the Entities as values
fill_cache(items, AccessType.EID, unique=True)
# if you now use IDs that where in items, they are taken from the cache
e1 = cached_get_by(eid=10001)
For the cached entity getter functions (``cached_get_by``) you need to set ``unique=True``.
......@@ -14,6 +14,8 @@ advanced usage of the Python client.
basic_analysis
Data-Insertion
errors
Entity-Getters
caching
data-model-interface
complex_data_models
serverside
......
# -*- coding: utf-8 -*-
#
# This file is a part of the CaosDB Project.
#
# Copyright (C) 2023 Henrik tom Wörden <h.tomwoerden@indiscale.com>
# Copyright (C) 2023 IndiScale GmbH <info@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
""" test entity_getters module """
from caosdb.cached import (cached_get_entity_by, cache_clear, cache_info, fill_cache,
AccessType, cache_initialize, cached_query)
from unittest.mock import patch
import caosdb as db
from copy import deepcopy
import pytest
DUMMY_SERVER_CONTENT = [
db.Record(name='a', id=101),
db.Record(name='b', id=102),
db.Record(name='c', id=103),
db.File(path='p', id=104),
db.File(path='pp', id=105),
]
@pytest.fixture(autouse=True)
def cache_clean_up():
cache_clear()
yield
cache_clear()
def mocked_name_query(name):
# copy the object, because Entities would normally be created from XML response
return deepcopy([el for el in DUMMY_SERVER_CONTENT if el.name == name][0])
def mocked_id_query(eid):
# copy the object, because Entities would normally be created from XML response
return deepcopy([el for el in DUMMY_SERVER_CONTENT if el.id == eid][0])
def mocked_path_query(path):
# copy the object, because Entities would normally be created from XML response
return deepcopy([el for el in DUMMY_SERVER_CONTENT if el.path == path][0])
def mocked_gen_query(q):
if q == 'a':
return db.Container().extend([DUMMY_SERVER_CONTENT[0]])
else:
return db.Container().extend(DUMMY_SERVER_CONTENT)
@patch("caosdb.utils.get_entity.get_entity_by_name")
def test_get_by_name(mocked_get_by_name):
mocked_get_by_name.side_effect = mocked_name_query
# first call; not in cache -> mocked_execute is touched
a = cached_get_entity_by(name='a')
assert a.id == 101
assert mocked_get_by_name.call_count == 1
# second call; in cache -> mocked_execute is NOT touched (count is still 1)
b = cached_get_entity_by(name='a')
assert mocked_get_by_name.call_count == 1
# the cache returned the same object
assert a is b
# check the info
assert cache_info().hits == 1
assert cache_info().currsize == 1
# after clearing the test, the mock is used again
cache_clear()
cached_get_entity_by(name='a')
assert mocked_get_by_name.call_count == 2
# we fill the cache manually and make sure the element is used
fill_cache({'lol': db.Entity(id=10001, name='lol')}, AccessType.NAME, unique=True)
# there are now two elements in the cache: a and lol
assert cache_info().currsize == 2
# we can retrieve the inserted element
lol = cached_get_entity_by(name='lol')
assert lol.id == 10001
# this did not touch the mocked function
assert mocked_get_by_name.call_count == 2
# make sure normal retrieval still works (count +1)
c = cached_get_entity_by(name='c')
assert mocked_get_by_name.call_count == 3
assert c.id == 103
@patch("caosdb.utils.get_entity.get_entity_by_id")
def test_get_by_id(mocked_get_by_id):
mocked_get_by_id.side_effect = mocked_id_query
# first call; not in cache -> mocked_execute is touched
b = cached_get_entity_by(eid=102)
assert b.id == 102
assert b.name == 'b'
assert mocked_get_by_id.call_count == 1
# second call; in cache -> mocked_execute is NOT touched (count is still 1)
a = cached_get_entity_by(eid=102)
assert mocked_get_by_id.call_count == 1
# the cache returned the same object
assert a is b
# check the info
assert cache_info().hits == 1
assert cache_info().currsize == 1
# after clearing the test, the mock is used again
cache_clear()
cached_get_entity_by(eid=102)
assert mocked_get_by_id.call_count == 2
# we fill the cache manually and make sure the element is used
fill_cache({10001: db.Entity(id=10001, name='lol')}, AccessType.EID, unique=True)
# there are now two elements in the cache: a and lol
assert cache_info().currsize == 2
# we can retrieve the inserted element
lol = cached_get_entity_by(eid=10001)
assert lol.name == 'lol'
# this did not touch the mocked function
assert mocked_get_by_id.call_count == 2
# make sure normal retrieval still works (count +1)
c = cached_get_entity_by(eid=103)
assert mocked_get_by_id.call_count == 3
assert c.name == 'c'
@patch("caosdb.cached.get_entity.get_entity_by_path")
def test_get_by_path(mocked_get_by_path):
mocked_get_by_path.side_effect = mocked_path_query
# first call; not in cache -> mocked_execute is touched
b = cached_get_entity_by(path='p')
assert b.id == 104
assert mocked_get_by_path.call_count == 1
# second call; in cache -> mocked_execute is NOT touched (count is still 1)
a = cached_get_entity_by(path='p')
assert mocked_get_by_path.call_count == 1
# the cache returned the same object
assert a is b
# check the info
assert cache_info().hits == 1
assert cache_info().currsize == 1
# after clearing the test, the mock is used again
cache_clear()
cached_get_entity_by(path='p')
assert mocked_get_by_path.call_count == 2
# we fill the cache manually and make sure the element is used
fill_cache({'lol': db.File(id=10001, path='lol')}, AccessType.PATH, unique=True)
# there are now two elements in the cache: a and lol
assert cache_info().currsize == 2
# we can retrieve the inserted element
lol = cached_get_entity_by(path='lol')
assert lol.id == 10001
# this did not touch the mocked function
assert mocked_get_by_path.call_count == 2
# make sure normal retrieval still works (count +1)
c = cached_get_entity_by(path='pp')
assert mocked_get_by_path.call_count == 3
assert c.id == 105
@patch("caosdb.cached.execute_query")
def test_cached_query(mocked_query):
mocked_query.side_effect = mocked_gen_query
# test cache initialization
cache_initialize(maxsize=10)
assert cache_info().maxsize == 10
# first call; not in cache -> mocked_execute is touched
res = cached_query('stuff')
assert len(res) == len(DUMMY_SERVER_CONTENT)
assert mocked_query.call_count == 1
# second call; in cache -> mocked_execute is NOT touched (count is still 1)
a = cached_query('stuff')
assert mocked_query.call_count == 1
# the cache returned the same object
assert a is res
# check the info
assert cache_info().hits == 1
assert cache_info().currsize == 1
# after clearing the test, the mock is used again
cache_clear()
cached_query('stuff')
assert mocked_query.call_count == 2
# we fill the cache manually and make sure the element is used
fill_cache({'lol': db.Container().extend([db.Entity(id=10001, name='lol')])},
AccessType.QUERY, unique=False)
# there are now two elements in the cache: a and lol
assert cache_info().currsize == 2
# we can retrieve the inserted element
lol = cached_query('lol')
assert lol[0].id == 10001
# this did not touch the mocked function
assert mocked_query.call_count == 2
# make sure normal retrieval still works (count +1)
c = cached_query('a')
assert mocked_query.call_count == 3
assert c[0].id == 101
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment