Skip to content
Snippets Groups Projects
Commit 91118aa2 authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

Merge branch 'f-cache-version' into 'dev'

F cache version

See merge request !46
parents 3c8e6ded c1f03b19
Branches
Tags
2 merge requests!59FIX: if multiple updates for one entity exist, the retrieve would result in an...,!46F cache version
Pipeline #28184 passed
...@@ -129,6 +129,7 @@ unittest: ...@@ -129,6 +129,7 @@ unittest:
image: $CI_REGISTRY_IMAGE image: $CI_REGISTRY_IMAGE
needs: [build-testenv] needs: [build-testenv]
script: script:
- python3 -c "import caosdb; print('CaosDB Version:', caosdb.__version__)"
- tox - tox
# Build the sphinx documentation and make it ready for deployment by Gitlab Pages # Build the sphinx documentation and make it ready for deployment by Gitlab Pages
......
...@@ -16,6 +16,7 @@ fi ...@@ -16,6 +16,7 @@ fi
OUT=/tmp/crawler.output OUT=/tmp/crawler.output
ls ls
cat pycaosdb.ini cat pycaosdb.ini
python3 -c "import caosdb; print('CaosDB Version:', caosdb.__version__)"
rm -rf cache.db rm -rf cache.db
set -e set -e
echo "Clearing database" echo "Clearing database"
......
...@@ -27,7 +27,10 @@ ...@@ -27,7 +27,10 @@
# server side? # server side?
import os import os
import sqlite3 import sqlite3
from copy import deepcopy
from abc import ABC, abstractmethod
from hashlib import sha256 from hashlib import sha256
import warnings
import caosdb as db import caosdb as db
from lxml import etree from lxml import etree
...@@ -64,59 +67,182 @@ def get_pretty_xml(cont): ...@@ -64,59 +67,182 @@ def get_pretty_xml(cont):
return etree.tounicode(xml, pretty_print=True) return etree.tounicode(xml, pretty_print=True)
class Cache(object): class AbstractCache(ABC):
@abstractmethod
def get_cache_schema_version(self):
""" """
stores identifiables (as a hash of xml) and their respective ID. A method that has to be overloaded that sets the version of the
SQLITE database schema. The schema is saved in table version column schema.
This allows to retrieve the Record corresponding to an indentifiable Increase this variable, when changes to the cache tables are made.
without querying. """
pass
@abstractmethod
def create_cache(self):
"""
Provide an overloaded function here that creates the cache in
the most recent version.
"""
pass
@abstractmethod
def get_default_file_name(self):
"""
Supply a default file name for the cache here.
"""
pass
def __init__(self, db_file=None, force_creation=False):
"""
db_file: The path of the database file.
if force_creation is set to True, the file will be created
regardless of a file at the same path already exists.
""" """
def __init__(self, db_file=None):
if db_file is None: if db_file is None:
self.db_file = "cache.db" self.db_file = self.get_default_file_name()
else: else:
self.db_file = db_file self.db_file = db_file
if not os.path.exists(self.db_file): if not os.path.exists(self.db_file) or force_creation:
self.create_cache() self.create_cache()
else:
self.check_cache()
def create_cache(self): def check_cache(self):
"""
Check whether the cache in db file self.db_file exists and conforms
to the latest database schema.
If it does not exist, it will be created using the newest database schema.
If it exists, but the schema is outdated, an exception will be raised.
"""
try:
current_schema = self.get_cache_version()
except sqlite3.OperationalError:
current_schema = 1
if current_schema > self.get_cache_schema_version():
raise RuntimeError(
"Cache is corrupt or was created with a future version of this program.")
elif current_schema < self.get_cache_schema_version():
raise RuntimeError("Cache version too old.")
def get_cache_version(self):
"""
Return the version of the cache stored in self.db_file.
The version is stored as the only entry in colum schema of table version.
"""
try:
conn = sqlite3.connect(self.db_file) conn = sqlite3.connect(self.db_file)
c = conn.cursor() c = conn.cursor()
c.execute( c.execute("SELECT schema FROM version")
'''CREATE TABLE identifiables (digest text primary key, caosdb_id integer)''') version_row = c.fetchall()
if len(version_row) != 1:
raise RuntimeError("Cache version table broken.")
return version_row[0][0]
finally:
conn.close()
def run_sql_commands(self, commands, fetchall=False):
"""
Run a list of SQL commands on self.db_file.
commands: list of sql commands (tuples) to execute
fetchall: When True, run fetchall as last command and return the results.
Otherwise nothing is returned.
"""
conn = sqlite3.connect(self.db_file)
c = conn.cursor()
for sql in commands:
c.execute(*sql)
if fetchall:
results = c.fetchall()
conn.commit() conn.commit()
conn.close() conn.close()
if fetchall:
return results
# TODO: A better name would be IdentifiablesCache
class IdentifiableCache(AbstractCache):
"""
stores identifiables (as a hash of xml) and their respective ID.
This allows to retrieve the Record corresponding to an indentifiable
without querying.
"""
def get_cache_schema_version(self):
return 2
def get_default_file_name(self):
return "cache.db"
def __init__(self, db_file=None, force_creation=False):
super().__init__(db_file, force_creation)
def create_cache(self):
"""
Create a new SQLITE cache file in self.db_file.
Two tables will be created:
- identifiables is the actual cache.
- version is a table with version information about the cache.
"""
conn = sqlite3.connect(self.db_file)
c = conn.cursor()
self.run_sql_commands([
('''CREATE TABLE identifiables (digest TEXT PRIMARY KEY, caosdb_id INTEGER, caosdb_version TEXT)''',),
('''CREATE TABLE version (schema INTEGER)''',),
("INSERT INTO version VALUES (?)", (self.get_cache_schema_version(),))])
@staticmethod @staticmethod
def hash_entity(ent): def hash_entity(ent):
xml = get_pretty_xml(ent) """
Format an entity as "pretty" XML and return the SHA256 hash.
"""
xml = get_pretty_xml(deepcopy(ent))
digest = sha256(xml.encode("utf-8")).hexdigest() digest = sha256(xml.encode("utf-8")).hexdigest()
return digest return digest
def insert(self, ent_hash, ent_id): def insert(self, ent_hash, ent_id, ent_version):
conn = sqlite3.connect(self.db_file) """
c = conn.cursor() Insert a new cache entry.
c.execute('''INSERT INTO identifiables VALUES (?, ?)''',
(ent_hash, ent_id)) ent_hash: Hash of the entity. Should be generated with Cache.hash_entity
conn.commit() ent_id: ID of the entity
conn.close() ent_version: Version string of the entity
"""
self.run_sql_commands([
('''INSERT INTO identifiables VALUES (?, ?, ?)''',
(ent_hash, ent_id, ent_version))])
def check_existing(self, ent_hash): def check_existing(self, ent_hash):
conn = sqlite3.connect(self.db_file) """
c = conn.cursor() Check the cache for a hash.
c.execute('''Select * FROM identifiables WHERE digest=?''',
(ent_hash,))
res = c.fetchone()
conn.commit()
conn.close()
if res is None: ent_hash: The hash to search for.
return res
Return the ID and the version ID of the hashed entity.
Return None if no entity with that hash is in the cache.
"""
res = self.run_sql_commands([('''Select * FROM identifiables WHERE digest=?''',
(ent_hash,))], True)
if len(res) == 0:
return None
else: else:
return res[1] return res[0][1:]
def update_ids_from_cache(self, entities): def update_ids_from_cache(self, entities):
""" sets ids of those entities that are in cache """ sets ids of those entities that are in cache
...@@ -131,7 +257,7 @@ class Cache(object): ...@@ -131,7 +257,7 @@ class Cache(object):
eid = self.check_existing(ehash) eid = self.check_existing(ehash)
if eid is not None: if eid is not None:
ent.id = eid ent.id = eid[0]
return hashes return hashes
...@@ -141,12 +267,63 @@ class Cache(object): ...@@ -141,12 +267,63 @@ class Cache(object):
The hashes must correspond to the entities in the list The hashes must correspond to the entities in the list
""" """
# Check whether all entities have IDs and versions:
for ent in entities:
if ent.id is None:
raise RuntimeError("Entity has no ID.")
if ent.version is None or ent.version.id is None:
raise RuntimeError("Entity has no version ID.")
for ehash, ent in zip(hashes, entities): for ehash, ent in zip(hashes, entities):
if self.check_existing(ehash) is None: if self.check_existing(ehash) is None:
self.insert(ehash, ent.id) self.insert(ehash, ent.id, ent.version.id)
def validate_cache(self, entities=None):
"""
Runs through all entities stored in the cache and checks
whether the version still matches the most recent version.
Non-matching entities will be removed from the cache.
entities: When set to a db.Container or a list of Entities
the IDs from the cache will not be retrieved from the CaosDB database,
but the versions from the cache will be checked against the versions
contained in that collection. Only entries in the cache that have
a corresponding version in the collection will be checked, all others
will be ignored. Useful for testing.
Return a list of invalidated entries or an empty list if no elements have been invalidated.
"""
res = self.run_sql_commands([(
"SELECT caosdb_id, caosdb_version FROM identifiables", ())], True)
if entities is None:
# TODO this might become a problem. If many entities are cached,
# then all of them are retrieved here...
ids = [c_id for c_id, _ in res]
ids = set(ids)
entities = db.Container()
entities.extend([db.Entity(id=c_id) for c_id in ids])
entities.retrieve()
class UpdateCache(Cache): v = {c_id: c_version for c_id, c_version in res}
invalidate_list = []
for ent in entities:
if ent.version.id != v[ent.id]:
invalidate_list.append(ent.id)
self.run_sql_commands([(
"DELETE FROM identifiables WHERE caosdb_id IN ({})".format(
", ".join([str(caosdb_id) for caosdb_id in invalidate_list])), ())])
return invalidate_list
class UpdateCache(AbstractCache):
""" """
stores unauthorized updates stores unauthorized updates
...@@ -154,12 +331,18 @@ class UpdateCache(Cache): ...@@ -154,12 +331,18 @@ class UpdateCache(Cache):
be stored in this cache such that it can be authorized and done later. be stored in this cache such that it can be authorized and done later.
""" """
def __init__(self, db_file=None): def get_cache_schema_version(self):
return 1
def get_default_file_name(self):
return "/tmp/crawler_update_cache.db"
def __init__(self, db_file=None, force_creation=False):
if db_file is None: if db_file is None:
tmppath = tempfile.gettempdir() tmppath = tempfile.gettempdir()
tmpf = os.path.join(tmppath, "crawler_update_cache.db") tmpf = os.path.join(tmppath, "crawler_update_cache.db")
db_file = tmpf db_file = tmpf
super().__init__(db_file=db_file) super().__init__(db_file=db_file, force_creation=force_creation)
@staticmethod @staticmethod
def get_previous_version(cont): def get_previous_version(cont):
...@@ -227,3 +410,9 @@ class UpdateCache(Cache): ...@@ -227,3 +410,9 @@ class UpdateCache(Cache):
conn.close() conn.close()
return res return res
class Cache(IdentifiableCache):
def __init__(self, *args, **kwargs):
warnings.warn(DeprecationWarning("This class is depricated. Please use IdentifiableCache."))
super().__init__(*args, **kwargs)
...@@ -50,7 +50,7 @@ from sqlite3 import IntegrityError ...@@ -50,7 +50,7 @@ from sqlite3 import IntegrityError
import caosdb as db import caosdb as db
from caosdb.exceptions import BadQueryError from caosdb.exceptions import BadQueryError
from .cache import Cache, UpdateCache, get_pretty_xml from .cache import IdentifiableCache, UpdateCache, get_pretty_xml
from .cfood import RowCFood, add_files, get_ids_for_entities_with_names from .cfood import RowCFood, add_files, get_ids_for_entities_with_names
from .datainconsistency import DataInconsistencyError from .datainconsistency import DataInconsistencyError
from .datamodel_problems import DataModelProblems from .datamodel_problems import DataModelProblems
...@@ -190,7 +190,8 @@ class Crawler(object): ...@@ -190,7 +190,8 @@ class Crawler(object):
self.filterKnown.reset(cat) self.filterKnown.reset(cat)
if self.use_cache: if self.use_cache:
self.cache = Cache(db_file=cache_file) self.cache = IdentifiableCache(db_file=cache_file)
self.cache.validate_cache()
def iteritems(self): def iteritems(self):
""" generates items to be crawled with an index""" """ generates items to be crawled with an index"""
......
...@@ -24,31 +24,35 @@ import os ...@@ -24,31 +24,35 @@ import os
import unittest import unittest
from copy import deepcopy from copy import deepcopy
from tempfile import NamedTemporaryFile from tempfile import NamedTemporaryFile
import sqlite3
import caosdb as db import caosdb as db
from caosadvancedtools.cache import Cache, cleanXML from caosadvancedtools.cache import IdentifiableCache, cleanXML
from lxml import etree from lxml import etree
import pytest
class CacheTest(unittest.TestCase): class CacheTest(unittest.TestCase):
def setUp(self): def setUp(self):
self.cache = Cache(db_file=NamedTemporaryFile(delete=False).name) self.cache = IdentifiableCache(db_file=NamedTemporaryFile(delete=False).name,
self.cache.create_cache() force_creation=True)
def test_hash(self): def test_hash(self):
ent = db.Record() ent = db.Record()
assert isinstance(Cache.hash_entity(ent), str) assert isinstance(IdentifiableCache.hash_entity(ent), str)
assert (Cache.hash_entity(ent) != assert (IdentifiableCache.hash_entity(ent) !=
Cache.hash_entity(db.Record().add_parent("lol"))) IdentifiableCache.hash_entity(db.Record().add_parent("lol")))
def test_insert(self): def test_insert(self):
ent = db.Record() ent = db.Record()
ent2 = db.Record() ent2 = db.Record()
ent2.add_parent(name="Experiment") ent2.add_parent(name="Experiment")
ent_hash = Cache.hash_entity(ent) ent_hash = IdentifiableCache.hash_entity(ent)
ent2_hash = Cache.hash_entity(ent2) ent2_hash = IdentifiableCache.hash_entity(ent2)
self.cache.insert(ent2_hash, 1235) self.cache.insert(ent2_hash, 1235, "ajkfljadsklf")
assert isinstance(self.cache.check_existing(ent2_hash), int) assert self.cache.check_existing(ent2_hash)[0] == 1235
assert self.cache.check_existing(ent2_hash)[1] == "ajkfljadsklf"
assert self.cache.check_existing(ent_hash) is None assert self.cache.check_existing(ent_hash) is None
def test_hirarchy(self): def test_hirarchy(self):
...@@ -64,17 +68,29 @@ class CacheTest(unittest.TestCase): ...@@ -64,17 +68,29 @@ class CacheTest(unittest.TestCase):
ent3 = db.Record() ent3 = db.Record()
ent3.add_parent(name="Analysis") ent3.add_parent(name="Analysis")
test_id = 2353243 test_id = 2353243
self.cache.insert(Cache.hash_entity(ent2), test_id) self.cache.insert(IdentifiableCache.hash_entity(ent2), test_id, "ajdsklfjadslf")
entities = [ent, ent2, ent3] entities = [ent, ent2, ent3]
hashes = self.cache.update_ids_from_cache(entities) hashes = self.cache.update_ids_from_cache(entities)
self.assertEqual(ent.id, None)
self.assertEqual(ent2.id, test_id) self.assertEqual(ent2.id, test_id)
self.assertEqual(ent3.id, None)
with pytest.raises(RuntimeError, match=r".*no ID.*"):
self.cache.insert_list(hashes, entities)
# test # test
ent.id = 1001 ent.id = 1001
ent3.id = 1003 ent3.id = 1003
with pytest.raises(RuntimeError, match=r".*no version ID.*"):
self.cache.insert_list(hashes, entities)
ent.version = db.common.versioning.Version("jkadsjfldf")
ent2.version = db.common.versioning.Version("jkadsjfldf")
ent3.version = db.common.versioning.Version("jkadsjfldf")
self.cache.insert_list(hashes, entities) self.cache.insert_list(hashes, entities)
self.assertEqual(self.cache.check_existing(hashes[0]), 1001) self.assertEqual(self.cache.check_existing(hashes[0])[0], 1001)
self.assertEqual(self.cache.check_existing(hashes[2]), 1003) self.assertEqual(self.cache.check_existing(hashes[2])[0], 1003)
def test_clean(self): def test_clean(self):
xml = etree.XML( xml = etree.XML(
...@@ -91,3 +107,138 @@ class CacheTest(unittest.TestCase): ...@@ -91,3 +107,138 @@ class CacheTest(unittest.TestCase):
""") """)
cleanXML(xml) cleanXML(xml)
assert len(xml.findall('TransactionBenchmark')) == 0 assert len(xml.findall('TransactionBenchmark')) == 0
def create_sqlite_file(commands):
"""
A temporary file will be used
commands: list of sql commands (tuples) to execute after creation
Name of the file is returned
"""
db_file = NamedTemporaryFile(delete=False).name
conn = sqlite3.connect(db_file)
c = conn.cursor()
for sql in commands:
c.execute(*sql)
conn.commit()
conn.close()
return db_file
class CacheTest2(unittest.TestCase):
"""
Test the schema version.
"""
def setUp(self):
# Correct version:
self.cache = IdentifiableCache(db_file=NamedTemporaryFile(delete=False).name,
force_creation=True)
self.db_file_defect = []
self.db_file_defect.extend([
# Version without version table (old version):
create_sqlite_file(
[('''CREATE TABLE identifiables (digest TEXT PRIMARY KEY, caosdb_id INTEGER)''',)]),
# Version with version table with wrong version:
create_sqlite_file(
[('''CREATE TABLE identifiables (digest TEXT PRIMARY KEY, caosdb_id INTEGER, caosdb_version TEXT)''',),
('''CREATE TABLE version (schema INTEGER)''',),
("INSERT INTO version VALUES (?)", (1,))]),
# Version with version table with wrong version:
create_sqlite_file(
[('''CREATE TABLE identifiables (digest TEXT PRIMARY KEY, caosdb_id INTEGER, caosdb_version TEXT)''',),
('''CREATE TABLE version (schema INTEGER)''',),
("INSERT INTO version VALUES (?)", (3,))]),
# Version with version table with missing version:
create_sqlite_file(
[('''CREATE TABLE identifiables (digest TEXT PRIMARY KEY, caosdb_id INTEGER, caosdb_version TEXT)''',),
('''CREATE TABLE version (schema INTEGER)''',)]),
# Version with version table with too many versions:
create_sqlite_file(
[('''CREATE TABLE identifiables (digest TEXT PRIMARY KEY, caosdb_id INTEGER, caosdb_version TEXT)''',),
('''CREATE TABLE version (schema INTEGER)''',),
("INSERT INTO version VALUES (?)", (1,)),
("INSERT INTO version VALUES (?)", (3,))])])
def test_schema(self):
# Test whether new cache is created correctly:
assert os.path.exists(self.cache.db_file)
# Test whether it can be opened
test_cache_2 = IdentifiableCache(db_file=self.cache.db_file)
assert test_cache_2.get_cache_version() == 2
with pytest.raises(RuntimeError, match="Cache version too old.") as e_info:
test_cache_2 = IdentifiableCache(db_file=self.db_file_defect[0])
with pytest.raises(RuntimeError, match="Cache version too old.") as e_info:
test_cache_2 = IdentifiableCache(db_file=self.db_file_defect[1])
with pytest.raises(RuntimeError, match=r".*future version.*") as e_info:
test_cache_2 = IdentifiableCache(db_file=self.db_file_defect[2])
with pytest.raises(RuntimeError, match=r".*table broken.*") as e_info:
test_cache_2 = IdentifiableCache(db_file=self.db_file_defect[3])
with pytest.raises(RuntimeError, match=r".*table broken.*") as e_info:
test_cache_2 = IdentifiableCache(db_file=self.db_file_defect[4])
def tearDown(self):
os.remove(self.cache.db_file)
for db_fn_defect in self.db_file_defect:
os.remove(db_fn_defect)
class InvalidationTest(unittest.TestCase):
"""
Test invalidation of cache entries.
"""
def setUp(self):
# Correct version:
self.cache = IdentifiableCache(db_file=NamedTemporaryFile(delete=False).name,
force_creation=True)
def tearDown(self):
os.remove(self.cache.db_file)
def test_invalid(self):
ent = db.Record()
ent2 = db.Record()
ent2.add_parent(name="Experiment")
ent3 = db.Record()
ent3.add_parent(name="Analysis")
ent.id = 117
ent2.id = 328
ent3.id = 224
ent.version = db.common.versioning.Version("a")
ent2.version = db.common.versioning.Version("b")
ent3.version = db.common.versioning.Version("a")
el = [ent, ent2, ent3]
for e in el:
self.cache.insert(IdentifiableCache.hash_entity(e), e.id, e.version.id)
for e in el:
res = self.cache.check_existing(IdentifiableCache.hash_entity(e))
assert e.id == res[0]
assert e.version.id == res[1]
ent2.version.id = "c"
ent3.version.id = "b"
for e in el[1:]:
res = self.cache.check_existing(IdentifiableCache.hash_entity(e))
assert res is None
invalidated_entries = self.cache.validate_cache(el)
assert 328 in invalidated_entries
assert 224 in invalidated_entries
assert 117 not in invalidated_entries
res = self.cache.run_sql_commands([
("SELECT * FROM identifiables", ())], fetchall=True)
assert len(res) == 1
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment