From cbc6067f47b36f6934d7ef0dfd04fd1d1b8c5704 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <henrik@trineo.org> Date: Wed, 15 May 2019 23:39:22 +0200 Subject: [PATCH] bugfixes maint --- src/caosadvancedtools/cache.py | 66 +++++++++++++++++++++ src/caosadvancedtools/cfood.py | 103 ++++++++++++++------------------- unittests/test_cache.py | 59 +++++++++++++++++++ 3 files changed, 167 insertions(+), 61 deletions(-) create mode 100644 src/caosadvancedtools/cache.py create mode 100644 unittests/test_cache.py diff --git a/src/caosadvancedtools/cache.py b/src/caosadvancedtools/cache.py new file mode 100644 index 00000000..932af01b --- /dev/null +++ b/src/caosadvancedtools/cache.py @@ -0,0 +1,66 @@ + +# TODO this is implementing a cache on client side. Should it be on +# server side? +import os +import sqlite3 +from hashlib import sha256 + +import caosdb as db +from lxml import etree + + +def get_pretty_xml(cont): + if isinstance(cont, list): + cont = db.Container().extend(cont) + + if not isinstance(cont, db.Container): + cont = db.Container().append(cont) + + return etree.tounicode(cont.to_xml( + local_serialization=True), pretty_print=True) + + +class Cache(object): + def __init__(self, db_file=None): + if db_file is None: + self.db_file = "cache.db" + else: + self.db_file = db_file + + if not os.path.exists(self.db_file): + self.create_cache() + + def create_cache(self): + conn = sqlite3.connect(self.db_file) + c = conn.cursor() + c.execute('''CREATE TABLE identifiables (digest text primary key, caosdb_id integer)''') + conn.commit() + conn.close() + + def hash_entity(ent): + xml = get_pretty_xml(ent) + digest = sha256(xml.encode("utf-8")).hexdigest() + + return digest + + def insert(self, ent_hash, ent_id): + conn = sqlite3.connect(self.db_file) + c = conn.cursor() + c.execute('''INSERT INTO identifiables VALUES (?, ?)''', + (ent_hash, ent_id)) + conn.commit() + conn.close() + + def check_existing(self, ent_hash): + conn = sqlite3.connect(self.db_file) + c = conn.cursor() + c.execute('''Select * FROM identifiables WHERE digest=?''', + (ent_hash,)) + res = c.fetchone() + conn.commit() + conn.close() + + if res is None: + return res + else: + return res[1] diff --git a/src/caosadvancedtools/cfood.py b/src/caosadvancedtools/cfood.py index 012202ce..70f0a7b8 100644 --- a/src/caosadvancedtools/cfood.py +++ b/src/caosadvancedtools/cfood.py @@ -26,68 +26,24 @@ import argparse import re -from hashlib import sha256 from argparse import RawTextHelpFormatter +from copy import deepcopy from datetime import datetime -import sqlite3 import caosdb as db +from caosadvancedtools.cache import Cache from caosdb.exceptions import TransactionError +ENTITIES = {} + def get_entity(name): - ent = db.Entity(name=name) - ent.retrieve() - - return ent - -def get_pretty_xml(cont): - if isinstance(cont, list): - cont = db.Container().extend(cont) - if isinstance(cont, db.Entity): - cont = db.Container().insert(cont) - - return etree.tounicode(cont.to_xml( - local_serialization=True), pretty_print=True) - - -# TODO this is implementing a cache on client side. Should it be on -# server side? -class Cache(object): - CACHE_DB = "cache.db" - def __init__(self): - if not os.path.exists(Cache.CACHE_DB): - self.create_cache() - - def create_cache(self): - conn = sqlite3.connect(Cache.CACHE_DB) - c = conn.cursor() - c.execute('''CREATE TABLE identifiables (digest text primary key, caosdb_id integer)''') - conn.commit() - conn.close() - - def hash_entity(ent): - xml = get_pretty_xml(ent) - digest = sha256(xml.encode("utf-8")).hexdigest() - - def insert(ent): - conn = sqlite3.connect(Cache.CACHE_DB) - c = conn.cursor() - c.execute('''INSERT INTO identifiables VALUES ({}, {})'''.format( - Cache.hash_entity(ent), ent.id) - ) - conn.commit() - conn.close() - - def check_existing(ent): - conn = sqlite3.connect(Cache.CACHE_DB) - c = conn.cursor() - c.execute('''Select * FROM stocks WHERE digest=?''', Cache.hash_entity(ent)) - res = c.fetchone() - conn.commit() - conn.close() - return res + if name not in ENTITIES: + ent = db.Entity(name=name) + ent.retrieve() + ENTITIES[name] = ent + return ENTITIES[name] class AbstractCFood(object): @@ -102,26 +58,51 @@ class AbstractCFood(object): def treat_match(self, crawled_file, match): entities = self.create_identifiables(crawled_file, match) - for key, ent in entities.items(): + for key, identifiable in entities.items(): + print(identifiable) + + if identifiable is None: + print("THIS IS STRANGE") + + continue existing = None + if self.use_cache: - c = Cache() - existing_in_cache = c.check_existing(ent) - if existing_in_cache is not None: - existing = db.Entity(id = existing_in_cache).retrieve() + identifiable_cache = Cache() + identifier = Cache.hash_entity(identifiable) + print("look for '{}' in Cache".format(identifiable.get_parents()[0])) + print(identifiable) + cached_id = identifiable_cache.check_existing(identifier) + + if cached_id is not None: + print(cached_id) + print("found Entity '{}' in Cache".format( + identifiable.get_parents()[0])) + existing = db.execute_query("FIND {}".format(cached_id), + unique=True) + else: + print("did not found Entity '{}' in Cache".format( + identifiable.get_parents()[0])) + + # Nothing in cache or cache not used. Check in CaosDB + if existing is None: + existing = AbstractCFood.find_existing(identifiable) - existing = AbstractCFood.find_existing(ent) + # No record matching the identifiable was found. Insert the record if existing is None: print(key, "does not exist") - ent.insert() + identifiable.insert() print(key, "inserted") - entities[key] = ent + entities[key] = identifiable else: print(key, "exists") entities[key] = existing + if self.use_cache and cached_id is None: + identifiable_cache.insert(identifier, entities[key].id) + self.update_identifiables(entities, crawled_file, match) def create_identifiables(self, crawled_file, match): diff --git a/unittests/test_cache.py b/unittests/test_cache.py new file mode 100644 index 00000000..9e26cadd --- /dev/null +++ b/unittests/test_cache.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2019 Henrik tom Wörden +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +import os +import unittest +from copy import deepcopy +from tempfile import NamedTemporaryFile + +import caosdb as db +from caosadvancedtools.cache import Cache + + +class CacheTest(unittest.TestCase): + def setUp(self): + self.cache = Cache(db_file=NamedTemporaryFile(delete=False).name) + self.cache.create_cache() + + def test_hash(self): + ent = db.Record() + assert type(Cache.hash_entity(ent)) is str + assert (Cache.hash_entity(ent) != + Cache.hash_entity(db.Record().add_parent("lol"))) + + def test_insert(self): + ent = db.Record() + ent2 = db.Record() + ent2.add_parent(name="Experiment") + ent_hash = Cache.hash_entity(ent) + print(ent_hash) + ent2_hash = Cache.hash_entity(ent2) + print(ent2_hash) + self.cache.insert(ent2_hash, 1235) + assert type(self.cache.check_existing(ent2_hash)) is int + assert self.cache.check_existing(ent_hash) is None + + def test_hirarchy(self): + assert isinstance(db.Record(), db.Entity) + + def tearDown(self): + os.remove(self.cache.db_file) -- GitLab