From 77a364260c7030842453595c5c79bbe99d47efc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <henrik@trineo.org> Date: Mon, 11 Feb 2019 17:18:24 +0100 Subject: [PATCH] cache --- src/caosadvancedtools/cfood.py | 65 +++++++++++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/src/caosadvancedtools/cfood.py b/src/caosadvancedtools/cfood.py index d4a2a367..ab74d589 100644 --- a/src/caosadvancedtools/cfood.py +++ b/src/caosadvancedtools/cfood.py @@ -25,8 +25,10 @@ import argparse import re +from hashlib import sha256 from argparse import RawTextHelpFormatter from datetime import datetime +import sqlite3 import caosdb as db from caosdb.exceptions import TransactionError @@ -37,16 +39,77 @@ def get_entity(name): ent.retrieve() return ent + +def get_pretty_xml(cont): + if isinstance(cont, list): + cont = db.Container().extend(cont) + if isinstance(cont, db.Entity): + cont = db.Container().insert(cont) + + return etree.tounicode(cont.to_xml( + local_serialization=True), pretty_print=True) + + +# TODO this is implementing a cache on client side. Should it be on +# server side? +class Cache(object): + CACHE_DB = "cache.db" + def __init__(self): + if not os.path.exists(Cache.CACHE_DB): + self.create_cache() + + def create_cache(self): + conn = sqlite3.connect(Cache.CACHE_DB) + c = conn.cursor() + c.execute('''CREATE TABLE identifiables (digest text primary key, caosdb_id integer)''') + conn.commit() + conn.close() + + def hash_entity(ent): + xml = get_pretty_xml(ent) + digest = sha256(xml.encode("utf-8")).hexdigest() + + def insert(ent): + conn = sqlite3.connect(Cache.CACHE_DB) + c = conn.cursor() + c.execute('''INSERT INTO identifiables VALUES ({}, {})'''.format( + Cache.hash_entity(ent), ent.id) + ) + conn.commit() + conn.close() + + def check_existing(ent): + conn = sqlite3.connect(Cache.CACHE_DB) + c = conn.cursor() + c.execute('''Select * FROM stocks WHERE digest=?''', Cache.hash_entity(ent)) + res = c.fetchone() + conn.commit() + conn.close() + return res + class AbstractCFood(object): - def __init__(self, pattern): + # TODO restructure this class such that no instance is needed to check for + # a match + # instances shall be used to keep track of a match; i.e. entities can be + # object variable + def __init__(self, pattern, use_cache=False): self.pattern = re.compile(pattern) + self.use_cache = use_cache def treat_match(self, crawled_file, match): entities = self.create_identifiables(crawled_file, match) for key, ent in entities.items(): + existing = None + if self.use_cache: + c = Cache() + existing_in_cache = c.check_existing(ent) + if existing_in_cache is not None: + existing = db.Entity(id = existing_in_cache).retrieve() + + existing = AbstractCFood.find_existing(ent) if existing is None: -- GitLab