Skip to content
Snippets Groups Projects
Commit e8122342 authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

Merge branch 'cache' into crawler_dev

parents 78ac0afb 77a36426
No related branches found
No related tags found
No related merge requests found
......@@ -26,8 +26,10 @@
import argparse
import re
from hashlib import sha256
from argparse import RawTextHelpFormatter
from datetime import datetime
import sqlite3
import caosdb as db
from caosdb.exceptions import TransactionError
......@@ -38,16 +40,77 @@ def get_entity(name):
ent.retrieve()
return ent
def get_pretty_xml(cont):
if isinstance(cont, list):
cont = db.Container().extend(cont)
if isinstance(cont, db.Entity):
cont = db.Container().insert(cont)
return etree.tounicode(cont.to_xml(
local_serialization=True), pretty_print=True)
# TODO this is implementing a cache on client side. Should it be on
# server side?
class Cache(object):
CACHE_DB = "cache.db"
def __init__(self):
if not os.path.exists(Cache.CACHE_DB):
self.create_cache()
def create_cache(self):
conn = sqlite3.connect(Cache.CACHE_DB)
c = conn.cursor()
c.execute('''CREATE TABLE identifiables (digest text primary key, caosdb_id integer)''')
conn.commit()
conn.close()
def hash_entity(ent):
xml = get_pretty_xml(ent)
digest = sha256(xml.encode("utf-8")).hexdigest()
def insert(ent):
conn = sqlite3.connect(Cache.CACHE_DB)
c = conn.cursor()
c.execute('''INSERT INTO identifiables VALUES ({}, {})'''.format(
Cache.hash_entity(ent), ent.id)
)
conn.commit()
conn.close()
def check_existing(ent):
conn = sqlite3.connect(Cache.CACHE_DB)
c = conn.cursor()
c.execute('''Select * FROM stocks WHERE digest=?''', Cache.hash_entity(ent))
res = c.fetchone()
conn.commit()
conn.close()
return res
class AbstractCFood(object):
def __init__(self, pattern):
# TODO restructure this class such that no instance is needed to check for
# a match
# instances shall be used to keep track of a match; i.e. entities can be
# object variable
def __init__(self, pattern, use_cache=False):
self.pattern = re.compile(pattern)
self.use_cache = use_cache
def treat_match(self, crawled_file, match):
entities = self.create_identifiables(crawled_file, match)
for key, ent in entities.items():
existing = None
if self.use_cache:
c = Cache()
existing_in_cache = c.check_existing(ent)
if existing_in_cache is not None:
existing = db.Entity(id = existing_in_cache).retrieve()
existing = AbstractCFood.find_existing(ent)
if existing is None:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment