Skip to content
Snippets Groups Projects
Commit 90c5e313 authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

bugfixes maint

parent 58bf1590
No related branches found
No related tags found
No related merge requests found
# TODO this is implementing a cache on client side. Should it be on
# server side?
import os
import sqlite3
from hashlib import sha256
import caosdb as db
from lxml import etree
def get_pretty_xml(cont):
if isinstance(cont, list):
cont = db.Container().extend(cont)
if not isinstance(cont, db.Container):
cont = db.Container().append(cont)
return etree.tounicode(cont.to_xml(
local_serialization=True), pretty_print=True)
class Cache(object):
def __init__(self, db_file=None):
if db_file is None:
self.db_file = "cache.db"
else:
self.db_file = db_file
if not os.path.exists(self.db_file):
self.create_cache()
def create_cache(self):
conn = sqlite3.connect(self.db_file)
c = conn.cursor()
c.execute('''CREATE TABLE identifiables (digest text primary key, caosdb_id integer)''')
conn.commit()
conn.close()
def hash_entity(ent):
xml = get_pretty_xml(ent)
digest = sha256(xml.encode("utf-8")).hexdigest()
return digest
def insert(self, ent_hash, ent_id):
conn = sqlite3.connect(self.db_file)
c = conn.cursor()
c.execute('''INSERT INTO identifiables VALUES (?, ?)''',
(ent_hash, ent_id))
conn.commit()
conn.close()
def check_existing(self, ent_hash):
conn = sqlite3.connect(self.db_file)
c = conn.cursor()
c.execute('''Select * FROM identifiables WHERE digest=?''',
(ent_hash,))
res = c.fetchone()
conn.commit()
conn.close()
if res is None:
return res
else:
return res[1]
......@@ -26,68 +26,24 @@
import argparse
import re
from hashlib import sha256
from argparse import RawTextHelpFormatter
from copy import deepcopy
from datetime import datetime
import sqlite3
import caosdb as db
from caosadvancedtools.cache import Cache
from caosdb.exceptions import TransactionError
ENTITIES = {}
def get_entity(name):
ent = db.Entity(name=name)
ent.retrieve()
return ent
def get_pretty_xml(cont):
if isinstance(cont, list):
cont = db.Container().extend(cont)
if isinstance(cont, db.Entity):
cont = db.Container().insert(cont)
return etree.tounicode(cont.to_xml(
local_serialization=True), pretty_print=True)
# TODO this is implementing a cache on client side. Should it be on
# server side?
class Cache(object):
CACHE_DB = "cache.db"
def __init__(self):
if not os.path.exists(Cache.CACHE_DB):
self.create_cache()
def create_cache(self):
conn = sqlite3.connect(Cache.CACHE_DB)
c = conn.cursor()
c.execute('''CREATE TABLE identifiables (digest text primary key, caosdb_id integer)''')
conn.commit()
conn.close()
def hash_entity(ent):
xml = get_pretty_xml(ent)
digest = sha256(xml.encode("utf-8")).hexdigest()
def insert(ent):
conn = sqlite3.connect(Cache.CACHE_DB)
c = conn.cursor()
c.execute('''INSERT INTO identifiables VALUES ({}, {})'''.format(
Cache.hash_entity(ent), ent.id)
)
conn.commit()
conn.close()
def check_existing(ent):
conn = sqlite3.connect(Cache.CACHE_DB)
c = conn.cursor()
c.execute('''Select * FROM stocks WHERE digest=?''', Cache.hash_entity(ent))
res = c.fetchone()
conn.commit()
conn.close()
return res
if name not in ENTITIES:
ent = db.Entity(name=name)
ent.retrieve()
ENTITIES[name] = ent
return ENTITIES[name]
class AbstractCFood(object):
......@@ -102,26 +58,51 @@ class AbstractCFood(object):
def treat_match(self, crawled_file, match):
entities = self.create_identifiables(crawled_file, match)
for key, ent in entities.items():
for key, identifiable in entities.items():
print(identifiable)
if identifiable is None:
print("THIS IS STRANGE")
continue
existing = None
if self.use_cache:
c = Cache()
existing_in_cache = c.check_existing(ent)
if existing_in_cache is not None:
existing = db.Entity(id = existing_in_cache).retrieve()
identifiable_cache = Cache()
identifier = Cache.hash_entity(identifiable)
print("look for '{}' in Cache".format(identifiable.get_parents()[0]))
print(identifiable)
cached_id = identifiable_cache.check_existing(identifier)
if cached_id is not None:
print(cached_id)
print("found Entity '{}' in Cache".format(
identifiable.get_parents()[0]))
existing = db.execute_query("FIND {}".format(cached_id),
unique=True)
else:
print("did not found Entity '{}' in Cache".format(
identifiable.get_parents()[0]))
# Nothing in cache or cache not used. Check in CaosDB
if existing is None:
existing = AbstractCFood.find_existing(identifiable)
existing = AbstractCFood.find_existing(ent)
# No record matching the identifiable was found. Insert the record
if existing is None:
print(key, "does not exist")
ent.insert()
identifiable.insert()
print(key, "inserted")
entities[key] = ent
entities[key] = identifiable
else:
print(key, "exists")
entities[key] = existing
if self.use_cache and cached_id is None:
identifiable_cache.insert(identifier, entities[key].id)
self.update_identifiables(entities, crawled_file, match)
def create_identifiables(self, crawled_file, match):
......
#!/usr/bin/env python
# encoding: utf-8
#
# ** header v3.0
# This file is a part of the CaosDB Project.
#
# Copyright (C) 2019 Henrik tom Wörden
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
# ** end header
import os
import unittest
from copy import deepcopy
from tempfile import NamedTemporaryFile
import caosdb as db
from caosadvancedtools.cache import Cache
class CacheTest(unittest.TestCase):
def setUp(self):
self.cache = Cache(db_file=NamedTemporaryFile(delete=False).name)
self.cache.create_cache()
def test_hash(self):
ent = db.Record()
assert type(Cache.hash_entity(ent)) is str
assert (Cache.hash_entity(ent) !=
Cache.hash_entity(db.Record().add_parent("lol")))
def test_insert(self):
ent = db.Record()
ent2 = db.Record()
ent2.add_parent(name="Experiment")
ent_hash = Cache.hash_entity(ent)
print(ent_hash)
ent2_hash = Cache.hash_entity(ent2)
print(ent2_hash)
self.cache.insert(ent2_hash, 1235)
assert type(self.cache.check_existing(ent2_hash)) is int
assert self.cache.check_existing(ent_hash) is None
def test_hirarchy(self):
assert isinstance(db.Record(), db.Entity)
def tearDown(self):
os.remove(self.cache.db_file)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment