Skip to content
Snippets Groups Projects
Commit 43bc5aec authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

ENH: allow to store unauthorized inserts

parent 3c8e6ded
No related branches found
No related tags found
2 merge requests!59FIX: if multiple updates for one entity exist, the retrieve would result in an...,!56F insert auth
Pipeline #28055 passed
......@@ -72,9 +72,11 @@ class Cache(object):
without querying.
"""
def __init__(self, db_file=None):
def __init__(self, db_file=None, default_name="cache.db"):
if db_file is None:
self.db_file = "cache.db"
tmppath = tempfile.gettempdir()
tmpf = os.path.join(tmppath, default_name)
self.db_file = tmpf
else:
self.db_file = db_file
......@@ -148,18 +150,14 @@ class Cache(object):
class UpdateCache(Cache):
"""
stores unauthorized updates
stores unauthorized inserts and updates
If the Guard is set to a mode that does not allow an update, the update can
be stored in this cache such that it can be authorized and done later.
If the Guard is set to a mode that does not allow an insert or update, the insert or update can
be stored in this cache such that it can be authorized and performed later.
"""
def __init__(self, db_file=None):
if db_file is None:
tmppath = tempfile.gettempdir()
tmpf = os.path.join(tmppath, "crawler_update_cache.db")
db_file = tmpf
super().__init__(db_file=db_file)
super().__init__(db_file=db_file, default_name="crawler_insert_cache.db")
@staticmethod
def get_previous_version(cont):
......@@ -174,23 +172,32 @@ class UpdateCache(Cache):
return old_ones
def insert(self, cont, run_id):
"""Insert a pending, unauthorized update
def insert(self, cont, run_id, insert=False):
"""Insert a pending, unauthorized inserts
Parameters
----------
cont: Container with the records to be updated containing the desired
cont: Container with the records to be inserted containing the desired
version, i.e. the state after the update.
run_id: int
The id of the crawler run
insert: bool
Whether the entities in the container shall be inserted or updated.
"""
cont = put_in_container(cont)
old_ones = UpdateCache.get_previous_version(cont)
if insert:
old_ones = ""
else:
old_ones = UpdateCache.get_previous_version(cont)
new_ones = cont
old_hash = Cache.hash_entity(old_ones)
if insert:
old_hash = ""
else:
old_hash = Cache.hash_entity(old_ones)
new_hash = Cache.hash_entity(new_ones)
conn = sqlite3.connect(self.db_file)
c = conn.cursor()
......@@ -210,20 +217,40 @@ class UpdateCache(Cache):
conn.commit()
conn.close()
def get_updates(self, run_id):
def get(self, run_id, querystring):
""" returns the pending updates for a given run id
Parameters:
-----------
run_id: the id of the crawler run
querystring: the sql query
"""
conn = sqlite3.connect(self.db_file)
c = conn.cursor()
c.execute('''Select * FROM updates WHERE run_id=?''',
(str(run_id),))
c.execute(querystring, (str(run_id),))
res = c.fetchall()
conn.commit()
conn.close()
return res
def get_inserts(self, run_id):
""" returns the pending updates for a given run id
Parameters:
-----------
run_id: the id of the crawler run
"""
return self.get(run_id, '''Select * FROM updates WHERE olddigest='' AND run_id=?''')
def get_updates(self, run_id):
""" returns the pending updates for a given run id
Parameters:
-----------
run_id: the id of the crawler run
"""
return self.get(run_id, '''Select * FROM updates WHERE olddigest!='' AND run_id=?''')
......@@ -208,6 +208,14 @@ class Crawler(object):
run_id: the id of the crawler run
"""
cache = UpdateCache()
inserts = cache.get_inserts(run_id)
for _, _, _, new, _ in inserts:
new_cont = db.Container()
new_cont = new_cont.from_xml(new)
new_cont.insert(unique=False)
logger.info("Successfully inserted {} records!".format(len(new_cont)))
logger.info("Finished with authorized updates.")
changes = cache.get_updates(run_id)
for _, _, old, new, _ in changes:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment