Skip to content
Snippets Groups Projects
Commit 43bc5aec authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

ENH: allow to store unauthorized inserts

parent 3c8e6ded
No related branches found
No related tags found
2 merge requests!59FIX: if multiple updates for one entity exist, the retrieve would result in an...,!56F insert auth
Pipeline #28055 passed
...@@ -72,9 +72,11 @@ class Cache(object): ...@@ -72,9 +72,11 @@ class Cache(object):
without querying. without querying.
""" """
def __init__(self, db_file=None): def __init__(self, db_file=None, default_name="cache.db"):
if db_file is None: if db_file is None:
self.db_file = "cache.db" tmppath = tempfile.gettempdir()
tmpf = os.path.join(tmppath, default_name)
self.db_file = tmpf
else: else:
self.db_file = db_file self.db_file = db_file
...@@ -148,18 +150,14 @@ class Cache(object): ...@@ -148,18 +150,14 @@ class Cache(object):
class UpdateCache(Cache): class UpdateCache(Cache):
""" """
stores unauthorized updates stores unauthorized inserts and updates
If the Guard is set to a mode that does not allow an update, the update can If the Guard is set to a mode that does not allow an insert or update, the insert or update can
be stored in this cache such that it can be authorized and done later. be stored in this cache such that it can be authorized and performed later.
""" """
def __init__(self, db_file=None): def __init__(self, db_file=None):
if db_file is None: super().__init__(db_file=db_file, default_name="crawler_insert_cache.db")
tmppath = tempfile.gettempdir()
tmpf = os.path.join(tmppath, "crawler_update_cache.db")
db_file = tmpf
super().__init__(db_file=db_file)
@staticmethod @staticmethod
def get_previous_version(cont): def get_previous_version(cont):
...@@ -174,23 +172,32 @@ class UpdateCache(Cache): ...@@ -174,23 +172,32 @@ class UpdateCache(Cache):
return old_ones return old_ones
def insert(self, cont, run_id): def insert(self, cont, run_id, insert=False):
"""Insert a pending, unauthorized update """Insert a pending, unauthorized inserts
Parameters Parameters
---------- ----------
cont: Container with the records to be updated containing the desired cont: Container with the records to be inserted containing the desired
version, i.e. the state after the update. version, i.e. the state after the update.
run_id: int run_id: int
The id of the crawler run The id of the crawler run
insert: bool
Whether the entities in the container shall be inserted or updated.
""" """
cont = put_in_container(cont) cont = put_in_container(cont)
old_ones = UpdateCache.get_previous_version(cont)
if insert:
old_ones = ""
else:
old_ones = UpdateCache.get_previous_version(cont)
new_ones = cont new_ones = cont
old_hash = Cache.hash_entity(old_ones) if insert:
old_hash = ""
else:
old_hash = Cache.hash_entity(old_ones)
new_hash = Cache.hash_entity(new_ones) new_hash = Cache.hash_entity(new_ones)
conn = sqlite3.connect(self.db_file) conn = sqlite3.connect(self.db_file)
c = conn.cursor() c = conn.cursor()
...@@ -210,20 +217,40 @@ class UpdateCache(Cache): ...@@ -210,20 +217,40 @@ class UpdateCache(Cache):
conn.commit() conn.commit()
conn.close() conn.close()
def get_updates(self, run_id): def get(self, run_id, querystring):
""" returns the pending updates for a given run id """ returns the pending updates for a given run id
Parameters: Parameters:
----------- -----------
run_id: the id of the crawler run run_id: the id of the crawler run
querystring: the sql query
""" """
conn = sqlite3.connect(self.db_file) conn = sqlite3.connect(self.db_file)
c = conn.cursor() c = conn.cursor()
c.execute('''Select * FROM updates WHERE run_id=?''', c.execute(querystring, (str(run_id),))
(str(run_id),))
res = c.fetchall() res = c.fetchall()
conn.commit() conn.commit()
conn.close() conn.close()
return res return res
def get_inserts(self, run_id):
""" returns the pending updates for a given run id
Parameters:
-----------
run_id: the id of the crawler run
"""
return self.get(run_id, '''Select * FROM updates WHERE olddigest='' AND run_id=?''')
def get_updates(self, run_id):
""" returns the pending updates for a given run id
Parameters:
-----------
run_id: the id of the crawler run
"""
return self.get(run_id, '''Select * FROM updates WHERE olddigest!='' AND run_id=?''')
...@@ -208,6 +208,14 @@ class Crawler(object): ...@@ -208,6 +208,14 @@ class Crawler(object):
run_id: the id of the crawler run run_id: the id of the crawler run
""" """
cache = UpdateCache() cache = UpdateCache()
inserts = cache.get_inserts(run_id)
for _, _, _, new, _ in inserts:
new_cont = db.Container()
new_cont = new_cont.from_xml(new)
new_cont.insert(unique=False)
logger.info("Successfully inserted {} records!".format(len(new_cont)))
logger.info("Finished with authorized updates.")
changes = cache.get_updates(run_id) changes = cache.get_updates(run_id)
for _, _, old, new, _ in changes: for _, _, old, new, _ in changes:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment