diff --git a/src/caosadvancedtools/cache.py b/src/caosadvancedtools/cache.py index ff807f2aba6210d643e675e7e3dd91d7c3b30906..a7d1e4526ab0d816f489e991be01ad08a94bc0b0 100644 --- a/src/caosadvancedtools/cache.py +++ b/src/caosadvancedtools/cache.py @@ -72,9 +72,11 @@ class Cache(object): without querying. """ - def __init__(self, db_file=None): + def __init__(self, db_file=None, default_name="cache.db"): if db_file is None: - self.db_file = "cache.db" + tmppath = tempfile.gettempdir() + tmpf = os.path.join(tmppath, default_name) + self.db_file = tmpf else: self.db_file = db_file @@ -148,18 +150,14 @@ class Cache(object): class UpdateCache(Cache): """ - stores unauthorized updates + stores unauthorized inserts and updates - If the Guard is set to a mode that does not allow an update, the update can - be stored in this cache such that it can be authorized and done later. + If the Guard is set to a mode that does not allow an insert or update, the insert or update can + be stored in this cache such that it can be authorized and performed later. """ def __init__(self, db_file=None): - if db_file is None: - tmppath = tempfile.gettempdir() - tmpf = os.path.join(tmppath, "crawler_update_cache.db") - db_file = tmpf - super().__init__(db_file=db_file) + super().__init__(db_file=db_file, default_name="crawler_insert_cache.db") @staticmethod def get_previous_version(cont): @@ -174,23 +172,32 @@ class UpdateCache(Cache): return old_ones - def insert(self, cont, run_id): - """Insert a pending, unauthorized update + def insert(self, cont, run_id, insert=False): + """Insert a pending, unauthorized inserts Parameters ---------- - cont: Container with the records to be updated containing the desired + cont: Container with the records to be inserted containing the desired version, i.e. the state after the update. run_id: int The id of the crawler run + insert: bool + Whether the entities in the container shall be inserted or updated. """ cont = put_in_container(cont) - old_ones = UpdateCache.get_previous_version(cont) + + if insert: + old_ones = "" + else: + old_ones = UpdateCache.get_previous_version(cont) new_ones = cont - old_hash = Cache.hash_entity(old_ones) + if insert: + old_hash = "" + else: + old_hash = Cache.hash_entity(old_ones) new_hash = Cache.hash_entity(new_ones) conn = sqlite3.connect(self.db_file) c = conn.cursor() @@ -210,20 +217,40 @@ class UpdateCache(Cache): conn.commit() conn.close() - def get_updates(self, run_id): + def get(self, run_id, querystring): """ returns the pending updates for a given run id Parameters: ----------- run_id: the id of the crawler run + querystring: the sql query """ conn = sqlite3.connect(self.db_file) c = conn.cursor() - c.execute('''Select * FROM updates WHERE run_id=?''', - (str(run_id),)) + c.execute(querystring, (str(run_id),)) res = c.fetchall() conn.commit() conn.close() return res + + def get_inserts(self, run_id): + """ returns the pending updates for a given run id + + Parameters: + ----------- + run_id: the id of the crawler run + """ + + return self.get(run_id, '''Select * FROM updates WHERE olddigest='' AND run_id=?''') + + def get_updates(self, run_id): + """ returns the pending updates for a given run id + + Parameters: + ----------- + run_id: the id of the crawler run + """ + + return self.get(run_id, '''Select * FROM updates WHERE olddigest!='' AND run_id=?''') diff --git a/src/caosadvancedtools/crawler.py b/src/caosadvancedtools/crawler.py index 099b8fd86656bd326c91e7754fa32a3d4ba76564..9e8f5fb324cccb095f98356b2b5e5aabc98bb383 100644 --- a/src/caosadvancedtools/crawler.py +++ b/src/caosadvancedtools/crawler.py @@ -208,6 +208,14 @@ class Crawler(object): run_id: the id of the crawler run """ cache = UpdateCache() + inserts = cache.get_inserts(run_id) + for _, _, _, new, _ in inserts: + new_cont = db.Container() + new_cont = new_cont.from_xml(new) + new_cont.insert(unique=False) + logger.info("Successfully inserted {} records!".format(len(new_cont))) + logger.info("Finished with authorized updates.") + changes = cache.get_updates(run_id) for _, _, old, new, _ in changes: