diff --git a/integrationtests/full_test/crawl.py b/integrationtests/full_test/crawl.py index a53472ae34219d754f7cf4ecc8f479abdee36120..622213ec8f97ca2c068665a7f49b434a2a6e1587 100755 --- a/integrationtests/full_test/crawl.py +++ b/integrationtests/full_test/crawl.py @@ -50,18 +50,21 @@ def access(path): if __name__ == "__main__": - set_log_level(logging.DEBUG) + logger = logging.getLogger("caosadvancedtools") + conlogger = logging.getLogger("connection") + conlogger.setLevel(level=logging.ERROR) + logger.setLevel(level=logging.WARN) parser = get_parser() args = parser.parse_args() - print("Starting query...") + logger.info("Starting query...") files = Crawler.query_files(args.path) - print("Query done...") + logger.info("Query done...") config = db.configuration.get_config() c = Crawler(use_cache=True, access=access, food=[ProjectCFood, ExperimentCFood, AnalysisCFood, PublicationCFood, SimulationCFood, ]) - c.crawl(files, interactive=False, security_level=UPDATE) + c.crawl(files, interactive=False, security_level=INSERT, hideKnown=True) diff --git a/src/caosadvancedtools/cfood.py b/src/caosadvancedtools/cfood.py index ca68efdaa504c3c7b1771381f4bd599fdac15925..68365d8df476699d498636c8faad01eff8f21201 100644 --- a/src/caosadvancedtools/cfood.py +++ b/src/caosadvancedtools/cfood.py @@ -38,6 +38,7 @@ import logging import re import caosdb as db +from caosdb.exceptions import EntityDoesNotExist from .guard import global_guard as guard @@ -77,7 +78,7 @@ class AbstractCFood(object): A function that takes a CaosDB path and returns a local path """ self.access = access - self.crawled_file = None + self._crawled_file = None self.crawled_path = crawled_file self.match = type(self).match(crawled_file) self.to_be_updated = db.Container() @@ -85,6 +86,25 @@ class AbstractCFood(object): self.attached_ones = [] self.attached_filenames = [] + @property + def crawled_file(self): + if self._crawled_file is None: + try: + q = "FIND FILE WHICH IS STORED AT '{}'".format( + self.crawled_path) + self._crawled_file = db.execute_query(q, unique=True) + except EntityDoesNotExist: + path = "**" + + if not self.crawled_path.startswith("/"): + path = path + "/" + q = "FIND FILE WHICH IS STORED AT '{}{}'".format(path, + self.crawled_path) + logger.debug(q) + self._crawled_file = db.execute_query(q, unique=True) + + return self._crawled_file + def collect_information(self): """ The CFood collects information for further processing. diff --git a/src/caosadvancedtools/crawler.py b/src/caosadvancedtools/crawler.py index f5715b7a517f509998ff5b3bf03e81eb4ac8f7e6..d581d7599907e3aac044c49dd65f401b1d9ffcc7 100644 --- a/src/caosadvancedtools/crawler.py +++ b/src/caosadvancedtools/crawler.py @@ -37,6 +37,7 @@ match. This occurs in basically three steps: import logging +import os import traceback from datetime import datetime @@ -55,11 +56,15 @@ def separated(text): class UnknownCache(object): - def __init__(self, interactive=False): + def __init__(self, interactive=False, load=False): + if interactive and "y" == input( "\nDo you want to load filenames that previously were not " "matched by any CFood?\nIn that case, they will not show up " "again. (y)"): + load = True + + if load and os.path.exists("known_cache.db"): with open("known_cache.db") as fi: self.filenames = [el.strip("\n") for el in fi.readlines()] else: @@ -99,7 +104,7 @@ class Crawler(object): if self.use_cache: self.cache = Cache() - def match(self, files, interactive): + def match(self, files, interactive, hideKnown=False): errors_occured = False tbs = [] cfoods = [] @@ -151,12 +156,12 @@ class Crawler(object): # possibly load previously encountered "Missing matches" and # "Multiple matches" - ucache = UnknownCache(interactive=interactive) + ucache = UnknownCache(interactive=interactive, load=hideKnown) for crawled_file in files: if len(matches[crawled_file]) == 0: msg = ("ATTENTION: No matching cfood!\n" - "Tried to match {}".format(crawled_file)) + "Tried to match {}\n".format(crawled_file)) if crawled_file in ucache.filenames: logger.debug(msg) @@ -165,28 +170,30 @@ class Crawler(object): ucache.add(crawled_file) if len(matches[crawled_file]) > 1: - msg = ("Attention: More than one matching cfood!" + msg = ("Attention: More than one matching cfood!\n" + "Tried to match {}\n".format(crawled_file) + "\tRecordTypes:\t" + ", ".join( - matches[crawled_file])) + matches[crawled_file])+"\n") if crawled_file in ucache.filenames: logger.debug(msg) else: logger.warn(msg) - ucache.add(crawled_file) + ucache.add(crawled_file) # Save the encountered prblem matches ucache.save() return cfoods, matches, tbs, errors_occured - def crawl(self, files, interactive=True, security_level=RETRIEVE): + def crawl(self, files, interactive=True, hideKnown=False, + security_level=RETRIEVE): guard.set_level(level=security_level) files = sorted([f.path for f in files]) - cfoods, matches, tbs, errors_occured = self.match(files, interactive) + cfoods, matches, tbs, errors_occured = self.match(files, interactive, + hideKnown=hideKnown) if interactive and "y" != input("Do you want to continue? (y)"): return