diff --git a/integrationtests/full_test/crawl.py b/integrationtests/full_test/crawl.py index 137700a0397499ed255c0454a070dbacccb73af4..a53472ae34219d754f7cf4ecc8f479abdee36120 100755 --- a/integrationtests/full_test/crawl.py +++ b/integrationtests/full_test/crawl.py @@ -24,12 +24,14 @@ # import argparse +import logging +import sys from argparse import RawTextHelpFormatter import caosdb as db from caosadvancedtools.crawler import Crawler from caosadvancedtools.guard import INSERT, RETRIEVE, UPDATE, Guard -from caosadvancedtools.verbosity import DEBUG, INFO, QUIET, VERBOSE +from caosadvancedtools.utils import set_log_level from scifolder import (AnalysisCFood, ExperimentCFood, ProjectCFood, PublicationCFood, SimulationCFood) @@ -48,6 +50,8 @@ def access(path): if __name__ == "__main__": + set_log_level(logging.DEBUG) + parser = get_parser() args = parser.parse_args() @@ -56,9 +60,8 @@ if __name__ == "__main__": print("Query done...") config = db.configuration.get_config() c = Crawler(use_cache=True, access=access, - verbosity=DEBUG, food=[ProjectCFood, - ExperimentCFood, AnalysisCFood, + ExperimentCFood, AnalysisCFood, PublicationCFood, SimulationCFood, ]) c.crawl(files, interactive=False, security_level=UPDATE) diff --git a/src/caosadvancedtools/__init__.py b/src/caosadvancedtools/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..f96b7578f308ba2d3ee381f7a32216f1a8e02e35 100644 --- a/src/caosadvancedtools/__init__.py +++ b/src/caosadvancedtools/__init__.py @@ -0,0 +1,6 @@ +import logging +import sys + +logger = logging.getLogger("caosadvancedtools") +logger.setLevel(level=logging.INFO) +logger.addHandler(logging.StreamHandler(sys.stdout)) diff --git a/src/caosadvancedtools/cfood.py b/src/caosadvancedtools/cfood.py index ccef7b8995954a832b9224f16f7eeba36adf3b43..ca68efdaa504c3c7b1771381f4bd599fdac15925 100644 --- a/src/caosadvancedtools/cfood.py +++ b/src/caosadvancedtools/cfood.py @@ -34,15 +34,17 @@ match. This occurs in basically three steps: 3. the identifiables are update based on the date in the file structure """ +import logging import re import caosdb as db from .guard import global_guard as guard -from .verbosity import DEBUG, INFO, VERBOSE ENTITIES = {} +logger = logging.getLogger(__name__) + def get_entity(name): """ Returns the entity with a given name, preferably from a local cache. @@ -63,7 +65,7 @@ class AbstractCFood(object): # function match() _pattern = None - def __init__(self, crawled_file, access=lambda x: x, verbosity=INFO): + def __init__(self, crawled_file, access=lambda x: x): """ Abstract base class for Crawler food (CFood). Parameters @@ -80,7 +82,6 @@ class AbstractCFood(object): self.match = type(self).match(crawled_file) self.to_be_updated = db.Container() self.identifiables = db.Container() - self.verbosity = verbosity self.attached_ones = [] self.attached_filenames = [] @@ -174,16 +175,12 @@ class AbstractCFood(object): self.to_be_updated = tmp - if self.verbosity >= INFO: - print("/"*60) - print("UPDATE: updating the following entities") + logger.info("UPDATE: updating the following entities") - for el in self.to_be_updated: - print(el.name if el.name is not None else el.id) + for el in self.to_be_updated: + logger.info("\t" + el.name if el.name is not None else el.id) - if self.verbosity >= DEBUG: - print(self.to_be_updated) - print("/"*60) + logger.debug(self.to_be_updated) guard.safe_update(self.to_be_updated) def attach(self, crawled_file): @@ -239,7 +236,7 @@ class AbstractCFood(object): def assure_object_is_in_list(obj, containing_object, property_name, - to_be_updated, datatype=None, verbosity=INFO): + to_be_updated, datatype=None): """ Checks whether `obj` is one of the values in the list property `property_name` of the supplied entity containing_object`. @@ -289,16 +286,12 @@ def assure_object_is_in_list(obj, containing_object, property_name, break if contained: - if verbosity >= VERBOSE: - print("{} is in {} of entity {}".format( + logger.debug("{} is in {} of entity {}".format( o, property_name, containing_object.id)) else: - if verbosity >= INFO: - print("/"*60) - print("UPDATE: Appending {} to {} of entity {}".format( + logger.info("UPDATE: Appending {} to {} of entity {}".format( o, property_name, containing_object.id)) - print("/"*60) current_list.append(o) update = True @@ -307,7 +300,7 @@ def assure_object_is_in_list(obj, containing_object, property_name, def assure_has_description(entity, description, to_be_updated=None, - verbosity=INFO, force=False): + force=False): """ Checks whether `entity` has the description that is passed. @@ -320,9 +313,7 @@ def assure_has_description(entity, description, to_be_updated=None, if entity.description == description: return - print("/"*60) - print("UPDATE: set description of entity {}".format(entity.id)) - print("/"*60) + logger.info("UPDATE: set description of entity {}".format(entity.id)) entity.description = description if to_be_updated is None: @@ -334,7 +325,7 @@ def assure_has_description(entity, description, to_be_updated=None, to_be_updated.append(entity) -def assure_has_parent(entity, parent, to_be_updated=None, verbosity=INFO, +def assure_has_parent(entity, parent, to_be_updated=None, force=False, unique=True): """ Checks whether `entity` has a parent with name `parent`. @@ -354,16 +345,12 @@ def assure_has_parent(entity, parent, to_be_updated=None, verbosity=INFO, break if contained: - if verbosity >= VERBOSE: - print("entity {} has parent {}".format(entity.id, parent)) + logger.debug("entity {} has parent {}".format(entity.id, parent)) return - if verbosity >= INFO: - print("/"*60) - print("UPDATE: Adding parent {} to entity {}".format(parent, - entity.id)) - print("/"*60) + logger.info("UPDATE: Adding parent {} to entity {}".format(parent, + entity.id)) entity.add_parent(parent) if to_be_updated is None: @@ -381,7 +368,7 @@ def assure_has_parent(entity, parent, to_be_updated=None, verbosity=INFO, def assure_has_property(entity, name, value, to_be_updated=None, - verbosity=INFO, datatype=None): + datatype=None): """ Checks whether `entity` has a property `name` with the value `value`. @@ -395,11 +382,8 @@ def assure_has_property(entity, name, value, to_be_updated=None, if entity.description == value: return else: - print("/"*60) - print("UPDATE: Adding description with value {} to " - "entity {}".format( - value, entity.id)) - print("/"*60) + logger.info("UPDATE: Adding description with value {} to " + "entity {}".format(value, entity.id)) entity.description = value if to_be_updated is None: @@ -421,19 +405,14 @@ def assure_has_property(entity, name, value, to_be_updated=None, break if contained: - if verbosity >= VERBOSE: - print("entity {} has property {} with value {}".format( - entity.id, - name, value)) + logger.debug("entity {} has property {} with value {}".format( + entity.id, name, value)) return - if verbosity >= INFO: - print("/"*60) - print("UPDATE: Adding property {} with value {} to entity {}".format( - name, - value, entity.id)) - print("/"*60) + logger.info( + "UPDATE: Adding property {} with value {} to entity {}".format( + name, value, entity.id)) if datatype is None: entity.add_property(name=name, value=value) diff --git a/src/caosadvancedtools/crawler.py b/src/caosadvancedtools/crawler.py index 6f26aacd0eb36e8cc91c26ee91b6f80962369f58..f5715b7a517f509998ff5b3bf03e81eb4ac8f7e6 100644 --- a/src/caosadvancedtools/crawler.py +++ b/src/caosadvancedtools/crawler.py @@ -36,6 +36,7 @@ match. This occurs in basically three steps: """ +import logging import traceback from datetime import datetime @@ -45,12 +46,37 @@ from caosdb.exceptions import TransactionError from .cache import Cache from .guard import INSERT, RETRIEVE, UPDATE from .guard import global_guard as guard -from .verbosity import DEBUG, INFO, VERBOSE + +logger = logging.getLogger(__name__) + + +def separated(text): + return "-"*60 + "\n" + text + + +class UnknownCache(object): + def __init__(self, interactive=False): + if interactive and "y" == input( + "\nDo you want to load filenames that previously were not " + "matched by any CFood?\nIn that case, they will not show up " + "again. (y)"): + with open("known_cache.db") as fi: + self.filenames = [el.strip("\n") for el in fi.readlines()] + else: + self.filenames = [] + + def save(self): + with open("known_cache.db", "w") as fi: + for name in self.filenames: + fi.write(name + "\n") + + def add(self, el): + self.filenames.append(el) class Crawler(object): def __init__(self, food, access=lambda x: x, use_cache=False, - abort_on_exception=True, verbosity=INFO): + abort_on_exception=True): """ Parameters ---------- @@ -68,40 +94,31 @@ class Crawler(object): self.access = access self.report = db.Container() self.use_cache = use_cache - self.verbosity = verbosity self.abort_on_exception = abort_on_exception if self.use_cache: self.cache = Cache() - def match(self, files): + def match(self, files, interactive): errors_occured = False tbs = [] cfoods = [] matches = {f: [] for f in files} - if self.verbosity >= INFO: - print("-"*60) - print("Matching files against CFoods") + logger.info(separated("Matching files against CFoods")) for Cfood in self.food: - if self.verbosity >= VERBOSE: - print("Matching against {}...".format(Cfood.__name__)) + logger.debug("Matching against {}...".format(Cfood.__name__)) for crawled_file in files: - if self.verbosity >= DEBUG: - print("Matching {}...".format(crawled_file)) - if Cfood.match(crawled_file) is not None: matches[crawled_file].append(Cfood.__name__) - if self.verbosity >= VERBOSE: - print("{} matched\n{}.".format( + logger.debug("{} matched\n{}.".format( Cfood.__class__.__name__, crawled_file)) try: - cfood = Cfood.cook(crawled_file, access=self.access, - verbosity=self.verbosity) + cfood = Cfood.cook(crawled_file, access=self.access) if cfood is not None: cfoods.append(cfood) @@ -114,43 +131,53 @@ class Crawler(object): errors_occured = True tbs.append(e) - if self.verbosity >= INFO: - print("-"*60) - print("CFoods are collecting information...") + logger.info(separated("CFoods are collecting information...")) for cfood in cfoods: cfood.collect_information() - if self.verbosity >= INFO: - print("-"*60) - print("Trying to attach files to created CFoods") + logger.info(separated("Trying to attach files to created CFoods")) for cfood in cfoods: - if self.verbosity >= VERBOSE: - print("Matching against {}...".format(Cfood.__name__)) + logger.debug("Matching against {}...".format(Cfood.__name__)) for crawled_file in files: - if self.verbosity >= DEBUG: - print("Matching {}...".format(crawled_file)) - if cfood.looking_for(crawled_file): - if self.verbosity >= VERBOSE: - print("{} matched\n{}.".format( + logger.debug("{} matched\n{}.".format( Cfood.__class__.__name__, crawled_file)) cfood.attach(crawled_file) matches[crawled_file].append(Cfood.__name__) - if self.verbosity >= INFO: - for crawled_file in files: - if len(matches[crawled_file]) == 0: - print("ATTENTION: No matching cfood!") - print("Tried to match {}".format(crawled_file)) + # possibly load previously encountered "Missing matches" and + # "Multiple matches" + ucache = UnknownCache(interactive=interactive) + + for crawled_file in files: + if len(matches[crawled_file]) == 0: + msg = ("ATTENTION: No matching cfood!\n" + "Tried to match {}".format(crawled_file)) + + if crawled_file in ucache.filenames: + logger.debug(msg) + else: + logger.warn(msg) + ucache.add(crawled_file) + + if len(matches[crawled_file]) > 1: + msg = ("Attention: More than one matching cfood!" + + "Tried to match {}\n".format(crawled_file) + + "\tRecordTypes:\t" + ", ".join( + matches[crawled_file])) + + if crawled_file in ucache.filenames: + logger.debug(msg) + else: + logger.warn(msg) + ucache.add(crawled_file) - if len(matches[crawled_file]) > 1: - print("Attention: More than one matching cfood!") - print("Tried to match {}:".format(crawled_file)) - print("\t\t" + ", ".join(matches[crawled_file])) + # Save the encountered prblem matches + ucache.save() return cfoods, matches, tbs, errors_occured @@ -159,14 +186,12 @@ class Crawler(object): files = sorted([f.path for f in files]) - cfoods, matches, tbs, errors_occured = self.match(files) + cfoods, matches, tbs, errors_occured = self.match(files, interactive) if interactive and "y" != input("Do you want to continue? (y)"): return - if self.verbosity >= INFO: - print("-"*60) - print("Creating and updating Identifiables") + logger.info(separated("Creating and updating Identifiables")) for cfood in cfoods: try: @@ -176,9 +201,7 @@ class Crawler(object): hashes = self.cache.update_ids_from_cache( cfood.identifiables) - self.find_or_insert_identifiables(cfood.identifiables, - self.verbosity, - ) + self.find_or_insert_identifiables(cfood.identifiables) if self.use_cache: self.cache.insert_list(hashes, cfood.identifiables) @@ -194,19 +217,15 @@ class Crawler(object): errors_occured = True tbs.append(e) - if self.verbosity >= INFO: - print("-"*60) - - if errors_occured: - print("Crawler terminated with failures!") - print(tbs) - else: - print("Crawler terminated successfully!") - print("-"*60) + if errors_occured: + logger.warn("Crawler terminated with failures!") + logger.warn(tbs) + else: + logger.info("Crawler terminated successfully!") # TODO remove static? @staticmethod - def find_or_insert_identifiables(identifiables, verbosity=INFO): + def find_or_insert_identifiables(identifiables): """ Sets the ids of identifiables (that do not have already an id from the cache) based on searching CaosDB and retrieves those entities. The remaining entities (those which can not be retrieved) have no @@ -215,22 +234,15 @@ class Crawler(object): # looking for matching entities in CaosDB when there is no valid id # i.e. there was none set from a cache - if verbosity >= VERBOSE: - print("-----------------------------------------------------") - for ent in identifiables: if ent.id is None or ent.id < 0: - if verbosity >= VERBOSE: - print("Looking for:") - print(ent) - existing = Crawler.find_existing(ent, verbosity=verbosity) + logger.debug("Looking for: {}".format(ent)) + existing = Crawler.find_existing(ent) if existing is not None: ent.id = existing.id else: - if verbosity >= DEBUG: - print("Id is known of:") - print(ent) + logger.debug("Id is known of: {}".format(ent)) # insert missing, i.e. those which are not valid missing_identifiables = db.Container() @@ -241,27 +253,22 @@ class Crawler(object): for ent in missing_identifiables: ent.id = None - if verbosity >= INFO and len(missing_identifiables) > 0: - print("Going to insert the following entities:") + if len(missing_identifiables) > 0: + logger.info("Going to insert the following entities:") for ent in missing_identifiables: - print(ent) + logger.info(ent) if len(missing_identifiables) == 0: - if verbosity >= VERBOSE: - print("No new entities to be inserted.") + logger.debug("No new entities to be inserted.") else: guard.safe_insert(missing_identifiables) - if verbosity >= VERBOSE: - print("Retrieving entities from CaosDB...") + logger.debug("Retrieving entities from CaosDB...") identifiables.retrieve(unique=True, raise_exception_on_error=False) - if verbosity >= VERBOSE: - print("-----------------------------------------------------") - @staticmethod - def find_existing(entity, verbosity=INFO): + def find_existing(entity): """searches for an entity that matches the identifiable in CaosDB Characteristics of the identifiable like, properties, name or id are @@ -281,8 +288,7 @@ class Crawler(object): else: query_string = "FIND '{}'".format(entity.name) - if verbosity >= VERBOSE: - print(query_string) + logger.debug(query_string) q = db.Query(query_string) # the identifiable should identify an object uniquely. Thus the query # is using the unique keyword @@ -302,9 +308,9 @@ class Crawler(object): def query_files(path): query_str = "FIND FILE WHICH IS STORED AT " + \ (path if path.endswith("/") else path + "/") + "**" - print("FILES QUERY: " + query_str) + logger.info("FILES QUERY: " + query_str) files = db.execute_query(query_str) - print("{} FILES TO BE PROCESSED.".format(len(files))) + logger.info("{} FILES TO BE PROCESSED.".format(len(files))) return files diff --git a/src/caosadvancedtools/utils.py b/src/caosadvancedtools/utils.py index 9ca503b0f551bbe05911302c0d90b0cc48445d74..b897d76ad53de8fa38ddc457e9c532fd3c274e58 100644 --- a/src/caosadvancedtools/utils.py +++ b/src/caosadvancedtools/utils.py @@ -22,11 +22,17 @@ # ** end header # +import logging import os import caosdb as db +def set_log_level(level): + logger = logging.getLogger("caosadvancedtools") + logger.setLevel(level=logging.DEBUG) + + def replace_path_prefix(path, old_prefix, new_prefix): """ Replaces the prefix old_prefix in path with new_prefix.