From 782db460721d57f8f2c5afa4bac842f80b25cfec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <henrik@trineo.org> Date: Thu, 8 Aug 2019 12:11:57 +0200 Subject: [PATCH] ENH: verbose mode and workaround for empty string props --- src/caosadvancedtools/cfood.py | 29 +++++++++++--- src/caosadvancedtools/crawler.py | 69 +++++++++++++++++++++++++++----- 2 files changed, 81 insertions(+), 17 deletions(-) diff --git a/src/caosadvancedtools/cfood.py b/src/caosadvancedtools/cfood.py index 76f7bce4..58cae5d0 100644 --- a/src/caosadvancedtools/cfood.py +++ b/src/caosadvancedtools/cfood.py @@ -60,7 +60,7 @@ class AbstractCFood(object): # function match() _pattern = None - def __init__(self, crawled_file, access=lambda x: x): + def __init__(self, crawled_file, access=lambda x: x, verbose=True): """ Abstract base class for Crawler food (CFood). Parameters @@ -77,6 +77,7 @@ class AbstractCFood(object): self.match = type(self).match(crawled_file.path) self.to_be_updated = db.Container() self.identifiables = db.Container() + self.verbose = verbose @staticmethod def get_re(): @@ -97,10 +98,14 @@ class AbstractCFood(object): The path of the file that shall be matched. """ - if cls._pattern is None: - cls._pattern = re.compile(cls.get_re()) + # TODO this does not quite work. Sometimes the wrong expression is in + # _pattern; FIX + # if cls._pattern is None: + # cls._pattern = re.compile(cls.get_re()) - return cls._pattern.match(string) + # return cls._pattern.match(string) + + return re.match(cls.get_re(), string) def create_identifiables(self): """ @@ -157,7 +162,7 @@ class AbstractCFood(object): def assure_object_is_in_list(obj, containing_object, property_name, - to_be_updated): + to_be_updated, verbose=True): """ Checks whether `obj` is one of the values in the list property `property_name` of the supplied entity containing_object`. @@ -183,13 +188,20 @@ def assure_object_is_in_list(obj, containing_object, property_name, break if contained: + if verbose: + print("{} is in {} of entity {}".format(obj, property_name, + containing_object.id)) + return + if verbose: + print("Appending {} to {} of entity {}".format(obj, property_name, + containing_object.id)) current_list.append(obj) to_be_updated.append(containing_object) -def assure_has_parent(entity, parent, to_be_updated): +def assure_has_parent(entity, parent, to_be_updated, verbose=True): """ Checks whether `entity` has a parent with name `parent`. @@ -207,8 +219,13 @@ def assure_has_parent(entity, parent, to_be_updated): break if contained: + if verbose: + print("entity {} has parent {}".format(entity.id, parent)) + return + if verbose: + print("Adding parent {} to entity {}".format(parent, entity.id)) entity.add_parent(parent) to_be_updated.append(entity) diff --git a/src/caosadvancedtools/crawler.py b/src/caosadvancedtools/crawler.py index 449b96ce..4d548d21 100644 --- a/src/caosadvancedtools/crawler.py +++ b/src/caosadvancedtools/crawler.py @@ -45,7 +45,7 @@ from .cache import Cache class Crawler(object): - def __init__(self, food, access=lambda x: x, use_cache=False): + def __init__(self, food, access=lambda x: x, use_cache=False, verbose=True): """ Parameters ---------- @@ -63,12 +63,13 @@ class Crawler(object): self.access = access self.report = db.Container() self.use_cache = use_cache + self.verbose = verbose if self.use_cache: self.cache = Cache() def crawl(self, files): - for crawled_file in files: + for crawled_file in sorted(files, key=lambda x: x.path): # if crawled_file.size == 0: # crawled_file.add_message( # type="Warning", description="This file is empty. Shouldn't we delete it?") @@ -76,17 +77,31 @@ class Crawler(object): # continue + if self.verbose: + msg = "Matching {}".format(crawled_file.path) + print("="*len(msg)) + print(msg) + print("="*len(msg)) + + matches = 0 + for Cfood in self.food: if Cfood.match(crawled_file.path) is not None: + matches += 1 + + if self.verbose: + print("{} matched.".format(Cfood.__name__)) try: - cfood = Cfood(crawled_file, access=self.access) + cfood = Cfood(crawled_file, access=self.access, + verbose=self.verbose) cfood.create_identifiables() if self.use_cache: hashes = self.cache.update_ids_from_cache( cfood.identifiables) - self.find_or_insert_identifiables(cfood.identifiables) + self.find_or_insert_identifiables(cfood.identifiables, + self.verbose) if self.use_cache: self.cache.insert_list(hashes, cfood.identifiables) @@ -97,8 +112,14 @@ class Crawler(object): traceback.print_exc() print(e) + if self.verbose and matches == 0: + print("ATTENTION: No matching cfood!") + + if self.verbose and matches > 1: + print("Attention: More than one matching cfood!") + @staticmethod - def find_or_insert_identifiables(identifiables): + def find_or_insert_identifiables(identifiables, verbose=True): """ Sets the ids of identifiables (that do not have already an id from the cache) based on searching CaosDB and retrieves those entities. The remaining entities (those which can not be retrieved) have no @@ -107,15 +128,22 @@ class Crawler(object): # looking for matching entities in CaosDB when there is no valid id # i.e. there was none set from a cache + if verbose: + print("-----------------------------------------------------") + for ent in identifiables: if ent.id is None or ent.id < 0: + if verbose: + print("Looking for:") + print(ent) existing = Crawler.find_existing(ent) if existing is not None: ent.id = existing.id - - # this makes entities with existing ids valid - # identifiables.retrieve(unique=True, raise_exception_on_error=False) + else: + if verbose: + print("Id is known of:") + print(ent) # insert missing, i.e. those which are not valid missing_identifiables = db.Container() @@ -126,9 +154,20 @@ class Crawler(object): for ent in missing_identifiables: ent.id = None + if verbose: + print("Going to insert the following entities:") + + for ent in missing_identifiables: + print(ent) missing_identifiables.insert() + + if verbose: + print("Updating entities from CaosDB...") identifiables.retrieve(unique=True, raise_exception_on_error=False) + if verbose: + print("-----------------------------------------------------") + @staticmethod def find_existing(entity): """searches for an entity that matches the identifiable in CaosDB @@ -140,12 +179,20 @@ class Crawler(object): if entity.name is None: # TODO multiple parents are ignored! Sufficient? query_string = "FIND Record " + entity.get_parents()[0].name - query_string += " WITH " + " AND ".join( - ["'" + p.name + "'='" - + str(get_value(p)) + "'" for p in entity.get_properties()]) + query_string += " WITH " + + for p in entity.get_properties(): + if p.value == "": + query_string += (" NOT '" + p.name + "' AND ") + else: + query_string += ("'" + p.name + "'='" + str(get_value(p)) + + "' AND ") + # remove the last AND + query_string = query_string[:-4] else: query_string = "FIND '{}'".format(entity.name) + print(query_string) q = db.Query(query_string) # the identifiable should identify an object uniquely. Thus the query # is using the unique keyword -- GitLab