diff --git a/src/caosadvancedtools/cfood.py b/src/caosadvancedtools/cfood.py index 76f7bce4244f5a6cf96f075bade8748591ef29e9..58cae5d0e65750a3e47618b5b228fabc007a097a 100644 --- a/src/caosadvancedtools/cfood.py +++ b/src/caosadvancedtools/cfood.py @@ -60,7 +60,7 @@ class AbstractCFood(object): # function match() _pattern = None - def __init__(self, crawled_file, access=lambda x: x): + def __init__(self, crawled_file, access=lambda x: x, verbose=True): """ Abstract base class for Crawler food (CFood). Parameters @@ -77,6 +77,7 @@ class AbstractCFood(object): self.match = type(self).match(crawled_file.path) self.to_be_updated = db.Container() self.identifiables = db.Container() + self.verbose = verbose @staticmethod def get_re(): @@ -97,10 +98,14 @@ class AbstractCFood(object): The path of the file that shall be matched. """ - if cls._pattern is None: - cls._pattern = re.compile(cls.get_re()) + # TODO this does not quite work. Sometimes the wrong expression is in + # _pattern; FIX + # if cls._pattern is None: + # cls._pattern = re.compile(cls.get_re()) - return cls._pattern.match(string) + # return cls._pattern.match(string) + + return re.match(cls.get_re(), string) def create_identifiables(self): """ @@ -157,7 +162,7 @@ class AbstractCFood(object): def assure_object_is_in_list(obj, containing_object, property_name, - to_be_updated): + to_be_updated, verbose=True): """ Checks whether `obj` is one of the values in the list property `property_name` of the supplied entity containing_object`. @@ -183,13 +188,20 @@ def assure_object_is_in_list(obj, containing_object, property_name, break if contained: + if verbose: + print("{} is in {} of entity {}".format(obj, property_name, + containing_object.id)) + return + if verbose: + print("Appending {} to {} of entity {}".format(obj, property_name, + containing_object.id)) current_list.append(obj) to_be_updated.append(containing_object) -def assure_has_parent(entity, parent, to_be_updated): +def assure_has_parent(entity, parent, to_be_updated, verbose=True): """ Checks whether `entity` has a parent with name `parent`. @@ -207,8 +219,13 @@ def assure_has_parent(entity, parent, to_be_updated): break if contained: + if verbose: + print("entity {} has parent {}".format(entity.id, parent)) + return + if verbose: + print("Adding parent {} to entity {}".format(parent, entity.id)) entity.add_parent(parent) to_be_updated.append(entity) diff --git a/src/caosadvancedtools/crawler.py b/src/caosadvancedtools/crawler.py index 449b96ce9fd9fc0bb941a4e51ca671ef9aed3ce2..4d548d213d70ca63f0f5ef963da7ae8435b9e883 100644 --- a/src/caosadvancedtools/crawler.py +++ b/src/caosadvancedtools/crawler.py @@ -45,7 +45,7 @@ from .cache import Cache class Crawler(object): - def __init__(self, food, access=lambda x: x, use_cache=False): + def __init__(self, food, access=lambda x: x, use_cache=False, verbose=True): """ Parameters ---------- @@ -63,12 +63,13 @@ class Crawler(object): self.access = access self.report = db.Container() self.use_cache = use_cache + self.verbose = verbose if self.use_cache: self.cache = Cache() def crawl(self, files): - for crawled_file in files: + for crawled_file in sorted(files, key=lambda x: x.path): # if crawled_file.size == 0: # crawled_file.add_message( # type="Warning", description="This file is empty. Shouldn't we delete it?") @@ -76,17 +77,31 @@ class Crawler(object): # continue + if self.verbose: + msg = "Matching {}".format(crawled_file.path) + print("="*len(msg)) + print(msg) + print("="*len(msg)) + + matches = 0 + for Cfood in self.food: if Cfood.match(crawled_file.path) is not None: + matches += 1 + + if self.verbose: + print("{} matched.".format(Cfood.__name__)) try: - cfood = Cfood(crawled_file, access=self.access) + cfood = Cfood(crawled_file, access=self.access, + verbose=self.verbose) cfood.create_identifiables() if self.use_cache: hashes = self.cache.update_ids_from_cache( cfood.identifiables) - self.find_or_insert_identifiables(cfood.identifiables) + self.find_or_insert_identifiables(cfood.identifiables, + self.verbose) if self.use_cache: self.cache.insert_list(hashes, cfood.identifiables) @@ -97,8 +112,14 @@ class Crawler(object): traceback.print_exc() print(e) + if self.verbose and matches == 0: + print("ATTENTION: No matching cfood!") + + if self.verbose and matches > 1: + print("Attention: More than one matching cfood!") + @staticmethod - def find_or_insert_identifiables(identifiables): + def find_or_insert_identifiables(identifiables, verbose=True): """ Sets the ids of identifiables (that do not have already an id from the cache) based on searching CaosDB and retrieves those entities. The remaining entities (those which can not be retrieved) have no @@ -107,15 +128,22 @@ class Crawler(object): # looking for matching entities in CaosDB when there is no valid id # i.e. there was none set from a cache + if verbose: + print("-----------------------------------------------------") + for ent in identifiables: if ent.id is None or ent.id < 0: + if verbose: + print("Looking for:") + print(ent) existing = Crawler.find_existing(ent) if existing is not None: ent.id = existing.id - - # this makes entities with existing ids valid - # identifiables.retrieve(unique=True, raise_exception_on_error=False) + else: + if verbose: + print("Id is known of:") + print(ent) # insert missing, i.e. those which are not valid missing_identifiables = db.Container() @@ -126,9 +154,20 @@ class Crawler(object): for ent in missing_identifiables: ent.id = None + if verbose: + print("Going to insert the following entities:") + + for ent in missing_identifiables: + print(ent) missing_identifiables.insert() + + if verbose: + print("Updating entities from CaosDB...") identifiables.retrieve(unique=True, raise_exception_on_error=False) + if verbose: + print("-----------------------------------------------------") + @staticmethod def find_existing(entity): """searches for an entity that matches the identifiable in CaosDB @@ -140,12 +179,20 @@ class Crawler(object): if entity.name is None: # TODO multiple parents are ignored! Sufficient? query_string = "FIND Record " + entity.get_parents()[0].name - query_string += " WITH " + " AND ".join( - ["'" + p.name + "'='" - + str(get_value(p)) + "'" for p in entity.get_properties()]) + query_string += " WITH " + + for p in entity.get_properties(): + if p.value == "": + query_string += (" NOT '" + p.name + "' AND ") + else: + query_string += ("'" + p.name + "'='" + str(get_value(p)) + + "' AND ") + # remove the last AND + query_string = query_string[:-4] else: query_string = "FIND '{}'".format(entity.name) + print(query_string) q = db.Query(query_string) # the identifiable should identify an object uniquely. Thus the query # is using the unique keyword