Skip to content
Snippets Groups Projects
Commit 782db460 authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

ENH: verbose mode and workaround for empty string props

parent 9dc5ab5f
No related branches found
No related tags found
No related merge requests found
...@@ -60,7 +60,7 @@ class AbstractCFood(object): ...@@ -60,7 +60,7 @@ class AbstractCFood(object):
# function match() # function match()
_pattern = None _pattern = None
def __init__(self, crawled_file, access=lambda x: x): def __init__(self, crawled_file, access=lambda x: x, verbose=True):
""" Abstract base class for Crawler food (CFood). """ Abstract base class for Crawler food (CFood).
Parameters Parameters
...@@ -77,6 +77,7 @@ class AbstractCFood(object): ...@@ -77,6 +77,7 @@ class AbstractCFood(object):
self.match = type(self).match(crawled_file.path) self.match = type(self).match(crawled_file.path)
self.to_be_updated = db.Container() self.to_be_updated = db.Container()
self.identifiables = db.Container() self.identifiables = db.Container()
self.verbose = verbose
@staticmethod @staticmethod
def get_re(): def get_re():
...@@ -97,10 +98,14 @@ class AbstractCFood(object): ...@@ -97,10 +98,14 @@ class AbstractCFood(object):
The path of the file that shall be matched. The path of the file that shall be matched.
""" """
if cls._pattern is None: # TODO this does not quite work. Sometimes the wrong expression is in
cls._pattern = re.compile(cls.get_re()) # _pattern; FIX
# if cls._pattern is None:
# cls._pattern = re.compile(cls.get_re())
return cls._pattern.match(string) # return cls._pattern.match(string)
return re.match(cls.get_re(), string)
def create_identifiables(self): def create_identifiables(self):
""" """
...@@ -157,7 +162,7 @@ class AbstractCFood(object): ...@@ -157,7 +162,7 @@ class AbstractCFood(object):
def assure_object_is_in_list(obj, containing_object, property_name, def assure_object_is_in_list(obj, containing_object, property_name,
to_be_updated): to_be_updated, verbose=True):
""" """
Checks whether `obj` is one of the values in the list property Checks whether `obj` is one of the values in the list property
`property_name` of the supplied entity containing_object`. `property_name` of the supplied entity containing_object`.
...@@ -183,13 +188,20 @@ def assure_object_is_in_list(obj, containing_object, property_name, ...@@ -183,13 +188,20 @@ def assure_object_is_in_list(obj, containing_object, property_name,
break break
if contained: if contained:
if verbose:
print("{} is in {} of entity {}".format(obj, property_name,
containing_object.id))
return return
if verbose:
print("Appending {} to {} of entity {}".format(obj, property_name,
containing_object.id))
current_list.append(obj) current_list.append(obj)
to_be_updated.append(containing_object) to_be_updated.append(containing_object)
def assure_has_parent(entity, parent, to_be_updated): def assure_has_parent(entity, parent, to_be_updated, verbose=True):
""" """
Checks whether `entity` has a parent with name `parent`. Checks whether `entity` has a parent with name `parent`.
...@@ -207,8 +219,13 @@ def assure_has_parent(entity, parent, to_be_updated): ...@@ -207,8 +219,13 @@ def assure_has_parent(entity, parent, to_be_updated):
break break
if contained: if contained:
if verbose:
print("entity {} has parent {}".format(entity.id, parent))
return return
if verbose:
print("Adding parent {} to entity {}".format(parent, entity.id))
entity.add_parent(parent) entity.add_parent(parent)
to_be_updated.append(entity) to_be_updated.append(entity)
......
...@@ -45,7 +45,7 @@ from .cache import Cache ...@@ -45,7 +45,7 @@ from .cache import Cache
class Crawler(object): class Crawler(object):
def __init__(self, food, access=lambda x: x, use_cache=False): def __init__(self, food, access=lambda x: x, use_cache=False, verbose=True):
""" """
Parameters Parameters
---------- ----------
...@@ -63,12 +63,13 @@ class Crawler(object): ...@@ -63,12 +63,13 @@ class Crawler(object):
self.access = access self.access = access
self.report = db.Container() self.report = db.Container()
self.use_cache = use_cache self.use_cache = use_cache
self.verbose = verbose
if self.use_cache: if self.use_cache:
self.cache = Cache() self.cache = Cache()
def crawl(self, files): def crawl(self, files):
for crawled_file in files: for crawled_file in sorted(files, key=lambda x: x.path):
# if crawled_file.size == 0: # if crawled_file.size == 0:
# crawled_file.add_message( # crawled_file.add_message(
# type="Warning", description="This file is empty. Shouldn't we delete it?") # type="Warning", description="This file is empty. Shouldn't we delete it?")
...@@ -76,17 +77,31 @@ class Crawler(object): ...@@ -76,17 +77,31 @@ class Crawler(object):
# continue # continue
if self.verbose:
msg = "Matching {}".format(crawled_file.path)
print("="*len(msg))
print(msg)
print("="*len(msg))
matches = 0
for Cfood in self.food: for Cfood in self.food:
if Cfood.match(crawled_file.path) is not None: if Cfood.match(crawled_file.path) is not None:
matches += 1
if self.verbose:
print("{} matched.".format(Cfood.__name__))
try: try:
cfood = Cfood(crawled_file, access=self.access) cfood = Cfood(crawled_file, access=self.access,
verbose=self.verbose)
cfood.create_identifiables() cfood.create_identifiables()
if self.use_cache: if self.use_cache:
hashes = self.cache.update_ids_from_cache( hashes = self.cache.update_ids_from_cache(
cfood.identifiables) cfood.identifiables)
self.find_or_insert_identifiables(cfood.identifiables) self.find_or_insert_identifiables(cfood.identifiables,
self.verbose)
if self.use_cache: if self.use_cache:
self.cache.insert_list(hashes, cfood.identifiables) self.cache.insert_list(hashes, cfood.identifiables)
...@@ -97,8 +112,14 @@ class Crawler(object): ...@@ -97,8 +112,14 @@ class Crawler(object):
traceback.print_exc() traceback.print_exc()
print(e) print(e)
if self.verbose and matches == 0:
print("ATTENTION: No matching cfood!")
if self.verbose and matches > 1:
print("Attention: More than one matching cfood!")
@staticmethod @staticmethod
def find_or_insert_identifiables(identifiables): def find_or_insert_identifiables(identifiables, verbose=True):
""" Sets the ids of identifiables (that do not have already an id from the """ Sets the ids of identifiables (that do not have already an id from the
cache) based on searching CaosDB and retrieves those entities. cache) based on searching CaosDB and retrieves those entities.
The remaining entities (those which can not be retrieved) have no The remaining entities (those which can not be retrieved) have no
...@@ -107,15 +128,22 @@ class Crawler(object): ...@@ -107,15 +128,22 @@ class Crawler(object):
# looking for matching entities in CaosDB when there is no valid id # looking for matching entities in CaosDB when there is no valid id
# i.e. there was none set from a cache # i.e. there was none set from a cache
if verbose:
print("-----------------------------------------------------")
for ent in identifiables: for ent in identifiables:
if ent.id is None or ent.id < 0: if ent.id is None or ent.id < 0:
if verbose:
print("Looking for:")
print(ent)
existing = Crawler.find_existing(ent) existing = Crawler.find_existing(ent)
if existing is not None: if existing is not None:
ent.id = existing.id ent.id = existing.id
else:
# this makes entities with existing ids valid if verbose:
# identifiables.retrieve(unique=True, raise_exception_on_error=False) print("Id is known of:")
print(ent)
# insert missing, i.e. those which are not valid # insert missing, i.e. those which are not valid
missing_identifiables = db.Container() missing_identifiables = db.Container()
...@@ -126,9 +154,20 @@ class Crawler(object): ...@@ -126,9 +154,20 @@ class Crawler(object):
for ent in missing_identifiables: for ent in missing_identifiables:
ent.id = None ent.id = None
if verbose:
print("Going to insert the following entities:")
for ent in missing_identifiables:
print(ent)
missing_identifiables.insert() missing_identifiables.insert()
if verbose:
print("Updating entities from CaosDB...")
identifiables.retrieve(unique=True, raise_exception_on_error=False) identifiables.retrieve(unique=True, raise_exception_on_error=False)
if verbose:
print("-----------------------------------------------------")
@staticmethod @staticmethod
def find_existing(entity): def find_existing(entity):
"""searches for an entity that matches the identifiable in CaosDB """searches for an entity that matches the identifiable in CaosDB
...@@ -140,12 +179,20 @@ class Crawler(object): ...@@ -140,12 +179,20 @@ class Crawler(object):
if entity.name is None: if entity.name is None:
# TODO multiple parents are ignored! Sufficient? # TODO multiple parents are ignored! Sufficient?
query_string = "FIND Record " + entity.get_parents()[0].name query_string = "FIND Record " + entity.get_parents()[0].name
query_string += " WITH " + " AND ".join( query_string += " WITH "
["'" + p.name + "'='"
+ str(get_value(p)) + "'" for p in entity.get_properties()]) for p in entity.get_properties():
if p.value == "":
query_string += (" NOT '" + p.name + "' AND ")
else:
query_string += ("'" + p.name + "'='" + str(get_value(p))
+ "' AND ")
# remove the last AND
query_string = query_string[:-4]
else: else:
query_string = "FIND '{}'".format(entity.name) query_string = "FIND '{}'".format(entity.name)
print(query_string)
q = db.Query(query_string) q = db.Query(query_string)
# the identifiable should identify an object uniquely. Thus the query # the identifiable should identify an object uniquely. Thus the query
# is using the unique keyword # is using the unique keyword
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment