Skip to content
Snippets Groups Projects
Commit 782db460 authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

ENH: verbose mode and workaround for empty string props

parent 9dc5ab5f
Branches
Tags
No related merge requests found
......@@ -60,7 +60,7 @@ class AbstractCFood(object):
# function match()
_pattern = None
def __init__(self, crawled_file, access=lambda x: x):
def __init__(self, crawled_file, access=lambda x: x, verbose=True):
""" Abstract base class for Crawler food (CFood).
Parameters
......@@ -77,6 +77,7 @@ class AbstractCFood(object):
self.match = type(self).match(crawled_file.path)
self.to_be_updated = db.Container()
self.identifiables = db.Container()
self.verbose = verbose
@staticmethod
def get_re():
......@@ -97,10 +98,14 @@ class AbstractCFood(object):
The path of the file that shall be matched.
"""
if cls._pattern is None:
cls._pattern = re.compile(cls.get_re())
# TODO this does not quite work. Sometimes the wrong expression is in
# _pattern; FIX
# if cls._pattern is None:
# cls._pattern = re.compile(cls.get_re())
return cls._pattern.match(string)
# return cls._pattern.match(string)
return re.match(cls.get_re(), string)
def create_identifiables(self):
"""
......@@ -157,7 +162,7 @@ class AbstractCFood(object):
def assure_object_is_in_list(obj, containing_object, property_name,
to_be_updated):
to_be_updated, verbose=True):
"""
Checks whether `obj` is one of the values in the list property
`property_name` of the supplied entity containing_object`.
......@@ -183,13 +188,20 @@ def assure_object_is_in_list(obj, containing_object, property_name,
break
if contained:
if verbose:
print("{} is in {} of entity {}".format(obj, property_name,
containing_object.id))
return
if verbose:
print("Appending {} to {} of entity {}".format(obj, property_name,
containing_object.id))
current_list.append(obj)
to_be_updated.append(containing_object)
def assure_has_parent(entity, parent, to_be_updated):
def assure_has_parent(entity, parent, to_be_updated, verbose=True):
"""
Checks whether `entity` has a parent with name `parent`.
......@@ -207,8 +219,13 @@ def assure_has_parent(entity, parent, to_be_updated):
break
if contained:
if verbose:
print("entity {} has parent {}".format(entity.id, parent))
return
if verbose:
print("Adding parent {} to entity {}".format(parent, entity.id))
entity.add_parent(parent)
to_be_updated.append(entity)
......
......@@ -45,7 +45,7 @@ from .cache import Cache
class Crawler(object):
def __init__(self, food, access=lambda x: x, use_cache=False):
def __init__(self, food, access=lambda x: x, use_cache=False, verbose=True):
"""
Parameters
----------
......@@ -63,12 +63,13 @@ class Crawler(object):
self.access = access
self.report = db.Container()
self.use_cache = use_cache
self.verbose = verbose
if self.use_cache:
self.cache = Cache()
def crawl(self, files):
for crawled_file in files:
for crawled_file in sorted(files, key=lambda x: x.path):
# if crawled_file.size == 0:
# crawled_file.add_message(
# type="Warning", description="This file is empty. Shouldn't we delete it?")
......@@ -76,17 +77,31 @@ class Crawler(object):
# continue
if self.verbose:
msg = "Matching {}".format(crawled_file.path)
print("="*len(msg))
print(msg)
print("="*len(msg))
matches = 0
for Cfood in self.food:
if Cfood.match(crawled_file.path) is not None:
matches += 1
if self.verbose:
print("{} matched.".format(Cfood.__name__))
try:
cfood = Cfood(crawled_file, access=self.access)
cfood = Cfood(crawled_file, access=self.access,
verbose=self.verbose)
cfood.create_identifiables()
if self.use_cache:
hashes = self.cache.update_ids_from_cache(
cfood.identifiables)
self.find_or_insert_identifiables(cfood.identifiables)
self.find_or_insert_identifiables(cfood.identifiables,
self.verbose)
if self.use_cache:
self.cache.insert_list(hashes, cfood.identifiables)
......@@ -97,8 +112,14 @@ class Crawler(object):
traceback.print_exc()
print(e)
if self.verbose and matches == 0:
print("ATTENTION: No matching cfood!")
if self.verbose and matches > 1:
print("Attention: More than one matching cfood!")
@staticmethod
def find_or_insert_identifiables(identifiables):
def find_or_insert_identifiables(identifiables, verbose=True):
""" Sets the ids of identifiables (that do not have already an id from the
cache) based on searching CaosDB and retrieves those entities.
The remaining entities (those which can not be retrieved) have no
......@@ -107,15 +128,22 @@ class Crawler(object):
# looking for matching entities in CaosDB when there is no valid id
# i.e. there was none set from a cache
if verbose:
print("-----------------------------------------------------")
for ent in identifiables:
if ent.id is None or ent.id < 0:
if verbose:
print("Looking for:")
print(ent)
existing = Crawler.find_existing(ent)
if existing is not None:
ent.id = existing.id
# this makes entities with existing ids valid
# identifiables.retrieve(unique=True, raise_exception_on_error=False)
else:
if verbose:
print("Id is known of:")
print(ent)
# insert missing, i.e. those which are not valid
missing_identifiables = db.Container()
......@@ -126,9 +154,20 @@ class Crawler(object):
for ent in missing_identifiables:
ent.id = None
if verbose:
print("Going to insert the following entities:")
for ent in missing_identifiables:
print(ent)
missing_identifiables.insert()
if verbose:
print("Updating entities from CaosDB...")
identifiables.retrieve(unique=True, raise_exception_on_error=False)
if verbose:
print("-----------------------------------------------------")
@staticmethod
def find_existing(entity):
"""searches for an entity that matches the identifiable in CaosDB
......@@ -140,12 +179,20 @@ class Crawler(object):
if entity.name is None:
# TODO multiple parents are ignored! Sufficient?
query_string = "FIND Record " + entity.get_parents()[0].name
query_string += " WITH " + " AND ".join(
["'" + p.name + "'='"
+ str(get_value(p)) + "'" for p in entity.get_properties()])
query_string += " WITH "
for p in entity.get_properties():
if p.value == "":
query_string += (" NOT '" + p.name + "' AND ")
else:
query_string += ("'" + p.name + "'='" + str(get_value(p))
+ "' AND ")
# remove the last AND
query_string = query_string[:-4]
else:
query_string = "FIND '{}'".format(entity.name)
print(query_string)
q = db.Query(query_string)
# the identifiable should identify an object uniquely. Thus the query
# is using the unique keyword
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment