Skip to content
Snippets Groups Projects
Commit 8efdc037 authored by Henrik tom Woerden's avatar Henrik tom Woerden
Browse files

Restructure CFood and Crawler

parent dedf5146
Branches
Tags
No related merge requests found
Showing
with 346 additions and 139 deletions
File moved
File moved
File moved
...@@ -104,11 +104,11 @@ class CrawlerTest(unittest.TestCase): ...@@ -104,11 +104,11 @@ class CrawlerTest(unittest.TestCase):
######################### #########################
# # first publication # # # # first publication # #
######################### #########################
pub = db.execute_query("FIND really_cool_finding", unique=True) pub = db.execute_query("FIND *really_cool_finding", unique=True)
# There should be a Project with name TestProject which is referenced # There should be a Project with name TestProject which is referenced
########################## ##########################
# # second publication # # # # second publication # #
########################## ##########################
pub = db.execute_query("FIND paper_on_exciting_stuff ", unique=True) pub = db.execute_query("FIND *paper_on_exciting_stuff ", unique=True)
#!/usr/bin/env python
# encoding: utf-8
#
# ** header v3.0
# This file is a part of the CaosDB Project.
#
# Copyright (C) 2018 Research Group Biomedical Physics,
# Max-Planck-Institute for Dynamics and Self-Organization Göttingen
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
# ** end header
import unittest
from tempfile import NamedTemporaryFile
import caosdb as db
from caosadvancedtools.cfood import AbstractCFood
class CFoodTest(unittest.TestCase):
def setUp(self):
pass
def test_check_existence(self):
pass
#!/usr/bin/env python
# encoding: utf-8
#
# This file is a part of the CaosDB Project.
#
# Copyright (C) 2018 Research Group Biomedical Physics,
# Max-Planck-Institute for Dynamics and Self-Organization Göttingen
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
import unittest
from copy import deepcopy
from tempfile import NamedTemporaryFile
import caosdb as db
from caosadvancedtools.crawler import Crawler
def seek_and_destroy(names):
for name in names:
db.execute_query("FIND "+name).delete(raise_exception_on_error=False)
class CrawlerTest(unittest.TestCase):
def setUp(self):
# TODO replace by something more reasonable
seek_and_destroy(["Experiment", "Analysis", "Publication", "species"])
self.rts = db.Container().extend([
db.RecordType(name="Experiment").insert(),
db.RecordType(name="Analysis").insert(),
db.RecordType(name="Publication").insert(),
db.Property(name="species", datatype=db.TEXT).insert(),
])
self.exp = db.Record()
self.exp.add_parent(name="Experiment")
self.exp.add_property(name="species", value="microunicorn")
self.ana = db.Record()
self.ana.add_parent(name="Analysis")
self.pub = db.Record()
self.pub.add_parent(name="Publication")
def test_check_existence(self):
assert Crawler.find_existing(self.exp) is None
def test_find_or_insert_identifiables(self):
tmpexp = db.Record()
tmpexp.add_parent(name="Experiment")
tmpexp.add_property(name="species", value="microunicorn")
tmpana = db.Record()
tmpana.add_parent(name="Analysis")
tmpexp.insert()
tmpana.insert()
self.ana.id = tmpana.id
# exp inserted/no id; ana inserted/id; pub missing
identifiables = db.Container().extend([self.exp, self.ana, self.pub])
old_id = id(identifiables[0])
reference_to_first = identifiables[0]
assert reference_to_first is identifiables[0]
Crawler.find_or_insert_identifiables(identifiables)
for el in identifiables:
assert el.is_valid()
# check whether instance is the same
assert reference_to_first is identifiables[0]
assert old_id == id(identifiables[0])
# order must not be changed
assert identifiables[0].get_parents()[0].name == "Experiment"
assert identifiables[1].get_parents()[0].name == "Analysis"
assert identifiables[2].get_parents()[0].name == "Publication"
def tearDown(self):
for el in [self.exp, self.ana, self.pub, self.rts]:
try:
el.delete()
except:
pass
class CrawlerTestExist(CrawlerTest):
def setUp(self):
super().setUp()
self.exp.insert()
self.ana.insert()
self.pub.insert()
def test_check_existence(self):
res = Crawler.find_existing(self.exp)
assert res.id == self.exp.id
def tearDown(self):
for el in [self.exp, self.ana, self.pub, self.rts]:
try:
el.delete()
except:
pass
...@@ -64,3 +64,30 @@ class Cache(object): ...@@ -64,3 +64,30 @@ class Cache(object):
return res return res
else: else:
return res[1] return res[1]
def update_ids_from_cache(self, entities):
""" sets ids of those entities that are in cache
A list of hashes corresponding to the entities is returned
"""
hashes = []
for ent in entities:
ehash = Cache.hash_entity(ent)
hashes.append(ehash)
eid = self.check_existing(ehash)
if eid is not None:
ent.id = eid
return hashes
def insert_list(self, hashes, entities):
""" Insert the ids of entities into the cache
The hashes must correspond to the entities in the list
"""
for ehash, ent in zip(hashes, entities):
if self.check_existing(ehash) is None:
self.insert(ehash, ent.id)
...@@ -22,7 +22,17 @@ ...@@ -22,7 +22,17 @@
# along with this program. If not, see <https://www.gnu.org/licenses/>. # along with this program. If not, see <https://www.gnu.org/licenses/>.
# #
# ** end header # ** end header
"""does something""" """ Defines how something that shall be inserted into CaosDB is treated.
CaosDB can automatically be filled with Records based on some file structure.
The Crawler will iterate over the files and test for each file whether a CFood
exists that matches the file path. If one does, it is instanciated to treat the
match. This occurs in basically three steps:
1. create a list of identifiables, i.e. unique representation of CaosDB Records
(such as an experiment belonging to a project and a date/time)
2. the identifiables are either found in CaosDB or they are created.
3. the identifiables are update based on the date in the file structure
"""
import argparse import argparse
import re import re
...@@ -31,32 +41,12 @@ from copy import deepcopy ...@@ -31,32 +41,12 @@ from copy import deepcopy
from datetime import datetime from datetime import datetime
import caosdb as db import caosdb as db
from caosdb.exceptions import TransactionError
from caosadvancedtools.cache import Cache from caosadvancedtools.cache import Cache
ENTITIES = {} ENTITIES = {}
def get_value(prop):
""" Returns the value of a Property
Parameters
----------
prop : The property of which the value shall be returned.
Returns
-------
out : The value of the property; if the value is an entity, its ID.
"""
if isinstance(prop.value, db.Entity):
return prop.value.id
else:
return prop.value
def get_entity(name): def get_entity(name):
""" Returns the entity with a given name, preferably from a local cache. """ Returns the entity with a given name, preferably from a local cache.
...@@ -72,89 +62,57 @@ def get_entity(name): ...@@ -72,89 +62,57 @@ def get_entity(name):
class AbstractCFood(object): class AbstractCFood(object):
# TODO restructure this class such that no instance is needed to check for # contains the compiled regular expression after the first execution of the
# a match # function match()
# instances shall be used to keep track of a match; i.e. entities can be _pattern = None
# object variable
def __init__(self, pattern, use_cache=False, access=lambda x: x): def __init__(self, match, access=lambda x: x):
"""Abstract base class for Crawler food (CFood). """ Abstract base class for Crawler food (CFood).
Parameters Parameters
---------- ----------
pattern : str match : match object of a regular expression match
The regex pattern for matching against file names. the result from matching a path against the pattern of this
class
use_cache : bool, optional
Whether to use caching (not re-inserting probably existing
objects into CaosDB), defaults to False.
access : callable, optional access : callable, optional
Only used by child classes? A function that takes a CaosDB path and returns a local path
""" """
self.pattern = re.compile(pattern)
self.use_cache = use_cache
self.access = access self.access = access
self.crawled_file = match.string
self.match = match
if self.use_cache: @staticmethod
self.identifiable_cache = Cache() def get_re():
""" Returns the regular expression used to identify files that shall be
def treat_match(self, crawled_file, match): processed
print(crawled_file)
entities = self.create_identifiables(crawled_file, match)
for key, identifiable in entities.items():
if identifiable is None:
print("THIS IS STRANGE. No identifiables found in {}.".format(
crawled_file))
continue
existing = None
print("Looking for \n", identifiable)
if self.use_cache:
identifiable_cache = Cache()
identifier = Cache.hash_entity(identifiable)
cached_id = self.identifiable_cache.check_existing(identifier)
# retrieve entity for the cached id
if cached_id is not None:
existing = db.execute_query("FIND {}".format(cached_id),
unique=True)
print("Found Entity in cache; Id:", cached_id)
# Nothing in cache or cache not used. Check in CaosDB
if existing is None:
existing = AbstractCFood.find_existing(identifiable)
# No record matching the identifiable was found. Insert the record
if existing is None: This function shall be implemented by subclasses.
identifiable.insert() """
entities[key] = identifiable raise NotImplementedError()
else:
entities[key] = existing
print("Got\n", identifiable) @classmethod
def match(cls, string):
""" Matches the regular expression of this class against file names
if self.use_cache: Parameters
print("cid", cached_id) ----------
string : str
The path of the file that shall be matched.
"""
if self.use_cache and cached_id is None: if cls._pattern is None:
identifiable_cache.insert(identifier, entities[key].id) cls._pattern = re.compile(cls.get_re())
self.update_identifiables(entities, crawled_file, match) return cls._pattern.match(string)
def create_identifiables(self, crawled_file, match): def create_identifiables(self):
"""
must return a Container with the identifiables
"""
raise NotImplementedError() raise NotImplementedError()
def update_identifiables(self, entities, crawled_file, match): def update_identifiables(self):
raise NotImplementedError() raise NotImplementedError()
@staticmethod @staticmethod
...@@ -187,48 +145,3 @@ class AbstractCFood(object): ...@@ -187,48 +145,3 @@ class AbstractCFood(object):
entity.add_property(prop, value, datatype=datatype) entity.add_property(prop, value, datatype=datatype)
else: else:
entity.add_property(prop, value) entity.add_property(prop, value)
@staticmethod
def find_existing(entity):
"""searches for an entity that matches the identifiable in CaosDB
Characteristics of the identifiable like, properties, name or id are
used for the match.
"""
if entity.name is None:
# TODO multiple parents are ignored! Sufficient?
query_string = "FIND Record " + entity.get_parents()[0].name
query_string += " WITH " + " AND ".join(
["'" + p.name + "'='"
+ str(get_value(p)) + "'" for p in entity.get_properties()])
else:
query_string = "FIND '{}'".format(entity.name)
print(query_string)
q = db.Query(query_string)
# the identifiable should identify an object uniquely. Thus the query
# is using the unique keyword
try:
r = q.execute(unique=True)
except TransactionError:
r = None
if r is not None:
print("Found Entity with id:", r.id)
else:
print("Did not find an existing entity.")
return r
def get_parser():
parser = argparse.ArgumentParser(description=__doc__,
formatter_class=RawTextHelpFormatter)
return parser
if __name__ == "__main__":
parser = get_parser()
args = parser.parse_args()
...@@ -22,16 +22,48 @@ ...@@ -22,16 +22,48 @@
# #
# ** end header # ** end header
# #
"""does something""" """ Crawls a file structure and inserts Records into CaosDB based on what is
found.
CaosDB can automatically be filled with Records based on some file structure.
The Crawler will iterate over the files and test for each file whether a CFood
exists that matches the file path. If one does, it is instanciated to treat the
match. This occurs in basically three steps:
1. create a list of identifiables, i.e. unique representation of CaosDB Records
(such as an experiment belonging to a project and a date/time)
2. the identifiables are either found in CaosDB or they are created.
3. the identifiables are update based on the date in the file structure
"""
import caosdb as db import caosdb as db
from caosdb.exceptions import TransactionError
from .cache import Cache
class Crawler(object): class Crawler(object):
def __init__(self, food): def __init__(self, food, access=lambda x: x, use_cache=False):
"""
Parameters
----------
pattern : str
The regex pattern for matching against file names.
use_cache : bool, optional
Whether to use caching (not re-inserting probably existing
objects into CaosDB), defaults to False.
access : callable, optional
A function that takes a CaosDB path and returns a local path
"""
self.food = food self.food = food
self.access = access
self.report = db.Container() self.report = db.Container()
self.use_cache = use_cache
if self.use_cache:
self.cache = Cache()
def crawl(self, files): def crawl(self, files):
for crawled_file in files: for crawled_file in files:
...@@ -42,11 +74,82 @@ class Crawler(object): ...@@ -42,11 +74,82 @@ class Crawler(object):
# continue # continue
for cfood in self.food: for Cfood in self.food:
match = cfood.pattern.match(crawled_file.path) match = Cfood.match(crawled_file.path)
if match is not None: if match is not None:
cfood.treat_match(crawled_file, match) cfood = Cfood(match, access=self.access)
identifiables = cfood.create_identifiables()
if self.use_cache:
hashes = self.cache.update_ids_from_cache(
identifiables)
self.find_or_insert_identifiables(identifiables)
if self.use_cache:
self.cache.insert_list(hashes, identifiables)
cfood.update_identifiables()
@staticmethod
def find_or_insert_identifiables(identifiables):
""" Sets the ids of identifiables (that do not have already an id from the
cache) based on searching CaosDB and retrieves those entities.
The remaining entities (those which can not be retrieved) have no
correspondence in CaosDB and are thus inserted.
"""
# looking for matching entities in CaosDB when there is no valid id
# i.e. there was none set from a cache
for ent in identifiables:
if ent.id is None or ent.id < 0:
existing = Crawler.find_existing(ent)
if existing is not None:
ent.id = existing.id
# this makes entities with existing ids valid
# identifiables.retrieve(unique=True, raise_exception_on_error=False)
# insert missing, i.e. those which are not valid
missing_identifiables = db.Container()
missing_identifiables.extend([ent for ent in identifiables
if ent.id is None or ent.id < 0])
missing_identifiables.insert()
identifiables.retrieve(unique=True, raise_exception_on_error=False)
@staticmethod
def find_existing(entity):
"""searches for an entity that matches the identifiable in CaosDB
Characteristics of the identifiable like, properties, name or id are
used for the match.
"""
if entity.name is None:
# TODO multiple parents are ignored! Sufficient?
query_string = "FIND Record " + entity.get_parents()[0].name
query_string += " WITH " + " AND ".join(
["'" + p.name + "'='"
+ str(get_value(p)) + "'" for p in entity.get_properties()])
else:
query_string = "FIND '{}'".format(entity.name)
q = db.Query(query_string)
# the identifiable should identify an object uniquely. Thus the query
# is using the unique keyword
try:
r = q.execute(unique=True)
except TransactionError:
r = None
# if r is not None:
# print("Found Entity with id:", r.id)
# else:
# print("Did not find an existing entity.")
return r
@staticmethod @staticmethod
def query_files(path): def query_files(path):
...@@ -57,3 +160,22 @@ class Crawler(object): ...@@ -57,3 +160,22 @@ class Crawler(object):
print("{} FILES TO BE PROCESSED.".format(len(files))) print("{} FILES TO BE PROCESSED.".format(len(files)))
return files return files
def get_value(prop):
""" Returns the value of a Property
Parameters
----------
prop : The property of which the value shall be returned.
Returns
-------
out : The value of the property; if the value is an entity, its ID.
"""
if isinstance(prop.value, db.Entity):
return prop.value.id
else:
return prop.value
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment