Restructure CFood and Crawler

8efdc037 · Henrik tom Woerden · dedf5146 · 8efdc037 · 8efdc037 · 8efdc037
Commit 8efdc037 authored 5 years ago by Henrik tom Woerden
--- a/integrationtests/extroot/Publications/Theses/2019_paper_on_exciting_stuff/usefull.xlsx
+++ b/integrationtests/extroot/Publications/Theses/2019_paper_on_exciting_stuff/usefull.xlsx
--- a/integrationtests/extroot/Publications/Theses/2019_paper_on_exciting_stuff/useless.xlsx
+++ b/integrationtests/extroot/Publications/Theses/2019_paper_on_exciting_stuff/useless.xlsx
--- a/integrationtests/extroot/SimulationData/TestProject/2019-02-03/README.md
+++ b/integrationtests/extroot/SimulationData/TestProject/2019-02-03/README.md
--- a/integrationtests/extroot/SimulationData/TestProject/2019-02-03/sim.py
+++ b/integrationtests/extroot/SimulationData/TestProject/2019-02-03/sim.py
--- a/integrationtests/extroot/SimulationData/TestProject/2019-02-03/snapshots.dat
+++ b/integrationtests/extroot/SimulationData/TestProject/2019-02-03/snapshots.dat
--- a/integrationtests/extroot/SimulationData/TestProject/2019-02-03_something/README.md
+++ b/integrationtests/extroot/SimulationData/TestProject/2019-02-03_something/README.md
--- a/integrationtests/extroot/SimulationData/TestProject/2019-02-03_something/large_sim.py
+++ b/integrationtests/extroot/SimulationData/TestProject/2019-02-03_something/large_sim.py
--- a/integrationtests/extroot/SimulationData/TestProject/2019-02-03_something/parameters.p
+++ b/integrationtests/extroot/SimulationData/TestProject/2019-02-03_something/parameters.p
--- a/integrationtests/extroot/SimulationData/TestProject/2019-02-03_something/timeseries.npy
+++ b/integrationtests/extroot/SimulationData/TestProject/2019-02-03_something/timeseries.npy
--- a/integrationtests/filldb.sh
+++ b/integrationtests/filldb.sh
--- a/integrationtests/insert_model.py
+++ b/integrationtests/insert_model.py
--- a/integrationtests/insert_record.py
+++ b/integrationtests/insert_record.py
--- a/integrationtests/model.yml
+++ b/integrationtests/model.yml
--- a/integrationtests/test.sh
+++ b/integrationtests/test.sh
--- a/integrationtests/test_crawler.py
+++ b/integrationtests/test_crawler.py
@@ -104,11 +104,11 @@ class CrawlerTest(unittest.TestCase):
        #########################
        # # first publication # #
        #########################
-        pub = db.execute_query("FIND really_cool_finding", unique=True)
+        pub = db.execute_query("FIND *really_cool_finding", unique=True)
        # There should be a Project with name TestProject which is referenced
        ##########################
        # # second publication # #
        ##########################
-        pub = db.execute_query("FIND paper_on_exciting_stuff ", unique=True)
+        pub = db.execute_query("FIND *paper_on_exciting_stuff ", unique=True)
--- a/integrationtests/single_tests/test_cfood.py
+++ b/integrationtests/single_tests/test_cfood.py
+#!/usr/bin/env python
+# encoding: utf-8
+#
+# ** header v3.0
+# This file is a part of the CaosDB Project.
+#
+# Copyright (C) 2018 Research Group Biomedical Physics,
+# Max-Planck-Institute for Dynamics and Self-Organization Göttingen
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+# ** end header
+import unittest
+from tempfile import NamedTemporaryFile
+import caosdb as db
+from caosadvancedtools.cfood import AbstractCFood
+class CFoodTest(unittest.TestCase):
+    def setUp(self):
+        pass
+    def test_check_existence(self):
+        pass
--- a/integrationtests/single_tests/test_crawler.py
+++ b/integrationtests/single_tests/test_crawler.py
+#!/usr/bin/env python
+# encoding: utf-8
+#
+# This file is a part of the CaosDB Project.
+#
+# Copyright (C) 2018 Research Group Biomedical Physics,
+# Max-Planck-Institute for Dynamics and Self-Organization Göttingen
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+import unittest
+from copy import deepcopy
+from tempfile import NamedTemporaryFile
+import caosdb as db
+from caosadvancedtools.crawler import Crawler
+def seek_and_destroy(names):
+    for name in names:
+        db.execute_query("FIND "+name).delete(raise_exception_on_error=False)
+class CrawlerTest(unittest.TestCase):
+    def setUp(self):
+        # TODO replace by something more reasonable
+        seek_and_destroy(["Experiment", "Analysis", "Publication", "species"])
+        self.rts = db.Container().extend([
+            db.RecordType(name="Experiment").insert(),
+            db.RecordType(name="Analysis").insert(),
+            db.RecordType(name="Publication").insert(),
+            db.Property(name="species", datatype=db.TEXT).insert(),
+        ])
+        self.exp = db.Record()
+        self.exp.add_parent(name="Experiment")
+        self.exp.add_property(name="species", value="microunicorn")
+        self.ana = db.Record()
+        self.ana.add_parent(name="Analysis")
+        self.pub = db.Record()
+        self.pub.add_parent(name="Publication")
+    def test_check_existence(self):
+        assert Crawler.find_existing(self.exp) is None
+    def test_find_or_insert_identifiables(self):
+        tmpexp = db.Record()
+        tmpexp.add_parent(name="Experiment")
+        tmpexp.add_property(name="species", value="microunicorn")
+        tmpana = db.Record()
+        tmpana.add_parent(name="Analysis")
+        tmpexp.insert()
+        tmpana.insert()
+        self.ana.id = tmpana.id
+        # exp inserted/no id; ana inserted/id; pub missing
+        identifiables = db.Container().extend([self.exp, self.ana, self.pub])
+        old_id = id(identifiables[0])
+        reference_to_first = identifiables[0]
+        assert reference_to_first is identifiables[0]
+        Crawler.find_or_insert_identifiables(identifiables)
+        for el in identifiables:
+            assert el.is_valid()
+        # check whether instance is the same
+        assert reference_to_first is identifiables[0]
+        assert old_id == id(identifiables[0])
+        # order must not be changed
+        assert identifiables[0].get_parents()[0].name == "Experiment"
+        assert identifiables[1].get_parents()[0].name == "Analysis"
+        assert identifiables[2].get_parents()[0].name == "Publication"
+    def tearDown(self):
+        for el in [self.exp, self.ana, self.pub, self.rts]:
+            try:
+                el.delete()
+            except:
+                pass
+class CrawlerTestExist(CrawlerTest):
+    def setUp(self):
+        super().setUp()
+        self.exp.insert()
+        self.ana.insert()
+        self.pub.insert()
+    def test_check_existence(self):
+        res = Crawler.find_existing(self.exp)
+        assert res.id == self.exp.id
+    def tearDown(self):
+        for el in [self.exp, self.ana, self.pub, self.rts]:
+            try:
+                el.delete()
+            except:
+                pass
--- a/src/caosadvancedtools/cache.py
+++ b/src/caosadvancedtools/cache.py
@@ -64,3 +64,30 @@ class Cache(object):
            return res
        else:
            return res[1]
+    def update_ids_from_cache(self, entities):
+        """ sets ids of those entities that are in cache
+        A list of hashes corresponding to the entities is returned
+        """
+        hashes = []
+        for ent in entities:
+            ehash = Cache.hash_entity(ent)
+            hashes.append(ehash)
+            eid = self.check_existing(ehash)
+            if eid is not None:
+                ent.id = eid
+        return hashes
+    def insert_list(self, hashes, entities):
+        """ Insert the ids of entities into the cache
+        The hashes must correspond to the entities in the list
+        """
+        for ehash, ent in zip(hashes, entities):
+            if self.check_existing(ehash) is None:
+                self.insert(ehash, ent.id)
--- a/src/caosadvancedtools/cfood.py
+++ b/src/caosadvancedtools/cfood.py
@@ -22,7 +22,17 @@
 # along with this program. If not, see <https://www.gnu.org/licenses/>.
 #
 # ** end header
-"""does something"""
+""" Defines how something that shall be inserted into CaosDB is treated.
+CaosDB can automatically be filled with Records based on some file structure.
+The Crawler will iterate over the files and test for each file whether a CFood
+exists that matches the file path. If one does, it is instanciated to treat the
+match. This occurs in basically three steps:
+1. create a list of identifiables, i.e. unique representation of CaosDB Records
+(such as an experiment belonging to a project and a date/time)
+2. the identifiables are either found in CaosDB or they are created.
+3. the identifiables are update based on the date in the file structure
+"""
 import argparse
 import re
@@ -31,32 +41,12 @@ from copy import deepcopy
 from datetime import datetime
 import caosdb as db
-from caosdb.exceptions import TransactionError
 from caosadvancedtools.cache import Cache
 ENTITIES = {}
-def get_value(prop):
-    """ Returns the value of a Property
-    Parameters
-    ----------
-    prop : The property of which the value shall be returned.
-    Returns
-    -------
-    out : The value of the property; if the value is an entity, its ID.
-    """
-    if isinstance(prop.value, db.Entity):
-        return prop.value.id
-    else:
-        return prop.value
 def get_entity(name):
    """ Returns the entity with a given name, preferably from a local cache.
@@ -72,89 +62,57 @@ def get_entity(name):
 class AbstractCFood(object):
-    # TODO restructure this class such that no instance is needed to check for
+    # contains the compiled regular expression after the first execution of the
-    # a match
+    # function match()
-    # instances shall be used to keep track of a match; i.e. entities can be
+    _pattern = None
-    # object variable
-    def __init__(self, pattern, use_cache=False, access=lambda x: x):
+    def __init__(self, match, access=lambda x: x):
-        """Abstract base class for Crawler food (CFood).
+        """ Abstract base class for Crawler food (CFood).
        Parameters
        ----------
-        pattern : str
+        match : match object of a regular expression match
-                  The regex pattern for matching against file names.
+                the result from matching a path against the pattern of this
+                class
-        use_cache : bool, optional
-                    Whether to use caching (not re-inserting probably existing
-                    objects into CaosDB), defaults to False.
        access : callable, optional
-                 Only used by child classes?
+                 A function that takes a CaosDB path and returns a local path
        """
-        self.pattern = re.compile(pattern)
-        self.use_cache = use_cache
        self.access = access
+        self.crawled_file = match.string
+        self.match = match
-        if self.use_cache:
+    @staticmethod
-            self.identifiable_cache = Cache()
+    def get_re():
+        """ Returns the regular expression used to identify files that shall be
-    def treat_match(self, crawled_file, match):
+        processed
-        print(crawled_file)
-        entities = self.create_identifiables(crawled_file, match)
-        for key, identifiable in entities.items():
-            if identifiable is None:
-                print("THIS IS STRANGE. No identifiables found in {}.".format(
-                    crawled_file))
-                continue
-            existing = None
-            print("Looking for \n", identifiable)
-            if self.use_cache:
-                identifiable_cache = Cache()
-                identifier = Cache.hash_entity(identifiable)
-                cached_id = self.identifiable_cache.check_existing(identifier)
-                # retrieve entity for the cached id
-                if cached_id is not None:
-                    existing = db.execute_query("FIND {}".format(cached_id),
-                                                unique=True)
-                    print("Found Entity in cache; Id:", cached_id)
-            # Nothing in cache or cache not used. Check in CaosDB
-            if existing is None:
-                existing = AbstractCFood.find_existing(identifiable)
-            # No record matching the identifiable was found. Insert the record
-            if existing is None:
+        This function shall be implemented by subclasses.
-                identifiable.insert()
+        """
-                entities[key] = identifiable
+        raise NotImplementedError()
-            else:
-                entities[key] = existing
-            print("Got\n", identifiable)
+    @classmethod
+    def match(cls, string):
+        """ Matches the regular expression of this class against file names
-            if self.use_cache:
+        Parameters
-                print("cid", cached_id)
+        ----------
+        string : str
+                 The path of the file that shall be matched.
+        """
-            if self.use_cache and cached_id is None:
+        if cls._pattern is None:
-                identifiable_cache.insert(identifier, entities[key].id)
+            cls._pattern = re.compile(cls.get_re())
-        self.update_identifiables(entities, crawled_file, match)
+        return cls._pattern.match(string)
-    def create_identifiables(self, crawled_file, match):
+    def create_identifiables(self):
+        """
+        must return a Container with the identifiables
+        """
        raise NotImplementedError()
-    def update_identifiables(self, entities, crawled_file, match):
+    def update_identifiables(self):
        raise NotImplementedError()
    @staticmethod
@@ -187,48 +145,3 @@ class AbstractCFood(object):
            entity.add_property(prop, value, datatype=datatype)
        else:
            entity.add_property(prop, value)
-    @staticmethod
-    def find_existing(entity):
-        """searches for an entity that matches the identifiable in CaosDB
-        Characteristics of the identifiable like, properties, name or id are
-        used for the match.
-        """
-        if entity.name is None:
-            # TODO multiple parents are ignored! Sufficient?
-            query_string = "FIND Record " + entity.get_parents()[0].name
-            query_string += " WITH " + " AND ".join(
-                ["'" + p.name + "'='"
-                 + str(get_value(p)) + "'" for p in entity.get_properties()])
-        else:
-            query_string = "FIND '{}'".format(entity.name)
-        print(query_string)
-        q = db.Query(query_string)
-        # the identifiable should identify an object uniquely. Thus the query
-        # is using the unique keyword
-        try:
-            r = q.execute(unique=True)
-        except TransactionError:
-            r = None
-        if r is not None:
-            print("Found Entity with id:", r.id)
-        else:
-            print("Did not find an existing entity.")
-        return r
-def get_parser():
-    parser = argparse.ArgumentParser(description=__doc__,
-                                     formatter_class=RawTextHelpFormatter)
-    return parser
-if __name__ == "__main__":
-    parser = get_parser()
-    args = parser.parse_args()
--- a/src/caosadvancedtools/crawler.py
+++ b/src/caosadvancedtools/crawler.py
@@ -22,16 +22,48 @@
 #
 # ** end header
 #
-"""does something"""
+""" Crawls a file structure and inserts Records into CaosDB based on what is
+found.
+CaosDB can automatically be filled with Records based on some file structure.
+The Crawler will iterate over the files and test for each file whether a CFood
+exists that matches the file path. If one does, it is instanciated to treat the
+match. This occurs in basically three steps:
+1. create a list of identifiables, i.e. unique representation of CaosDB Records
+(such as an experiment belonging to a project and a date/time)
+2. the identifiables are either found in CaosDB or they are created.
+3. the identifiables are update based on the date in the file structure
+"""
 import caosdb as db
+from caosdb.exceptions import TransactionError
+from .cache import Cache
 class Crawler(object):
-    def __init__(self, food):
+    def __init__(self, food, access=lambda x: x, use_cache=False):
+        """
+        Parameters
+        ----------
+        pattern : str
+                  The regex pattern for matching against file names.
+        use_cache : bool, optional
+                    Whether to use caching (not re-inserting probably existing
+                    objects into CaosDB), defaults to False.
+        access : callable, optional
+                 A function that takes a CaosDB path and returns a local path
+        """
        self.food = food
+        self.access = access
        self.report = db.Container()
+        self.use_cache = use_cache
+        if self.use_cache:
+            self.cache = Cache()
    def crawl(self, files):
        for crawled_file in files:
@@ -42,11 +74,82 @@ class Crawler(object):
            #    continue
-            for cfood in self.food:
+            for Cfood in self.food:
-                match = cfood.pattern.match(crawled_file.path)
+                match = Cfood.match(crawled_file.path)
                if match is not None:
-                    cfood.treat_match(crawled_file, match)
+                    cfood = Cfood(match, access=self.access)
+                    identifiables = cfood.create_identifiables()
+                    if self.use_cache:
+                        hashes = self.cache.update_ids_from_cache(
+                            identifiables)
+                    self.find_or_insert_identifiables(identifiables)
+                    if self.use_cache:
+                        self.cache.insert_list(hashes, identifiables)
+                    cfood.update_identifiables()
+    @staticmethod
+    def find_or_insert_identifiables(identifiables):
+        """ Sets the ids of identifiables (that do not have already an id from the
+        cache) based on searching CaosDB and retrieves those entities.
+        The remaining entities (those which can not be retrieved) have no
+        correspondence in CaosDB and are thus inserted.
+        """
+        # looking for matching entities in CaosDB when there is no valid id
+        # i.e. there was none set from a cache
+        for ent in identifiables:
+            if ent.id is None or ent.id < 0:
+                existing = Crawler.find_existing(ent)
+                if existing is not None:
+                    ent.id = existing.id
+        # this makes entities with existing ids valid
+        # identifiables.retrieve(unique=True, raise_exception_on_error=False)
+        # insert missing, i.e. those which are not valid
+        missing_identifiables = db.Container()
+        missing_identifiables.extend([ent for ent in identifiables
+                                      if ent.id is None or ent.id < 0])
+        missing_identifiables.insert()
+        identifiables.retrieve(unique=True, raise_exception_on_error=False)
+    @staticmethod
+    def find_existing(entity):
+        """searches for an entity that matches the identifiable in CaosDB
+        Characteristics of the identifiable like, properties, name or id are
+        used for the match.
+        """
+        if entity.name is None:
+            # TODO multiple parents are ignored! Sufficient?
+            query_string = "FIND Record " + entity.get_parents()[0].name
+            query_string += " WITH " + " AND ".join(
+                ["'" + p.name + "'='"
+                 + str(get_value(p)) + "'" for p in entity.get_properties()])
+        else:
+            query_string = "FIND '{}'".format(entity.name)
+        q = db.Query(query_string)
+        # the identifiable should identify an object uniquely. Thus the query
+        # is using the unique keyword
+        try:
+            r = q.execute(unique=True)
+        except TransactionError:
+            r = None
+        # if r is not None:
+        #     print("Found Entity with id:", r.id)
+        # else:
+        #     print("Did not find an existing entity.")
+        return r
    @staticmethod
    def query_files(path):
@@ -57,3 +160,22 @@ class Crawler(object):
        print("{} FILES TO BE PROCESSED.".format(len(files)))
        return files
+def get_value(prop):
+    """ Returns the value of a Property
+    Parameters
+    ----------
+    prop : The property of which the value shall be returned.
+    Returns
+    -------
+    out : The value of the property; if the value is an entity, its ID.
+    """
+    if isinstance(prop.value, db.Entity):
+        return prop.value.id
+    else:
+        return prop.value