From a5ec4027e5b9c88f5d2579e6e1f9f0201db118c7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com>
Date: Tue, 5 May 2020 10:58:48 +0000
Subject: [PATCH] Refactor Crawler

---
 .gitignore                                    |   2 +-
 README_SETUP.md                               |  10 +-
 integrationtests/full_test/crawl.py           |  19 +-
 integrationtests/full_test/test.sh            |   1 +
 .../full_test/test_im_und_export.py           |   1 -
 integrationtests/full_test/test_table.py      |   6 +-
 src/caosadvancedtools/cfood.py                | 302 +++++++++---------
 src/caosadvancedtools/crawler.py              | 270 +++++++---------
 unittests/test_cfood.py                       |  98 +++++-
 9 files changed, 376 insertions(+), 333 deletions(-)

diff --git a/.gitignore b/.gitignore
index c68adb8f..30f02d1c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,6 @@
 __pycache__
 .tox
 .coverage
-cache.db
+*cache.db
 *.egg-info
 .docker/cert
diff --git a/README_SETUP.md b/README_SETUP.md
index cba784d3..526fe900 100644
--- a/README_SETUP.md
+++ b/README_SETUP.md
@@ -7,10 +7,12 @@ pip install tox --user
 tox
 
 # Run Integration Tests Locally
-1. Mount `integrationtests/full_test/extroot` to the folder that will be used as
-   extroot. E.g. `sudo mount -o bind extroot ../../../caosdb-deploy/profiles/empty/paths/extroot`
-2. Start an empty CaosDB instance
-3. run test.sh
+
+1. Change directory to `integrationtests/full_test/`.
+2. Mount `extroot` to the folder that will be used as extroot. E.g. `sudo mount
+   -o bind extroot ../../../caosdb-deploy/profiles/empty/paths/extroot`.
+3. Start an empty CaosDB instance (with the mounted extroot).
+4. Run `test.sh`.
 
 # Code Formatting
 autopep8 -i -r ./
diff --git a/integrationtests/full_test/crawl.py b/integrationtests/full_test/crawl.py
index 51ad24b9..66005fd7 100755
--- a/integrationtests/full_test/crawl.py
+++ b/integrationtests/full_test/crawl.py
@@ -25,14 +25,13 @@
 
 import argparse
 import logging
-import sys
 from argparse import RawTextHelpFormatter
 
 import caosdb as db
 
+from caosadvancedtools.cfood import fileguide
 from caosadvancedtools.crawler import FileCrawler
-from caosadvancedtools.guard import INSERT, RETRIEVE, UPDATE, Guard
-from caosadvancedtools.utils import set_log_level
+from caosadvancedtools.guard import UPDATE
 from scifolder import (AnalysisCFood, ExperimentCFood, ProjectCFood,
                        PublicationCFood, SimulationCFood)
 
@@ -54,8 +53,8 @@ if __name__ == "__main__":
     logger = logging.getLogger("caosadvancedtools")
     conlogger = logging.getLogger("connection")
     conlogger.setLevel(level=logging.ERROR)
-    logger.setLevel(level=logging.WARN)
-
+    logger.setLevel(level=logging.DEBUG)
+    fileguide.access = access
     parser = get_parser()
     args = parser.parse_args()
 
@@ -63,10 +62,10 @@ if __name__ == "__main__":
     files = FileCrawler.query_files(args.path)
     logger.info("Query done...")
     config = db.configuration.get_config()
-    c = FileCrawler(files=files, use_cache=True, access=access,
+    c = FileCrawler(files=files, use_cache=True,
                     interactive=False, hideKnown=True,
-                    food=[ProjectCFood,
-                          ExperimentCFood, AnalysisCFood,
-                          PublicationCFood, SimulationCFood,
-                          ])
+                    cfood_types=[ProjectCFood,
+                                 ExperimentCFood, AnalysisCFood,
+                                 PublicationCFood, SimulationCFood,
+                                 ])
     c.crawl(security_level=UPDATE)
diff --git a/integrationtests/full_test/test.sh b/integrationtests/full_test/test.sh
index 25906da8..249f7342 100755
--- a/integrationtests/full_test/test.sh
+++ b/integrationtests/full_test/test.sh
@@ -4,6 +4,7 @@ echo "Filling the database"
 ./filldb.sh
 echo "Testing the crawler database"
 python3 -m pytest test_crawler.py
+python3 test_table.py
 # TODO the following test deletes lots of the data inserted by the crawler
 echo "Testing im and export"
 python3 test_im_und_export.py
diff --git a/integrationtests/full_test/test_im_und_export.py b/integrationtests/full_test/test_im_und_export.py
index d6fe43eb..5c7584e6 100644
--- a/integrationtests/full_test/test_im_und_export.py
+++ b/integrationtests/full_test/test_im_und_export.py
@@ -1,6 +1,5 @@
 #!/usr/bin/env python3
 import os
-import unittest
 from tempfile import TemporaryDirectory
 
 import caosdb as db
diff --git a/integrationtests/full_test/test_table.py b/integrationtests/full_test/test_table.py
index 91b99471..15b851fb 100644
--- a/integrationtests/full_test/test_table.py
+++ b/integrationtests/full_test/test_table.py
@@ -18,17 +18,13 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program. If not, see <https://www.gnu.org/licenses/>.
 
-import argparse
 import logging
-import sys
-from argparse import RawTextHelpFormatter
 
 import caosdb as db
 import pandas as pd
 
 from caosadvancedtools.crawler import TableCrawler
-from caosadvancedtools.guard import INSERT, RETRIEVE, UPDATE, Guard
-from caosadvancedtools.utils import set_log_level
+from caosadvancedtools.guard import UPDATE
 
 if __name__ == "__main__":
     logger = logging.getLogger("caosadvancedtools")
diff --git a/src/caosadvancedtools/cfood.py b/src/caosadvancedtools/cfood.py
index 2128d6b1..6142e68c 100644
--- a/src/caosadvancedtools/cfood.py
+++ b/src/caosadvancedtools/cfood.py
@@ -25,18 +25,21 @@
 # ** end header
 """ Defines how something that shall be inserted into CaosDB is treated.
 
-CaosDB can automatically be filled with Records based on some file structure.
-The Crawler will iterate over the files and test for each file whether a CFood
-exists that matches the file path. If one does, it is instanciated to treat the
-match. This occurs in basically three steps:
-1. create a list of identifiables, i.e. unique representation of CaosDB Records
-(such as an experiment belonging to a project and a date/time)
-2. the identifiables are either found in CaosDB or they are created.
-3. the identifiables are update based on the date in the file structure
+CaosDB can automatically be filled with Records based on some structure, a file
+structure, a table or similar.
+The Crawler will iterate over the respective items and test for each item
+whether a CFood class exists that matches the file path, i.e. whether CFood
+class wants to treat that pariticular item. If one does, it is instanciated to
+treat the match. This occurs in basically three steps:
+1. Create a list of identifiables, i.e. unique representation of CaosDB Records
+(such as an experiment belonging to a project and a date/time).
+2. The identifiables are either found in CaosDB or they are created.
+3. The identifiables are update based on the date in the file structure.
 """
 
 import logging
 import re
+from abc import ABCMeta, abstractmethod
 
 import caosdb as db
 from caosdb.exceptions import EntityDoesNotExistError
@@ -62,24 +65,37 @@ def get_entity(name):
     return ENTITIES[name]
 
 
-class AbstractCFood(object):
+class FileGuide(object):
+    def access(path):
+        """ should be replaced by a function that adds
+        a prefix to paths to allow to access caosdb files locally"""
 
-    def __init__(self):
+        raise NotImplementedError()
+
+
+fileguide = FileGuide()
+
+
+class AbstractCFood(object, metaclass=ABCMeta):
+
+    def __init__(self, item):
         """ Abstract base class for Crawler food (CFood)."""
         self.to_be_updated = db.Container()
         self.identifiables = db.Container()
+        self.item = item
+        self.attached_items = []
 
+    @abstractmethod
     def create_identifiables(self):
         """
         should set the instance variable Container with the identifiables
         """
-        raise NotImplementedError()
 
+    @abstractmethod
     def update_identifiables(self):
         """ Changes the identifiables as needed and adds changed identifiables
         to self.to_be_updated
         """
-        raise NotImplementedError()
 
     def push_identifiables_to_CaosDB(self):
         """ Updates the self.to_be_updated Container, i.e. pushes the changes
@@ -108,6 +124,56 @@ class AbstractCFood(object):
         logger.debug(self.to_be_updated)
         guard.safe_update(self.to_be_updated)
 
+    @classmethod
+    def match_item(cls, item):
+        """ Matches an item found by the crawler against this class. Returns
+        True if the item shall be treated by this class, i.e. if this class
+        matches the item.
+
+        Parameters
+        ----------
+        item : object
+               iterated by the crawler
+
+        To be overwritten by subclasses!
+        """
+
+        return True
+
+    def collect_information(self):
+        """ The CFood collects information for further processing.
+
+        Often CFoods need information from files or even from the database in
+        order to make processing decision. It is intended that this function is
+        called after match. Thus match can be used without connecting to the
+        database.
+
+        To be overwritten by subclasses
+        """
+        pass
+
+    def attach(self, item):
+        self.attached_items.append(item)
+
+    # TODO looking for should `attach` the files itsself. This would allow to
+    # group them right away and makes it unnecessary to check matches later
+    # again.
+    def looking_for(self, item):
+        """
+        returns True if item can be added to this CFood.
+
+        Typically a CFood exists for a file and defines how to deal with the
+        file. However, sometimes additional files "belong" to a CFood. E.g. an
+        experiment CFood might match against a README file but labnotes.txt
+        also shall be treated by the cfood (and not a special cfood created for
+        labnotes.txt)
+        This function can be used to define what files shall be 'attached'.
+
+        To be overwritten by subclasses
+        """
+
+        return False
+
     @staticmethod
     # move to api?
     def set_parents(entity, names):
@@ -140,55 +206,6 @@ class AbstractCFood(object):
             entity.add_property(prop, value)
 
 
-class CMeal(object):
-    """
-    CMeal groups equivalent Files and allow their collected insertion.
-
-    Sometimes there is no one file that can be used to trigger the creation of
-    some Record. E.g. if a collection of images shall be referenced from one
-    Record that groups them, it is unclear which image should trigger the
-    creation of the Record.
-
-    CMeals are grouped based on the groups in the used regular expression. If,
-    in the above example, all the images reside in one folder, all groups
-    except that for the file name should match. The groups that shall match
-    need to be listed in the matching_groups class property. Subclasses will
-    overwrite this property.
-
-    The cook function of a cfood allows this class to work. Instead of directly
-    instantiating a CFood the cook function is used. If the CFood is also a
-    child of CMeal, it will be checked (using get_suitable_cfood) in the cook
-    function whether a new CFood should be created or if the file match should
-    be added to an existing one. In order to allow this all instances of a
-    CFood class are tracked in the existing_instances class member.
-    """
-    existing_instances = []
-    matching_groups = []
-
-    def __init__(self, *args, **kwargs):
-        self.__class__.existing_instances.append(self)
-        self.crawled_files = []
-
-    def add(self, crawled_file):
-        self.crawled_files.append(crawled_file)
-
-    @classmethod
-    def get_suitable_cfood(cls, match):
-        for cfood in cls.existing_instances:
-            suitable = True
-
-            for group in cls.matching_groups:
-                if (group not in match.groupdict() or
-                        group not in cfood.match.groupdict() or
-                        match.group(group) != cfood.match.group(group)):
-                    suitable = False
-
-            if suitable:
-                return cfood
-
-        return None
-
-
 def get_entity_for_path(path):
     try:
         q = "FIND FILE WHICH IS STORED AT '{}'".format(path)
@@ -210,7 +227,7 @@ class AbstractFileCFood(AbstractCFood):
     # function match()
     _pattern = None
 
-    def __init__(self, crawled_path, access=lambda x: x):
+    def __init__(self, crawled_path, *args, **kwargs):
         """ Abstract base class for file based Crawler food (CFood).
 
         Parameters
@@ -218,15 +235,11 @@ class AbstractFileCFood(AbstractCFood):
         crawled_path : The file that the crawler is currently matching. Its
                        path should match against the pattern of this class
 
-        access : callable, optional
-                 A function that takes a CaosDB path and returns a local path
         """
-        super().__init__()
-        self.access = access
+        super().__init__(*args, item=crawled_path, **kwargs)
         self._crawled_file = None
         self.crawled_path = crawled_path
-        self.match = type(self).match_file(crawled_path)
-        self.attached_ones = []
+        self.match = re.match(type(self).get_re(), crawled_path)
         self.attached_filenames = []
 
     @property
@@ -236,18 +249,6 @@ class AbstractFileCFood(AbstractCFood):
 
         return self._crawled_file
 
-    def collect_information(self):
-        """ The CFood collects information for further processing.
-
-        Often CFoods need information from files or even from the database in
-        order to make processing decision. It is intended that this function is
-        called after match. Thus match can be used without connecting to the
-        database.
-
-        To be overwritten by subclasses
-        """
-        pass
-
     @staticmethod
     def get_re():
         """ Returns the regular expression used to identify files that shall be
@@ -258,45 +259,16 @@ class AbstractFileCFood(AbstractCFood):
         raise NotImplementedError()
 
     @classmethod
-    def cook(cls, crawled_file, **kwargs):
-        """ possibly checks for existing CFoods whether the match should be
-        added or whether a new CFood instance needs to be returned
-
-        This function should typically be used to create CFoods in order to
-        prevent the creation of unnecessary instances.
-
-        This standard implementation does not do a check but may be overwritten
-        by subclasses.
-
-        Retruns
-        -------------
-        CFood: if a new instance was created
-        None: otherwise
-        """
-
-        return cls(crawled_file, **kwargs)
-
-    @classmethod
-    def match_file(cls, string):
+    def match_item(cls, path):
         """ Matches the regular expression of this class against file names
 
         Parameters
         ----------
-        string : str
+        path : str
                  The path of the file that shall be matched.
         """
 
-        # TODO this does not quite work. Sometimes the wrong expression is in
-        # _pattern; FIX
-        # if cls._pattern is None:
-        #    cls._pattern = re.compile(cls.get_re())
-
-        # return cls._pattern.match(string)
-
-        return re.match(cls.get_re(), string)
-
-    def attach(self, crawled_file):
-        self.attached_ones.append(crawled_file)
+        return re.match(cls.get_re(), path) is not None
 
     # TODO looking for should `attach` the files itsself. This would allow to
     # group them right away and makes it unnecessary to check matches later
@@ -320,37 +292,6 @@ class AbstractFileCFood(AbstractCFood):
 
         return False
 
-    @staticmethod
-    # move to api?
-    def set_parents(entity, names):
-        entity.parents.clear()
-
-        for n in names:
-            entity.add_parent(get_entity(n))
-
-    @staticmethod
-    # move to api?
-    def remove_property(entity, prop):
-        # TODO only do something when it is necessary?
-
-        if isinstance(prop, db.Entity):
-            name = prop.name
-        else:
-            name = prop
-
-        while entity.get_property(name) is not None:
-            entity.remove_property(name)
-
-    @staticmethod
-    # move to api?
-    def set_property(entity, prop, value, datatype=None):
-        AbstractCFood.remove_property(entity, prop)
-
-        if datatype is not None:
-            entity.add_property(prop, value, datatype=datatype)
-        else:
-            entity.add_property(prop, value)
-
 
 def assure_object_is_in_list(obj, containing_object, property_name,
                              to_be_updated, datatype=None):
@@ -564,12 +505,11 @@ def get_ids_for_entities_with_names(entities):
 
 
 class RowCFood(AbstractCFood):
-    def __init__(self, row, unique_cols, recordtype):
+    def __init__(self, item, unique_cols, recordtype, **kwargs):
         """
         table : pandas table
         """
-        super().__init__()
-        self.row = row
+        super().__init__(item, **kwargs)
         self.unique_cols = unique_cols
         self.recordtype = recordtype
 
@@ -578,13 +518,79 @@ class RowCFood(AbstractCFood):
         rec.add_parent(self.recordtype)
 
         for col in self.unique_cols:
-            rec.add_property(col, self.row.loc[col])
+            rec.add_property(col, self.item.loc[col])
         self.identifiables.append(rec)
 
     def update_identifiables(self):
         rec = self.identifiables[0]
 
-        for key, value in self.row.iteritems():
+        for key, value in self.item.iteritems():
             if key in self.unique_cols:
                 continue
             rec.add_property(key, value)
+
+
+class CMeal(object):
+    """
+    CMeal groups equivalent items and allow their collected insertion.
+
+    Sometimes there is no one item that can be used to trigger the creation of
+    some Record. E.g. if a collection of image files shall be referenced from one
+    Record that groups them, it is unclear which image should trigger the
+    creation of the Record.
+
+    CMeals are grouped based on the groups in the used regular expression. If,
+    in the above example, all the images reside in one folder, all groups of
+    the filename match except that for the file name should match.
+    The groups that shall match
+    need to be listed in the matching_groups class property. Subclasses will
+    overwrite this property.
+
+    This allows to use has_suitable_cfood in the match_item function of a CFood
+    to check whether the necessary CFood was already created.
+    In order to allow this all instances of a
+    CFood class are tracked in the existing_instances class member.
+
+    Subclasses must have a cls.get_re function and a match member variable
+    (see AbstractFileCFood)
+    """
+    existing_instances = []
+    matching_groups = []
+
+    def __init__(self):
+        self.__class__.existing_instances.append(self)
+
+    @classmethod
+    def all_groups_equal(cls, m1, m2):
+        equal = True
+
+        for group in cls.matching_groups:
+            if (group not in m1.groupdict() or
+                    group not in m2.groupdict() or
+                    m1.group(group) != m2.group(group)):
+                equal = False
+
+        return equal
+
+    @classmethod
+    def has_suitable_cfood(cls, item):
+        """ checks whether the required cfood object already exists.
+
+        item : the crawled item
+        """
+        match = re.match(cls.get_re(), item)
+
+        for cfood in cls.existing_instances:
+            if cls.all_groups_equal(match, cfood.match):
+                return True
+
+        return False
+
+    def belongs_to_meal(self, item):
+        # This is already the main item
+
+        if item == self.item:
+            return False
+        match = re.match(self.get_re(), item)
+
+        return self.all_groups_equal(match, self.match)
diff --git a/src/caosadvancedtools/crawler.py b/src/caosadvancedtools/crawler.py
index db89551d..2877cd7f 100644
--- a/src/caosadvancedtools/crawler.py
+++ b/src/caosadvancedtools/crawler.py
@@ -82,44 +82,56 @@ class UnknownCache(object):
 
 
 class Crawler(object):
-    def __init__(self, food=None, access=lambda x: x, use_cache=False,
-                 abort_on_exception=True, interactive=True):
+    def __init__(self, cfood_types, use_cache=False,
+                 abort_on_exception=True, interactive=True, hideKnown=False):
         """
         Parameters
         ----------
-        food : list of CFood classes, optional
+        cfood_types : list of CFood classes
                The Crawler will use those CFoods when crawling.
-        pattern : str
-                  The regex pattern for matching against file names.
-
         use_cache : bool, optional
                     Whether to use caching (not re-inserting probably existing
                     objects into CaosDB), defaults to False.
-
-        access : callable, optional
-                 A function that takes a CaosDB path and returns a local path
+        abort_on_exception : if true, exceptions are raise.
+                    Otherwise the crawler continues if an exception occurs.
         interactive : boolean, optional
                       If true, questions will be posed during execution of the
                       crawl function.
 
         """
 
-        if food is None:
-            self.food = []
-        else:
-            self.food = food
+        self.cfood_types = cfood_types
         self.interactive = interactive
-        self.access = access
         self.report = db.Container()
         self.use_cache = use_cache
+        self.hideKnown = hideKnown
         self.abort_on_exception = abort_on_exception
 
         if self.use_cache:
             self.cache = Cache()
 
+    def iteritems(self):
+        """ generates items to be crawled with an index"""
+        yield 0, None
+
     def collect_cfoods(self):
         """
-        to be overwritten by subclasses.
+        This is the first phase of the crawl. It collects all cfoods that shall
+        be processed. The second phase is iterating over cfoods and updating
+        CaosDB. This separate first step is necessary in order to allow a
+        single cfood being influenced by multiple crawled items. E.g. the
+        FileCrawler can have a single cfood treat multiple files.
+
+        This is a very basic implementation and this function should be
+        overwritten by subclasses.
+
+        The basic structure of this function should be, that what ever is
+        being processed is iterated and each cfood is checked whether the
+        item 'matches'. If it does, a cfood is instantiated passing the item
+        as an argument.
+        The match can depend on the cfoods already being created, i.e. a file
+        migth no longer match because it is already treaded by an earlier
+        cfood.
 
         should return cfoods, tbs and errors_occured.
         # TODO do this via logging?
@@ -129,27 +141,83 @@ class Crawler(object):
         cfoods = []
         tbs = []
         errors_occured = False
+        matches = {idx: [] for idx, _ in self.iteritems()}
 
-        for food in self.food:
-            cfoods.append(food())
+        logger.info(separated("Matching files against CFoods"))
 
-        for Cfood in self.food:
-            try:
-                cfood = Cfood()
+        for Cfood in self.cfood_types:
+            logger.debug("Matching against {}...".format(Cfood.__name__))
 
-                if cfood is not None:
-                    cfoods.append(cfood)
-            except Exception as e:
-                traceback.print_exc()
-                print(e)
+            for idx, item in self.iteritems():
+                if Cfood.match_item(item):
+                    try:
+                        cfoods.append(Cfood(item))
+                        matches[idx].append(Cfood.__name__)
+                        logger.debug("{} matched\n{}.".format(
+                                Cfood.__name__,
+                                item))
+                    except Exception as e:
+                        traceback.print_exc()
+                        print(e)
 
-                if self.abort_on_exception:
-                    raise e
-                errors_occured = True
-                tbs.append(e)
+                        if self.abort_on_exception:
+                            raise e
+                        errors_occured = True
+                        tbs.append(e)
+
+        logger.info(separated("CFoods are collecting information..."))
+
+        for cfood in cfoods:
+            cfood.collect_information()
+
+        logger.info(separated("Trying to attach further items to created CFoods"))
+
+        for cfood in cfoods:
+            logger.debug("Matching against {}...".format(Cfood.__name__))
+
+            for idx, item in self.iteritems():
+                if cfood.looking_for(item):
+                    logger.debug("{} matched\n{}.".format(
+                            cfood.__class__.__name__,
+                            item))
+                    cfood.attach(item)
+                    matches[idx].append(Cfood.__name__)
+
+        self.check_matches(matches)
 
         return cfoods, tbs, errors_occured
 
+    def check_matches(self, matches):
+        # possibly load previously encountered "Missing matches" and
+        # "Multiple matches"
+        ucache = UnknownCache(interactive=self.interactive, load=self.hideKnown)
+
+        for idx, item in self.iteritems():
+            if len(matches[idx]) == 0:
+                msg = ("ATTENTION: No matching cfood!\n"
+                       "Tried to match {}\n".format(item))
+
+                if item in ucache.filenames:
+                    logger.debug(msg)
+                else:
+                    logger.warning(msg)
+                ucache.add(item)
+
+            if len(matches[idx]) > 1:
+                msg = ("Attention: More than one matching cfood!\n"
+                       + "Tried to match {}\n".format(item)
+                       + "\tRecordTypes:\t" + ", ".join(
+                            matches[idx])+"\n")
+
+                if item in ucache.filenames:
+                    logger.debug(msg)
+                else:
+                    logger.warning(msg)
+                ucache.add(item)
+
+        # Save the encountered prblem matches
+        ucache.save()
+
     def cached_find_identifiables(self, identifiables):
         if self.use_cache:
             hashes = self.cache.update_ids_from_cache(identifiables)
@@ -273,122 +341,30 @@ class Crawler(object):
 
         return r
 
-    @staticmethod
-    def query_files(path):
-        query_str = "FIND FILE WHICH IS STORED AT " + \
-            (path if path.endswith("/") else path + "/") + "**"
-        logger.info("FILES QUERY: " + query_str)
-        files = db.execute_query(query_str)
-        logger.info("{} FILES TO BE PROCESSED.".format(len(files)))
-
-        return files
-
 
 class FileCrawler(Crawler):
-    def __init__(self, files, access=lambda x: x, hideKnown=False, **kwargs):
+    def __init__(self, files, **kwargs):
         """
         Parameters
         ----------
         files : files to be crawled
 
-        access : callable, optional
-                 A function that takes a CaosDB path and returns a local path
-
         """
         super().__init__(**kwargs)
         self.files = files
-        self.access = access
-        self.hideKnown = hideKnown
-
-    def match(self):
-
-        files = sorted([f.path for f in self.files])
-        errors_occured = False
-        tbs = []
-        cfoods = []
-        matches = {f: [] for f in files}
-
-        logger.info(separated("Matching files against CFoods"))
 
-        for Cfood in self.food:
-            logger.debug("Matching against {}...".format(Cfood.__name__))
-
-            for crawled_file in files:
-                if Cfood.match_file(crawled_file) is not None:
-                    matches[crawled_file].append(Cfood.__name__)
-
-                    logger.debug("{} matched\n{}.".format(
-                            Cfood.__name__,
-                            crawled_file))
-                    try:
-                        cfood = Cfood.cook(crawled_file, access=self.access)
-
-                        if cfood is not None:
-                            cfoods.append(cfood)
-                    except Exception as e:
-                        traceback.print_exc()
-                        print(e)
-
-                        if self.abort_on_exception:
-                            raise e
-                        errors_occured = True
-                        tbs.append(e)
-
-        logger.info(separated("CFoods are collecting information..."))
-
-        for cfood in cfoods:
-            cfood.collect_information()
+    def iteritems(self):
+        for idx, p in enumerate(sorted([f.path for f in self.files])):
+            yield idx, p
 
-        logger.info(separated("Trying to attach files to created CFoods"))
-
-        for cfood in cfoods:
-            logger.debug("Matching against {}...".format(Cfood.__name__))
-
-            for crawled_file in files:
-                if cfood.looking_for(crawled_file):
-                    logger.debug("{} matched\n{}.".format(
-                            cfood.__class__.__name__,
-                            crawled_file))
-                    cfood.attach(crawled_file)
-                    matches[crawled_file].append(Cfood.__name__)
-
-        # possibly load previously encountered "Missing matches" and
-        # "Multiple matches"
-        ucache = UnknownCache(interactive=self.interactive, load=self.hideKnown)
-
-        for crawled_file in files:
-            if len(matches[crawled_file]) == 0:
-                msg = ("ATTENTION: No matching cfood!\n"
-                       "Tried to match {}\n".format(crawled_file))
-
-                if crawled_file in ucache.filenames:
-                    logger.debug(msg)
-                else:
-                    logger.warning(msg)
-                ucache.add(crawled_file)
-
-            if len(matches[crawled_file]) > 1:
-                msg = ("Attention: More than one matching cfood!\n"
-                       + "Tried to match {}\n".format(crawled_file)
-                       + "\tRecordTypes:\t" + ", ".join(
-                            matches[crawled_file])+"\n")
-
-                if crawled_file in ucache.filenames:
-                    logger.debug(msg)
-                else:
-                    logger.warning(msg)
-                ucache.add(crawled_file)
-
-        # Save the encountered prblem matches
-        ucache.save()
-
-        return cfoods, tbs, errors_occured
-
-    def collect_cfoods(self):
-
-        cfoods, tbs, errors_occured = self.match()
+    @staticmethod
+    def query_files(path):
+        query_str = "FIND FILE WHICH IS STORED AT " + (path if path.endswith("/") else path + "/") + "**"
+        logger.info("FILES QUERY: " + query_str)
+        files = db.execute_query(query_str)
+        logger.info("{} FILES TO BE PROCESSED.".format(len(files)))
 
-        return cfoods, tbs, errors_occured
+        return files
 
 
 class TableCrawler(Crawler):
@@ -402,32 +378,18 @@ class TableCrawler(Crawler):
                       identifiable
         recordtype : Record Type of the Records to be created
         """
-        super().__init__(**kwargs)
         self.table = table
-        self.unique_cols = unique_cols
-        self.recordtype = recordtype
-
-    def collect_cfoods(self):
-        cfoods = []
-        tbs = []
-        errors_occured = False
-
-        for _, row in self.table.iterrows():
-            try:
-                cfood = RowCFood(row, self.unique_cols, self.recordtype)
 
-                if cfood is not None:
-                    cfoods.append(cfood)
-            except Exception as e:
-                traceback.print_exc()
-                print(e)
+        # TODO I do not like this yet, but I do not see a better way so far.
+        class ThisRowCF(RowCFood):
+            def __init__(self, item):
+                super().__init__(item, unique_cols, recordtype)
 
-                if self.abort_on_exception:
-                    raise e
-                errors_occured = True
-                tbs.append(e)
+        super().__init__(cfood_types=[ThisRowCF], **kwargs)
 
-        return cfoods, tbs, errors_occured
+    def iteritems(self):
+        for idx, row in self.table.iterrows():
+            yield idx, row
 
 
 def get_value(prop):
diff --git a/unittests/test_cfood.py b/unittests/test_cfood.py
index dcea2cbd..bfa3443a 100644
--- a/unittests/test_cfood.py
+++ b/unittests/test_cfood.py
@@ -21,33 +21,90 @@
 # along with this program. If not, see <https://www.gnu.org/licenses/>.
 #
 # ** end header
+import re
 import unittest
 
 import caosdb as db
-
-from caosadvancedtools.cfood import (AbstractFileCFood, assure_has_parent,
+from caosadvancedtools.cfood import (AbstractCFood, AbstractFileCFood, CMeal,
+                                     assure_has_parent,
                                      assure_object_is_in_list)
 from caosadvancedtools.example_cfood import ExampleCFood
 
 PATTERN = "h.*"
 
 
-class TestCFood(AbstractFileCFood):
+class ExampleCFoodMeal(AbstractFileCFood, CMeal):
+    matching_groups = ["test"]
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # add the constructor of CMeal
+        CMeal.__init__(self)
+
+    @classmethod
+    def match_item(cls, item):
+        """ standard match_match, but returns False if a suitable cfood exists """
+
+        if cls.has_suitable_cfood(item):
+            return False
+
+        return re.match(cls.get_re(), item) is not None
+
+    def looking_for(self, crawled_file):
+        """ standard looking_for, but returns True if the file matches all
+        groups"""
+
+        if self.belongs_to_meal(crawled_file):
+            return True
+
+        return super().looking_for(crawled_file)
+
+    @staticmethod
+    def get_re():
+        return r"/(?P<test>[a-z]*)/"
+
+    def create_identifiables(self):
+        pass
+
+    def update_identifiables(self):
+        pass
+
+
+class SimpleCFood(AbstractFileCFood):
 
     @staticmethod
     def get_re():
         return PATTERN
 
 
+class DependendCFood(AbstractCFood):
+    existing = []
+
+    @classmethod
+    def match_item(cls, item):
+        if len(cls.existing) == 0:
+            return True
+
+    def __init__(self, *args,  **kwargs):
+        super().__init__(*args,  **kwargs)
+        DependendCFood.existing.append(self)
+
+    def create_identifiables(self):
+        pass
+
+    def update_identifiables(self):
+        pass
+
+
 class CFoodReTest(unittest.TestCase):
     def test(self):
-        self.assertEquals(TestCFood.get_re(), PATTERN)
-        self.assertEqual(TestCFood._pattern, None)
-        self.assertIsNotNone(TestCFood.match_file("hallo"))
+        self.assertEqual(SimpleCFood.get_re(), PATTERN)
+        self.assertEqual(SimpleCFood._pattern, None)
+        self.assertTrue(SimpleCFood.match_item("hallo"))
         # TODO the caching is of compiled re is disabled currently
-        # self.assertIsNotNone(TestCFood._pattern)
-        self.assertIsNotNone(TestCFood.match_file("hallo"))
-        self.assertIsNone(TestCFood.match_file("allo"))
+        # self.assertIsNotNone(SimpleCFood._pattern)
+        self.assertTrue(SimpleCFood.match_item("hallo"))
+        self.assertFalse(SimpleCFood.match_item("allo"))
 
 
 class InsertionTest(unittest.TestCase):
@@ -75,10 +132,31 @@ class InsertionTest(unittest.TestCase):
         assert len(to_be_updated) == 0
 
 
+class DependendTest(unittest.TestCase):
+    def test(self):
+        self.assertTrue(DependendCFood.match_item(None))
+        cf = DependendCFood(None)
+        self.assertFalse(DependendCFood.match_item(None))
+
+
 class ExampleTest(unittest.TestCase):
     def test(self):
         path = "/data/rabbit/2019-03-03/README.md"
         cf = ExampleCFood(crawled_path=path)
-        self.assertIsNotNone(ExampleCFood.match_file(path))
+        self.assertIsNotNone(ExampleCFood.match_item(path))
         self.assertEqual(cf.match.group('species'), 'rabbit')
         self.assertEqual(cf.match.group('date'), '2019-03-03')
+
+
+class MealTest(unittest.TestCase):
+    def test(self):
+        # path should match
+        self.assertTrue(ExampleCFoodMeal.match_item("/this/file"))
+        # create an instance
+        c = ExampleCFoodMeal("/this/file")
+        # same prefix should no longer match
+        self.assertFalse(ExampleCFoodMeal.match_item("/this/other"))
+        # but instance should be looking for this prefix
+        self.assertTrue(c.looking_for("/this/other"))
+        # class should still match other prefixes
+        self.assertTrue(ExampleCFoodMeal.match_item("/that/file"))
-- 
GitLab