Merge branch 'f-refactor' into 'master'

Refactor Crawler See merge request caosdb/caosdb-advanced-user-tools!19

Merge branch 'f-refactor' into 'master'
aea9144a · Henrik tom Wörden · 75b703dc · a5ec4027 · aea9144a · aea9144a
Commit aea9144a authored 5 years ago by Henrik tom Wörden
--- a/.gitignore
+++ b/.gitignore
 __pycache__
 .tox
 .coverage
-cache.db
+*cache.db
 *.egg-info
 .docker/cert
--- a/README_SETUP.md
+++ b/README_SETUP.md
@@ -7,10 +7,12 @@ pip install tox --user
 tox
 # Run Integration Tests Locally
-1. Mount `integrationtests/full_test/extroot` to the folder that will be used as
-   extroot. E.g. `sudo mount -o bind extroot ../../../caosdb-deploy/profiles/empty/paths/extroot`
+1. Change directory to `integrationtests/full_test/`.
-2. Start an empty CaosDB instance
+2. Mount `extroot` to the folder that will be used as extroot. E.g. `sudo mount
-3. run test.sh
+   -o bind extroot ../../../caosdb-deploy/profiles/empty/paths/extroot`.
+3. Start an empty CaosDB instance (with the mounted extroot).
+4. Run `test.sh`.
 # Code Formatting
 autopep8 -i -r ./
--- a/integrationtests/full_test/crawl.py
+++ b/integrationtests/full_test/crawl.py
@@ -25,14 +25,13 @@
 import argparse
 import logging
-import sys
 from argparse import RawTextHelpFormatter
 import caosdb as db
+from caosadvancedtools.cfood import fileguide
 from caosadvancedtools.crawler import FileCrawler
-from caosadvancedtools.guard import INSERT, RETRIEVE, UPDATE, Guard
+from caosadvancedtools.guard import UPDATE
-from caosadvancedtools.utils import set_log_level
 from scifolder import (AnalysisCFood, ExperimentCFood, ProjectCFood,
                       PublicationCFood, SimulationCFood)
@@ -54,8 +53,8 @@ if __name__ == "__main__":
    logger = logging.getLogger("caosadvancedtools")
    conlogger = logging.getLogger("connection")
    conlogger.setLevel(level=logging.ERROR)
-    logger.setLevel(level=logging.WARN)
+    logger.setLevel(level=logging.DEBUG)
+    fileguide.access = access
    parser = get_parser()
    args = parser.parse_args()
@@ -63,9 +62,9 @@ if __name__ == "__main__":
    files = FileCrawler.query_files(args.path)
    logger.info("Query done...")
    config = db.configuration.get_config()
-    c = FileCrawler(files=files, use_cache=True, access=access,
+    c = FileCrawler(files=files, use_cache=True,
                    interactive=False, hideKnown=True,
-                    food=[ProjectCFood,
+                    cfood_types=[ProjectCFood,
                                 ExperimentCFood, AnalysisCFood,
                                 PublicationCFood, SimulationCFood,
                                 ])

--- a/integrationtests/full_test/test.sh
+++ b/integrationtests/full_test/test.sh
@@ -4,6 +4,7 @@ echo "Filling the database"
 ./filldb.sh
 echo "Testing the crawler database"
 python3 -m pytest test_crawler.py
+python3 test_table.py
 # TODO the following test deletes lots of the data inserted by the crawler
 echo "Testing im and export"
 python3 test_im_und_export.py
--- a/integrationtests/full_test/test_im_und_export.py
+++ b/integrationtests/full_test/test_im_und_export.py
 #!/usr/bin/env python3
 import os
-import unittest
 from tempfile import TemporaryDirectory
 import caosdb as db

--- a/integrationtests/full_test/test_table.py
+++ b/integrationtests/full_test/test_table.py
@@ -18,17 +18,13 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program. If not, see <https://www.gnu.org/licenses/>.
-import argparse
 import logging
-import sys
-from argparse import RawTextHelpFormatter
 import caosdb as db
 import pandas as pd
 from caosadvancedtools.crawler import TableCrawler
-from caosadvancedtools.guard import INSERT, RETRIEVE, UPDATE, Guard
+from caosadvancedtools.guard import UPDATE
-from caosadvancedtools.utils import set_log_level
 if __name__ == "__main__":
    logger = logging.getLogger("caosadvancedtools")

--- a/src/caosadvancedtools/cfood.py
+++ b/src/caosadvancedtools/cfood.py
@@ -25,18 +25,21 @@
 # ** end header
 """ Defines how something that shall be inserted into CaosDB is treated.
-CaosDB can automatically be filled with Records based on some file structure.
+CaosDB can automatically be filled with Records based on some structure, a file
-The Crawler will iterate over the files and test for each file whether a CFood
+structure, a table or similar.
-exists that matches the file path. If one does, it is instanciated to treat the
+The Crawler will iterate over the respective items and test for each item
-match. This occurs in basically three steps:
+whether a CFood class exists that matches the file path, i.e. whether CFood
-1. create a list of identifiables, i.e. unique representation of CaosDB Records
+class wants to treat that pariticular item. If one does, it is instanciated to
-(such as an experiment belonging to a project and a date/time)
+treat the match. This occurs in basically three steps:
-2. the identifiables are either found in CaosDB or they are created.
+1. Create a list of identifiables, i.e. unique representation of CaosDB Records
-3. the identifiables are update based on the date in the file structure
+(such as an experiment belonging to a project and a date/time).
+2. The identifiables are either found in CaosDB or they are created.
+3. The identifiables are update based on the date in the file structure.
 """
 import logging
 import re
+from abc import ABCMeta, abstractmethod
 import caosdb as db
 from caosdb.exceptions import EntityDoesNotExistError
@@ -62,24 +65,37 @@ def get_entity(name):
    return ENTITIES[name]
-class AbstractCFood(object):
+class FileGuide(object):
+    def access(path):
+        """ should be replaced by a function that adds
+        a prefix to paths to allow to access caosdb files locally"""
-    def __init__(self):
+        raise NotImplementedError()
+fileguide = FileGuide()
+class AbstractCFood(object, metaclass=ABCMeta):
+    def __init__(self, item):
        """ Abstract base class for Crawler food (CFood)."""
        self.to_be_updated = db.Container()
        self.identifiables = db.Container()
+        self.item = item
+        self.attached_items = []
+    @abstractmethod
    def create_identifiables(self):
        """
        should set the instance variable Container with the identifiables
        """
-        raise NotImplementedError()
+    @abstractmethod
    def update_identifiables(self):
        """ Changes the identifiables as needed and adds changed identifiables
        to self.to_be_updated
        """
-        raise NotImplementedError()
    def push_identifiables_to_CaosDB(self):
        """ Updates the self.to_be_updated Container, i.e. pushes the changes
@@ -108,6 +124,56 @@ class AbstractCFood(object):
        logger.debug(self.to_be_updated)
        guard.safe_update(self.to_be_updated)
+    @classmethod
+    def match_item(cls, item):
+        """ Matches an item found by the crawler against this class. Returns
+        True if the item shall be treated by this class, i.e. if this class
+        matches the item.
+        Parameters
+        ----------
+        item : object
+               iterated by the crawler
+        To be overwritten by subclasses!
+        """
+        return True
+    def collect_information(self):
+        """ The CFood collects information for further processing.
+        Often CFoods need information from files or even from the database in
+        order to make processing decision. It is intended that this function is
+        called after match. Thus match can be used without connecting to the
+        database.
+        To be overwritten by subclasses
+        """
+        pass
+    def attach(self, item):
+        self.attached_items.append(item)
+    # TODO looking for should `attach` the files itsself. This would allow to
+    # group them right away and makes it unnecessary to check matches later
+    # again.
+    def looking_for(self, item):
+        """
+        returns True if item can be added to this CFood.
+        Typically a CFood exists for a file and defines how to deal with the
+        file. However, sometimes additional files "belong" to a CFood. E.g. an
+        experiment CFood might match against a README file but labnotes.txt
+        also shall be treated by the cfood (and not a special cfood created for
+        labnotes.txt)
+        This function can be used to define what files shall be 'attached'.
+        To be overwritten by subclasses
+        """
+        return False
    @staticmethod
    # move to api?
    def set_parents(entity, names):
@@ -140,55 +206,6 @@ class AbstractCFood(object):
            entity.add_property(prop, value)
-class CMeal(object):
-    """
-    CMeal groups equivalent Files and allow their collected insertion.
-    Sometimes there is no one file that can be used to trigger the creation of
-    some Record. E.g. if a collection of images shall be referenced from one
-    Record that groups them, it is unclear which image should trigger the
-    creation of the Record.
-    CMeals are grouped based on the groups in the used regular expression. If,
-    in the above example, all the images reside in one folder, all groups
-    except that for the file name should match. The groups that shall match
-    need to be listed in the matching_groups class property. Subclasses will
-    overwrite this property.
-    The cook function of a cfood allows this class to work. Instead of directly
-    instantiating a CFood the cook function is used. If the CFood is also a
-    child of CMeal, it will be checked (using get_suitable_cfood) in the cook
-    function whether a new CFood should be created or if the file match should
-    be added to an existing one. In order to allow this all instances of a
-    CFood class are tracked in the existing_instances class member.
-    """
-    existing_instances = []
-    matching_groups = []
-    def __init__(self, *args, **kwargs):
-        self.__class__.existing_instances.append(self)
-        self.crawled_files = []
-    def add(self, crawled_file):
-        self.crawled_files.append(crawled_file)
-    @classmethod
-    def get_suitable_cfood(cls, match):
-        for cfood in cls.existing_instances:
-            suitable = True
-            for group in cls.matching_groups:
-                if (group not in match.groupdict() or
-                        group not in cfood.match.groupdict() or
-                        match.group(group) != cfood.match.group(group)):
-                    suitable = False
-            if suitable:
-                return cfood
-        return None
 def get_entity_for_path(path):
    try:
        q = "FIND FILE WHICH IS STORED AT '{}'".format(path)
@@ -210,7 +227,7 @@ class AbstractFileCFood(AbstractCFood):
    # function match()
    _pattern = None
-    def __init__(self, crawled_path, access=lambda x: x):
+    def __init__(self, crawled_path, *args, **kwargs):
        """ Abstract base class for file based Crawler food (CFood).
        Parameters
@@ -218,15 +235,11 @@ class AbstractFileCFood(AbstractCFood):
        crawled_path : The file that the crawler is currently matching. Its
                       path should match against the pattern of this class
-        access : callable, optional
-                 A function that takes a CaosDB path and returns a local path
        """
-        super().__init__()
+        super().__init__(*args, item=crawled_path, **kwargs)
-        self.access = access
        self._crawled_file = None
        self.crawled_path = crawled_path
-        self.match = type(self).match_file(crawled_path)
+        self.match = re.match(type(self).get_re(), crawled_path)
-        self.attached_ones = []
        self.attached_filenames = []
    @property
@@ -236,18 +249,6 @@ class AbstractFileCFood(AbstractCFood):
        return self._crawled_file
-    def collect_information(self):
-        """ The CFood collects information for further processing.
-        Often CFoods need information from files or even from the database in
-        order to make processing decision. It is intended that this function is
-        called after match. Thus match can be used without connecting to the
-        database.
-        To be overwritten by subclasses
-        """
-        pass
    @staticmethod
    def get_re():
        """ Returns the regular expression used to identify files that shall be
@@ -258,45 +259,16 @@ class AbstractFileCFood(AbstractCFood):
        raise NotImplementedError()
    @classmethod
-    def cook(cls, crawled_file, **kwargs):
+    def match_item(cls, path):
-        """ possibly checks for existing CFoods whether the match should be
-        added or whether a new CFood instance needs to be returned
-        This function should typically be used to create CFoods in order to
-        prevent the creation of unnecessary instances.
-        This standard implementation does not do a check but may be overwritten
-        by subclasses.
-        Retruns
-        -------------
-        CFood: if a new instance was created
-        None: otherwise
-        """
-        return cls(crawled_file, **kwargs)
-    @classmethod
-    def match_file(cls, string):
        """ Matches the regular expression of this class against file names
        Parameters
        ----------
-        string : str
+        path : str
                 The path of the file that shall be matched.
        """
-        # TODO this does not quite work. Sometimes the wrong expression is in
+        return re.match(cls.get_re(), path) is not None
-        # _pattern; FIX
-        # if cls._pattern is None:
-        #    cls._pattern = re.compile(cls.get_re())
-        # return cls._pattern.match(string)
-        return re.match(cls.get_re(), string)
-    def attach(self, crawled_file):
-        self.attached_ones.append(crawled_file)
    # TODO looking for should `attach` the files itsself. This would allow to
    # group them right away and makes it unnecessary to check matches later
@@ -320,37 +292,6 @@ class AbstractFileCFood(AbstractCFood):
        return False
-    @staticmethod
-    # move to api?
-    def set_parents(entity, names):
-        entity.parents.clear()
-        for n in names:
-            entity.add_parent(get_entity(n))
-    @staticmethod
-    # move to api?
-    def remove_property(entity, prop):
-        # TODO only do something when it is necessary?
-        if isinstance(prop, db.Entity):
-            name = prop.name
-        else:
-            name = prop
-        while entity.get_property(name) is not None:
-            entity.remove_property(name)
-    @staticmethod
-    # move to api?
-    def set_property(entity, prop, value, datatype=None):
-        AbstractCFood.remove_property(entity, prop)
-        if datatype is not None:
-            entity.add_property(prop, value, datatype=datatype)
-        else:
-            entity.add_property(prop, value)
 def assure_object_is_in_list(obj, containing_object, property_name,
                             to_be_updated, datatype=None):
@@ -564,12 +505,11 @@ def get_ids_for_entities_with_names(entities):
 class RowCFood(AbstractCFood):
-    def __init__(self, row, unique_cols, recordtype):
+    def __init__(self, item, unique_cols, recordtype, **kwargs):
        """
        table : pandas table
        """
-        super().__init__()
+        super().__init__(item, **kwargs)
-        self.row = row
        self.unique_cols = unique_cols
        self.recordtype = recordtype
@@ -578,13 +518,79 @@ class RowCFood(AbstractCFood):
        rec.add_parent(self.recordtype)
        for col in self.unique_cols:
-            rec.add_property(col, self.row.loc[col])
+            rec.add_property(col, self.item.loc[col])
        self.identifiables.append(rec)
    def update_identifiables(self):
        rec = self.identifiables[0]
-        for key, value in self.row.iteritems():
+        for key, value in self.item.iteritems():
            if key in self.unique_cols:
                continue
            rec.add_property(key, value)
+class CMeal(object):
+    """
+    CMeal groups equivalent items and allow their collected insertion.
+    Sometimes there is no one item that can be used to trigger the creation of
+    some Record. E.g. if a collection of image files shall be referenced from one
+    Record that groups them, it is unclear which image should trigger the
+    creation of the Record.
+    CMeals are grouped based on the groups in the used regular expression. If,
+    in the above example, all the images reside in one folder, all groups of
+    the filename match except that for the file name should match.
+    The groups that shall match
+    need to be listed in the matching_groups class property. Subclasses will
+    overwrite this property.
+    This allows to use has_suitable_cfood in the match_item function of a CFood
+    to check whether the necessary CFood was already created.
+    In order to allow this all instances of a
+    CFood class are tracked in the existing_instances class member.
+    Subclasses must have a cls.get_re function and a match member variable
+    (see AbstractFileCFood)
+    """
+    existing_instances = []
+    matching_groups = []
+    def __init__(self):
+        self.__class__.existing_instances.append(self)
+    @classmethod
+    def all_groups_equal(cls, m1, m2):
+        equal = True
+        for group in cls.matching_groups:
+            if (group not in m1.groupdict() or
+                    group not in m2.groupdict() or
+                    m1.group(group) != m2.group(group)):
+                equal = False
+        return equal
+    @classmethod
+    def has_suitable_cfood(cls, item):
+        """ checks whether the required cfood object already exists.
+        item : the crawled item
+        """
+        match = re.match(cls.get_re(), item)
+        for cfood in cls.existing_instances:
+            if cls.all_groups_equal(match, cfood.match):
+                return True
+        return False
+    def belongs_to_meal(self, item):
+        # This is already the main item
+        if item == self.item:
+            return False
+        match = re.match(self.get_re(), item)
+        return self.all_groups_equal(match, self.match)
--- a/src/caosadvancedtools/crawler.py
+++ b/src/caosadvancedtools/crawler.py
@@ -82,44 +82,56 @@ class UnknownCache(object):
 class Crawler(object):
-    def __init__(self, food=None, access=lambda x: x, use_cache=False,
+    def __init__(self, cfood_types, use_cache=False,
-                 abort_on_exception=True, interactive=True):
+                 abort_on_exception=True, interactive=True, hideKnown=False):
        """
        Parameters
        ----------
-        food : list of CFood classes, optional
+        cfood_types : list of CFood classes
               The Crawler will use those CFoods when crawling.
-        pattern : str
-                  The regex pattern for matching against file names.
        use_cache : bool, optional
                    Whether to use caching (not re-inserting probably existing
                    objects into CaosDB), defaults to False.
+        abort_on_exception : if true, exceptions are raise.
-        access : callable, optional
+                    Otherwise the crawler continues if an exception occurs.
-                 A function that takes a CaosDB path and returns a local path
        interactive : boolean, optional
                      If true, questions will be posed during execution of the
                      crawl function.
        """
-        if food is None:
+        self.cfood_types = cfood_types
-            self.food = []
-        else:
-            self.food = food
        self.interactive = interactive
-        self.access = access
        self.report = db.Container()
        self.use_cache = use_cache
+        self.hideKnown = hideKnown
        self.abort_on_exception = abort_on_exception
        if self.use_cache:
            self.cache = Cache()
+    def iteritems(self):
+        """ generates items to be crawled with an index"""
+        yield 0, None
    def collect_cfoods(self):
        """
-        to be overwritten by subclasses.
+        This is the first phase of the crawl. It collects all cfoods that shall
+        be processed. The second phase is iterating over cfoods and updating
+        CaosDB. This separate first step is necessary in order to allow a
+        single cfood being influenced by multiple crawled items. E.g. the
+        FileCrawler can have a single cfood treat multiple files.
+        This is a very basic implementation and this function should be
+        overwritten by subclasses.
+        The basic structure of this function should be, that what ever is
+        being processed is iterated and each cfood is checked whether the
+        item 'matches'. If it does, a cfood is instantiated passing the item
+        as an argument.
+        The match can depend on the cfoods already being created, i.e. a file
+        migth no longer match because it is already treaded by an earlier
+        cfood.
        should return cfoods, tbs and errors_occured.
        # TODO do this via logging?
@@ -129,16 +141,21 @@ class Crawler(object):
        cfoods = []
        tbs = []
        errors_occured = False
+        matches = {idx: [] for idx, _ in self.iteritems()}
-        for food in self.food:
+        logger.info(separated("Matching files against CFoods"))
-            cfoods.append(food())
-        for Cfood in self.food:
+        for Cfood in self.cfood_types:
-            try:
+            logger.debug("Matching against {}...".format(Cfood.__name__))
-                cfood = Cfood()
-                if cfood is not None:
+            for idx, item in self.iteritems():
-                    cfoods.append(cfood)
+                if Cfood.match_item(item):
+                    try:
+                        cfoods.append(Cfood(item))
+                        matches[idx].append(Cfood.__name__)
+                        logger.debug("{} matched\n{}.".format(
+                                Cfood.__name__,
+                                item))
                    except Exception as e:
                        traceback.print_exc()
                        print(e)
@@ -148,8 +165,59 @@ class Crawler(object):
                        errors_occured = True
                        tbs.append(e)
+        logger.info(separated("CFoods are collecting information..."))
+        for cfood in cfoods:
+            cfood.collect_information()
+        logger.info(separated("Trying to attach further items to created CFoods"))
+        for cfood in cfoods:
+            logger.debug("Matching against {}...".format(Cfood.__name__))
+            for idx, item in self.iteritems():
+                if cfood.looking_for(item):
+                    logger.debug("{} matched\n{}.".format(
+                            cfood.__class__.__name__,
+                            item))
+                    cfood.attach(item)
+                    matches[idx].append(Cfood.__name__)
+        self.check_matches(matches)
        return cfoods, tbs, errors_occured
+    def check_matches(self, matches):
+        # possibly load previously encountered "Missing matches" and
+        # "Multiple matches"
+        ucache = UnknownCache(interactive=self.interactive, load=self.hideKnown)
+        for idx, item in self.iteritems():
+            if len(matches[idx]) == 0:
+                msg = ("ATTENTION: No matching cfood!\n"
+                       "Tried to match {}\n".format(item))
+                if item in ucache.filenames:
+                    logger.debug(msg)
+                else:
+                    logger.warning(msg)
+                ucache.add(item)
+            if len(matches[idx]) > 1:
+                msg = ("Attention: More than one matching cfood!\n"
+                       + "Tried to match {}\n".format(item)
+                       + "\tRecordTypes:\t" + ", ".join(
+                            matches[idx])+"\n")
+                if item in ucache.filenames:
+                    logger.debug(msg)
+                else:
+                    logger.warning(msg)
+                ucache.add(item)
+        # Save the encountered prblem matches
+        ucache.save()
    def cached_find_identifiables(self, identifiables):
        if self.use_cache:
            hashes = self.cache.update_ids_from_cache(identifiables)
@@ -273,122 +341,30 @@ class Crawler(object):
        return r
-    @staticmethod
-    def query_files(path):
-        query_str = "FIND FILE WHICH IS STORED AT " + \
-            (path if path.endswith("/") else path + "/") + "**"
-        logger.info("FILES QUERY: " + query_str)
-        files = db.execute_query(query_str)
-        logger.info("{} FILES TO BE PROCESSED.".format(len(files)))
-        return files
 class FileCrawler(Crawler):
-    def __init__(self, files, access=lambda x: x, hideKnown=False, **kwargs):
+    def __init__(self, files, **kwargs):
        """
        Parameters
        ----------
        files : files to be crawled
-        access : callable, optional
-                 A function that takes a CaosDB path and returns a local path
        """
        super().__init__(**kwargs)
        self.files = files
-        self.access = access
-        self.hideKnown = hideKnown
-    def match(self):
+    def iteritems(self):
+        for idx, p in enumerate(sorted([f.path for f in self.files])):
-        files = sorted([f.path for f in self.files])
+            yield idx, p
-        errors_occured = False
-        tbs = []
-        cfoods = []
-        matches = {f: [] for f in files}
-        logger.info(separated("Matching files against CFoods"))
-        for Cfood in self.food:
+    @staticmethod
-            logger.debug("Matching against {}...".format(Cfood.__name__))
+    def query_files(path):
+        query_str = "FIND FILE WHICH IS STORED AT " + (path if path.endswith("/") else path + "/") + "**"
-            for crawled_file in files:
+        logger.info("FILES QUERY: " + query_str)
-                if Cfood.match_file(crawled_file) is not None:
+        files = db.execute_query(query_str)
-                    matches[crawled_file].append(Cfood.__name__)
+        logger.info("{} FILES TO BE PROCESSED.".format(len(files)))
-                    logger.debug("{} matched\n{}.".format(
-                            Cfood.__name__,
-                            crawled_file))
-                    try:
-                        cfood = Cfood.cook(crawled_file, access=self.access)
-                        if cfood is not None:
-                            cfoods.append(cfood)
-                    except Exception as e:
-                        traceback.print_exc()
-                        print(e)
-                        if self.abort_on_exception:
-                            raise e
-                        errors_occured = True
-                        tbs.append(e)
-        logger.info(separated("CFoods are collecting information..."))
-        for cfood in cfoods:
-            cfood.collect_information()
-        logger.info(separated("Trying to attach files to created CFoods"))
-        for cfood in cfoods:
-            logger.debug("Matching against {}...".format(Cfood.__name__))
-            for crawled_file in files:
-                if cfood.looking_for(crawled_file):
-                    logger.debug("{} matched\n{}.".format(
-                            cfood.__class__.__name__,
-                            crawled_file))
-                    cfood.attach(crawled_file)
-                    matches[crawled_file].append(Cfood.__name__)
-        # possibly load previously encountered "Missing matches" and
-        # "Multiple matches"
-        ucache = UnknownCache(interactive=self.interactive, load=self.hideKnown)
-        for crawled_file in files:
-            if len(matches[crawled_file]) == 0:
-                msg = ("ATTENTION: No matching cfood!\n"
-                       "Tried to match {}\n".format(crawled_file))
-                if crawled_file in ucache.filenames:
-                    logger.debug(msg)
-                else:
-                    logger.warning(msg)
-                ucache.add(crawled_file)
-            if len(matches[crawled_file]) > 1:
-                msg = ("Attention: More than one matching cfood!\n"
-                       + "Tried to match {}\n".format(crawled_file)
-                       + "\tRecordTypes:\t" + ", ".join(
-                            matches[crawled_file])+"\n")
-                if crawled_file in ucache.filenames:
-                    logger.debug(msg)
-                else:
-                    logger.warning(msg)
-                ucache.add(crawled_file)
-        # Save the encountered prblem matches
-        ucache.save()
-        return cfoods, tbs, errors_occured
-    def collect_cfoods(self):
-        cfoods, tbs, errors_occured = self.match()
-        return cfoods, tbs, errors_occured
+        return files
 class TableCrawler(Crawler):
@@ -402,32 +378,18 @@ class TableCrawler(Crawler):
                      identifiable
        recordtype : Record Type of the Records to be created
        """
-        super().__init__(**kwargs)
        self.table = table
-        self.unique_cols = unique_cols
-        self.recordtype = recordtype
-    def collect_cfoods(self):
-        cfoods = []
-        tbs = []
-        errors_occured = False
-        for _, row in self.table.iterrows():
-            try:
-                cfood = RowCFood(row, self.unique_cols, self.recordtype)
-                if cfood is not None:
+        # TODO I do not like this yet, but I do not see a better way so far.
-                    cfoods.append(cfood)
+        class ThisRowCF(RowCFood):
-            except Exception as e:
+            def __init__(self, item):
-                traceback.print_exc()
+                super().__init__(item, unique_cols, recordtype)
-                print(e)
-                if self.abort_on_exception:
+        super().__init__(cfood_types=[ThisRowCF], **kwargs)
-                    raise e
-                errors_occured = True
-                tbs.append(e)
-        return cfoods, tbs, errors_occured
+    def iteritems(self):
+        for idx, row in self.table.iterrows():
+            yield idx, row
 def get_value(prop):

--- a/unittests/test_cfood.py
+++ b/unittests/test_cfood.py
@@ -21,33 +21,90 @@
 # along with this program. If not, see <https://www.gnu.org/licenses/>.
 #
 # ** end header
+import re
 import unittest
 import caosdb as db
+from caosadvancedtools.cfood import (AbstractCFood, AbstractFileCFood, CMeal,
-from caosadvancedtools.cfood import (AbstractFileCFood, assure_has_parent,
+                                     assure_has_parent,
                                     assure_object_is_in_list)
 from caosadvancedtools.example_cfood import ExampleCFood
 PATTERN = "h.*"
-class TestCFood(AbstractFileCFood):
+class ExampleCFoodMeal(AbstractFileCFood, CMeal):
+    matching_groups = ["test"]
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # add the constructor of CMeal
+        CMeal.__init__(self)
+    @classmethod
+    def match_item(cls, item):
+        """ standard match_match, but returns False if a suitable cfood exists """
+        if cls.has_suitable_cfood(item):
+            return False
+        return re.match(cls.get_re(), item) is not None
+    def looking_for(self, crawled_file):
+        """ standard looking_for, but returns True if the file matches all
+        groups"""
+        if self.belongs_to_meal(crawled_file):
+            return True
+        return super().looking_for(crawled_file)
+    @staticmethod
+    def get_re():
+        return r"/(?P<test>[a-z]*)/"
+    def create_identifiables(self):
+        pass
+    def update_identifiables(self):
+        pass
+class SimpleCFood(AbstractFileCFood):
    @staticmethod
    def get_re():
        return PATTERN
+class DependendCFood(AbstractCFood):
+    existing = []
+    @classmethod
+    def match_item(cls, item):
+        if len(cls.existing) == 0:
+            return True
+    def __init__(self, *args,  **kwargs):
+        super().__init__(*args,  **kwargs)
+        DependendCFood.existing.append(self)
+    def create_identifiables(self):
+        pass
+    def update_identifiables(self):
+        pass
 class CFoodReTest(unittest.TestCase):
    def test(self):
-        self.assertEquals(TestCFood.get_re(), PATTERN)
+        self.assertEqual(SimpleCFood.get_re(), PATTERN)
-        self.assertEqual(TestCFood._pattern, None)
+        self.assertEqual(SimpleCFood._pattern, None)
-        self.assertIsNotNone(TestCFood.match_file("hallo"))
+        self.assertTrue(SimpleCFood.match_item("hallo"))
        # TODO the caching is of compiled re is disabled currently
-        # self.assertIsNotNone(TestCFood._pattern)
+        # self.assertIsNotNone(SimpleCFood._pattern)
-        self.assertIsNotNone(TestCFood.match_file("hallo"))
+        self.assertTrue(SimpleCFood.match_item("hallo"))
-        self.assertIsNone(TestCFood.match_file("allo"))
+        self.assertFalse(SimpleCFood.match_item("allo"))
 class InsertionTest(unittest.TestCase):
@@ -75,10 +132,31 @@ class InsertionTest(unittest.TestCase):
        assert len(to_be_updated) == 0
+class DependendTest(unittest.TestCase):
+    def test(self):
+        self.assertTrue(DependendCFood.match_item(None))
+        cf = DependendCFood(None)
+        self.assertFalse(DependendCFood.match_item(None))
 class ExampleTest(unittest.TestCase):
    def test(self):
        path = "/data/rabbit/2019-03-03/README.md"
        cf = ExampleCFood(crawled_path=path)
-        self.assertIsNotNone(ExampleCFood.match_file(path))
+        self.assertIsNotNone(ExampleCFood.match_item(path))
        self.assertEqual(cf.match.group('species'), 'rabbit')
        self.assertEqual(cf.match.group('date'), '2019-03-03')
+class MealTest(unittest.TestCase):
+    def test(self):
+        # path should match
+        self.assertTrue(ExampleCFoodMeal.match_item("/this/file"))
+        # create an instance
+        c = ExampleCFoodMeal("/this/file")
+        # same prefix should no longer match
+        self.assertFalse(ExampleCFoodMeal.match_item("/this/other"))
+        # but instance should be looking for this prefix
+        self.assertTrue(c.looking_for("/this/other"))
+        # class should still match other prefixes
+        self.assertTrue(ExampleCFoodMeal.match_item("/that/file"))