From eb752ae96198b8cbb76f8e1970c1fd6bea741e4b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com>
Date: Wed, 29 Apr 2020 10:28:20 +0000
Subject: [PATCH] Create a non file based crawler

---
 .docker/Dockerfile                            |   3 +
 .gitlab-ci.yml                                |  13 +-
 README_SETUP.md                               |   2 +-
 integrationtests/full_test/crawl.py           |  18 +-
 integrationtests/full_test/model.yml          |   5 +-
 integrationtests/full_test/test.sh            |   3 +-
 integrationtests/full_test/test_table.py      |  51 +++
 .../test_labfolder_import.py                  |   2 +-
 .../test_labfolder_retrieve.py                |   0
 src/caosadvancedtools/cfood.py                | 285 ++++++++++-------
 .../converter/labfolder_api.py                |   1 -
 src/caosadvancedtools/crawler.py              | 292 ++++++++++++------
 src/caosadvancedtools/example_cfood.py        |   4 +-
 unittests/test_cfood.py                       |  15 +-
 14 files changed, 474 insertions(+), 220 deletions(-)
 create mode 100644 integrationtests/full_test/test_table.py
 rename {unittests => manual_tests}/test_labfolder_import.py (95%)
 rename {unittests => manual_tests}/test_labfolder_retrieve.py (100%)

diff --git a/.docker/Dockerfile b/.docker/Dockerfile
index 9d2c5aed..9daad27e 100644
--- a/.docker/Dockerfile
+++ b/.docker/Dockerfile
@@ -4,6 +4,9 @@ RUN apt-get update && \
 	 curl \
 	 python3 \
 	 python3-pip \
+	 python3-requests \
+	 python3-pandas \
+	 python3-html2text \
 	 git \
 	 openjdk-11-jdk-headless \
 	 python-autopep8 \
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 4cf9b51f..699b03e2 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -31,11 +31,12 @@ stages:
   - setup
   - cert
   - style
-  - test
+  - unittest
+  - integrationtest
 
 test:
   tags: [docker]
-  stage: test
+  stage: integrationtest
   image: $CI_REGISTRY_IMAGE_BASE
   script:
       - if [[ "$CAOSDB_TAG" == "" ]]; then
@@ -106,3 +107,11 @@ style:
   script:
       - autopep8 -ar --diff --exit-code .
   allow_failure: true
+
+unittest:
+  tags: [docker]
+  stage: unittest
+  image: $CI_REGISTRY_IMAGE
+  script:
+      - cd src
+      - python3 -m pytest ../unittests
diff --git a/README_SETUP.md b/README_SETUP.md
index afea7527..cba784d3 100644
--- a/README_SETUP.md
+++ b/README_SETUP.md
@@ -8,7 +8,7 @@ tox
 
 # Run Integration Tests Locally
 1. Mount `integrationtests/full_test/extroot` to the folder that will be used as
-   extroot. E.g. `sudo mount -o bind extroot ../../../caosdb-deploy/profiles/empty/custom/extroot`
+   extroot. E.g. `sudo mount -o bind extroot ../../../caosdb-deploy/profiles/empty/paths/extroot`
 2. Start an empty CaosDB instance
 3. run test.sh
 
diff --git a/integrationtests/full_test/crawl.py b/integrationtests/full_test/crawl.py
index 622213ec..51ad24b9 100755
--- a/integrationtests/full_test/crawl.py
+++ b/integrationtests/full_test/crawl.py
@@ -29,7 +29,8 @@ import sys
 from argparse import RawTextHelpFormatter
 
 import caosdb as db
-from caosadvancedtools.crawler import Crawler
+
+from caosadvancedtools.crawler import FileCrawler
 from caosadvancedtools.guard import INSERT, RETRIEVE, UPDATE, Guard
 from caosadvancedtools.utils import set_log_level
 from scifolder import (AnalysisCFood, ExperimentCFood, ProjectCFood,
@@ -59,12 +60,13 @@ if __name__ == "__main__":
     args = parser.parse_args()
 
     logger.info("Starting query...")
-    files = Crawler.query_files(args.path)
+    files = FileCrawler.query_files(args.path)
     logger.info("Query done...")
     config = db.configuration.get_config()
-    c = Crawler(use_cache=True, access=access,
-                food=[ProjectCFood,
-                      ExperimentCFood, AnalysisCFood,
-                      PublicationCFood, SimulationCFood,
-                      ])
-    c.crawl(files, interactive=False, security_level=INSERT, hideKnown=True)
+    c = FileCrawler(files=files, use_cache=True, access=access,
+                    interactive=False, hideKnown=True,
+                    food=[ProjectCFood,
+                          ExperimentCFood, AnalysisCFood,
+                          PublicationCFood, SimulationCFood,
+                          ])
+    c.crawl(security_level=UPDATE)
diff --git a/integrationtests/full_test/model.yml b/integrationtests/full_test/model.yml
index e698909b..a01836b3 100644
--- a/integrationtests/full_test/model.yml
+++ b/integrationtests/full_test/model.yml
@@ -18,7 +18,10 @@ Person:
     lastName:
       datatype: TEXT 
       description: 'LastName of a Person.'
-    responsible:
+  recommended_properties:
+    email:
+      datatype: TEXT 
+      description: 'Email of a Person.'
 responsible:
   datatype: REFERENCE
 revisionOf:
diff --git a/integrationtests/full_test/test.sh b/integrationtests/full_test/test.sh
index f7e7191b..25906da8 100755
--- a/integrationtests/full_test/test.sh
+++ b/integrationtests/full_test/test.sh
@@ -3,6 +3,7 @@ rm -rf cache.db
 echo "Filling the database"
 ./filldb.sh
 echo "Testing the crawler database"
-py.test-3 test_crawler.py
+python3 -m pytest test_crawler.py
+# TODO the following test deletes lots of the data inserted by the crawler
 echo "Testing im and export"
 python3 test_im_und_export.py
diff --git a/integrationtests/full_test/test_table.py b/integrationtests/full_test/test_table.py
new file mode 100644
index 00000000..91b99471
--- /dev/null
+++ b/integrationtests/full_test/test_table.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+#
+# This file is a part of the CaosDB Project.
+#
+# Copyright (C) 2020 Henrik tom Wörden
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+import argparse
+import logging
+import sys
+from argparse import RawTextHelpFormatter
+
+import caosdb as db
+import pandas as pd
+
+from caosadvancedtools.crawler import TableCrawler
+from caosadvancedtools.guard import INSERT, RETRIEVE, UPDATE, Guard
+from caosadvancedtools.utils import set_log_level
+
+if __name__ == "__main__":
+    logger = logging.getLogger("caosadvancedtools")
+    conlogger = logging.getLogger("connection")
+    conlogger.setLevel(level=logging.ERROR)
+    logger.setLevel(level=logging.DEBUG)
+
+    table = pd.read_csv("example_table.csv")
+
+    assert 0 == len(db.execute_query("FIND Person with firstname=Henrik"))
+    first = table.loc[table.firstName == "Henrik"]
+    tcr = TableCrawler(table=first, unique_cols=["firstName", "lastName"],
+                       recordtype="Person", interactive=False)
+    tcr.crawl(security_level=UPDATE)
+    assert 1 == len(db.execute_query("FIND Person with firstname=Henrik"))
+    tcr = TableCrawler(table=table, unique_cols=["firstName", "lastName"],
+                       recordtype="Person", interactive=False)
+    tcr.crawl(security_level=UPDATE)
+    assert 1 == len(db.execute_query("FIND Person with firstname=Henrik"))
+    assert 1 == len(db.execute_query("FIND Person with firstname=Max"))
diff --git a/unittests/test_labfolder_import.py b/manual_tests/test_labfolder_import.py
similarity index 95%
rename from unittests/test_labfolder_import.py
rename to manual_tests/test_labfolder_import.py
index 0508f3e2..e1e9d326 100644
--- a/unittests/test_labfolder_import.py
+++ b/manual_tests/test_labfolder_import.py
@@ -27,7 +27,7 @@ import sys
 import caosmodels
 from caosmodels.parser import parse_model_from_yaml
 
-from caosadvancedtools.converter import labfolder
+from caosadvancedtools.converter import labfolder_export as labfolder
 
 
 def main(args):
diff --git a/unittests/test_labfolder_retrieve.py b/manual_tests/test_labfolder_retrieve.py
similarity index 100%
rename from unittests/test_labfolder_retrieve.py
rename to manual_tests/test_labfolder_retrieve.py
diff --git a/src/caosadvancedtools/cfood.py b/src/caosadvancedtools/cfood.py
index 59c3a4ee..2128d6b1 100644
--- a/src/caosadvancedtools/cfood.py
+++ b/src/caosadvancedtools/cfood.py
@@ -7,6 +7,7 @@
 # Copyright (C) 2018 Research Group Biomedical Physics,
 # Max-Planck-Institute for Dynamics and Self-Organization Göttingen
 # Copyright (C) 2019 Henrik tom Wörden
+# Copyright (C) 2020 Henrik tom Wörden
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU Affero General Public License as
@@ -62,46 +63,176 @@ def get_entity(name):
 
 
 class AbstractCFood(object):
+
+    def __init__(self):
+        """ Abstract base class for Crawler food (CFood)."""
+        self.to_be_updated = db.Container()
+        self.identifiables = db.Container()
+
+    def create_identifiables(self):
+        """
+        should set the instance variable Container with the identifiables
+        """
+        raise NotImplementedError()
+
+    def update_identifiables(self):
+        """ Changes the identifiables as needed and adds changed identifiables
+        to self.to_be_updated
+        """
+        raise NotImplementedError()
+
+    def push_identifiables_to_CaosDB(self):
+        """ Updates the self.to_be_updated Container, i.e. pushes the changes
+        to CaosDB
+        """
+
+        if len(self.to_be_updated) == 0:
+            return
+
+        get_ids_for_entities_with_names(self.to_be_updated)
+
+        # remove duplicates
+        tmp = db.Container()
+
+        for el in self.to_be_updated:
+            if el not in tmp:
+                tmp.append(el)
+
+        self.to_be_updated = tmp
+
+        logger.info("UPDATE: updating the following entities")
+
+        for el in self.to_be_updated:
+            logger.info("\t" + el.name if el.name is not None else el.id)
+
+        logger.debug(self.to_be_updated)
+        guard.safe_update(self.to_be_updated)
+
+    @staticmethod
+    # move to api?
+    def set_parents(entity, names):
+        entity.parents.clear()
+
+        for n in names:
+            entity.add_parent(get_entity(n))
+
+    @staticmethod
+    # move to api?
+    def remove_property(entity, prop):
+        # TODO only do something when it is necessary?
+
+        if isinstance(prop, db.Entity):
+            name = prop.name
+        else:
+            name = prop
+
+        while entity.get_property(name) is not None:
+            entity.remove_property(name)
+
+    @staticmethod
+    # move to api?
+    def set_property(entity, prop, value, datatype=None):
+        AbstractCFood.remove_property(entity, prop)
+
+        if datatype is not None:
+            entity.add_property(prop, value, datatype=datatype)
+        else:
+            entity.add_property(prop, value)
+
+
+class CMeal(object):
+    """
+    CMeal groups equivalent Files and allow their collected insertion.
+
+    Sometimes there is no one file that can be used to trigger the creation of
+    some Record. E.g. if a collection of images shall be referenced from one
+    Record that groups them, it is unclear which image should trigger the
+    creation of the Record.
+
+    CMeals are grouped based on the groups in the used regular expression. If,
+    in the above example, all the images reside in one folder, all groups
+    except that for the file name should match. The groups that shall match
+    need to be listed in the matching_groups class property. Subclasses will
+    overwrite this property.
+
+    The cook function of a cfood allows this class to work. Instead of directly
+    instantiating a CFood the cook function is used. If the CFood is also a
+    child of CMeal, it will be checked (using get_suitable_cfood) in the cook
+    function whether a new CFood should be created or if the file match should
+    be added to an existing one. In order to allow this all instances of a
+    CFood class are tracked in the existing_instances class member.
+    """
+    existing_instances = []
+    matching_groups = []
+
+    def __init__(self, *args, **kwargs):
+        self.__class__.existing_instances.append(self)
+        self.crawled_files = []
+
+    def add(self, crawled_file):
+        self.crawled_files.append(crawled_file)
+
+    @classmethod
+    def get_suitable_cfood(cls, match):
+        for cfood in cls.existing_instances:
+            suitable = True
+
+            for group in cls.matching_groups:
+                if (group not in match.groupdict() or
+                        group not in cfood.match.groupdict() or
+                        match.group(group) != cfood.match.group(group)):
+                    suitable = False
+
+            if suitable:
+                return cfood
+
+        return None
+
+
+def get_entity_for_path(path):
+    try:
+        q = "FIND FILE WHICH IS STORED AT '{}'".format(path)
+
+        return db.execute_query(q, unique=True)
+    except EntityDoesNotExistError:
+        path_prefix = "**"
+
+        if not path.startswith("/"):
+            path_prefix = path_prefix + "/"
+        q = "FIND FILE WHICH IS STORED AT '{}{}'".format(path_prefix, path)
+        logger.debug(q)
+
+        return db.execute_query(q, unique=True)
+
+
+class AbstractFileCFood(AbstractCFood):
     # contains the compiled regular expression after the first execution of the
     # function match()
     _pattern = None
 
-    def __init__(self, crawled_file, access=lambda x: x):
-        """ Abstract base class for Crawler food (CFood).
+    def __init__(self, crawled_path, access=lambda x: x):
+        """ Abstract base class for file based Crawler food (CFood).
 
         Parameters
         ----------
-        crawled_file : The file that the crawler is currently matching. Its
+        crawled_path : The file that the crawler is currently matching. Its
                        path should match against the pattern of this class
 
         access : callable, optional
                  A function that takes a CaosDB path and returns a local path
         """
+        super().__init__()
         self.access = access
         self._crawled_file = None
-        self.crawled_path = crawled_file
-        self.match = type(self).match(crawled_file)
-        self.to_be_updated = db.Container()
-        self.identifiables = db.Container()
+        self.crawled_path = crawled_path
+        self.match = type(self).match_file(crawled_path)
         self.attached_ones = []
         self.attached_filenames = []
 
     @property
     def crawled_file(self):
         if self._crawled_file is None:
-            try:
-                q = "FIND FILE WHICH IS STORED AT '{}'".format(
-                    self.crawled_path)
-                self._crawled_file = db.execute_query(q, unique=True)
-            except EntityDoesNotExistError:
-                path = "**"
-
-                if not self.crawled_path.startswith("/"):
-                    path = path + "/"
-                q = "FIND FILE WHICH IS STORED AT '{}{}'".format(path,
-                                                                 self.crawled_path)
-                logger.debug(q)
-                self._crawled_file = db.execute_query(q, unique=True)
+            self._crawled_file = get_entity_for_path(self.crawled_path)
 
         return self._crawled_file
 
@@ -146,7 +277,7 @@ class AbstractCFood(object):
         return cls(crawled_file, **kwargs)
 
     @classmethod
-    def match(cls, string):
+    def match_file(cls, string):
         """ Matches the regular expression of this class against file names
 
         Parameters
@@ -164,48 +295,12 @@ class AbstractCFood(object):
 
         return re.match(cls.get_re(), string)
 
-    def create_identifiables(self):
-        """
-        should set the instance variable Container with the identifiables
-        """
-        raise NotImplementedError()
-
-    def update_identifiables(self):
-        """ Changes the identifiables as needed and adds changed identifiables
-        to self.to_be_updated
-        """
-        raise NotImplementedError()
-
-    def push_identifiables_to_CaosDB(self):
-        """ Updates the self.to_be_updated Container, i.e. pushes the changes
-        to CaosDB
-        """
-
-        if len(self.to_be_updated) == 0:
-            return
-
-        get_ids_for_entities_with_names(self.to_be_updated)
-
-        # remove duplicates
-        tmp = db.Container()
-
-        for el in self.to_be_updated:
-            if el not in tmp:
-                tmp.append(el)
-
-        self.to_be_updated = tmp
-
-        logger.info("UPDATE: updating the following entities")
-
-        for el in self.to_be_updated:
-            logger.info("\t" + el.name if el.name is not None else el.id)
-
-        logger.debug(self.to_be_updated)
-        guard.safe_update(self.to_be_updated)
-
     def attach(self, crawled_file):
         self.attached_ones.append(crawled_file)
 
+    # TODO looking for should `attach` the files itsself. This would allow to
+    # group them right away and makes it unnecessary to check matches later
+    # again.
     def looking_for(self, crawled_file):
         """
         returns True if crawled_file can be added to this CFood.
@@ -218,6 +313,8 @@ class AbstractCFood(object):
         This function can be used to define what files shall be 'attached'.
         """
 
+        # TODO rename to filenames_to_be_attached
+
         if crawled_file in self.attached_filenames:
             return True
 
@@ -280,7 +377,6 @@ def assure_object_is_in_list(obj, containing_object, property_name,
 
     if not isinstance(containing_object.get_property(property_name).value, list):
         containing_object.get_property(property_name).value = [containing_object.get_property(property_name).value]
-        containing_object.get_property(property_name).value
         containing_object.get_property(property_name).datatype = datatype
     current_list = containing_object.get_property(property_name).value
 
@@ -418,6 +514,9 @@ def assure_has_property(entity, name, value, to_be_updated=None,
                            name.lower()]
     contained = False
 
+    if isinstance(value, db.Entity):
+        value = value.id
+
     for el in possible_properties:
         if el.value == value:
             contained = True
@@ -464,50 +563,28 @@ def get_ids_for_entities_with_names(entities):
             insert_id_based_on_name(ent)
 
 
-class CMeal(object):
-    """
-    CMeal groups equivalent CFoods and allow their collected insertion.
-
-    Sometimes there is no one file that can be used to trigger the creation of
-    some Record. E.g. if a collection of images shall be referenced from one
-    Record that groups them, it is unclear which image should trigger the
-    creation of the Record.
-
-    CMeals are grouped based on the groups in the used regular expression. If,
-    in the above example, all the images reside in one folder, all groups
-    except that for the file name should match. The groups that shall match
-    need to be listed in the matching_groups class property. Subclasses will
-    overwrite this property.
-
-    The cook function of a cfood allows this class to work. Instead of directly
-    instantiating a CFood the cook function is used. If the CFood is also a
-    child of CMeal, it will be checked (using get_suitable_cfood) in the cook
-    function whether a new CFood should be created or if the file match should
-    be added to an existing one. In order to allow this all instances of a
-    CFood class are tracked in the existing_instances class member.
-    """
-    existing_instances = []
-    matching_groups = []
-
-    def __init__(self, *args, **kwargs):
-        self.existing_instances.append(self)
-        self.crawled_files = []
-
-    def add(self, crawled_file):
-        self.crawled_files.append(crawled_file)
+class RowCFood(AbstractCFood):
+    def __init__(self, row, unique_cols, recordtype):
+        """
+        table : pandas table
+        """
+        super().__init__()
+        self.row = row
+        self.unique_cols = unique_cols
+        self.recordtype = recordtype
 
-    @classmethod
-    def get_suitable_cfood(cls, match):
-        for cfood in cls.existing_instances:
-            suitable = True
+    def create_identifiables(self):
+        rec = db.Record()
+        rec.add_parent(self.recordtype)
 
-            for group in cls.matching_groups:
-                if (group not in match.groupdict() or
-                        group not in cfood.match.groupdict() or
-                        match.group(group) != cfood.match.group(group)):
-                    suitable = False
+        for col in self.unique_cols:
+            rec.add_property(col, self.row.loc[col])
+        self.identifiables.append(rec)
 
-            if suitable:
-                return cfood
+    def update_identifiables(self):
+        rec = self.identifiables[0]
 
-        return None
+        for key, value in self.row.iteritems():
+            if key in self.unique_cols:
+                continue
+            rec.add_property(key, value)
diff --git a/src/caosadvancedtools/converter/labfolder_api.py b/src/caosadvancedtools/converter/labfolder_api.py
index 82b2f4f4..567ee5a8 100644
--- a/src/caosadvancedtools/converter/labfolder_api.py
+++ b/src/caosadvancedtools/converter/labfolder_api.py
@@ -28,7 +28,6 @@ import time
 import html2text
 
 import caosdb as db
-import labfolder.connection
 from labfolder.connection import configure_connection
 
 
diff --git a/src/caosadvancedtools/crawler.py b/src/caosadvancedtools/crawler.py
index d581d759..db89551d 100644
--- a/src/caosadvancedtools/crawler.py
+++ b/src/caosadvancedtools/crawler.py
@@ -6,6 +6,7 @@
 #
 # Copyright (C) 2018 Research Group Biomedical Physics,
 # Max-Planck-Institute for Dynamics and Self-Organization Göttingen
+# Copyright (C) 2020 Henrik tom Wörden
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU Affero General Public License as
@@ -45,7 +46,8 @@ import caosdb as db
 from caosdb.exceptions import TransactionError
 
 from .cache import Cache
-from .guard import INSERT, RETRIEVE, UPDATE
+from .cfood import RowCFood
+from .guard import RETRIEVE
 from .guard import global_guard as guard
 
 logger = logging.getLogger(__name__)
@@ -80,11 +82,13 @@ class UnknownCache(object):
 
 
 class Crawler(object):
-    def __init__(self, food, access=lambda x: x, use_cache=False,
-                 abort_on_exception=True):
+    def __init__(self, food=None, access=lambda x: x, use_cache=False,
+                 abort_on_exception=True, interactive=True):
         """
         Parameters
         ----------
+        food : list of CFood classes, optional
+               The Crawler will use those CFoods when crawling.
         pattern : str
                   The regex pattern for matching against file names.
 
@@ -94,8 +98,17 @@ class Crawler(object):
 
         access : callable, optional
                  A function that takes a CaosDB path and returns a local path
+        interactive : boolean, optional
+                      If true, questions will be posed during execution of the
+                      crawl function.
+
         """
-        self.food = food
+
+        if food is None:
+            self.food = []
+        else:
+            self.food = food
+        self.interactive = interactive
         self.access = access
         self.report = db.Container()
         self.use_cache = use_cache
@@ -104,98 +117,54 @@ class Crawler(object):
         if self.use_cache:
             self.cache = Cache()
 
-    def match(self, files, interactive, hideKnown=False):
-        errors_occured = False
-        tbs = []
+    def collect_cfoods(self):
+        """
+        to be overwritten by subclasses.
+
+        should return cfoods, tbs and errors_occured.
+        # TODO do this via logging?
+        tbs text returned from traceback
+        errors_occured True if at least one error occured
+        """
         cfoods = []
-        matches = {f: [] for f in files}
+        tbs = []
+        errors_occured = False
 
-        logger.info(separated("Matching files against CFoods"))
+        for food in self.food:
+            cfoods.append(food())
 
         for Cfood in self.food:
-            logger.debug("Matching against {}...".format(Cfood.__name__))
-
-            for crawled_file in files:
-                if Cfood.match(crawled_file) is not None:
-                    matches[crawled_file].append(Cfood.__name__)
-
-                    logger.debug("{} matched\n{}.".format(
-                            Cfood.__class__.__name__,
-                            crawled_file))
-                    try:
-                        cfood = Cfood.cook(crawled_file, access=self.access)
-
-                        if cfood is not None:
-                            cfoods.append(cfood)
-                    except Exception as e:
-                        traceback.print_exc()
-                        print(e)
-
-                        if self.abort_on_exception:
-                            raise e
-                        errors_occured = True
-                        tbs.append(e)
-
-        logger.info(separated("CFoods are collecting information..."))
-
-        for cfood in cfoods:
-            cfood.collect_information()
-
-        logger.info(separated("Trying to attach files to created CFoods"))
-
-        for cfood in cfoods:
-            logger.debug("Matching against {}...".format(Cfood.__name__))
-
-            for crawled_file in files:
-                if cfood.looking_for(crawled_file):
-                    logger.debug("{} matched\n{}.".format(
-                            Cfood.__class__.__name__,
-                            crawled_file))
-                    cfood.attach(crawled_file)
-                    matches[crawled_file].append(Cfood.__name__)
-
-        # possibly load previously encountered "Missing matches" and
-        # "Multiple matches"
-        ucache = UnknownCache(interactive=interactive, load=hideKnown)
+            try:
+                cfood = Cfood()
 
-        for crawled_file in files:
-            if len(matches[crawled_file]) == 0:
-                msg = ("ATTENTION: No matching cfood!\n"
-                       "Tried to match {}\n".format(crawled_file))
+                if cfood is not None:
+                    cfoods.append(cfood)
+            except Exception as e:
+                traceback.print_exc()
+                print(e)
 
-                if crawled_file in ucache.filenames:
-                    logger.debug(msg)
-                else:
-                    logger.warn(msg)
-                ucache.add(crawled_file)
+                if self.abort_on_exception:
+                    raise e
+                errors_occured = True
+                tbs.append(e)
 
-            if len(matches[crawled_file]) > 1:
-                msg = ("Attention: More than one matching cfood!\n"
-                       + "Tried to match {}\n".format(crawled_file)
-                       + "\tRecordTypes:\t" + ", ".join(
-                            matches[crawled_file])+"\n")
+        return cfoods, tbs, errors_occured
 
-                if crawled_file in ucache.filenames:
-                    logger.debug(msg)
-                else:
-                    logger.warn(msg)
-                ucache.add(crawled_file)
+    def cached_find_identifiables(self, identifiables):
+        if self.use_cache:
+            hashes = self.cache.update_ids_from_cache(identifiables)
 
-        # Save the encountered prblem matches
-        ucache.save()
+        self.find_or_insert_identifiables(identifiables)
 
-        return cfoods, matches, tbs, errors_occured
+        if self.use_cache:
+            self.cache.insert_list(hashes, identifiables)
 
-    def crawl(self, files, interactive=True, hideKnown=False,
-              security_level=RETRIEVE):
+    def crawl(self, security_level=RETRIEVE):
         guard.set_level(level=security_level)
 
-        files = sorted([f.path for f in files])
-
-        cfoods, matches, tbs, errors_occured = self.match(files, interactive,
-                                                          hideKnown=hideKnown)
+        cfoods, tbs, errors_occured = self.collect_cfoods()
 
-        if interactive and "y" != input("Do you want to continue? (y)"):
+        if self.interactive and "y" != input("Do you want to continue? (y)"):
             return
 
         logger.info(separated("Creating and updating Identifiables"))
@@ -204,14 +173,7 @@ class Crawler(object):
             try:
                 cfood.create_identifiables()
 
-                if self.use_cache:
-                    hashes = self.cache.update_ids_from_cache(
-                        cfood.identifiables)
-
-                self.find_or_insert_identifiables(cfood.identifiables)
-
-                if self.use_cache:
-                    self.cache.insert_list(hashes, cfood.identifiables)
+                self.cached_find_identifiables(cfood.identifiables)
 
                 cfood.update_identifiables()
                 cfood.push_identifiables_to_CaosDB()
@@ -225,8 +187,8 @@ class Crawler(object):
                 tbs.append(e)
 
         if errors_occured:
-            logger.warn("Crawler terminated with failures!")
-            logger.warn(tbs)
+            logger.warning("Crawler terminated with failures!")
+            logger.warning(tbs)
         else:
             logger.info("Crawler terminated successfully!")
 
@@ -322,6 +284,152 @@ class Crawler(object):
         return files
 
 
+class FileCrawler(Crawler):
+    def __init__(self, files, access=lambda x: x, hideKnown=False, **kwargs):
+        """
+        Parameters
+        ----------
+        files : files to be crawled
+
+        access : callable, optional
+                 A function that takes a CaosDB path and returns a local path
+
+        """
+        super().__init__(**kwargs)
+        self.files = files
+        self.access = access
+        self.hideKnown = hideKnown
+
+    def match(self):
+
+        files = sorted([f.path for f in self.files])
+        errors_occured = False
+        tbs = []
+        cfoods = []
+        matches = {f: [] for f in files}
+
+        logger.info(separated("Matching files against CFoods"))
+
+        for Cfood in self.food:
+            logger.debug("Matching against {}...".format(Cfood.__name__))
+
+            for crawled_file in files:
+                if Cfood.match_file(crawled_file) is not None:
+                    matches[crawled_file].append(Cfood.__name__)
+
+                    logger.debug("{} matched\n{}.".format(
+                            Cfood.__name__,
+                            crawled_file))
+                    try:
+                        cfood = Cfood.cook(crawled_file, access=self.access)
+
+                        if cfood is not None:
+                            cfoods.append(cfood)
+                    except Exception as e:
+                        traceback.print_exc()
+                        print(e)
+
+                        if self.abort_on_exception:
+                            raise e
+                        errors_occured = True
+                        tbs.append(e)
+
+        logger.info(separated("CFoods are collecting information..."))
+
+        for cfood in cfoods:
+            cfood.collect_information()
+
+        logger.info(separated("Trying to attach files to created CFoods"))
+
+        for cfood in cfoods:
+            logger.debug("Matching against {}...".format(Cfood.__name__))
+
+            for crawled_file in files:
+                if cfood.looking_for(crawled_file):
+                    logger.debug("{} matched\n{}.".format(
+                            cfood.__class__.__name__,
+                            crawled_file))
+                    cfood.attach(crawled_file)
+                    matches[crawled_file].append(Cfood.__name__)
+
+        # possibly load previously encountered "Missing matches" and
+        # "Multiple matches"
+        ucache = UnknownCache(interactive=self.interactive, load=self.hideKnown)
+
+        for crawled_file in files:
+            if len(matches[crawled_file]) == 0:
+                msg = ("ATTENTION: No matching cfood!\n"
+                       "Tried to match {}\n".format(crawled_file))
+
+                if crawled_file in ucache.filenames:
+                    logger.debug(msg)
+                else:
+                    logger.warning(msg)
+                ucache.add(crawled_file)
+
+            if len(matches[crawled_file]) > 1:
+                msg = ("Attention: More than one matching cfood!\n"
+                       + "Tried to match {}\n".format(crawled_file)
+                       + "\tRecordTypes:\t" + ", ".join(
+                            matches[crawled_file])+"\n")
+
+                if crawled_file in ucache.filenames:
+                    logger.debug(msg)
+                else:
+                    logger.warning(msg)
+                ucache.add(crawled_file)
+
+        # Save the encountered prblem matches
+        ucache.save()
+
+        return cfoods, tbs, errors_occured
+
+    def collect_cfoods(self):
+
+        cfoods, tbs, errors_occured = self.match()
+
+        return cfoods, tbs, errors_occured
+
+
+class TableCrawler(Crawler):
+
+    def __init__(self, table, unique_cols, recordtype, **kwargs):
+        """
+        Parameters
+        ----------
+        table : pandas DataFrame
+        unique_cols : the columns that provide the properties for the
+                      identifiable
+        recordtype : Record Type of the Records to be created
+        """
+        super().__init__(**kwargs)
+        self.table = table
+        self.unique_cols = unique_cols
+        self.recordtype = recordtype
+
+    def collect_cfoods(self):
+        cfoods = []
+        tbs = []
+        errors_occured = False
+
+        for _, row in self.table.iterrows():
+            try:
+                cfood = RowCFood(row, self.unique_cols, self.recordtype)
+
+                if cfood is not None:
+                    cfoods.append(cfood)
+            except Exception as e:
+                traceback.print_exc()
+                print(e)
+
+                if self.abort_on_exception:
+                    raise e
+                errors_occured = True
+                tbs.append(e)
+
+        return cfoods, tbs, errors_occured
+
+
 def get_value(prop):
     """ Returns the value of a Property
 
diff --git a/src/caosadvancedtools/example_cfood.py b/src/caosadvancedtools/example_cfood.py
index 07456df0..6111d95d 100644
--- a/src/caosadvancedtools/example_cfood.py
+++ b/src/caosadvancedtools/example_cfood.py
@@ -22,10 +22,10 @@
 
 import caosdb as db
 
-from .cfood import AbstractCFood, assure_has_property
+from .cfood import AbstractFileCFood, assure_has_property
 
 
-class ExampleCFood(AbstractCFood):
+class ExampleCFood(AbstractFileCFood):
     @staticmethod
     def get_re():
         return (r".*/(?P<species>[^/]+)/"
diff --git a/unittests/test_cfood.py b/unittests/test_cfood.py
index 9122856d..dcea2cbd 100644
--- a/unittests/test_cfood.py
+++ b/unittests/test_cfood.py
@@ -24,14 +24,15 @@
 import unittest
 
 import caosdb as db
-from caosadvancedtools.cfood import (AbstractCFood, assure_has_parent,
+
+from caosadvancedtools.cfood import (AbstractFileCFood, assure_has_parent,
                                      assure_object_is_in_list)
 from caosadvancedtools.example_cfood import ExampleCFood
 
 PATTERN = "h.*"
 
 
-class TestCFood(AbstractCFood):
+class TestCFood(AbstractFileCFood):
 
     @staticmethod
     def get_re():
@@ -42,11 +43,11 @@ class CFoodReTest(unittest.TestCase):
     def test(self):
         self.assertEquals(TestCFood.get_re(), PATTERN)
         self.assertEqual(TestCFood._pattern, None)
-        self.assertIsNotNone(TestCFood.match("hallo"))
+        self.assertIsNotNone(TestCFood.match_file("hallo"))
         # TODO the caching is of compiled re is disabled currently
         # self.assertIsNotNone(TestCFood._pattern)
-        self.assertIsNotNone(TestCFood.match("hallo"))
-        self.assertIsNone(TestCFood.match("allo"))
+        self.assertIsNotNone(TestCFood.match_file("hallo"))
+        self.assertIsNone(TestCFood.match_file("allo"))
 
 
 class InsertionTest(unittest.TestCase):
@@ -77,7 +78,7 @@ class InsertionTest(unittest.TestCase):
 class ExampleTest(unittest.TestCase):
     def test(self):
         path = "/data/rabbit/2019-03-03/README.md"
-        cf = ExampleCFood(crawled_file=path)
-        self.assertIsNotNone(ExampleCFood.match(path))
+        cf = ExampleCFood(crawled_path=path)
+        self.assertIsNotNone(ExampleCFood.match_file(path))
         self.assertEqual(cf.match.group('species'), 'rabbit')
         self.assertEqual(cf.match.group('date'), '2019-03-03')
-- 
GitLab