From 334b60297041e866350702c78960720e489c87c7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com>
Date: Tue, 5 Jan 2021 16:14:23 +0000
Subject: [PATCH] ENH: Include Scifolder

---
 .docker/Dockerfile                            |   9 -
 .gitlab-ci.yml                                |   1 +
 CHANGELOG.md                                  |   1 +
 README_SETUP.md                               |   5 -
 integrationtests/crawl.py                     |   5 +-
 integrationtests/insert_model.py              |   2 +-
 .../test_crawl_with_datamodel_problems.py     |   6 +-
 src/caosadvancedtools/import_from_xml.py      |   2 +-
 src/caosadvancedtools/scifolder/__init__.py   |   5 +
 .../scifolder/analysis_cfood.py               | 129 +++++++++
 .../scifolder/experiment_cfood.py             | 105 +++++++
 .../scifolder/generic_pattern.py              |  35 +++
 .../scifolder/publication_cfood.py            | 112 ++++++++
 .../scifolder/simulation_cfood.py             | 109 +++++++
 .../scifolder/software_cfood.py               | 115 ++++++++
 src/caosadvancedtools/scifolder/utils.py      | 204 +++++++++++++
 src/caosadvancedtools/scifolder/withreadme.py | 270 ++++++++++++++++++
 src/doc/crawler.rst                           |   5 +-
 .../2019-02-03_something/README.md            |  15 +
 .../2019-02-03_something/README.md            |   9 +
 .../Posters/2019-02-03_something/README.md    |  11 +
 unittests/data/README.md                      |  14 +
 unittests/data/README.xlsx                    | Bin 0 -> 5338 bytes
 .../2019-02-03_something/README.md            |  12 +
 unittests/test_cfoods.py                      |  54 ++++
 unittests/test_scifolder_utils.py             |  67 +++++
 26 files changed, 1278 insertions(+), 24 deletions(-)
 create mode 100644 src/caosadvancedtools/scifolder/__init__.py
 create mode 100644 src/caosadvancedtools/scifolder/analysis_cfood.py
 create mode 100644 src/caosadvancedtools/scifolder/experiment_cfood.py
 create mode 100644 src/caosadvancedtools/scifolder/generic_pattern.py
 create mode 100644 src/caosadvancedtools/scifolder/publication_cfood.py
 create mode 100644 src/caosadvancedtools/scifolder/simulation_cfood.py
 create mode 100644 src/caosadvancedtools/scifolder/software_cfood.py
 create mode 100644 src/caosadvancedtools/scifolder/utils.py
 create mode 100644 src/caosadvancedtools/scifolder/withreadme.py
 create mode 100644 unittests/data/DataAnalysis/2010_TestProject/2019-02-03_something/README.md
 create mode 100644 unittests/data/ExperimentalData/2010_TestProject/2019-02-03_something/README.md
 create mode 100644 unittests/data/Publications/Posters/2019-02-03_something/README.md
 create mode 100644 unittests/data/README.md
 create mode 100644 unittests/data/README.xlsx
 create mode 100644 unittests/data/SimulationData/2010_TestProject/2019-02-03_something/README.md
 create mode 100644 unittests/test_cfoods.py
 create mode 100644 unittests/test_scifolder_utils.py

diff --git a/.docker/Dockerfile b/.docker/Dockerfile
index ca59395a..d5d2fe66 100644
--- a/.docker/Dockerfile
+++ b/.docker/Dockerfile
@@ -20,15 +20,6 @@ ADD https://gitlab.com/api/v4/projects/13656973/repository/branches/dev \
    pylib_version.json
 RUN git clone https://gitlab.com/caosdb/caosdb-pylib.git && \
    cd caosdb-pylib && git checkout dev && pip3 install .
-ADD https://gitlab.com/api/v4/projects/13656965/repository/branches/master \
-   model_version.json
-RUN git clone https://gitlab.com/caosdb/caosdb-models.git && \
-   cd caosdb-models && pip3 install .
-ADD https://gitlab.com/api/v4/projects/13601752/repository/branches/master \
-   scifolder_version.json
-RUN git clone \
-    https://gitlab.com/henrik_indiscale/scifolder.git && \
-    cd scifolder && pip3 install .
 COPY . /git
 RUN rm -r /git/.git \
     && mv /git/.docker/pycaosdb.ini /git/integrationtests
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 9f746e47..9b573a53 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -78,6 +78,7 @@ build-testenv:
   stage: setup
   only:
       - schedules
+      - web
   script: 
       - df -h
       - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
diff --git a/CHANGELOG.md b/CHANGELOG.md
index cfebbbcf..106d703c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added ###
 
+- included the scifolder module
 - included the caosmodels module
 * `send_mail` function in `caosadvancedtools.serverside.helper` module
 - New class to collect possible problems with the data model
diff --git a/README_SETUP.md b/README_SETUP.md
index b9db16a9..243fba2d 100644
--- a/README_SETUP.md
+++ b/README_SETUP.md
@@ -15,17 +15,12 @@ Dependencies will be installed automatically if you use the below described proc
 
 For testing:
 - `tox` 
-- `scifolder`from https://gitlab.com/henrik_indiscale/scifolder
 
 
 ## Installation
 - `pip install . --user`
 - `pip install tox --user`
 
-In order to run the tests you need to install the [scifolder
-package](https://gitlab.com/henrik_indiscale/scifolder) by Henrik tom
-Wörden.
-
 ## Run Unit Tests
 `tox`
 
diff --git a/integrationtests/crawl.py b/integrationtests/crawl.py
index e4bf311e..bf72b5f7 100755
--- a/integrationtests/crawl.py
+++ b/integrationtests/crawl.py
@@ -32,8 +32,9 @@ import caosdb as db
 from caosadvancedtools.cfood import fileguide
 from caosadvancedtools.crawler import FileCrawler
 from caosadvancedtools.guard import INSERT, UPDATE
-from scifolder import (AnalysisCFood, ExperimentCFood, PublicationCFood,
-                       SimulationCFood, SoftwareCFood)
+from caosadvancedtools.scifolder import (AnalysisCFood, ExperimentCFood,
+                                         PublicationCFood, SimulationCFood,
+                                         SoftwareCFood)
 
 try:
     from sss_helper import get_argument_parser, print_success
diff --git a/integrationtests/insert_model.py b/integrationtests/insert_model.py
index 2289f72e..270a08a3 100755
--- a/integrationtests/insert_model.py
+++ b/integrationtests/insert_model.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 import caosdb as db
-from caosmodels.parser import parse_model_from_yaml
+from caosadvancedtools.models.parser import parse_model_from_yaml
 
 model = parse_model_from_yaml("model.yml")
 model.sync_data_model(noquestion=True)
diff --git a/integrationtests/test_crawl_with_datamodel_problems.py b/integrationtests/test_crawl_with_datamodel_problems.py
index 3089bf4c..6c212e36 100644
--- a/integrationtests/test_crawl_with_datamodel_problems.py
+++ b/integrationtests/test_crawl_with_datamodel_problems.py
@@ -30,9 +30,9 @@ from caosadvancedtools.cfood import fileguide
 from caosadvancedtools.crawler import FileCrawler
 from caosadvancedtools.datamodel_problems import DataModelProblems
 from caosadvancedtools.guard import INSERT
-from caosmodels.parser import parse_model_from_yaml
-from scifolder import (AnalysisCFood, ExperimentCFood, PublicationCFood,
-                       SimulationCFood)
+from caosadvancedtools.models.parser import parse_model_from_yaml
+from caosadvancedtools.scifolder import (AnalysisCFood, ExperimentCFood,
+                                         PublicationCFood, SimulationCFood)
 
 
 def setup_module():
diff --git a/src/caosadvancedtools/import_from_xml.py b/src/caosadvancedtools/import_from_xml.py
index 9942a9a9..0bf9b1c0 100755
--- a/src/caosadvancedtools/import_from_xml.py
+++ b/src/caosadvancedtools/import_from_xml.py
@@ -33,7 +33,7 @@ from tempfile import NamedTemporaryFile
 
 import caosdb as db
 from caosdb.apiutils import apply_to_ids
-from caosmodels.data_model import DataModel
+from caosadvancedtools.models.data_model import DataModel
 
 
 def create_dummy_file(text="Please ask the administrator for this file."):
diff --git a/src/caosadvancedtools/scifolder/__init__.py b/src/caosadvancedtools/scifolder/__init__.py
new file mode 100644
index 00000000..d7d67937
--- /dev/null
+++ b/src/caosadvancedtools/scifolder/__init__.py
@@ -0,0 +1,5 @@
+from .analysis_cfood import AnalysisCFood
+from .experiment_cfood import ExperimentCFood
+from .publication_cfood import PublicationCFood
+from .simulation_cfood import SimulationCFood
+from .software_cfood import SoftwareCFood
diff --git a/src/caosadvancedtools/scifolder/analysis_cfood.py b/src/caosadvancedtools/scifolder/analysis_cfood.py
new file mode 100644
index 00000000..27cb871a
--- /dev/null
+++ b/src/caosadvancedtools/scifolder/analysis_cfood.py
@@ -0,0 +1,129 @@
+#!/usr/bin/env python
+# encoding: utf-8
+#
+# Copyright (C) 2019 Henrik tom Wörden
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+import os
+from itertools import chain
+
+import caosdb as db
+from caosadvancedtools.cfood import (AbstractFileCFood, assure_has_parent,
+                                     assure_has_property,
+                                     assure_object_is_in_list, get_entity)
+from caosadvancedtools.read_md_header import get_header
+
+from .generic_pattern import full_pattern
+from .utils import (get_files_referenced_by_field, parse_responsibles,
+                    reference_records_corresponding_to_files)
+from .withreadme import DATAMODEL as dm
+from .withreadme import (RESULTS, REVISIONOF, SCRIPTS, SOURCES, WithREADME,
+                         get_glob)
+
+
+class AnalysisCFood(AbstractFileCFood, WithREADME):
+    _prefix = ".*/DataAnalysis/"
+
+    # win_paths can be used to define fields that will contain windows style
+    # path instead of the default unix ones. Possible fields are:
+    # ["results", "sources", "scripts","revisionOf"]
+    win_paths = []
+
+    def __init__(self,  *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        WithREADME.__init__(self)
+
+    def collect_information(self):
+        self.find_referenced_files([RESULTS, SOURCES, SCRIPTS])
+
+    @staticmethod
+    def name_beautifier(name):
+        """ a function that can be used to rename the project. I.e. if
+        the project in CaosDB shall be named differently than in the folder
+        structure.
+        Use discouraged.
+        """
+
+        return name
+
+    @staticmethod
+    def get_re():
+        return AnalysisCFood._prefix + full_pattern
+
+    def create_identifiables(self):
+        # create the project identifiable
+        name = AnalysisCFood.name_beautifier(
+            self.match.group("project_identifier"))
+        self.project = db.Record(name=name)
+        self.project.add_parent(name=dm.Project)
+        self.identifiables.append(self.project)
+
+        # create the Analysis identifiable
+        self.analysis = db.Record()
+        self.analysis.add_parent(name=dm.Analysis)
+        self.analysis.add_property(name=dm.date, value=self.match.group("date"))
+
+        self.analysis.add_property(name=dm.Project, value=self.project)
+        self.identifiables.append(self.analysis)
+
+        if self.match.group("suffix") is not None:
+            self.analysis.add_property(name=dm.identifier,
+                                       value=self.match.group("suffix"))
+        else:
+            # TODO empty string causes an error in search
+            self.analysis.add_property(name=dm.identifier,
+                                       value="empty_identifier")
+
+        # parse people and add them to identifiables
+        # TODO People are currently 'identifiable' due to ther first and last
+        # names. There will be conflicts
+        self.people = parse_responsibles(self.header)
+        self.identifiables.extend(self.people)
+
+    def update_identifiables(self):
+        assure_has_property(self.analysis, "description",
+                            self.header["description"][0],
+                            to_be_updated=self.to_be_updated)
+        assure_object_is_in_list(obj=self.people,
+                                 containing_object=self.analysis,
+                                 property_name=dm.responsible,
+                                 to_be_updated=self.to_be_updated,
+                                 datatype=db.LIST(db.REFERENCE)
+                                 )
+        self.reference_included_records(self.analysis,
+                                        [RESULTS, SOURCES, SCRIPTS],
+                                        to_be_updated=self.to_be_updated
+                                        )
+
+        if SOURCES.key in self.header:
+            reference_records_corresponding_to_files(
+                    record=self.analysis,
+                    recordtypes=[dm.Experiment, dm.Publication, dm.Simulation,
+                                 dm.Analysis],
+                    globs=get_glob(self.header[SOURCES.key]),
+                    property_name=dm.sources,
+                    path=self.crawled_path,
+                    to_be_updated=self.to_be_updated)
+
+        self.reference_files_from_header(record=self.analysis)
+
+        if REVISIONOF.key in self.header:
+            reference_records_corresponding_to_files(
+                record=self.analysis,
+                recordtypes=[dm.Analysis],
+                property_name=dm.revisionOf,
+                globs=get_glob(self.header[REVISIONOF.key]),
+                path=self.crawled_path,
+                to_be_updated=self.to_be_updated)
diff --git a/src/caosadvancedtools/scifolder/experiment_cfood.py b/src/caosadvancedtools/scifolder/experiment_cfood.py
new file mode 100644
index 00000000..0eccd18d
--- /dev/null
+++ b/src/caosadvancedtools/scifolder/experiment_cfood.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python
+# encoding: utf-8
+#
+# Copyright (C) 2019 Henrik tom Wörden
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+import caosdb as db
+from caosadvancedtools.cfood import (AbstractFileCFood, assure_has_description,
+                                     assure_has_parent, assure_has_property,
+                                     assure_object_is_in_list, get_entity)
+from caosadvancedtools.read_md_header import get_header
+
+from .generic_pattern import full_pattern
+from .utils import parse_responsibles, reference_records_corresponding_to_files
+from .withreadme import DATAMODEL as dm
+from .withreadme import RESULTS, REVISIONOF, SCRIPTS, WithREADME, get_glob
+
+
+class ExperimentCFood(AbstractFileCFood, WithREADME):
+
+    # win_paths can be used to define fields that will contain windows style
+    # path instead of the default unix ones. Possible fields are:
+    # ["results", "revisionOf"]
+    win_paths = []
+
+    @staticmethod
+    def name_beautifier(x): return x
+
+    def __init__(self,  *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        WithREADME.__init__(self)
+
+        self.name_map = {},
+
+    @staticmethod
+    def get_re():
+        return ".*/ExperimentalData/"+full_pattern
+
+    def collect_information(self):
+        self.find_referenced_files([RESULTS])
+
+    @staticmethod
+    def create_identifiable_experiment(match):
+        # create the project identifiable
+        name = ExperimentCFood.name_beautifier(
+            match.group("project_identifier"))
+        project = db.Record(name=name)
+        project.add_parent(name=dm.Project)
+
+        experiment = db.Record()
+        experiment.add_parent(name=dm.Experiment)
+        experiment.add_property(
+            name=dm.date, value=match.group("date"))
+        experiment.add_property(name=dm.Project, value=project)
+
+        if match.group("suffix") is None:
+            experiment.add_property(
+                name="identifier", value="empty_identifier")
+        else:
+            experiment.add_property(name="identifier",
+                                    value=match.group("suffix"))
+
+        return [experiment, project]
+
+    def create_identifiables(self):
+        self.experiment, self.project = (
+            ExperimentCFood.create_identifiable_experiment(self.match))
+
+        self.identifiables.extend([self.experiment, self.project])
+        self.people = parse_responsibles(self.header)
+        self.identifiables.extend(self.people)
+
+    def update_identifiables(self):
+        # set description
+        assure_has_property(self.experiment, "description",
+                            self.header["description"][0],
+                            to_be_updated=self.to_be_updated)
+
+        # set responsible people
+        assure_object_is_in_list(self.people, self.experiment, dm.responsible,
+                                 to_be_updated=self.to_be_updated,
+                                 datatype=db.LIST(db.REFERENCE))
+
+        self.reference_files_from_header(record=self.experiment)
+
+        if "revisionOf" in self.header:
+            reference_records_corresponding_to_files(
+                record=self.experiment,
+                recordtypes=[dm.Experiment],
+                globs=get_glob(self.header[REVISIONOF.key]),
+                path=self.crawled_path,
+                property_name=dm.revisionOf,
+                to_be_updated=self.to_be_updated)
diff --git a/src/caosadvancedtools/scifolder/generic_pattern.py b/src/caosadvancedtools/scifolder/generic_pattern.py
new file mode 100644
index 00000000..0b5a4df2
--- /dev/null
+++ b/src/caosadvancedtools/scifolder/generic_pattern.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python
+# encoding: utf-8
+#
+# Copyright (C) 2020 IndiScale GmbH <info@indiscale.com>
+# Copyright (C) 2020 Henrik tom Wörden <h.tomwoerden@indiscale.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+""" this module contains regular expressions neeeded for the standard file
+structure """
+
+
+project_pattern = (r"(?P<project_identifier>"
+                   r"(?P<project_year>\d{4})_?(?P<project_name>((?!/).)*))/")
+date_pattern = r"(?P<date>\d{2,4}[-_]\d{1,2}[-_]\d{1,2})"
+date_suffix_pattern = r"(_(?P<suffix>(((?!/).)*)))?/"
+readme_pattern = r"(readme.md|README.md|readme.xlsx|README.xlsx)$"
+
+full_pattern = (project_pattern + date_pattern + date_suffix_pattern
+                # TODO: Additional level are not allowed according to the
+                # specification. This should be removed or enabled via a
+                # configuration
+                + "(.*)"
+                + readme_pattern)
diff --git a/src/caosadvancedtools/scifolder/publication_cfood.py b/src/caosadvancedtools/scifolder/publication_cfood.py
new file mode 100644
index 00000000..fc78e5b7
--- /dev/null
+++ b/src/caosadvancedtools/scifolder/publication_cfood.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python
+# encoding: utf-8
+#
+# Copyright (C) 2019 Henrik tom Wörden
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+import os
+from itertools import chain
+
+import caosdb as db
+from caosadvancedtools.cfood import (AbstractFileCFood,
+                                     assure_object_is_in_list, fileguide,
+                                     get_entity)
+from caosadvancedtools.read_md_header import get_header
+from caosadvancedtools.utils import find_records_that_reference_ids
+
+from .generic_pattern import date_suffix_pattern, readme_pattern
+from .utils import (get_files_referenced_by_field, parse_responsibles,
+                    reference_records_corresponding_to_files)
+from .withreadme import DATAMODEL as dm
+from .withreadme import (RESULTS, REVISIONOF, SCRIPTS, SOURCES, WithREADME,
+                         get_glob)
+
+
+def folder_to_type(name):
+    if name == "Theses":
+        return "Thesis"
+    elif name == "Articles":
+        return "Article"
+    elif name == "Posters":
+        return "Poster"
+    elif name == "Presentations":
+        return "Presentation"
+    elif name == "Reports":
+        return "Report"
+    else:
+        raise ValueError()
+
+
+class PublicationCFood(AbstractFileCFood, WithREADME):
+    # win_paths can be used to define fields that will contain windows style
+    # path instead of the default unix ones. Possible fields are:
+    # ["results", "sources", "scripts", "revisionOf"]
+    win_paths = []
+
+    def __init__(self,  *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        WithREADME.__init__(self)
+
+    def collect_information(self):
+        self.find_referenced_files([RESULTS, SOURCES, SCRIPTS])
+
+    @staticmethod
+    def get_re():
+        # matches anything but "/", i.e. a folder name
+        _prefix = ".*/Publications/"
+        _type = r"(?P<type>Theses|Articles|Posters|Presentations|Reports)/"
+        _partial_date = r"(?P<date>\d{2,4}([-_]\d{1,2}[-_]\d{1,2})?)"
+
+        return _prefix+_type+_partial_date+date_suffix_pattern+readme_pattern
+
+    def create_identifiables(self):
+        header = get_header(fileguide.access(self.crawled_path))
+        self.publication = db.Record(name=self.match.group("date")
+                                     + "_"+self.match.group("suffix"))
+        self.publication.add_parent(name=folder_to_type(
+            self.match.group("type")))
+        self.identifiables.append(self.publication)
+
+        self.people = parse_responsibles(header)
+        self.identifiables.extend(self.people)
+
+    def update_identifiables(self):
+        header = get_header(fileguide.access(self.crawled_path))
+        self.publication.description = header["description"][0]
+
+        assure_object_is_in_list(self.people, self.publication,
+                                 "responsible",
+                                 self.to_be_updated,
+                                 datatype=db.LIST(db.REFERENCE))
+
+        if SOURCES.key in self.header:
+            reference_records_corresponding_to_files(
+                    record=self.publication,
+                    recordtypes=[dm.Experiment, dm.Publication, dm.Simulation,
+                                 dm.Analysis],
+                    globs=get_glob(self.header[SOURCES.key]),
+                    property_name=dm.sources,
+                    path=self.crawled_path,
+                    to_be_updated=self.to_be_updated)
+        self.reference_files_from_header(record=self.publication)
+
+        if REVISIONOF.key in self.header:
+            reference_records_corresponding_to_files(
+                record=self.publication,
+                recordtypes=[dm.Publication],
+                property_name=dm.revisionOf,
+                globs=get_glob(self.header[REVISIONOF.key]),
+                path=self.crawled_path,
+                to_be_updated=self.to_be_updated)
diff --git a/src/caosadvancedtools/scifolder/simulation_cfood.py b/src/caosadvancedtools/scifolder/simulation_cfood.py
new file mode 100644
index 00000000..ae129e6a
--- /dev/null
+++ b/src/caosadvancedtools/scifolder/simulation_cfood.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python
+# encoding: utf-8
+#
+# Copyright (C) 2019 Henrik tom Wörden
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+import os
+from itertools import chain
+
+import caosdb as db
+from caosadvancedtools.cfood import (AbstractFileCFood, assure_has_parent,
+                                     assure_has_property,
+                                     assure_object_is_in_list, get_entity)
+from caosadvancedtools.read_md_header import get_header
+
+from .generic_pattern import full_pattern
+from .utils import (get_files_referenced_by_field, parse_responsibles,
+                    reference_records_corresponding_to_files)
+from .withreadme import DATAMODEL as dm
+from .withreadme import (RESULTS, REVISIONOF, SCRIPTS, SOURCES, WithREADME,
+                         get_glob)
+
+
+class SimulationCFood(AbstractFileCFood, WithREADME):
+    # win_paths can be used to define fields that will contain windows style
+    # path instead of the default unix ones. Possible fields are:
+    # ["results", "sources", "scripts", "revisionOf"]
+    win_paths = []
+
+    def __init__(self,  *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        WithREADME.__init__(self)
+
+    def collect_information(self):
+        self.find_referenced_files([RESULTS, SOURCES, SCRIPTS])
+
+    @staticmethod
+    def get_re():
+        return ".*/SimulationData/" + full_pattern
+
+    def create_identifiables(self):
+        # create the project identifiable
+        self.project = db.Record(name=self.match.group("project_identifier"))
+        self.project.add_parent(name="Project")
+        self.identifiables.append(self.project)
+
+        self.simulation = db.Record()
+        # import IPython
+        # IPython.embed()
+        self.simulation.add_parent(name="Simulation")
+        self.simulation.add_property(
+            name="date", value=self.match.group("date"))
+
+        self.simulation.add_property(name="Project", value=self.project)
+
+        if self.match.group("suffix") is not None:
+            self.simulation.add_property(
+                name="identifier", value=self.match.group("suffix"))
+        else:
+            # TODO empty string causes an error in search
+            self.simulation.add_property(name="identifier",
+                                              value="empty_identifier")
+        self.identifiables.append(self.simulation)
+        self.people = parse_responsibles(self.header)
+        self.identifiables.extend(self.people)
+
+    def update_identifiables(self):
+        assure_has_property(self.simulation, "description",
+                            self.header["description"][0],
+                            to_be_updated=self.to_be_updated)
+
+        # TODO why is here no db.LIST("Person") possible?
+
+        assure_object_is_in_list(self.people, self.simulation,
+                                 "responsible",
+                                 self.to_be_updated,
+                                 datatype=db.LIST(db.REFERENCE))
+
+        if SOURCES.key in self.header:
+            reference_records_corresponding_to_files(
+                    record=self.simulation,
+                    recordtypes=["Experiment", "Publication", "Simulation",
+                                 "Analysis"],
+                    globs=get_glob(self.header[SOURCES.key]),
+                    property_name=dm.sources,
+                    path=self.crawled_path,
+                    to_be_updated=self.to_be_updated)
+        self.reference_files_from_header(record=self.simulation)
+
+        if REVISIONOF.key in self.header:
+            reference_records_corresponding_to_files(
+                record=self.simulation,
+                recordtypes=[dm.Software],
+                property_name=dm.revisionOf,
+                globs=get_glob(self.header[dm.revisionOf]),
+                path=self.crawled_path,
+                to_be_updated=self.to_be_updated)
diff --git a/src/caosadvancedtools/scifolder/software_cfood.py b/src/caosadvancedtools/scifolder/software_cfood.py
new file mode 100644
index 00000000..77fb4652
--- /dev/null
+++ b/src/caosadvancedtools/scifolder/software_cfood.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python
+# encoding: utf-8
+#
+# Copyright (C) 2019 IndiScale GmbH <info@indiscale.com>
+# Copyright (C) 2019 Henrik tom Wörden <h.tomwoerden@indiscale.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+import os
+from itertools import chain
+
+import caosdb as db
+from caosadvancedtools.cfood import (AbstractFileCFood, assure_has_parent,
+                                     assure_has_property, assure_name_is,
+                                     assure_object_is_in_list, get_entity)
+from caosadvancedtools.guard import global_guard as guard
+from caosadvancedtools.read_md_header import get_header
+
+from .generic_pattern import full_pattern
+from .utils import get_files_referenced_by_field, parse_responsibles
+from .withreadme import BINARIES
+from .withreadme import DATAMODEL as dm
+from .withreadme import SOURCECODE, WithREADME
+
+
+class SoftwareCFood(AbstractFileCFood, WithREADME):
+    _prefix = ".*/Software/"
+    # win_paths can be used to define fields that will contain windows style
+    # path instead of the default unix ones. Possible fields are:
+    # ["binaries", "sourceCode","revisionOf"]
+    win_paths = []
+
+    def __init__(self,  *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        WithREADME.__init__(self)
+
+    def collect_information(self):
+        self.find_referenced_files([BINARIES, SOURCECODE])
+
+    @staticmethod
+    def get_re():
+
+        return SoftwareCFood._prefix + full_pattern
+
+    def create_identifiables(self):
+        # The software is a record type. Let's try to find it.
+        self.software = db.execute_query(
+            "FIND RecordType Software with name = {}".format(
+                self.match.group("project_identifier")))
+
+        if len(self.software) == 0:
+            # Software not found insert if allowed
+            self.software = db.RecordType(
+                name=self.match.group("project_identifier"))
+            self.software.add_parent(name="Software")
+            self.software.add_property(name="alias",
+                                       value=self.match.group("project_name"))
+            guard.safe_insert(self.software)
+        elif len(self.software) == 1:
+            self.software = self.software[0]
+        else:
+            raise RuntimeError("Cannot identify software record type. Multiple"
+                               "matches for {}".format(
+                                   self.match.group("project_identifier")))
+
+        # create the software version
+        # identifiable is made from parent and date and suffix
+        self.softwareversion = db.Record()
+        self.softwareversion.add_parent(self.software)
+        self.softwareversion.add_property("date", self.match.group("date"))
+
+        if self.match.group("suffix"):
+            self.softwareversion.add_property(
+                "version", self.match.group("suffix"))
+
+        self.identifiables.append(self.softwareversion)
+
+        # parse people and add them to identifiables
+        # TODO People are currently 'identifiable' with their first and last
+        # names. There will be conflicts
+        self.people = parse_responsibles(self.header)
+        self.identifiables.extend(self.people)
+
+    def update_identifiables(self):
+        version_name = self.match.group("project_name")
+
+        if self.match.group("suffix"):
+            version_name += "_"+self.match.group("suffix")
+        else:
+            version_name += "_"+self.match.group("date")
+
+        assure_name_is(self.softwareversion, version_name,
+                       to_be_updated=self.to_be_updated)
+        assure_has_property(self.softwareversion, "description",
+                            self.header["description"][0],
+                            to_be_updated=self.to_be_updated)
+        assure_object_is_in_list(obj=self.people,
+                                 containing_object=self.softwareversion,
+                                 property_name="responsible",
+                                 to_be_updated=self.to_be_updated,
+                                 datatype=db.LIST(db.REFERENCE)
+                                 )
+
+        self.reference_files_from_header(record=self.softwareversion)
diff --git a/src/caosadvancedtools/scifolder/utils.py b/src/caosadvancedtools/scifolder/utils.py
new file mode 100644
index 00000000..3241764f
--- /dev/null
+++ b/src/caosadvancedtools/scifolder/utils.py
@@ -0,0 +1,204 @@
+#!/usr/bin/env python
+# encoding: utf-8
+#
+# Copyright (C) 2020 IndiScale GmbH <info@indiscale.com>
+# Copyright (C) 2020 Henrik tom Wörden <h.tomwoerden@indiscale.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+import os
+from itertools import chain
+
+import caosdb as db
+import pandas as pd
+from caosadvancedtools.cfood import assure_object_is_in_list, fileguide
+from caosadvancedtools.utils import (find_records_that_reference_ids,
+                                     read_field_as_list,
+                                     return_field_or_property,
+                                     string_to_person)
+
+
+def parse_responsibles(header):
+    """
+    Extract the responsible person(s) from the yaml header.
+
+    If field responsible is a list every entry from that list will be added as
+    a person.
+    Currently only the format <Firstname> <Lastname> <*> is supported.
+    If it is a simple string, it is added as the only person.
+    """
+    people = []
+
+    for person in read_field_as_list(header["responsible"]):
+        people.append(string_to_person(person))
+
+    return people
+
+
+def get_files_referenced_by_field(globs, prefix="", final_glob=None):
+    """
+    returns all file entities at paths described by given globs
+
+    This function assumes that the supplied globs is a list of
+    filenames, directories or globs.
+
+    prefix should be the path of the crawled file to supply a context for
+    relative paths.
+    """
+    referenced_files = []
+    globs = [g for g in globs if g is not None]
+
+    for glob in globs:
+        # TODO extract glob manipulation
+
+        if final_glob is not None and not glob.endswith(final_glob):
+            glob += final_glob
+
+        if not glob.startswith("/"):
+            glob = os.path.normpath(os.path.join(prefix, glob))
+        else:
+            glob = os.path.normpath(glob)
+
+        query_string = "FIND file which is stored at {}".format(glob)
+
+        el = db.execute_query(query_string)
+
+        referenced_files.append(el)
+
+    return referenced_files
+
+
+def is_filename_allowed(path, recordtype):
+    if recordtype.lower() == "experiment":
+        if "ExperimentalData" in path:
+            return True
+    elif recordtype.lower() == "analysis":
+        if "DataAnalysis" in path:
+            return True
+    elif recordtype.lower() == "publication":
+        if "Publication" in path:
+            return True
+    elif recordtype.lower() == "simulation":
+        if "Simulation" in path:
+            return True
+
+    return False
+
+
+def get_entity_ids_from_include_file(prefix, file_path):
+    """reads version ids from  include file """
+
+    if not file_path.startswith("/"):
+        file_path = os.path.normpath(os.path.join(prefix, file_path))
+    else:
+        file_path = os.path.normpath(file_path)
+    df = pd.read_csv(fileguide.access(file_path), sep="\t", comment="#")
+
+    if "ID" not in df.columns:
+        raise ValueError("Include file must have an ID column")
+
+    return list(df.ID)
+
+
+def reference_records_corresponding_to_files(record, recordtypes, globs, path,
+                                             to_be_updated, property_name):
+    # TODO this function needs to be refactored:
+    # the treatement of keys like 'results' should be separated from searching
+    # entities (see setting of globs and includes below).
+
+    for recordtype in recordtypes:
+
+        directly_named_files = list(chain(*get_files_referenced_by_field(
+            globs,
+            prefix=os.path.dirname(path))))
+
+        files_in_folders = list(chain(*get_files_referenced_by_field(
+            globs,
+            prefix=os.path.dirname(path),
+            final_glob="**")))
+        files = [f for f in directly_named_files + files_in_folders if
+                 is_filename_allowed(f.path, recordtype=recordtype)]
+        entities = find_records_that_reference_ids(
+            list(set([
+                fi.id for fi in files])),
+            rt=recordtype)
+
+        if len(entities) == 0:
+            continue
+        else:
+            assure_object_is_in_list(entities,
+                                     record,
+                                     property_name,
+                                     to_be_updated,
+                                     datatype=db.LIST(db.REFERENCE))
+
+
+def create_files_list(df, ftype):
+    files = []
+
+    for indx, src in df.loc[ftype,
+                            pd.notnull(df.loc[ftype])].iteritems():
+        desc = df.loc[ftype+" description", indx]
+
+        if pd.notnull(desc):
+            files.append({'file': src, 'description': desc})
+        else:
+            files.append(src)
+
+    return files
+
+
+def add_value_list(header, df, name):
+    if name in df.index:
+        header[name] = list(df.loc[name, pd.notnull(df.loc[name])])
+
+
+def get_xls_header(filepath):
+    """
+    This function reads an xlsx file and creates a dictionary analogue to the
+    one created by the yaml headers in README.md files read with the get_header
+    function of caosdb-advancedtools.
+    As xlsx files lack the hierarchical structure, the information that can be
+    provided is less complex. See the possibility to use the xlsx files as a
+    less powerfull version for people who are not comfortable with the
+    README.md files.
+
+    The xlsx file has a defined set of rows. In each row a list of entries can
+    be given. This structure is converted to a dictionary with a fix structure.
+    """
+
+    header = {}
+
+    df = pd.read_excel(filepath, index_col=0, header=None)
+    add_value_list(header, df, "responsible")
+    add_value_list(header, df, "description")
+    assert len(header["description"]) <= 1
+
+    for ftype in ["sources", "scripts", "results", "sourceCode", "binaries"]:
+        if ftype not in df.index:
+            continue
+        files = create_files_list(df, ftype)
+
+        if len(files) > 0:
+            header[ftype] = files
+
+    add_value_list(header, df, "revisionOf")
+    # there should be only one revision of
+
+    if "revisionOf" in header:
+        if len(header["revisionOf"]) > 0:
+            header["revisionOf"] = header["revisionOf"][0]
+    add_value_list(header, df, "tags")
+
+    return header
diff --git a/src/caosadvancedtools/scifolder/withreadme.py b/src/caosadvancedtools/scifolder/withreadme.py
new file mode 100644
index 00000000..b3eb1095
--- /dev/null
+++ b/src/caosadvancedtools/scifolder/withreadme.py
@@ -0,0 +1,270 @@
+#!/usr/bin/env python
+# encoding: utf-8
+#
+# Copyright (C) 2020 IndiScale GmbH <info@indiscale.com>
+# Copyright (C) 2020 Henrik tom Wörden <h.tomwoerden@indiscale.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+
+import logging
+import os
+from dataclasses import dataclass
+
+import caosdb as db
+from caosadvancedtools.cfood import (assure_has_description, assure_has_parent,
+                                     assure_object_is_in_list, fileguide)
+from caosadvancedtools.read_md_header import get_header as get_md_header
+from caosadvancedtools.table_importer import (win_path_converter,
+                                              win_path_list_converter)
+from caosadvancedtools.utils import return_field_or_property
+
+from .utils import (get_entity_ids_from_include_file,
+                    get_files_referenced_by_field, get_xls_header)
+
+LOGGER = logging.getLogger("withreadme")
+LOGGER.setLevel(level=logging.ERROR)
+
+
+@dataclass
+class DataModel(object):
+    results: str = "results"
+    scripts: str = "scripts"
+    sources: str = "sources"
+    date: str = "date"
+    Project: str = "Project"
+    Analysis: str = "Analysis"
+    identifier: str = "identifier"
+    responsible: str = "responsible"
+    revisionOf: str = "revisionOf"
+    Experiment: str = "Experiment"
+    Publication: str = "Publication"
+    Simulation: str = "Simulation"
+    Analysis: str = "Analysis"
+    revisionOf: str = "revisionOf"
+    binaries: str = "binaries"
+    sourcecode: str = "sourceCode"
+    description: str = "description"
+
+
+DATAMODEL = DataModel()
+dm = DATAMODEL
+
+
+class HeaderField(object):
+    def __init__(self, key, model):
+        self.key = key
+        self.model = model
+
+
+RESULTS = HeaderField("results", dm.results)
+SCRIPTS = HeaderField("scripts", dm.scripts)
+SOURCES = HeaderField("sources", dm.sources)
+FILE = HeaderField("file", None)
+INCLUDE = HeaderField("include", None)
+REVISIONOF = HeaderField("revisionOf", dm.revisionOf)
+BINARIES = HeaderField("binaries", dm.binaries)
+SOURCECODE = HeaderField("sourceCode", dm.sourcecode)
+DESCRIPTION = HeaderField("description", dm.description)
+RECORDTYPE = HeaderField("recordtype", None)
+
+
+def get_glob(field):
+    """ takes a field which must be a list of globs or dicts.
+
+    if it is a dict, it must have either an include or a file key"""
+    globs = []
+
+    for value in field:
+
+        if isinstance(value, dict) and INCLUDE.key in value:
+            continue
+
+        globs.append(return_field_or_property(value, FILE.key))
+
+    return globs
+
+
+def get_description(value):
+    if isinstance(value, dict) and DESCRIPTION.key in value:
+        return value[DESCRIPTION.key]
+    else:
+        return None
+
+
+def get_rt(value):
+    if isinstance(value, dict) and RECORDTYPE.key in value:
+        return value[RECORDTYPE.key]
+    else:
+        return None
+
+
+class WithREADME(object):
+    def __init__(self):
+        self._header = None
+        self.ref_files = {}
+
+    @property
+    def header(self):
+        if self._header is None:
+            if self.crawled_path.lower().endswith(".md"):
+                self._header = get_md_header(
+                    fileguide.access(self.crawled_path))
+            elif self.crawled_path.lower().endswith(".xlsx"):
+                self._header = get_xls_header(
+                    fileguide.access(self.crawled_path))
+            else:
+                raise RuntimeError("Readme format not recognized.")
+            self.convert_win_paths()
+
+        return self._header
+
+    def find_referenced_files(self, fields):
+        """ iterates over given fields in the header and searches for files
+
+        if the field contains a glob. The file entities are attached"""
+
+        for field in fields:
+
+            if field.key not in self.header:
+                continue
+
+            globs = get_glob(self.header[field.key])
+            files = get_files_referenced_by_field(
+                globs, prefix=os.path.dirname(self.crawled_path))
+
+            description = [get_description(val) for val in
+                           self.header[field.key]]
+            recordtype = [get_rt(val) for val in self.header[field.key]]
+            self.ref_files[field.model] = [
+                (f, d, r) for f, d, r in zip(files, description, recordtype)]
+            # flatten returned list of file lists
+            flat_list = [f.path for sublist in files
+                         for f in sublist]
+
+            if len(flat_list) == 0:
+                LOGGER.warn("ATTENTION: the field {} does not reference any "
+                            "known files".format(field.key))
+
+            self.attached_filenames.extend(flat_list)
+
+    def convert_path(self, el):
+        """ converts the path in el to unix type
+
+        el can be a dict of a string. If el is dict it must have a file key
+
+        returns: same type as el
+        """
+
+        if isinstance(el, dict):
+            if INCLUDE.key in el:
+                el[INCLUDE.key] = win_path_converter(el[INCLUDE.key])
+
+                return el
+
+            if FILE.key not in el:
+                raise ValueError("field should have a 'file' attribute")
+            el[FILE.key] = win_path_converter(el[FILE.key])
+
+            return el
+        else:
+            return win_path_converter(el)
+
+    def convert_win_paths(self):
+        for field in self.win_paths:
+            if field in self.header:
+
+                if isinstance(self.header[field], list):
+                    self.header[field] = [
+                        self.convert_path(el) for el in self.header[field]]
+                else:
+                    self.header[field] = self.convert_path(self.header[field])
+
+    def reference_files_from_header(self, record):
+        """adds properties that reference the files collected in ref_files
+
+        ref_files is expected to be a list of (files, description, recordtype)
+        tuples, where files is the list of file entities, description the description
+        that shall be added to each and recordtype the recordtype that the
+        files shall get as parent. files may be an empty list and description
+        and recordtype may be None.
+
+        The files will be grouped according to the keys used in ref_files and
+        the record types. The record types take precedence.
+        """
+        references = {}
+
+        for prop_name, ref_tuple in self.ref_files.items():
+            generic_references = []
+
+            for files, description, recordtype in ref_tuple:
+                if len(files) == 0:
+                    continue
+
+                if description is not None:
+                    for fi in files:
+                        assure_has_description(fi, description, force=True)
+
+                if recordtype is None:
+                    generic_references.extend(files)
+                else:
+                    for fi in files:
+                        # fix parent
+                        assure_has_parent(fi, recordtype, force=True,
+                                          unique=False)
+
+                    if recordtype not in references:
+                        references[recordtype] = []
+                    references[recordtype].extend(files)
+
+            if len(generic_references) > 0:
+                assure_object_is_in_list(
+                    generic_references,
+                    record,
+                    prop_name,
+                    to_be_updated=self.to_be_updated,
+                    datatype=db.LIST(db.REFERENCE),
+                )
+
+        for ref_type in references.keys():
+            assure_object_is_in_list(
+                references[ref_type],
+                record,
+                ref_type,
+                to_be_updated=self.to_be_updated,
+            )
+
+    def reference_included_records(self, record, fields, to_be_updated):
+        """ iterates over given fields in the header and searches for files
+
+        if the field contains a glob. The file entities are attached"""
+
+        for field in fields:
+
+            if field.key not in self.header:
+                continue
+            included = []
+
+            for item in self.header[field.key]:
+                if INCLUDE.key in item:
+                    included.extend(
+                        get_entity_ids_from_include_file(
+                            os.path.dirname(self.crawled_path),
+                            item[INCLUDE.key]))
+
+            assure_object_is_in_list(included,
+                                     record,
+                                     field.model,
+                                     to_be_updated,
+                                     datatype=db.LIST(db.REFERENCE))
diff --git a/src/doc/crawler.rst b/src/doc/crawler.rst
index 2380cdbd..92a624bb 100644
--- a/src/doc/crawler.rst
+++ b/src/doc/crawler.rst
@@ -36,9 +36,8 @@ different components of the CaosDB Crawler can be found in the
 `developers’ information <#extending-the-crawlers>`__ below.
 
 In case you are happy with our suggestion of a standard crawler, feel
-free to use the standard crawler. The standard crawler lives in this git
-repository maintained by Henrik tom Wörden:
-https://gitlab.com/henrik_indiscale/scifolder
+free to use the standard crawler. The standard crawler lives in the submodule
+`caosadvancedtools.scifolder`
 
 Usage
 =====
diff --git a/unittests/data/DataAnalysis/2010_TestProject/2019-02-03_something/README.md b/unittests/data/DataAnalysis/2010_TestProject/2019-02-03_something/README.md
new file mode 100644
index 00000000..71454e89
--- /dev/null
+++ b/unittests/data/DataAnalysis/2010_TestProject/2019-02-03_something/README.md
@@ -0,0 +1,15 @@
+---
+responsible:	
+- Only Responsible
+description: 	A description of another example analysis.
+
+sources:
+- file:	"/ExperimentalData/2010_TestProject/2019-02-03/*.dat"
+  description:  an example reference to a results file
+
+scripts:
+- file: plot.py
+  description: a plotting script
+results:
+- file: results.pdf
+...
diff --git a/unittests/data/ExperimentalData/2010_TestProject/2019-02-03_something/README.md b/unittests/data/ExperimentalData/2010_TestProject/2019-02-03_something/README.md
new file mode 100644
index 00000000..b7e5051c
--- /dev/null
+++ b/unittests/data/ExperimentalData/2010_TestProject/2019-02-03_something/README.md
@@ -0,0 +1,9 @@
+---
+responsible:	
+- Only Responsible
+description: 	A description of another example experiment.
+
+results:
+- file:	"/ExperimentalData/2010_TestProject/2019-02-03/*.dat"
+  description:  an example reference to a results file
+...
diff --git a/unittests/data/Publications/Posters/2019-02-03_something/README.md b/unittests/data/Publications/Posters/2019-02-03_something/README.md
new file mode 100644
index 00000000..c95e37ec
--- /dev/null
+++ b/unittests/data/Publications/Posters/2019-02-03_something/README.md
@@ -0,0 +1,11 @@
+---
+responsible:	
+- Only Responsible
+description: 	A description of another example experiment.
+
+sources:
+- /DataAnalysis/2010_TestProject/2019-02-03/results.pdf
+
+results:
+- "*.pdf"
+...
diff --git a/unittests/data/README.md b/unittests/data/README.md
new file mode 100644
index 00000000..a2e0ce6e
--- /dev/null
+++ b/unittests/data/README.md
@@ -0,0 +1,14 @@
+---
+responsible: Ana Lytic
+description: An examplary analysis of very exciting research. The analysis was conducted following state of the art best practices of scientific methodology.
+sources: 
+  - /ExperimentalData/2010_TestProject/2019-02-03_something/
+  - file: /ExperimentalData/2010_TestProject/2019-02-03_something/
+    description: An example reference to an experiment. The experimental data was analysed with statistical methods using proper error calculations.
+scripts: 
+  - file: scripts
+    description: all the files needed to run the analysis
+results: 
+  - file: results.pdf
+    description: a plot of the statistical analysis
+...
diff --git a/unittests/data/README.xlsx b/unittests/data/README.xlsx
new file mode 100644
index 0000000000000000000000000000000000000000..a909347789edc1d5a1bbaacd998744cee83d5f6b
GIT binary patch
literal 5338
zcmaJ_1yq#X)*e8IhEYHoksf+LYG@FUl5Xji7#!Lml$I`$ltw{P8X2T}DCtH)r8|`R
zhwr=V%Ju&D+q2fpylcICpLO<r&U2o<)s?YskOFXUZ~)Bu>KcG+hL8Sl3b%H2<>9{i
zE{Pvh!h(Q8w|&E75uP<<53DMYVMA#TeV-@yUziJ}@KU<_A_8#?3-hsK6#e#5Lb9oI
zw`~r?(-p$yVMT@<k<n)qh}=w~<8^AJgC2@;Kv#*0*s`~1wB7N5KHeGQ12ymDV~tjQ
z6X+MSe46@Dt2rw~kdv_~)}_^tqp-rGdAs@3DBaNGpx5Pv>++$V%O;reBEw(wa$>oT
z+2ncn8*ix%KG4$1qI&evk;n5V;t>wpxKYJo*wc(U?aIjHPwry%q3{ezlJ7yK7p@|8
zIPx(Ujq$3heRgBb!>o3a0Ci<-9Gjwm+C}uXMKJ;B=l_)$V)Pqc=rv$)2MZX?f!oW;
zF-oh)d4?B+Jm$DD@5E*DggOaJFFF=52YTC+5-)$pA)K^(H!0_G{+4w<cv_ZNPy~OW
zF~!?(KHXuJwnaZ%r@z&_Sxmj2J7~INW5yDi3>jq5yZPEsx}V!@3bi)3mxq%0{A_y|
zVQr~<PY{2h*88SFt?b<MoQH*~s5q6%eJ9K3<TaShhQBmGD(9*A9nhfQpLYq2lNdsy
z28adNHlzq1Mz_!F=2SRVa|Q>!Exi{bgt9du*$nDS|M^~T+BE9*u8+T+W^c4XPyWp(
zRU>bCraZgjpkFlk$`)~!#3+uz{!(^%D(tcY<GW0YR{mW^pYP4vGn!P%aM#ukYjzT#
zmJFl9a=I_crfLG8EvIZ|e7DWU^9`pg`LXxS4?bb5VJ3&49&Ph=-V@tG&%EDeNz8(7
zZzG#lyPpL^oFlm6N4PNQqYRoCYt?rImOrg_Zx&06uK?FfLb!CG+xSHrZULlFJPtm^
zNR*6v^DKFs?1uTfx%6*5Ac7UvUs+)?lW;zDzmc2L#kiYP{>B4i^UohVVhWW8)s(w*
z#@w0n)`=f=yHr-yv;soUg~5<*vj_ctjhXVFtpuZh!%!N+)*TPN6J9Dc;Hp#V$YQ)I
zpPGMsG0j?lpQ6G-PBv^Aqjxh{7qP`u<B{moyLLg#|4FZXjr<BT5nlY8#~1*BGuA(0
zhVXC9xVm{eTDx8$W>eo7mc~o!yHHks)+D2uT0@e(2DP5<(=I90wx15f0v7+mqI^qu
ze)zF7NNUFIiJ((qXTs(>BAU2MkRsHUqh*+Oj9CjFu7E8euieRR&X@LO{pZFxu6+JI
zzI%@qp&evgy9Ni`oT?qGI*$sao9X+&15scJFB8h|+Od05WZE)nMOE~`ROo)Mj8=T&
z9i}>=qw|-nI0?ddE1La5V2$FWk>`<uL63KtUo0{;cMluCyOHNp5dWmLf;`wn78G0<
zPyL2*15-m^B1^ROMSJaPJkHwm_f(OPM4V0za+--mqB^YE1^$y(cM4=)4a}Bft<|t!
z7{*0sNgmc3Kp|V5E{!E{=&jQ%rl>C*T`+Je=TV?2Ky}w0;EwT?JMocOwg>r3{Oq7O
z6~)%F2=F6Z9<Z})<|y(pc5dwFQ9y-_fwOWh-e5dM?hWm63ehS{XAWK;6_HOL-e0iH
zi6M#e-XE;8zjDVCOc<Njrq%jPFW2%u<pa}UdtGxEG@l&$oiiaoLaA}IcW81nt%gY9
zo&jslWBVh(s0U`HO~RzOv%aJ3mN$gY&AJIJOM`FMFp-xmTP_gSlSfDKtM@U#x2PWf
zxJRI>90%4@vV{>VE<RgaDbbX9cHia1PrLS2@71CPy5!y4jNgv;EpCQU5?j=SO?krz
zEr5`~Wa)l-8*)g?l7uffaM+{}_ecrll{uPzVF@ugKG`DL&v5lAUx)a;lF*qS+fN%=
zm`-W3HQIsBT+9X$mx_O%PWsfPmj;wX@j$<I{zQZz;hq;fA>JsCQrD-$O<OE=ByvZ1
zcHy6<(*)*s9V@J6c~=eRwVH!(2NdrsD2O*N=>mRjUD`<}(An*A+7$}CdOBtijYwW(
zR)G~K&~2TF%rcKb`lazjHl=4@jdrToIdV(qW?rFvlY*<i63vc^WdDSAir)<BYG-Zj
z=F0Qe59A8?9eT#DX~JmWo3`u>y)A*%>?(xMDW)HK*p3L3yq!t$O^~Kg4SK+A-u3ek
z!HmJO=)o`gG3u^kT|+uztG=72HAaon{Yd{|>(uNa!gr#D^9#iMWBuuh`Jvw$RP){)
z#$L)d9~C&~Vj@givB+ciI`l;V&C9c@uX=gJ)a1bfu^YQI{Q8P@-LU~$75i4)yx}N0
z%0uA!#bh7Iyi~3ZRMxVnD8t}`%4KL%F#x>)Yf70GFuf+5);eQv)X&1IGTgOSZ>-@E
zcq6&CXw7u{@!o?&hBvQ98;l1Ury?XOe+abmG0{bwi37e5%_gbmM2F;l#34M%aJel*
zb|=%Dz*Q0SieemI)ka<1v}|K6z9WjG-8av0&>KN+t|BEP?xC<~wPo^)ysvP87uWx`
zPi&N>Lcv)%rr%Au4*~ow3pZcFaN0mRIc$N)XXVQWBsdtJx~@w+1mQXu(jH*m#+H{w
z_sFK5c{}1-pYAufXS0z`(NuE16_AP~(N@m~ONnq7ozPJX8W*hO0eW1GEnb4hoEjeG
zxn2U!cDvo*hztte%At>6*cKXqp;k8ECG7)gz0Iw_dWSZX@QZh{RJCQHRt-aY-yXu!
z<Ac}f!?r4{T?L%p%qY|TL=h}Md6xPb=uD@VYmg!els9I_VE19JP<rut0YBu$-AHWd
z5)zdalJV5Gy)5kfOmixoywOafY}gK+R<}@4D&(En>And_KZw5Q{(_-NIJ^*BKR`ZM
zTZ|)R>Q+8eG(^;Be6)Xvq&&#9wZBYTfu8|1(@K!l5~us)6PE~=GSOQ6dBgJLHcCZt
zP|)F$HC84<eJ7BSK-_3@O>+XOIL}~S>CcO$_31O6TS?z)nn7xjOGVHRzhPn|l11O<
z@l>QAV_O-&n3=KG-Q;%jk1sT#!gwsmUCEu=p0c+v2O_FvAlEN}_fSi(H-4cfm&Ir0
zxo7X>d?z&|yizJQY$6ARgq{UXMM5i6s;Fq(ZTOu8PI4p334xN{&!laiRqDD*+X`0h
zk9NU$)1KN2`XLr#L`O~+rcM$vi_i0Nkqh&O30s-to;BwOgrk`rW3cUWd%g3lvF8(B
z&z1vts|TW9x@)?qnu)1#J9xtNH*tiK4F=rZ{I+<*%)*wORD&|*$Kem=ZVpa&3_jez
zAuZ?(Jwzm|GO?kiR!KJ^yv1>V3BRz6o$E1~2TSrg&oThF2Xwy9zAt$a7bx3XF&S&?
zH9a6zA407h7OR9Pone@<I1Li0srewXV!!F|SfQ;nxR7=q3-J(Q6o|=dE-eMvFXHud
z<xZ5kaW<Kh^J6!@!&EEh$=i+Q2<f(0@{0pWVxPBZG^v$=6*6CR(u@(L6Cy8Wq=Yn4
zi=;2T9+gkwW(DgwSl@?R#53`-77$##)Fd<M+J&8d-fYD2rZFl*dWC*CYa8+TbgJue
zxP<Yz$<4W?WnrJ7!gPC=h_kUgf@Sb-a%SI<Xp7wg<!=l3o~Y8e`q-K&7Y{ic;l4dI
zTK)pNA}yILJ8jqv0N^FjKam#I-=y`o<h?HL{+7RSst#?u1R>k!Wb(%yirAKs!pKPJ
zvhx#+PK_@^?Sk>+)t7$U)JAyL8$$_06Z=gM57+KTt%bGA5&(;GZUnb<c;FykoNsL|
zQR<>{<`?alW6JN|e&0wk`J7yl$FOo=*nr!jHCEI(uSe-o;Y3eE;HCyo-=5;<bPN=k
zR*4Z-l{1I>N7vTNa?1%qBGX(%CT}+(E(%T;$3LtYW3kA=!-sv~^m9dgMUsJblCife
zFjG^|6L-wihHm>AJ`8uJMBQ)WjA?(bHbt(aY<|uM^C~lHJ3pU|p!d!EkAj!<U*g5i
z9Byr;<p#HRw*5=PRBNj`%wDNpJ%XmPl|0b&qC4BU@ZnI~5e=z*qi!6t4*x*urLV$K
z2oSHsi0y<v=FLK*w;KG^g-%?|l7+mjsi80pk2l%QSa|FGb|}wBPcqQDS#=D?@mQp`
zxEsun0r>O&P!m=1XCzv<(Flw|%wsbozG+l8e#^~C)G(p~c^|CR*l&%#Gk(|H4lb{H
z#iHdR$$I7mp4Bl{E_Ha#a-3KdH{U9goLRAE6=y+zyj-C(TU}&!8E^@@?~^f=qG}gm
z^+B04$<Yd1Y#rjAVLK#!nnEMJ5m8p@NZDn<O4>MJ^KOZE5jLxsa01$}qTo6hKJ{XE
zd74pC?jmg)!b&=*H7@_HIXE-2t+~__D8GDf`p4I&r9v+Ygsrouf^;$maVx3wQ;st4
zJM`=nc#DN)!9*^k+VXe&CT6!%Yhy?wC@7_!7|I-~B`zy_tPM}T+i84RhxE-!EwZgD
zp37}fu{FgL>mKm*ltPrtS-aZe3mJRV=p8I>ki4qWN<t)HxJFZ2&8V1=zp8?utc#rN
zP0IBU*+|m?i|s~XN>E88v&k~J*!BRPBNW$Tu}6KgY!W(kLAAF2J#d-v(`?;@#<@sJ
z{H5tJ{uLhPAjc~MXfZ2(^Pf)2Ei@jiV3z7|n2RfqB@Ax;R|}-6q1MU^@;lb!^j&Dc
zWDvAu3KvtfV37Z6<P@VZ*HFMkaDklb1hNye_|}~^l`YI2ruHAn(QP#A8x(;VG$Jw8
z8%Z^-wGKWQ29c2JcO{y67n+3=nGJIymFfx_@9$VC5q2t;zYx+)f{V*hf4|=`xkpIa
zFK?19R?n^4lCGv2BJkCe6H6@YD}=il(n8Z#ZT=3S;8sQ2SUYNzAOsAs?9K+PaI)NL
zVJ*SZnv5W)`le#=&~wN|V>%A`3}($!T*O`!5;c0PwllN0TTdyFSmeh}#HJb@rSAM(
z1oT2E%c(xvn|6Ls4?mIUi!{w`e|ZkR><JPmRU_-k#QAi2lybuO26(pu&=)m2uwBmV
z50Ds4>8-hF(ZX0d%Z@uxNA{*0)E+3CDuX<+x1VCHL)oJz{pG33aI$v^nQLts-GQps
zSfU1(NViea&Cd-EAJ0#2Y)!}gYVUf~Kx0^ke|gq*Kz&sv7M1*NJ9L?5{_)s?u9wN&
z#pS9>8L_A*ZM>kMZC^Y_cy)gAVk$jHRpJ}W%@!^srH&yh`>!@b?C+cJYhssYXNW3&
zKJxq23LB-HDY2YH7jwwt?XcT@C8qUW&+JPg#m%1>0TR=S$TEmN?J|7}GMB+B?7oo+
zg+GWiQc8dZBoy}63=UARutNG)rP_q4ZmLP6h{Q%xD?(ZW76pFrnLJ;PsS{uF&xXFs
zyc|LJN2joQ5px^9q!&g;`%2j(zDkW!kK26D`6(MY&i1mNv_dG8r18uo`<U$gxDIlf
zT9T(tVTJ%zP-j2^59d*TNNFTK(7E>E+rgdA)oS5g!mCW3etm7VfzE?3I$Qs{qbEVX
zVI&K4cC&VNGu8HXv351SN)}RER1KZ1`BM&tGB%IC=OCr&7hO27*!cN&nITbkKg^9>
zAO^tNI=JAlu$~XwO|R8F1ZUo{7zFN*{76^%k<65Z&zM&V->a%q=IL>S_TDkVmuNBB
zbkz(sE2G$#BMe78;j}AG>;%)fth3vAS1gm2@9&-jkjY~Y-6M<(RH>w9`&oqBvk8px
zY`|gdIV|N-S_v|m=?=i=tkk#85XO$$#we{8hQIa`&JA-!_;8g-o|?3*tv!?~O6Gjy
z$!NmmCOeI3LATS8K^og%kAM8w919VxNI@&C_Ijki9M;KY8fh?c;jpdFZw<x=73hP5
z)M2zPOAgPosHRJ`pBAe9I4yT-CN7zezo7TV-zhd`!AhIl&)lh?K3<}^us>+}M$~TU
za?*sI*?iA4lem2_qk_&Ki2X@uH$^l@E8@Xoz{cHklUs$C=~Q2g$myG&#PE&#C4Tn2
zs!Yt$il70yDtdoW4F)DD;P)8n`W)tJ4E3LOZ78L#{GX23oAIkL&2OVbFZIXJ=1=GA
zz0Fm>{@Ya1y*=9bclZ9M>ve^3)%^T6QQ*JV{jXN&&nVY#<101t+jj8&it<0S$)9nq
zv-s8B{oC*e{@%!+H~F6tt`p1^+y6Eyw1)aG?*G&M`l-8Oq2JbmuFAjN|HVmvdS6Gz
h)ye#A>*!$r&pB0BzKPB<0Dz1BNTMk*`1aM){{g;w5Pbju

literal 0
HcmV?d00001

diff --git a/unittests/data/SimulationData/2010_TestProject/2019-02-03_something/README.md b/unittests/data/SimulationData/2010_TestProject/2019-02-03_something/README.md
new file mode 100644
index 00000000..fba1bd48
--- /dev/null
+++ b/unittests/data/SimulationData/2010_TestProject/2019-02-03_something/README.md
@@ -0,0 +1,12 @@
+---
+responsible:	
+- Only Responsible
+description: 	A description of another example experiment.
+
+results:
+- file:	"*.dat"
+  description:  an example reference to a results file
+
+scripts:
+- sim.py
+...
diff --git a/unittests/test_cfoods.py b/unittests/test_cfoods.py
new file mode 100644
index 00000000..87e6d6d2
--- /dev/null
+++ b/unittests/test_cfoods.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+# encoding: utf-8
+#
+# Copyright (C) 2019 Henrik tom Wörden
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+import os
+import unittest
+
+from caosadvancedtools.scifolder import (AnalysisCFood, ExperimentCFood,
+                                         PublicationCFood, SimulationCFood)
+
+data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
+                         "data")
+
+
+class CFoodTest(unittest.TestCase):
+    def test_analysis(self):
+        self.assertFalse(AnalysisCFood.match_item("nopath"))
+        path = (data_path+"/DataAnalysis/2010_TestProject/"
+                "2019-02-03_something/README.md")
+        self.assertTrue(AnalysisCFood.match_item(path))
+        AnalysisCFood(path)
+
+    def test_experiment(self):
+        self.assertFalse(ExperimentCFood.match_item("nopath"))
+        path = (data_path+"/ExperimentalData/2010_TestProject/"
+                "2019-02-03_something/README.md")
+        self.assertTrue(ExperimentCFood.match_item(path))
+        ExperimentCFood(path)
+
+    def test_publication(self):
+        self.assertFalse(PublicationCFood.match_item("nopath"))
+        path = data_path+"/Publications/Posters/2019-02-03_something/README.md"
+        self.assertTrue(PublicationCFood.match_item(path))
+        PublicationCFood(path)
+
+    def test_simulation(self):
+        self.assertFalse(SimulationCFood.match_item("nopath"))
+        path = (data_path + "/SimulationData/2010_TestProject/"
+                "2019-02-03_something/README.md")
+        self.assertTrue(SimulationCFood.match_item(path))
+        SimulationCFood(path)
diff --git a/unittests/test_scifolder_utils.py b/unittests/test_scifolder_utils.py
new file mode 100644
index 00000000..30e211d9
--- /dev/null
+++ b/unittests/test_scifolder_utils.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python
+# encoding: utf-8
+#
+# Copyright (C) 2020 IndiScale GmbH <info@indiscale.com>
+# Copyright (C) 2020 Henrik tom Wörden <h.tomwoerden@indiscale.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+import os
+import unittest
+
+from caosadvancedtools.scifolder.utils import get_xls_header
+
+
+class XLSTest(unittest.TestCase):
+    def test_read(self):
+        filename = os.path.join(os.path.dirname(__file__),
+                                "data/README.xlsx")
+        assert os.path.exists(filename)
+
+        header = get_xls_header(filename)
+        assert header is not None
+        assert isinstance(header, dict)
+
+        # responsible
+        assert header['responsible'] == ["Ana Lytic"]
+
+        # description
+        assert len(header['description']) == 1
+        assert isinstance(header['description'][0], str)
+        assert len(header['description'][0]) > 20
+        assert "exciting" in header['description'][0]
+
+        # sources
+        assert isinstance(header['sources'], list)
+
+        for el in header['sources']:
+            assert isinstance(el, dict)
+            assert "TestProject" in el["file"]
+            assert "example" in el["description"]
+
+        # scripts
+        assert isinstance(header['scripts'], list)
+
+        for el in header['scripts']:
+            assert isinstance(el, dict)
+            assert "scripts" == el["file"]
+            assert "all the files" in el["description"]
+
+        # results
+        assert isinstance(header['results'], list)
+
+        for el in header['results']:
+            assert isinstance(el, dict)
+            assert "result.pdf" == el["file"]
+            assert "plot" in el["description"]
-- 
GitLab