Merge branch 'dev' into f-update-parents-mitigation

5b9ca55f · Henrik tom Wörden · 8e9a9128 · d34f0bfa · 5b9ca55f · 5b9ca55f
Commit 5b9ca55f authored Oct 6, 2022 by Henrik tom Wörden
--- a/.gitignore
+++ b/.gitignore
@@ -14,3 +14,6 @@ provenance.yml
 *.tar.gz
 *.sql
 /integrationtests/test-profile/custom/other/cert/
+src/doc/_apidoc/
+start_caosdb_docker.sh
+src/doc/_apidoc
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -152,11 +152,11 @@ inttest:
      - CAOSDB_TAG=$CAOSDB_TAG docker-compose  up -d

        # Store versions of CaosDB parts
-      - docker exec -u 0 -t docker_caosdb-server_1 cat /opt/caosdb/git/caosdb_pylib_commit > hash_pylib
-      - docker exec -u 0 -t docker_caosdb-server_1 cat /opt/caosdb/git/caosdb_webui_commit > hash_webui
-      - docker exec -u 0 -t docker_caosdb-server_1 cat /opt/caosdb/git/caosdb_server_commit > hash_server
-      - docker exec -u 0 -t docker_caosdb-server_1 cat /opt/caosdb/git/caosdb_mysqlbackend_commit > hash_mysql
-      - docker exec -u 0 -t docker_caosdb-server_1 cat /opt/caosdb/git/caosdb_proto_commit > hash_proto
+      - docker exec -u 0 -t docker-caosdb-server-1 cat /opt/caosdb/git/caosdb_pylib_commit > hash_pylib
+      - docker exec -u 0 -t docker-caosdb-server-1 cat /opt/caosdb/git/caosdb_webui_commit > hash_webui
+      - docker exec -u 0 -t docker-caosdb-server-1 cat /opt/caosdb/git/caosdb_server_commit > hash_server
+      - docker exec -u 0 -t docker-caosdb-server-1 cat /opt/caosdb/git/caosdb_mysqlbackend_commit > hash_mysql
+      - docker exec -u 0 -t docker-caosdb-server-1 cat /opt/caosdb/git/caosdb_proto_commit > hash_proto
      - cat hash_server
      - cat hash_proto
      - cat hash_mysql
@@ -167,8 +167,8 @@ inttest:
      - /bin/sh ./run.sh

        # Save logs
-      - docker logs docker_caosdb-server_1 &> ../caosdb_log.txt
-      - docker logs docker_sqldb_1 &> ../mariadb_log.txt
+      - docker logs docker-caosdb-server-1 &> ../caosdb_log.txt
+      - docker logs docker-sqldb-1 &> ../mariadb_log.txt
      - cd ..

        # Stop the server

--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,11 +13,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 * Added new converters for tables: CSVTableConverter and XLSXTableConverter
 * Possibility to authorize updates as in the old crawler
 * Allow authorization of inserts
+* Allow splitting cfoods into multiple yaml documents
+* Implemented macros
 * Converters can now filter the list of children
+* You can now crawl data with name conflicts: `synchronize(unique_names=False)`

 ### Changed

-* Renamed module from `newcrawler` to `caoscrawler`
+* MAINT: Renamed module from `newcrawler` to `caoscrawler`
+* MAINT: Removed global converters from `crawl.py`

 ### Deprecated

@@ -30,6 +34,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 * FIX: #35 Parent cannot be set from value
 * [#6](https://gitlab.com/caosdb/caosdb-crawler/-/issues/6): Fixed many type
  hints to be compatible to python 3.8
+* [#9](https://gitlab.com/caosdb/caosdb-crawler/-/issues/9): Sclaras of types
+  different than string can now be given in cfood definitions


 ### Security
--- a/src/caoscrawler/converters.py
+++ b/src/caoscrawler/converters.py
@@ -36,7 +36,7 @@ from .structure_elements import (StructureElement, Directory, File, Dict, JSONFi
                                 DictFloatElement, DictDictElement,
                                 TextElement, DictTextElement, DictElement, DictListElement)
 from typing import Dict as Dict_t, List, Optional, Tuple, Union
-from abc import abstractmethod
+from abc import ABCMeta, abstractmethod
 from string import Template
 import yaml_header_tools

@@ -156,15 +156,21 @@ def handle_value(value: Union[dict, str, list], values: GeneralStore):
        propvalue = value

        # variables replacement:
-        propvalue = [replace_variables(i, values) for i in propvalue]
+        propvalue = list()
+        for element in value:
+            # Do the element-wise replacement only, when its type is string:
+            if type(element) == str:
+                propvalue.append(replace_variables(element, values))
+            else:
+                propvalue.append(element)

        return (propvalue, collection_mode)
    else:
        # value is another simple type
-        # collection_mode = "single"
-        # propvalue = value["value"]
-        # return (propvalue, collection_mode)
-        raise RuntimeError()
+        collection_mode = "single"
+        propvalue = value
+        # Return it immediately, otherwise variable substitution would be done and fail:
+        return (propvalue, collection_mode)

    propvalue = replace_variables(propvalue, values)
    return (propvalue, collection_mode)
@@ -255,7 +261,7 @@ def create_records(values: GeneralStore,
    return keys_modified


-class Converter(object):
+class Converter(object, metaclass=ABCMeta):
    """
    Converters treat StructureElements contained in the hierarchical sturcture.
    """
@@ -283,6 +289,10 @@ class Converter(object):
    def converter_factory(definition: dict,
                          name: str,
                          converter_registry: dict):
+        """creates a Converter instance of the appropriate class.
+
+        The `type` key in the `definition` defines the Converter class which is being used.
+        """

        if "type" not in definition:
            raise RuntimeError(
@@ -535,6 +545,7 @@ class DictConverter(Converter):
        return {}


+# TODO: difference to SimpleFileConverter? Do we need both?
 class FileConverter(Converter):
    def typecheck(self, element: StructureElement):
        return isinstance(element, File)
@@ -566,6 +577,8 @@ class JSONFileConverter(DictConverter):
    def create_children(self, generalStore: GeneralStore, element: StructureElement):
        if not self.typecheck(element):
            raise RuntimeError("A JSON file is needed to create children")
+        # TODO: either add explicit time check for File structure element here,
+        #       or add a comment to suppress mypy type warning.
        with open(element.path, 'r') as json_file:
            json_data = json.load(json_file)
        if not isinstance(json_data, dict):

--- a/src/caoscrawler/crawl.py
+++ b/src/caoscrawler/crawl.py
@@ -55,12 +55,17 @@ from caosdb.apiutils import compare_entities, merge_entities
 from copy import deepcopy
 from jsonschema import validate

-logger = logging.getLogger(__name__)
+from .macros import defmacro_constructor, macro_constructor

+logger = logging.getLogger(__name__)

 SPECIAL_PROPERTIES_STRICT = ("description", "name", "id", "path")
 SPECIAL_PROPERTIES_NOT_STRICT = ("file", "checksum", "size")

+# Register the macro functions from the submodule:
+yaml.SafeLoader.add_constructor("!defmacro", defmacro_constructor)
+yaml.SafeLoader.add_constructor("!macro", macro_constructor)
+

 def check_identical(record1: db.Entity, record2: db.Entity, ignore_id=False):
    """
@@ -160,7 +165,6 @@ class Crawler(object):
    """

    def __init__(self,
-                 converters: List[Converter] = [],
                 generalStore: Optional[GeneralStore] = None,
                 debug: bool = False,
                 identifiableAdapter: IdentifiableAdapter = None,
@@ -171,15 +175,14 @@ class Crawler(object):

        Parameters
        ----------
-        converters : List[Converter]
-             The set of converters used for this crawler.
        recordStore : GeneralStore
             An initial GeneralStore which might store e.g. environment variables.
        debug : bool
             Create a debugging information tree when set to True.
             The debugging information tree is a variable stored in
             self.debug_tree. It is a dictionary mapping directory entries
-             to a tuple of general stores and record stores which are valid for the directory scope.
+             to a tuple of general stores and record stores which are valid for
+             the directory scope.
             Furthermore, it is stored in a second tree named self.debug_copied whether the
             objects in debug_tree had been copied from a higher level in the hierarchy
             of the structureelements.
@@ -191,7 +194,6 @@ class Crawler(object):
        """

        # TODO: check if this feature is really needed
-        self.global_converters = converters

        self.identified_cache = IdentifiedCache()
        self.recordStore = RecordStore()
@@ -225,7 +227,16 @@ class Crawler(object):

        # Load the cfood from a yaml file:
        with open(crawler_definition_path, "r") as f:
-            crawler_definition = yaml.safe_load(f)
+            crawler_definitions = list(yaml.safe_load_all(f))
+
+            if len(crawler_definitions) == 1:
+                # Simple case, just one document:
+                crawler_definition = crawler_definitions[0]
+            elif len(crawler_definitions) == 2:
+                crawler_definition = crawler_definitions[1]
+            else:
+                raise RuntimeError(
+                    "Crawler definition must not contain more than two documents.")

        # TODO: at this point this function can already load the cfood schema extensions
        #       from the crawler definition and add them to the yaml schema that will be
@@ -376,9 +387,13 @@ class Crawler(object):
                            converter_registry)

    @staticmethod
-    def create_local_converters(crawler_definition: dict,
-                                converter_registry: dict):
-        local_converters = []
+    def initialize_converters(crawler_definition: dict, converter_registry: dict):
+        """
+        takes the cfood as dict (`crawler_definition`) and creates the converter objects that
+        are defined on the highest level. Child Converters will in turn be created during the
+        initialization of the Converters.
+        """
+        converters = []

        for key, value in crawler_definition.items():
            # Definitions and Converters are reserved keywords
@@ -390,10 +405,9 @@ class Crawler(object):
                continue
            elif key == "Converters":
                continue
-            local_converters.append(Converter.converter_factory(
-                value, key, converter_registry))
+            converters.append(Converter.converter_factory(value, key, converter_registry))

-        return local_converters
+        return converters

    def start_crawling(self, items: Union[List[StructureElement], StructureElement],
                       crawler_definition: dict,
@@ -425,20 +439,19 @@ class Crawler(object):
            items = [items]

        self.run_id = uuid.uuid1()
-        local_converters = Crawler.create_local_converters(crawler_definition,
-                                                           converter_registry)
+        local_converters = Crawler.initialize_converters(
+            crawler_definition, converter_registry)
        # This recursive crawling procedure generates the update list:
        self.target_data: List[db.Record] = []
-        self._crawl(items,
-                    self.global_converters, local_converters, self.generalStore, self.recordStore,
-                    [], [])
+        self._crawl(items, local_converters, self.generalStore,
+                    self.recordStore, [], [])

        if self.debug:
-            self.debug_converters = self.global_converters + local_converters
+            self.debug_converters = local_converters

        return self.target_data

-    def synchronize(self, commit_changes: bool = True):
+    def synchronize(self, commit_changes: bool = True, unique_names=True):
        """
        Carry out the actual synchronization.
        """
@@ -446,7 +459,7 @@ class Crawler(object):
        # After the crawling, the actual synchronization with the database, based on the
        # update list is carried out:

-        return self._synchronize(self.target_data, commit_changes)
+        return self._synchronize(self.target_data, commit_changes, unique_names=unique_names)

    def can_be_checked_externally(self, record: db.Record):
        """
@@ -807,7 +820,8 @@ class Crawler(object):
        return db.Entity(name=name).retrieve()

    @staticmethod
-    def execute_inserts_in_list(to_be_inserted, securityMode, run_id: int = None):
+    def execute_inserts_in_list(to_be_inserted, securityMode, run_id: int = None,
+                                unique_names=True):
        for record in to_be_inserted:
            for prop in record.properties:
                entity = Crawler._get_entity_by_name(prop.name)
@@ -816,7 +830,7 @@ class Crawler(object):
        logger.debug(to_be_inserted)
        if len(to_be_inserted) > 0:
            if securityMode.value > SecurityMode.RETRIEVE.value:
-                db.Container().extend(to_be_inserted).insert()
+                db.Container().extend(to_be_inserted).insert(unique=unique_names)
            elif run_id is not None:
                update_cache = UpdateCache()
                update_cache.insert(to_be_inserted, run_id, insert=True)
@@ -834,18 +848,20 @@ class Crawler(object):
                    _resolve_datatype(prop, entity)

    @staticmethod
-    def execute_updates_in_list(to_be_updated, securityMode, run_id: int = None):
+    def execute_updates_in_list(to_be_updated, securityMode, run_id: int = None,
+                                unique_names=True):
        Crawler.set_ids_and_datatype_of_parents_and_properties(to_be_updated)
        logger.debug("UPDATE")
        logger.debug(to_be_updated)
        if len(to_be_updated) > 0:
            if securityMode.value > SecurityMode.INSERT.value:
-                db.Container().extend(to_be_updated).update()
+                db.Container().extend(to_be_updated).update(unique=unique_names)
            elif run_id is not None:
                update_cache = UpdateCache()
                update_cache.insert(to_be_updated, run_id)

-    def _synchronize(self, target_data: List[db.Record], commit_changes: bool = True):
+    def _synchronize(self, target_data: List[db.Record], commit_changes: bool = True,
+                     unique_names=True):
        """
        This function applies several stages:
        1) Retrieve identifiables for all records in target_data.
@@ -884,9 +900,9 @@ class Crawler(object):
            self.execute_parent_updates_in_list(to_be_updated)

            self.execute_inserts_in_list(
-                to_be_inserted, self.securityMode, self.run_id)
+                to_be_inserted, self.securityMode, self.run_id, unique_names=unique_names)
            self.execute_updates_in_list(
-                to_be_updated, self.securityMode, self.run_id)
+                to_be_updated, self.securityMode, self.run_id, unique_names=unique_names)

        update_cache = UpdateCache()
        pending_inserts = update_cache.get_inserts(self.run_id)
@@ -969,7 +985,6 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3]))
            f.write(yaml.dump(paths, sort_keys=False))

    def _crawl(self, items: List[StructureElement],
-               global_converters: List[Converter],
               local_converters: List[Converter],
               generalStore: GeneralStore,
               recordStore: RecordStore,
@@ -978,7 +993,7 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3]))
        Crawl a list of StructureElements and apply any matching converters.

        items: structure_elements (e.g. files and folders on one level on the hierarchy)
-        global_converters and local_converters: globally or locally defined converters for
+        local_converters: locally defined converters for
                            treating structure elements. A locally defined converter could be
                            one that is only valid for a specific subtree of the originally
                            cralwed StructureElement structure.
@@ -986,7 +1001,8 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3]))
                            global stores of the Crawler object.
        """
        for element in items:
-            for converter in global_converters + local_converters:
+            for converter in local_converters:
+
                # type is something like "matches files", replace isinstance with "type_matches"
                # match function tests regexp for example
                if (converter.typecheck(element) and
@@ -1012,7 +1028,8 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3]))
                        self.debug_tree[str(element)] = (
                            generalStore_copy.get_storage(), recordStore_copy.get_storage())
                        self.debug_metadata["copied"][str(element)] = (
-                            generalStore_copy.get_dict_copied(), recordStore_copy.get_dict_copied())
+                            generalStore_copy.get_dict_copied(),
+                            recordStore_copy.get_dict_copied())
                        self.debug_metadata["usage"][str(element)].add(
                            "/".join(converters_path + [converter.name]))
                        mod_info = self.debug_metadata["provenance"]
@@ -1023,10 +1040,11 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3]))
                            record_identifier = record_name + \
                                "_" + str(internal_id)
                            converter.metadata["usage"].add(record_identifier)
-                            mod_info[record_identifier][prop_name] = (structure_elements_path + [element.get_name()],
+                            mod_info[record_identifier][prop_name] = (
+                                structure_elements_path + [element.get_name()],
                                converters_path + [converter.name])

-                    self._crawl(children, global_converters, converter.converters,
+                    self._crawl(children, converter.converters,
                                generalStore_copy, recordStore_copy,
                                structure_elements_path + [element.get_name()],
                                converters_path + [converter.name])
@@ -1058,7 +1076,9 @@ def crawler_main(crawled_directory_path: str,
                 provenance_file: str = None,
                 dry_run: bool = False,
                 prefix: str = "",
-                 securityMode: int = SecurityMode.UPDATE):
+                 securityMode: int = SecurityMode.UPDATE,
+                 unique_names=True,
+                 ):
    """

    Parameters
@@ -1079,6 +1099,8 @@ def crawler_main(crawled_directory_path: str,
        remove the given prefix from file paths
    securityMode : int
        securityMode of Crawler
+    unique_names : bool
+        whether or not to update or insert entities inspite of name conflicts

    Returns
    -------
@@ -1110,6 +1132,8 @@ def crawler_main(crawled_directory_path: str,
            if isinstance(elem, db.File):
                # correct the file path:
                # elem.file = os.path.join(args.path, elem.file)
+                if prefix is None:
+                    raise RuntimeError("No prefix set. Prefix must be set if files are used.")
                if elem.path.startswith(prefix):
                    elem.path = elem.path[len(prefix):]
                elem.file = None
@@ -1136,7 +1160,7 @@ def crawler_main(crawled_directory_path: str,
            raise RuntimeError("Missing RecordTypes: {}".
                               format(", ".join(notfound)))

-        crawler.synchronize(commit_changes=True)
+        crawler.synchronize(commit_changes=True, unique_names=unique_names)
    return 0


@@ -1154,6 +1178,7 @@ def parse_args():
                        help="The subtree of files below the given path will "
                        "be considered. Use '/' for everything.")
    parser.add_argument("-s", "--security-mode", choices=["retrieve", "insert", "update"],
+                        default="retrieve",
                        help="Determines whether entities may only be read from the server, or "
                        "whether inserts or even updates may be done.")
    parser.add_argument("-n", "--dry-run", action="store_true",
@@ -1162,9 +1187,9 @@ def parse_args():

    # TODO: load identifiables is a dirty implementation currently
    parser.add_argument("-i", "--load-identifiables",
-                        help="Load identifiables from "
-                        "the given yaml file.")
-
+                        help="Load identifiables from the given yaml file.")
+    parser.add_argument("-u", "--unique-names",
+                        help="Insert or updates entities even if name conflicts exist.")
    parser.add_argument("-p", "--prefix",
                        help="Remove the given prefix from the paths "
                        "of all file objects.")
@@ -1186,16 +1211,17 @@ def main():
        logger.setLevel(logging.INFO)

    sys.exit(crawler_main(
-        args.crawled_directory_path,
-        args.cfood_file_name,
-        args.load_identifiables,
-        args.debug,
-        args.provenance,
-        args.dry_run,
-        args.prefix,
-        {"retrieve": SecurityMode.RETRIEVE,
+        crawled_directory_path=args.crawled_directory_path,
+        cfood_file_name=args.cfood_file_name,
+        identifiables_definition_file=args.load_identifiables,
+        debug=args.debug,
+        provenance_file=args.provenance,
+        dry_run=args.dry_run,
+        prefix=args.prefix,
+        securityMode={"retrieve": SecurityMode.RETRIEVE,
                      "insert": SecurityMode.INSERT,
-         "update": SecurityMode.UPDATE}[args.security_mode]
+                      "update": SecurityMode.UPDATE}[args.security_mode],
+        unique_names=args.unique_names,
    ))



--- a/src/caoscrawler/macros/__init__.py
+++ b/src/caoscrawler/macros/__init__.py
+from .macro_yaml_object import defmacro_constructor, macro_constructor
--- a/src/caoscrawler/macros/macro_yaml_object.py
+++ b/src/caoscrawler/macros/macro_yaml_object.py
+#!/usr/bin/env python3
+# encoding: utf-8
+#
+# ** header v3.0
+# This file is a part of the CaosDB Project.
+#
+# Copyright (C) 2022 Alexander Schlemmer
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+# ** end header
+#
+
+# Function to expand a macro in yaml
+# A. Schlemmer, 05/2022
+
+from dataclasses import dataclass
+from typing import Any, Dict
+from copy import deepcopy
+from string import Template
+
+
+@dataclass
+class MacroDefinition:
+    """
+    Stores a macro definition.
+    name: Name of the macro
+    params: variables and default values to be substituted in keys or values
+    definition: A dictionary that will be substituted including parameters
+    """
+    name: str
+    params: Dict[str, Any]
+    definition: Any
+
+
+# This dictionary stores the macro definitions
+macro_store: Dict[str, MacroDefinition] = dict()
+
+
+def substitute(propvalue, values: dict):
+    """
+    Substitution of variables in strings using the variable substitution
+    library from python's standard library.
+    """
+    propvalue_template = Template(propvalue)
+    return propvalue_template.safe_substitute(**values)
+
+
+def substitute_dict(sourced: Dict[str, Any], values: Dict[str, Any]):
+    """
+    Create a copy of sourced.
+    Afterwards recursively do variable substitution on all keys and values.
+    """
+    d = deepcopy(sourced)
+    # Changes in keys:
+    replace: Dict[str, str] = dict()
+    for k in d:
+        replacement = substitute(k, values)
+        if replacement != k:
+            replace[k] = replacement
+    for k, v in replace.items():
+        d[v] = d[k]
+        del d[k]
+    # Changes in values:
+    for k, v in d.items():
+        if isinstance(v, str):
+            d[k] = substitute(v, values)
+        elif isinstance(v, list):
+            subst_list = list()
+            for i in d[k]:
+                if isinstance(i, str):
+                    subst_list.append(substitute(i, values))
+                elif isinstance(i, dict):
+                    subst_list.append(substitute_dict(i, values))
+                else:
+                    subst_list.append(i)
+            d[k] = subst_list
+        elif isinstance(v, dict):
+            d[k] = substitute_dict(v, values)
+        else:
+            pass
+    return d
+
+
+def defmacro_constructor(loader, node):
+    """
+    Function for registering macros in yaml files.
+
+    It can be registered in pyaml using:
+    yaml.SafeLoader.add_constructor("!defmacro", defmacro_constructor)
+    """
+
+    value = loader.construct_mapping(node, deep=True)
+    params = {}
+    if "params" in value:
+        params = value["params"]
+    macro = MacroDefinition(
+        value["name"], params,
+        value["definition"])
+    macro_store[macro.name] = macro
+    return {}
+
+
+def macro_constructor(loader, node):
+    """
+    Function for substituting macros in yaml files.
+
+    It can be registered in pyaml using:
+    yaml.SafeLoader.add_constructor("!macro", macro_constructor)
+    """
+    res = dict()
+    value = loader.construct_mapping(node, deep=True)
+    for name, params_setter in value.items():
+        if name in macro_store:
+            # If params_setter is a list, run this for every element:
+            if params_setter is not None and isinstance(params_setter, list):
+                for el in params_setter:
+                    macro = macro_store[name]
+                    params = deepcopy(macro.params)
+                    if el is not None:
+                        if isinstance(el, dict):
+                            params.update(el)
+                        else:
+                            raise RuntimeError("params type not supported")
+                    else:
+                        raise RuntimeError("params type must not be None")
+                    definition = substitute_dict(macro.definition, params)
+                    res.update(definition)
+            else:
+                # This is just a single macro:
+                macro = macro_store[name]
+                params = deepcopy(macro.params)
+                if params_setter is not None:
+                    if isinstance(params_setter, dict):
+                        params.update(params_setter)
+                    else:
+                        raise RuntimeError("params type not supported")
+                definition = substitute_dict(macro.definition, params)
+                res.update(definition)
+        else:
+            # If there is no macro with that name, just keep that node:
+            res[name] = params_setter
+
+    return res
--- a/src/doc/README_SETUP.md
+++ b/src/doc/README_SETUP.md
+# Getting started with the CaosDB Crawler #
+
+## Installation ##
+
+### Requirements ###
+
+
+### How to install ###
+
+#### Linux ####
+
+Make sure that Python (at least version 3.8) and pip is installed, using your system tools and
+documentation.
+
+Then open a terminal and continue in the [Generic installation](#generic-installation) section.
+
+#### Windows ####
+
+If a Python distribution is not yet installed, we recommend Anaconda Python, which you can download
+for free from [https://www.anaconda.com](https://www.anaconda.com).  The "Anaconda Individual Edition" provides most of all
+packages you will ever need out of the box.  If you prefer, you may also install the leaner
+"Miniconda" installer, which allows you to install packages as you need them.
+
+After installation, open an Anaconda prompt from the Windows menu and continue in the [Generic
+installation](#generic-installation) section.
+
+#### MacOS ####
+
+If there is no Python 3 installed yet, there are two main ways to
+obtain it: Either get the binary package from
+[python.org](https://www.python.org/downloads/) or, for advanced
+users, install via [Homebrew](https://brew.sh/). After installation
+from python.org, it is recommended to also update the TLS certificates
+for Python (this requires administrator rights for your user):
+
+```sh
+# Replace this with your Python version number:
+cd /Applications/Python\ 3.9/
+
+# This needs administrator rights:
+sudo ./Install\ Certificates.command
+```
+
+After these steps, you may continue with the [Generic
+installation](#generic-installation).
+
+#### Generic installation ####
+
+---
+
+Obtain the sources from GitLab and install from there (`git` must be installed for
+this option):
+
+```sh
+git clone https://gitlab.com/caosdb/caosdb-crawler
+cd caosdb-crawler
+pip3 install --user .
+```
+
+**Note**: In the near future, this package will also be made available on PyPi.
+
+## Configuration ##
+
+
+
+## Try it out ##
+
+
+
+## Run Unit Tests
+
+## Documentation ##
+
+Build documentation in `src/doc` with `make html`.
+
+### Requirements ###
+
+- `sphinx`
+- `sphinx-autoapi`
+- `recommonmark`
+
+### Troubleshooting ###
--- a/src/doc/_apidoc/modules.rst
+++ b/src/doc/_apidoc/modules.rst
-newcrawler
-==========
-
-.. toctree::
-   :maxdepth: 4
-
-   newcrawler
--- a/src/doc/_apidoc/newcrawler.converters.rst
+++ b/src/doc/_apidoc/newcrawler.converters.rst
-newcrawler.converters module
-============================
-
-.. automodule:: newcrawler.converters
-   :members:
-   :undoc-members:
-   :show-inheritance:
--- a/src/doc/_apidoc/newcrawler.crawl.rst
+++ b/src/doc/_apidoc/newcrawler.crawl.rst
-newcrawler.crawl module
-=======================
-
-.. automodule:: newcrawler.crawl
-   :members:
-   :undoc-members:
-   :show-inheritance:
--- a/src/doc/_apidoc/newcrawler.identifiable_adapters.rst
+++ b/src/doc/_apidoc/newcrawler.identifiable_adapters.rst
-newcrawler.identifiable\_adapters module
-========================================
-
-.. automodule:: newcrawler.identifiable_adapters
-   :members:
-   :undoc-members:
-   :show-inheritance:
--- a/src/doc/_apidoc/newcrawler.identified_cache.rst
+++ b/src/doc/_apidoc/newcrawler.identified_cache.rst
-newcrawler.identified\_cache module
-===================================
-
-.. automodule:: newcrawler.identified_cache
-   :members:
-   :undoc-members:
-   :show-inheritance:
--- a/src/doc/_apidoc/newcrawler.rst
+++ b/src/doc/_apidoc/newcrawler.rst
-newcrawler package
-==================
-
-Submodules
----------
-
-.. toctree::
-   :maxdepth: 4
-
-   newcrawler.converters
-   newcrawler.crawl
-   newcrawler.identifiable_adapters
-   newcrawler.identified_cache
-   newcrawler.stores
-   newcrawler.structure_elements
-   newcrawler.utils
-
-Module contents
---------------
-
-.. automodule:: newcrawler
-   :members:
-   :undoc-members:
-   :show-inheritance:
--- a/src/doc/_apidoc/newcrawler.stores.rst
+++ b/src/doc/_apidoc/newcrawler.stores.rst
-newcrawler.stores module
-========================
-
-.. automodule:: newcrawler.stores
-   :members:
-   :undoc-members:
-   :show-inheritance:
--- a/src/doc/_apidoc/newcrawler.structure_elements.rst
+++ b/src/doc/_apidoc/newcrawler.structure_elements.rst
-newcrawler.structure\_elements module
-=====================================
-
-.. automodule:: newcrawler.structure_elements
-   :members:
-   :undoc-members:
-   :show-inheritance:
--- a/src/doc/_apidoc/newcrawler.utils.rst
+++ b/src/doc/_apidoc/newcrawler.utils.rst
-newcrawler.utils module
-=======================
-
-.. automodule:: newcrawler.utils
-   :members:
-   :undoc-members:
-   :show-inheritance:
--- a/src/doc/cfood.rst
+++ b/src/doc/cfood.rst
+CFood-Definition
+================
+
+The crawler specification is called CFood-definition. It is stored inside a yaml file, or - more precisely - inside of one single or two yaml documents inside a yaml file.
+
+The specification consists of three separate parts:
+#. Metadata and macro definitions
+#. Custom converter registrations
+#. The converter tree specification
+
+In the simplest case, there is just one yaml file with just a single document including at least
+the converter tree specification (see :ref:`example 1<example_1>`). Additionally the custom converter part may be also included in
+this single document (for historical reasons, see :ref:`example 2<example_2>`), but it is recommended to include them in the separate
+document together with the metadata and :doc:`macro<macros>` definitions (see :ref:`below<example_4>`).
+
+If metadata and macro definitions are provided, there **must** be a second document preceeding the
+converter tree specification, including these definitions.
+
+Examples
++++++++
+
+A single document with a converter tree specification:
+
+.. _example_1:
+.. code-block:: yaml
+                
+   extroot:
+     type: Directory
+     match: ^extroot$
+     subtree:
+       DataAnalysis:
+         type: Directory
+         match: DataAnalysis
+         # (...)
+
+         
+A single document with a converter tree specification, but also including a custom converters section:
+
+.. _example_2:
+.. code-block:: yaml
+
+   Converters:
+     CustomConverter_1:
+       package: mypackage.converters
+       converter: CustomConverter1
+     CustomConverter_2:
+       package: mypackage.converters
+       converter: CustomConverter2
+                
+   extroot:
+     type: Directory
+     match: ^extroot$
+     subtree:
+       DataAnalysis:
+         type: Directory
+         match: DataAnalysis
+         # (...)
+
+
+
+A yaml multi-document, defining metadata and some macros in the first document and declaring
+two custom converters in the second document (**not recommended**, see the recommended version :ref:`below<example_4>`). Please note, that two separate yaml documents can be defined using the ``---`` syntax:
+
+
+.. _example_3:
+.. code-block:: yaml
+
+   ---
+   metadata:
+     name: Datascience CFood
+     description: CFood for data from the local data science work group
+     macros:
+     - !defmacro
+       name: SimulationDatasetFile
+       params:
+         match: null
+         recordtype: null
+         nodename: null
+       definition:
+         # (...)
+   ---
+   Converters:
+     CustomConverter_1:
+       package: mypackage.converters
+       converter: CustomConverter1
+     CustomConverter_2:
+       package: mypackage.converters
+       converter: CustomConverter2
+                
+   extroot:
+     type: Directory
+     match: ^extroot$
+     subtree:
+       DataAnalysis:
+         type: Directory
+         match: DataAnalysis
+         # (...)
+
+
+
+The **recommended way** of defining metadata, custom converters, macros and the main cfood specification is shown in the following code example:
+
+
+.. _example_4:
+.. code-block:: yaml
+
+   ---
+   metadata:
+     name: Datascience CFood
+     description: CFood for data from the local data science work group
+     macros:
+     - !defmacro
+       name: SimulationDatasetFile
+       params:
+         match: null
+         recordtype: null
+         nodename: null
+       definition:
+         # (...)
+     Converters:
+       CustomConverter_1:
+         package: mypackage.converters
+         converter: CustomConverter1
+       CustomConverter_2:
+         package: mypackage.converters
+         converter: CustomConverter2
+   ---
+   extroot:
+     type: Directory
+     match: ^extroot$
+     subtree:
+       DataAnalysis:
+         type: Directory
+         match: DataAnalysis
+         # (...)
+
+
+List Mode
+---------
+
+Specifying values of properties can make use of two special characters, in order to automatically
+create lists or multi properties instead of single values:
+
+.. code-block:: yaml
+                
+        Experiment1:
+            Measurement: +Measurement <- Element in List (list is cleared before run)
+                         *Measurement <- Multi Property (properties are removed before run)
+                         Measurement  <- Overwrite
--- a/src/doc/concepts.rst
+++ b/src/doc/concepts.rst
+Concepts
+))))))))
+
+Structure Elements
++++++++++++++++++
+
+This hierarchical structure is assumed to be consituted of a tree of
+StructureElements. The tree is created on the fly by so called Converters which
+are defined in a yaml file. The tree of StructureElements is a model
+of the existing data (For example could a tree of Python file objects
+(StructureElements) represent a file tree that exists on some file server).
+
+Relevant sources in:
+src/structure_elements.py
+
+Converters
++++++++++
+
+Converters treat StructureElements and thereby create the StructureElement that
+are the children of the treated StructureElement. Converters therefore create
+the above named tree. The definition of a Converter also contains what
+Converters shall be used to treat the generated child-StructureElements. The
+definition is therefore a tree itself.
+
+See `:doc:converters<converters>` for details.
+
+
+
+Relevant sources in:
+src/converters.py
+
+
+
+Identifiables
+++++++++++++
+
+Relevant sources in:
+src/identifiable_adapters.py
+
+The Crawler
+++++++++++
+
+The crawler can be considered the main program doing the synchronization in basically two steps:
+#. Based on a yaml-specification scan the file system (or other sources) and create a set of CaosDB Entities that are supposed to be inserted or updated in a CaosDB instance.
+#. Compare the current state of the CaosDB instance with the set of CaosDB Entities created in step 1, taking into account the :ref:`registered identifiables<Identifiables>`. Insert or update entites accordingly.
+
+Relevant sources in:
+src/crawl.py
+
+
+
+Special Cases
+=============
+
+Variable Precedence
+++++++++++++++++++
+
+Let's assume the following situation
+
+.. code-block:: yaml
+                
+  description:
+    type: DictTextElement
+    match_value: (?P<description>.*)
+    match_name: description
+
+
+Making use of the $description variable could refer to two different variables created here:
+1. The structure element path.
+2. The value of the matched expression.
+
+The matched expression does take precedence over the structure element path and shadows it.
+
+Make sure, that if you want to be able to use the structure element path, to give unique names
+to the variables like:
+
+.. code-block:: yaml
+                
+  description_text_block:
+    type: DictTextElement
+    match_value: (?P<description>.*)
+    match_name: description
+
+
+Scopes
+========
+
+Example:
+
+.. code-block:: yaml
+                
+  DicomFile:
+    type: SimpleDicomFile
+    match: (?P<filename>.*)\.dicom
+    records:
+      DicomRecord:
+        name: $filename
+    subtree:  # header of dicom file
+      PatientID:
+        type: DicomHeaderElement
+        match_name: PatientName
+        match_value: (?P<patient>.*)
+        records:
+          Patient:
+            name: $patient
+            dicom_name: $filename  # $filename is in same scope!
+  ExperimentFile:
+    type: MarkdownFile
+    match: ^readme.md$
+    records:
+      Experiment:
+        dicom_name: $filename  # does NOT work, because $filename is out of scope!
+
+
+# can variables be used within regexp?
+
+
+File Objects
+============
--- a/src/doc/conf.py
+++ b/src/doc/conf.py
@@ -53,6 +53,7 @@ extensions = [
    'sphinx.ext.autosectionlabel',
    'sphinx.ext.intersphinx',
    'sphinx.ext.napoleon',     # For Google style docstrings
+    "recommonmark",            # For markdown files.
    "sphinx_rtd_theme",
 ]

@@ -61,7 +62,7 @@ templates_path = ['_templates']

 # The suffix(es) of source filenames.
 # You can specify multiple suffix as a list of string:
-source_suffix = ['.rst']
+source_suffix = ['.rst', '.md']

 # The master toctree document.
 master_doc = 'index'
@@ -71,7 +72,7 @@ master_doc = 'index'
 #
 # This is also used if you do content translation via gettext catalogs.
 # Usually you set "language" from the command line for these cases.
-language = None
+language = "en"

 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
@@ -99,7 +100,7 @@ html_theme = "sphinx_rtd_theme"
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = []  # ['_static']

 # Custom sidebar templates, must be a dictionary that maps document names
 # to template names.