Merge branch 'release-0.2.0' into 'main'

REL: RElease v0.2.0 See merge request !71

Merge branch 'release-0.2.0' into 'main'
88e63b9f · Florian Spreckelsen · 692b2df7 · e92e87e8 · 88e63b9f · 88e63b9f
Commit 88e63b9f authored Nov 18, 2022 by Florian Spreckelsen
--- a/.docker/docker-compose.yml
+++ b/.docker/docker-compose.yml
@@ -34,6 +34,7 @@ services:
      DEBUG: 1
      CAOSDB_CONFIG_AUTHTOKEN_CONFIG: "conf/core/authtoken.example.yaml"
      CAOSDB_CONFIG_TRANSACTION_BENCHMARK_ENABLED: "TRUE"
+      CAOSDB_CONFIG__CAOSDB_INTEGRATION_TEST_SUITE_KEY: 10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2
 volumes:
  scripting:
  authtoken:

--- a/.gitignore
+++ b/.gitignore
@@ -3,7 +3,6 @@ src/caoscrawler.egg-info/
 __pycache__
 .tox
 TAGS
-src/.coverage
 build/
 *~
 .pdbrc
@@ -17,3 +16,4 @@ provenance.yml
 src/doc/_apidoc/
 start_caosdb_docker.sh
 src/doc/_apidoc
+/dist/
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -113,14 +113,34 @@ info:
  script:
    - *env

-unittest:
+unittest_py3.9:
  tags: [cached-dind]
-  image: docker:20.10
  stage: test
  image: $CI_REGISTRY_IMAGE
  script:
      - tox

+unittest_py3.8:
+  tags: [cached-dind]
+  stage: test
+  image: python:3.8
+  script: &python_test_script
+    # install dependencies
+    - pip install pytest pytest-cov
+    # TODO: Use f-branch logic here
+    - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev
+    - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@dev
+    - pip install .
+    # actual test
+    - caosdb-crawler --help
+    - pytest --cov=caosdb -vv ./unittests
+
+unittest_py3.10:
+  tags: [cached-dind]
+  stage: test
+  image: python:3.10
+  script: *python_test_script
+
 inttest:
  tags: [docker]
  services:

--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,42 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

+## [0.2.0] - 2022-11-18 ##
+(Florian Spreckelsen)
+
+### Added ###
+- the -c/--add-cwd-to-path option allows to plays for example custom converter
+  modules into the current working directory(cwd) since the cwd is added to
+  the Python path.
+
+### Changed ###
+
+- Converters often used in dicts (DictFloatElementConverter,
+  DictIntegerElementConverter, ...) do now accept other StructureElements by
+  default. For example a DictIntegerElement is accepted by default instead of a
+  DictFloatElement. This behavior can be changed (see converter documentation).
+  **Note** This might lead to additional matches compared to previous versions.
+- `_AbstractDictElementConverter` uses `re.DOTALL` for `match_value`
+- The "fallback" parent, the name of the element in the cfood, is only used 
+  when the object is created and only if there are no parents given.
+
+### Deprecated ###
+
+### Removed ###
+
+### Fixed ###
+
+* [#31](https://gitlab.com/caosdb/caosdb-crawler/-/issues/31) Identified cache:
+  Hash is the same for Records without IDs
+* [#30](https://gitlab.com/caosdb/caosdb-crawler/-/issues/30)
+* [#23](https://gitlab.com/caosdb/caosdb-crawler/-/issues/23) Crawler may
+  overwrite and delete existing data in case of manually added properties
+* [#10](https://gitlab.com/caosdb/caosdb-crawler/-/issues/10) floats can be
+  interpreted as integers and vice versa, there are defaults for allowing other
+  types and this can be changed per converter
+
+### Security ###
+
 ## [0.1.0] - 2022-10-11
 (Florian Spreckelsen)


--- a/integrationtests/basic_example/test_basic.py
+++ b/integrationtests/basic_example/test_basic.py
 #!/usr/bin/env python3
 # encoding: utf-8
 #
-# ** header v3.0
 # This file is a part of the CaosDB Project.
 #
 # Copyright (C) 2021 Indiscale GmbH <info@indiscale.com>
@@ -21,11 +20,10 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program. If not, see <https://www.gnu.org/licenses/>.
 #
-# ** end header
 #

 """
-module description
+an integration test module that does basic integration tests
 """

 from caosadvancedtools.crawler import Crawler as OldCrawler
@@ -41,9 +39,8 @@ import pytest
 from caosadvancedtools.models.parser import parse_model_from_yaml
 import yaml

-# TODO is not yet merged in caosadvancedtools
-#from caosadvancedtools.testutils import clear_database, set_test_key
-# set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2")
+from caosdb.utils.register_tests import clear_database, set_test_key
+set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2")


 def rfp(*pathcomponents):
@@ -54,11 +51,6 @@ def rfp(*pathcomponents):
    return os.path.join(os.path.dirname(__file__), *pathcomponents)


-@pytest.fixture
-def clear_database():
-    db.execute_query("FIND Entity").delete()
-
-
 @pytest.fixture
 def usemodel():
    model = parse_model_from_yaml(rfp("model.yml"))
@@ -108,7 +100,7 @@ def crawler_extended(ident):
    cr = Crawler(debug=True, identifiableAdapter=ident)
    crawl_standard_test_directory(cr, cfood="scifolder_extended.yml")
    # correct paths for current working directory
-    file_list = [r for r in cr.target_data if r.role == "File"]
+    file_list = [r for r in cr.crawled_data if r.role == "File"]
    for f in file_list:
        f.file = rfp("..", "..", "unittests", "test_directories", f.file)
    return cr
@@ -160,7 +152,7 @@ def test_insertion(clear_database, usemodel, ident, crawler):
    # Do a second run on the same data, there should a new insert:
    cr = Crawler(debug=True, identifiableAdapter=ident)
    crawl_standard_test_directory(cr, "example_insert")
-    assert len(cr.target_data) == 3
+    assert len(cr.crawled_data) == 3
    ins, ups = cr.synchronize()
    assert len(ins) == 1
    assert len(ups) == 0
@@ -168,7 +160,7 @@ def test_insertion(clear_database, usemodel, ident, crawler):
    # Do it again to check whether nothing is changed:
    cr = Crawler(debug=True, identifiableAdapter=ident)
    crawl_standard_test_directory(cr, "example_insert")
-    assert len(cr.target_data) == 3
+    assert len(cr.crawled_data) == 3
    ins, ups = cr.synchronize()
    assert len(ins) == 0
    assert len(ups) == 0
@@ -180,7 +172,7 @@ def test_insert_auth(clear_database, usemodel, ident, crawler):
    # Do a second run on the same data, there should a new insert:
    cr = Crawler(debug=True, identifiableAdapter=ident, securityMode=SecurityMode.RETRIEVE)
    crawl_standard_test_directory(cr, "example_insert")
-    assert len(cr.target_data) == 3
+    assert len(cr.crawled_data) == 3
    ins, ups = cr.synchronize()
    assert len(ins) == 1
    assert not ins[0].is_valid()
@@ -190,7 +182,7 @@ def test_insert_auth(clear_database, usemodel, ident, crawler):
    # Do it again to check whether nothing is changed:
    cr = Crawler(debug=True, identifiableAdapter=ident)
    crawl_standard_test_directory(cr, "example_insert")
-    assert len(cr.target_data) == 3
+    assert len(cr.crawled_data) == 3
    ins, ups = cr.synchronize()
    assert len(ins) == 0
    assert len(ups) == 0
@@ -205,9 +197,9 @@ def test_insertion_and_update(clear_database, usemodel, ident, crawler):

    cr = Crawler(debug=True, identifiableAdapter=ident)
    crawl_standard_test_directory(cr, "example_overwrite_1")
-    # print(cr.target_data)
+    # print(cr.crawled_data)
    # cr.save_debug_data(rfp("provenance.yml"))
-    assert len(cr.target_data) == 3
+    assert len(cr.crawled_data) == 3
    ins, ups = cr.synchronize()
    assert len(ins) == 0
    assert len(ups) == 1
@@ -222,7 +214,7 @@ def test_identifiable_update(clear_database, usemodel, ident, crawler):
    crawl_standard_test_directory(cr)

    # Test the addition of a single property:
-    l = cr.target_data
+    l = cr.crawled_data
    for record in l:
        if (record.parents[0].name == "Measurement" and
                record.get_property("date").value == "2020-01-03"):
@@ -238,7 +230,7 @@ def test_identifiable_update(clear_database, usemodel, ident, crawler):
    # Test the change within one property:
    cr = Crawler(debug=True, identifiableAdapter=ident)
    crawl_standard_test_directory(cr)
-    l = cr.target_data
+    l = cr.crawled_data
    for record in l:
        if (record.parents[0].name == "Measurement" and
                record.get_property("date").value == "2020-01-03"):
@@ -252,7 +244,7 @@ def test_identifiable_update(clear_database, usemodel, ident, crawler):
    # Changing the date should result in a new insertion:
    cr = Crawler(debug=True, identifiableAdapter=ident)
    crawl_standard_test_directory(cr)
-    l = cr.target_data
+    l = cr.crawled_data
    for record in l:
        if (record.parents[0].name == "Measurement" and
                record.get_property("date").value == "2020-01-03"):
@@ -269,7 +261,7 @@ def test_file_insertion_dry(clear_database, usemodel, ident):
    crawler_extended = Crawler(debug=True, identifiableAdapter=ident)
    crawl_standard_test_directory(
        crawler_extended, cfood="scifolder_extended.yml")
-    file_list = [r for r in crawler_extended.target_data if r.role == "File"]
+    file_list = [r for r in crawler_extended.crawled_data if r.role == "File"]
    assert len(file_list) == 11

    for f in file_list:
@@ -305,7 +297,7 @@ def test_file_update(clear_database, usemodel, ident, crawler_extended):
    cr = Crawler(debug=True, identifiableAdapter=ident)
    crawl_standard_test_directory(cr, cfood="scifolder_extended.yml")

-    file_list = [r for r in cr.target_data if r.role == "File"]
+    file_list = [r for r in cr.crawled_data if r.role == "File"]
    for f in file_list:
        f.file = rfp("..", "..", "unittests", "test_directories", f.file)
    ins2, ups2 = cr.synchronize(commit_changes=True)
@@ -320,7 +312,7 @@ def test_file_update(clear_database, usemodel, ident, crawler_extended):
    cr2 = Crawler(debug=True, identifiableAdapter=ident)
    crawl_standard_test_directory(cr2, cfood="scifolder_extended2.yml")

-    file_list = [r for r in cr2.target_data if r.role == "File"]
+    file_list = [r for r in cr2.crawled_data if r.role == "File"]
    for f in file_list:
        f.file = rfp("..", "..", "unittests", "test_directories", f.file)
    ins3, ups3 = cr2.synchronize(commit_changes=True)

--- a/integrationtests/test-profile/custom/other/restore/caosroot.example.tar.gz
+++ b/integrationtests/test-profile/custom/other/restore/caosroot.example.tar.gz
--- a/integrationtests/test-profile/custom/other/restore/restore.dump.sql
+++ b/integrationtests/test-profile/custom/other/restore/restore.dump.sql
--- a/integrationtests/test-profile/profile.yml
+++ b/integrationtests/test-profile/profile.yml
@@ -136,7 +136,7 @@ default:
        # grpc_server_port_https: 8443
        # HTTP port of the grpc end-point
        # grpc_server_port_http: 8080
-
+        _CAOSDB_INTEGRATION_TEST_SUITE_KEY: 10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2
  # Development configuration options
  # devel:
    # Copy the caosdb-server jar from this location into the Docker container.

--- a/integrationtests/test_data/extroot/realworld_example/data/35/03_raw_data/001_dataset1/metadata.json
+++ b/integrationtests/test_data/extroot/realworld_example/data/35/03_raw_data/001_dataset1/metadata.json
@@ -5,7 +5,7 @@
            {
                "longitude": 18.445078548041533,
                "start_datetime": "2022-02-10T16:36:48+01:00",
-                "latitude": 53.10833068997861,
+                "latitude": 53,
                "elevation": 2,
                "location": "Bremen, Germany"
            }

--- a/integrationtests/test_issues.py
+++ b/integrationtests/test_issues.py
+# This file is a part of the CaosDB Project.
+#
+# Copyright (C) 2022 Indiscale GmbH <info@indiscale.com>
+#               2022 Florian Spreckelsen <f.spreckelsen@indiscale.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+from pytest import fixture, mark
+
+import caosdb as db
+
+from caoscrawler.crawl import Crawler
+from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter
+from caoscrawler.structure_elements import Dict
+
+from caosdb.utils.register_tests import clear_database, set_test_key
+set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2")
+
+
+def test_issue_23(clear_database):
+    """Test that an update leaves existing properties, that were not found by
+    the crawler, unchanged.
+
+    See issue https://gitlab.com/caosdb/caosdb-crawler/-/issues/23
+
+    """
+
+    # insert a simplistic model an arecord of type TestType with identifying
+    # property and prop_a, but not prop_b.
+    prop_ident = db.Property(name="identifying_prop", datatype=db.TEXT)
+    prop_a = db.Property(name="prop_a", datatype=db.TEXT)
+    prop_b = db.Property(name="prop_b", datatype=db.TEXT)
+    rt = db.RecordType(name="TestType")
+    rec = db.Record(name="TestRec").add_parent(rt)
+    rec.add_property(name="identifying_prop", value="identifier")
+    rec.add_property(name="prop_a", value="something")
+    db.Container().extend([prop_ident, prop_a, prop_b, rt, rec]).insert()
+
+    # set up crawler, first cfood defining a TestType record with
+    # identifying_prop and prop_b, but not prop_a ...
+    crawler_definition = {
+        "DictTest": {
+            "type": "Dict",
+            "match": "(.*)",
+            "records": {
+                "TestType": {}
+            },
+            "subtree": {
+                "identifying_element": {
+                    "type": "DictTextElement",
+                    "match_name": "ident",
+                    "match_value": "(?P<ident_value>.*)",
+                    "records": {
+                        "TestType": {
+                            "identifying_prop": "$ident_value"
+                        }
+                    }
+                },
+                "other_element": {
+                    "type": "DictTextElement",
+                    "match_name": "prop_b",
+                    "match_value": "(?P<other_value>.*)",
+                    "records": {
+                        "TestType": {
+                            "prop_b": "$other_value"
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    # register identifiable for TestType
+    ident = CaosDBIdentifiableAdapter()
+    ident.register_identifiable("TestType", db.RecordType().add_parent(
+        name="TestType").add_property(name="identifying_prop"))
+
+    crawler = Crawler(debug=True, identifiableAdapter=ident)
+    converter_registry = crawler.load_converters(crawler_definition)
+
+    # the dictionary to be crawled...
+    test_dict = {
+        "ident": "identifier",
+        "prop_b": "something_else"
+    }
+
+    records = crawler.start_crawling(
+        Dict("TestDict", test_dict), crawler_definition, converter_registry)
+
+    assert len(records) == 1
+    rec_crawled = records[0]
+    assert rec_crawled.parents[0].name == "TestType"
+    assert rec_crawled.get_property("identifying_prop") is not None
+    assert rec_crawled.get_property("identifying_prop").value == "identifier"
+    assert rec_crawled.get_property("prop_b") is not None
+    assert rec_crawled.get_property("prop_b").value == "something_else"
+    # no interaction with the database yet, so the rrecord shouldn't have a prop_a yet
+    assert rec_crawled.get_property("prop_a") is None
+
+    # synchronize with database and update the record
+    ins, ups = crawler.synchronize()
+    assert len(ins) == 0
+    assert len(ups) == 1
+
+    # retrieve and check that name and properties have been combined correctly
+    rec_retrieved = db.Record(id=rec.id).retrieve()
+    assert rec_retrieved.name == rec.name
+    assert rec_retrieved.get_property(
+        "identifying_prop").value == rec.get_property("identifying_prop").value
+    assert rec_retrieved.get_property(
+        "prop_a").value == rec.get_property("prop_a").value
+    assert rec_retrieved.get_property(
+        "identifying_prop").value == rec_crawled.get_property("identifying_prop").value
+    assert rec_retrieved.get_property(
+        "prop_b").value == rec_crawled.get_property("prop_b").value
--- a/integrationtests/test_realworld_example.py
+++ b/integrationtests/test_realworld_example.py
@@ -22,8 +22,9 @@
 #

 """
-module description
+an integration test module that runs a test against a (close to) real world example
 """
+from caosdb.utils.register_tests import clear_database, set_test_key
 import json
 import os

@@ -36,11 +37,9 @@ from caoscrawler.structure_elements import File, JSONFile, Directory
 import pytest
 from caosadvancedtools.models.parser import parse_model_from_json_schema, parse_model_from_yaml

-#from caosadvancedtools.testutils import clear_database, set_test_key
 import sys

-# TODO is not yet merged in caosadvancedtools
-# set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2")
+set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2")


 def rfp(*pathcomponents):
@@ -83,19 +82,7 @@ def clear_database():

 def create_identifiable_adapter():
    ident = CaosDBIdentifiableAdapter()
-    ident.register_identifiable("license", (
-        db.RecordType()
-        .add_parent("license")
-        .add_property("name")))
-    ident.register_identifiable("project_type", (
-        db.RecordType()
-        .add_parent("project_type")
-        .add_property("name")))
-    ident.register_identifiable("Person", (
-        db.RecordType()
-        .add_parent("Person")
-        .add_property("full_name")))
-
+    ident.load_from_yaml_definition(os.path.join(DATADIR, "identifiables.yml"))
    return ident


@@ -131,6 +118,7 @@ def test_dataset(clear_database, usemodel):
                            "") == 1
    assert db.execute_query(f"COUNT RECORD with id={dataset.id} AND WHICH REFERENCES Event WITH "
                            "start_datetime='2022-02-10T16:36:48+01:00'") == 1
+    assert db.execute_query(f"FIND Event WITH latitude=53", unique=True)


 def test_event_update(clear_database, usemodel):

--- a/integrationtests/test_use_case_simple_presentation.py
+++ b/integrationtests/test_use_case_simple_presentation.py
@@ -22,9 +22,6 @@
 # ** end header
 #

-"""
-module description
-"""
 import os
 import pytest
 from subprocess import run
@@ -33,25 +30,13 @@ import caosdb as db
 from caosadvancedtools.loadFiles import loadpath
 from caosadvancedtools.models import parser as parser
 from caoscrawler.crawl import crawler_main
+from caosdb.utils.register_tests import clear_database, set_test_key

-# TODO: wait for release of this feature in pylib
-# from caosdb.utils.register_tests import clear_database, set_test_key
-# set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2")

+set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2")
 DATADIR = os.path.join(os.path.dirname(__file__), "test_data",
                       "extroot", "use_case_simple_presentation")

-# TODO: remove this
-
-
-@pytest.fixture
-def clear_database():
-    # TODO(fspreck): Remove once the corresponding advancedtools function can be
-    # used.
-    ents = db.execute_query("FIND ENTITY WITH ID>99")
-    if ents:
-        ents.delete()
-

 def test_complete_crawler(
        clear_database

--- a/setup.cfg
+++ b/setup.cfg
 [metadata]
 name = caoscrawler
-version = 0.1.0
+version = 0.2.0
 author = Alexander Schlemmer
 author_email = alexander.schlemmer@ds.mpg.de
 description = A new crawler for caosdb
@@ -20,7 +20,7 @@ packages = find:
 python_requires = >=3.8
 install_requires =
 	importlib-resources
-	caosdb
+	caosdb > 0.9.0
 	caosadvancedtools >= 0.6.0
    yaml-header-tools >= 0.2.1
    pyyaml

--- a/src/caoscrawler/converters.py
+++ b/src/caoscrawler/converters.py
@@ -205,6 +205,10 @@ def create_records(values: GeneralStore,
            # additionally add the new record to the general store:
            values[name] = c_record

+            # add the "fallback" parent only for Records, not for Files:
+            if (role == "Record" and "parents" not in record):
+                c_record.add_parent(name)
+
        c_record = records[name]

        for key, value in record.items():
@@ -252,12 +256,6 @@ def create_records(values: GeneralStore,
                var_replaced_parent = replace_variables(parent, values)
                if not has_parent(c_record, var_replaced_parent):
                    c_record.add_parent(var_replaced_parent)
-        else:
-            # add the "fallback" parent only for Records, not for Files:
-            if role == "Record":
-                # if not has_parent(c_record, name):
-                if len(c_record.parents) == 0:
-                    c_record.add_parent(name)
    return keys_modified


@@ -369,10 +367,21 @@ class Converter(object, metaclass=ABCMeta):

    @abstractmethod
    def typecheck(self, element: StructureElement):
+        """
+        Check whether the current structure element can be converted using
+        this converter.
+        """
        pass

    @abstractmethod
    def match(self, element: StructureElement) -> Optional[dict]:
+        """
+        This method is used to implement detailed checks for matching compatibility
+        of the current structure element with this converter.
+
+        The return value is a dictionary providing possible matched variables from the
+        structure elements information.
+        """
        pass


@@ -577,7 +586,7 @@ class JSONFileConverter(DictConverter):
    def create_children(self, generalStore: GeneralStore, element: StructureElement):
        if not self.typecheck(element):
            raise RuntimeError("A JSON file is needed to create children")
-        # TODO: either add explicit time check for File structure element here,
+        # TODO: either add explicit type check for File structure element here,
        #       or add a comment to suppress mypy type warning.
        with open(element.path, 'r') as json_file:
            json_data = json.load(json_file)
@@ -605,6 +614,12 @@ class JSONFileConverter(DictConverter):


 class _AbstractDictElementConverter(Converter):
+    default_matches = {
+        "accept_text": False,
+        "accept_bool": False,
+        "accept_int": False,
+        "accept_float": False,
+    }

    def create_children(self, generalStore: GeneralStore, element: StructureElement):
        return []
@@ -627,7 +642,7 @@ class _AbstractDictElementConverter(Converter):
        m1 = re.match(self.definition["match_name"], element.name)
        if m1 is None:
            return None
-        m2 = re.match(self.definition["match_value"], str(element.value))
+        m2 = re.match(self.definition["match_value"], str(element.value), re.DOTALL)
        if m2 is None:
            return None
        values = dict()
@@ -635,25 +650,85 @@ class _AbstractDictElementConverter(Converter):
        values.update(m2.groupdict())
        return values

+    def _typecheck(self, element: StructureElement, allowed_matches: Dict):
+        """
+        returns whether the type of StructureElement is accepted.
+
+        Parameters:
+        element: StructureElement, the element that is checked
+        allowed_matches: Dict, a dictionary that defines what types are allowed. It must have the
+                         keys 'accept_text', 'accept_bool', 'accept_int', and 'accept_float'.
+
+        returns:  whether or not the converter allows the type of element
+        """
+        if (bool(allowed_matches["accept_text"]) and isinstance(element, DictTextElement)):
+            return True
+        elif (bool(allowed_matches["accept_bool"]) and isinstance(element, DictBooleanElement)):
+            return True
+        elif (bool(allowed_matches["accept_int"]) and isinstance(element, DictIntegerElement)):
+            return True
+        elif (bool(allowed_matches["accept_float"]) and isinstance(element, DictFloatElement)):
+            return True
+        else:
+            return False
+
+    def _merge_match_definition_with_default(self, default: Dict, definition: Dict):
+        """
+        returns a dict with the same keys as default dict but with updated values from definition
+        where it has the same keys
+        """
+
+        result = {}
+        for key in default:
+            if key in definition:
+                result[key] = definition[key]
+            else:
+                result[key] = default[key]
+        return result

-class DictBooleanElementConverter(_AbstractDictElementConverter):
    def typecheck(self, element: StructureElement):
-        return isinstance(element, DictBooleanElement)
+        """
+        returns whether the type of StructureElement is accepted by this converter instance.
+        """
+        allowed_matches = self._merge_match_definition_with_default(self.default_matches,
+                                                                    self.definition)
+        return self._typecheck(element, allowed_matches)
+
+
+class DictBooleanElementConverter(_AbstractDictElementConverter):
+    default_matches = {
+        "accept_text": False,
+        "accept_bool": True,
+        "accept_int": True,
+        "accept_float": False,
+    }


 class DictFloatElementConverter(_AbstractDictElementConverter):
-    def typecheck(self, element: StructureElement):
-        return isinstance(element, DictFloatElement)
+    default_matches = {
+        "accept_text": False,
+        "accept_bool": False,
+        "accept_int": True,
+        "accept_float": True,
+    }


 class DictTextElementConverter(_AbstractDictElementConverter):
-    def typecheck(self, element: StructureElement):
-        return isinstance(element, DictTextElement)
+    default_matches = {
+        "accept_text": True,
+        "accept_bool": True,
+        "accept_int": True,
+        "accept_float": True,
+    }


 class DictIntegerElementConverter(_AbstractDictElementConverter):
-    def typecheck(self, element: StructureElement):
-        return isinstance(element, DictIntegerElement)
+    default_matches = {
+        "accept_text": False,
+        "accept_bool": False,
+        "accept_int": True,
+        "accept_float": False,
+    }


 class DictListElementConverter(Converter):

--- a/src/caoscrawler/crawl.py
+++ b/src/caoscrawler/crawl.py
--- a/src/caoscrawler/identifiable_adapters.py
+++ b/src/caoscrawler/identifiable_adapters.py
@@ -4,8 +4,8 @@
 # ** header v3.0
 # This file is a part of the CaosDB Project.
 #
-# Copyright (C) 2021 Henrik tom Wörden
-#               2021 Alexander Schlemmer
+# Copyright (C) 2021-2022 Henrik tom Wörden
+#               2021-2022 Alexander Schlemmer
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU Affero General Public License as
@@ -208,18 +208,10 @@ class IdentifiableAdapter(metaclass=ABCMeta):
                # TODO: how to handle missing values in identifiables
                #       raise an exception?
                raise NotImplementedError(
-                    f"RECORD\n{record}\nPROPERTY\n{prop.name}"
+                    f"The following record is missing an identifying property:"
+                    f"RECORD\n{record}\nIdentifying PROPERTY\n{prop.name}"
                )
            newval = record_prop.value
-            if isinstance(record_prop.value, db.Entity):
-                newval = self.resolve_reference(record_prop.value)
-            elif isinstance(record_prop.value, list):
-                newval = list()
-                for element in record_prop.value:
-                    if isinstance(element, db.Entity):
-                        newval.append(self.resolve_reference(element))
-                    else:
-                        newval.append(element)
            record_prop_new = db.Property(name=record_prop.name,
                                          id=record_prop.id,
                                          description=record_prop.description,
@@ -371,16 +363,13 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter):
            # a) prop_record.value has a registered identifiable:
            #      in this case, fetch the identifiable and set the value accordingly
            if isinstance(prop.value, db.Entity):  # lists are not checked here
-                registered = self.get_registered_identifiable(prop.value)
-
-                if registered is None:
-                    raise NotImplementedError("Non-identifiable references cannot"
-                                              " be used as properties in identifiables.")
-
-                raise RuntimeError("The identifiable which is used as property"
-                                   " here has to be inserted first.")
+                otherid = prop_record.value
+                if isinstance(prop_record.value, db.Entity):
+                    otherid = prop_record.value.id
+                if prop.value.id != otherid:
+                    return False

-            if prop.value != prop_record.value:
+            elif prop.value != prop_record.value:
                return False
        return True


--- a/src/caoscrawler/identified_cache.py
+++ b/src/caoscrawler/identified_cache.py
@@ -23,13 +23,52 @@
 # ** end header
 #

+
 """
-stores identified records and is able to detect duplicates
+This module is a cache for Records where we checked the existence in a remote server using
+identifiables. If the Record was found, this means that we identified the corresponding Record
+in the remote server and the ID of the local object can be set.
+To prevent querying the server again and again for the same objects, this cache allows storing
+Records that were found on a remote server and those that were not (typically in separate caches).
+The look up in the cache is done using a hash of a string representation.
+
+TODO: We need a general review:
+- How are entities identified with each other?
+- What happens if the identification fails?
+
+Checkout how this was done in the old crawler.
 """

 import caosdb as db

 from hashlib import sha256
+from datetime import datetime
+
+
+def _value_representation(value):
+    """returns the string representation of property values to be used in the hash function """
+
+    # TODO: (for review)
+    #       This expansion of the hash function was introduced recently
+    #       to allow the special case of Files as values of properties.
+    #       We need to review the completeness of all the cases here, as the cache
+    #       is crucial for correct identification of insertion and updates.
+    if value is None:
+        return "None"
+    elif isinstance(value, db.File):
+        return str(value.path)
+    elif isinstance(value, db.Entity):
+        if value.id is not None:
+            return str(value.id)
+        else:
+            return "PyID="+str(id(value))
+    elif isinstance(value, list):
+        return "["+", ".join([_value_representation(el) for el in value])+"]"
+    elif (isinstance(value, str) or isinstance(value, int) or isinstance(value, float)
+          or isinstance(value, datetime)):
+        return str(value)
+    else:
+        raise ValueError(f"Unknown datatype of the value: {value}")


 def _create_hashable_string(identifiable: db.Record):
@@ -46,28 +85,11 @@ def _create_hashable_string(identifiable: db.Record):
        #   sorted([p.name for p in identifiable.parents])
        raise RuntimeError("Cache entry can only be generated for entities with 1 parent.")
    rec_string = "P<{}>N<{}>".format(identifiable.parents[0].name, identifiable.name)
+    # TODO this structure neglects Properties if multiple exist for the same name
    for pname in sorted([p.name for p in identifiable.properties]):
-        value = str(identifiable.get_property(pname).value)
-
-        # TODO: (for review)
-        #       This expansion of the hash function was introduced recently
-        #       to allow the special case of Files as values of properties.
-        #       We need to review the completeness of all the cases here, as the cache
-        #       is crucial for correct identification of insertion and updates.
-        if isinstance(identifiable.get_property(pname).value, db.File):
-            value = str(identifiable.get_property(pname).value.path)
-        elif isinstance(identifiable.get_property(pname).value, db.Entity):
-            value = str(identifiable.get_property(pname).value.id)
-        elif isinstance(identifiable.get_property(pname).value, list):
-            tmplist = []
-            for val in identifiable.get_property(pname).value:
-                if isinstance(val, db.Entity):
-                    tmplist.append(val.id)
-                else:
-                    tmplist.append(val)
-            value = str(tmplist)

-        rec_string += "{}:".format(pname) + value
+        rec_string += ("{}:".format(pname) +
+                       _value_representation(identifiable.get_property(pname).value))
    return rec_string



--- a/src/doc/conf.py
+++ b/src/doc/conf.py
@@ -33,10 +33,10 @@ copyright = '2021, MPIDS'
 author = 'Alexander Schlemmer'

 # The short X.Y version
-version = '0.1'
+version = '0.2'
 # The full version, including alpha/beta/rc tags
 # release = '0.5.2-rc2'
-release = '0.1'
+release = '0.2'


 # -- General configuration ---------------------------------------------------

--- a/src/doc/converters.rst
+++ b/src/doc/converters.rst
@@ -77,12 +77,23 @@ Dict Converter
 Typical Subtree converters
 --------------------------

-DictBooleanElementConverter
-DictFloatElementConverter
-DictTextElementConverter
-DictIntegerElementConverter
-DictListElementConverter
-DictDictElementConverter
+These converters expect `match_name` and `match_value` in their definition
+which allow to match the key and the value, respectively.
+
+Note that there are defaults for accepting other types. For example,
+DictFloatElementConverter also accepts DictIntegerElements. The default
+behavior can be adjusted with the fields `accept_text`, `accept_int`,
+`accept_float`, and `accept_bool`.
+
+The following denotes what kind of StructureElements are accepted by default
+(they are defined in `src/caoscrawler/converters.py`):
+
+- DictBooleanElementConverter: bool, int
+- DictFloatElementConverter: int, float
+- DictTextElementConverter: text, bool, int, float
+- DictIntegerElementConverter: int
+- DictListElementConverter: list
+- DictDictElementConverter: dict

 YAMLFileConverter
 =================
@@ -208,7 +219,7 @@ Now we need to create a class called "SourceResolver" in the file "sources.py".
 Furthermore we will customize the method :py:meth:`~caoscrawler.converters.Converter.create_records` that allows us to specify a more complex record generation procedure than provided in the standard implementation. One specific limitation of the standard implementation is, that only a fixed
 number of records can be generated by the yaml definition. So for any applications - like here - that require an arbitrary number of records to be created, a customized implementation of :py:meth:`~caoscrawler.converters.Converter.create_records` is recommended.
 In this context it is recommended to make use of the function :func:`caoscrawler.converters.create_records` that implements creation of record objects from python dictionaries of the same structure
-that would be given using a yaml definition.
+that would be given using a yaml definition (see next section below).
     
 .. code-block:: python

@@ -307,3 +318,151 @@ The following yaml block will register the converter in a yaml file:
     SourceResolver:
       package: scifolder.converters.sources
       converter: SourceResolver
+
+       
+Using the `create_records` API function
+=======================================
+
+The function :func:`caoscrawler.converters.create_records` was already mentioned above and it is
+the recommended way to create new records from custom converters. Let's have a look at the
+function signature:
+
+.. code-block:: python
+
+    def create_records(values: GeneralStore,  # <- pass the current variables store here
+                       records: RecordStore,  # <- pass the current store of CaosDB records here
+                       def_records: dict):    # <- This is the actual definition of new records!
+
+
+`def_records` is the actual definition of new records according to the yaml cfood specification
+(work in progress, in the docs). Essentially you can do everything here, that you could do
+in the yaml document as well, but using python source code.
+
+Let's have a look at a few examples:
+
+.. code-block:: yaml
+
+  DirConverter:
+    type: Directory
+    match: (?P<dir_name>.*)
+    records:
+      Experiment:
+        identifier: $dir_name
+
+This block will just create a new record with parent `Experiment` and one property
+`identifier` with a value derived from the matching regular expression.
+
+Let's formulate that using `create_records`:
+
+.. code-block:: python
+
+  dir_name = "directory name"
+  
+  record_def = {
+    "Experiment": {
+      "identifier": dir_name
+      }
+  }
+
+  keys_modified = create_records(values, records,
+                                 record_def)
+
+The `dir_name` is set explicitely here, everything else is identical to the yaml statements.
+
+
+The role of `keys_modified`
+===========================
+
+You probably have noticed already, that :func:`caoscrawler.converters.create_records` returns
+`keys_modified` which is a list of tuples. Each element of `keys_modified` has two elements:
+
+- Element 0 is the name of the record that is modified (as used in the record store `records`).
+- Element 1 is the name of the property that is modified.
+
+It is important, that the correct list of modified keys is returned by
+:py:meth:`~caoscrawler.converters.Converter.create_records` to make the crawler process work.
+
+So, a sketch of a typical implementation within a custom converter could look like this:
+
+
+.. code-block:: python
+
+  def create_records(self, values: GeneralStore,
+                       records: RecordStore,
+                       element: StructureElement,
+                       file_path_prefix: str):
+
+    # Modify some records:
+    record_def = {
+      # ...
+    }
+
+  keys_modified = create_records(values, records,
+                                 record_def)
+
+  # You can of course do it multiple times:
+  keys_modified.extend(create_records(values, records,
+                                      record_def))
+
+  # You can also process the records section of the yaml definition:
+  keys_modified.extend(
+         super().create_records(values, records, element, file_path_prefix))
+  # This essentially allows users of your converter to customize the creation of records
+  # by providing a custom "records" section additionally to the modifications provided
+  # in this implementation of the Converter.
+
+  # Important: Return the list of modified keys!
+  return keys_modified
+
+
+More complex example
+====================
+
+Let's have a look at a more complex examples, defining multiple records:
+
+.. code-block:: yaml
+
+  DirConverter:
+    type: Directory
+    match: (?P<dir_name>.*)
+    records:
+      Project:
+        identifier: project_name
+      Experiment:
+        identifier: $dir_name
+        Project: $Project
+      ProjectGroup:
+        projects: +$Project
+      
+
+This block will create two new Records:
+
+- A project with a constant identifier
+- An experiment with an identifier, derived from a regular expression and a reference to the new project.
+
+Furthermore a Record `ProjectGroup` will be edited (its initial definition is not given in the
+yaml block): The project that was just created will be added as a list element to the property
+`projects`.
+
+Let's formulate that using `create_records` (again, `dir_name` is constant here):
+
+.. code-block:: python
+
+  dir_name = "directory name"
+  
+  record_def = {
+    "Project": {
+      "identifier": "project_name",
+    }
+    "Experiment": {
+      "identifier": dir_name,
+      "Project": "$Project",
+      }
+    "ProjectGroup": {
+      "projects": "+$Project",
+    }
+    
+  }
+
+  keys_modified = create_records(values, records,
+                                 record_def)
--- a/src/doc/macros.rst
+++ b/src/doc/macros.rst