From 78a98364f0df4f262103c7a5ed564c09334efff5 Mon Sep 17 00:00:00 2001
From: Alexander Schlemmer <alexander@mail-schlemmer.de>
Date: Mon, 10 Oct 2022 10:30:04 +0200
Subject: [PATCH 01/17] DOC: added draft for referencing json schema documents
 from rst documents with SPHINX

---
 src/doc/cfood-schema.yml        | 67 +++++++++++++++++++++++++++++++++
 src/doc/cfood-specification.rst |  6 +++
 src/doc/conf.py                 |  2 +
 src/doc/index.rst               |  1 +
 4 files changed, 76 insertions(+)
 create mode 100644 src/doc/cfood-schema.yml
 create mode 100644 src/doc/cfood-specification.rst

diff --git a/src/doc/cfood-schema.yml b/src/doc/cfood-schema.yml
new file mode 100644
index 00000000..d7b5abfd
--- /dev/null
+++ b/src/doc/cfood-schema.yml
@@ -0,0 +1,67 @@
+cfood:
+  type: object
+  additionalProperties:
+    $ref:
+      "#/$defs/converter"
+  $defs:
+    converter:
+      properties:
+        type:
+          enum:
+          - Directory
+          - File
+          - DictTextElement
+          - TextElement
+          - SimpleFile
+          - YamlFileCaosDBRecord
+          - MarkdownFile
+          - DictListElement
+          - DictDictElement
+          - DictFloatElement
+          - DictIntegerElement
+          - DictBooleanElement
+          - Definitions
+          - Dict
+          - JSONFile
+          - CSVTableConverter
+          - XLSXTableConverter
+          description: Type of this converter node.
+        match:
+          description: typically a regexp which is matched to a structure element name
+          type: string
+        match_name:
+          description: a regexp that is matched to the key of a key-value pair
+          type: string
+        match_value:
+          description: a regexp that is matched to the value of a key-value pair
+          type: string
+        records:
+          description: This field is used to define new records or to modify records which have been defined on a higher level.
+          type: object
+          properties:
+            parents:
+              description: Parents for this record are given here as a list of names.
+              type: array
+              items:
+                type: string
+            additionalProperties:
+              oneOf:
+              - type: object
+                properties:
+                  value:
+                    description: Dictionary notation for variable values. Values can be given by a variable which is indicated by an initial "$". Use "$$" for setting values actually starting with a dollar sign.
+                    type: string
+                  collection_mode:
+                    description: The collection mode defines whether the resulting property will be a single property or whether the values of multiple structure elements will be collected either into a list or a multiproperty.
+                    enum:
+                    - single
+                    - list
+                    - multiproperty
+                additionalProperties: false
+              - type: string
+                description: The short notation for values. Values can be given by a variable which is indicated by an initial "$". Use "$$" for setting values actually starting with a dollar sign. Multiproperties can be set using an initial "*" and list properties using an initial "+".
+        subtree:
+          type: object
+          additionalProperties:
+            $ref:
+              "#/$defs/converter"
diff --git a/src/doc/cfood-specification.rst b/src/doc/cfood-specification.rst
new file mode 100644
index 00000000..453921f9
--- /dev/null
+++ b/src/doc/cfood-specification.rst
@@ -0,0 +1,6 @@
+CFood-Specification
+(((((((((((((((((((
+
+
+
+.. jsonschema:: cfood-schema.yml#/cfood/$defs/converter
diff --git a/src/doc/conf.py b/src/doc/conf.py
index 30ce670e..897a94b6 100644
--- a/src/doc/conf.py
+++ b/src/doc/conf.py
@@ -55,6 +55,8 @@ extensions = [
     'sphinx.ext.napoleon',     # For Google style docstrings
     "recommonmark",            # For markdown files.
     "sphinx_rtd_theme",
+    'sphinx.ext.autodoc',
+    'sphinx-jsonschema',
 ]
 
 # Add any paths that contain templates here, relative to this directory.
diff --git a/src/doc/index.rst b/src/doc/index.rst
index 724bcc54..e72b8a00 100644
--- a/src/doc/index.rst
+++ b/src/doc/index.rst
@@ -11,6 +11,7 @@ Crawler 2.0 Documentation
    Concepts<concepts>
    Converters<converters>
    CFoods (Crawler Definitions)<cfood>
+   CFood-Specification<cfood-specification>
    Macros<macros>
    Tutorials<tutorials/index>
    API documentation<_apidoc/modules>
-- 
GitLab


From fadfde5bcd25a52058ed309bc02bdc11adc117c9 Mon Sep 17 00:00:00 2001
From: Alexander Schlemmer <alexander@mail-schlemmer.de>
Date: Fri, 27 Jan 2023 12:32:21 +0100
Subject: [PATCH 02/17] DOC: new document describing the typical crawler
 workflow

---
 src/doc/workflow.rst | 60 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 src/doc/workflow.rst

diff --git a/src/doc/workflow.rst b/src/doc/workflow.rst
new file mode 100644
index 00000000..0ffd50ec
--- /dev/null
+++ b/src/doc/workflow.rst
@@ -0,0 +1,60 @@
+Crawler Workflow
+================
+
+The CaosDB crawler aims to provide a very flexible framework for synchronizing
+data on file systems (or potentially other sources of information) with a
+running CaosDB instance. The workflow that is used in the scientific environment
+should be choosen according to the users needs. It is also possible to combine multiple workflow or use them in parallel.
+
+In this document we will describe several workflows for crawler operation.
+
+Local Crawler Operation
+-----------------------
+
+A very simple setup that can also reliably used for testing (e.g. in local
+docker containers) sets up the crawler on a local computer. The files that
+are being crawled need to be visible to both, the local computer and the
+machine, running the CaosDB.
+
+Prerequisites
++++++++++++++
+
+- Make sure that CaosDB is running, that your computer has a network connection to CaosDB and
+  that your pycaosdb.ini is pointing to the correct instance of CaosDB. Please refer to the
+  pylib manual for questions related to the configuration in pycaosdb.ini
+  (https://docs.indiscale.com/caosdb-pylib/README_SETUP.html).
+- Make sure that caosdb-crawler and caosdb-advanced-user-tools are installed (e.g. using pip).
+- Make sure that you have created:
+  - The data model, needed for the crawler.
+  - A file "identifiables.yml" describing the identifiables.
+  - A cfood file, e.g. cfood.yml.
+
+Running the crawler
++++++++++++++++++++
+
+Running the crawler currently involves two steps:
+- Inserting the files
+- Running the crawler program
+
+Inserting the files
+)))))))))))))))))))
+
+This can be done using the module "loadFiles" from caosadvancedtools.
+(See https://docs.indiscale.com/caosdb-advanced-user-tools/ for installation.)
+
+The generic syntax is:
+
+python3 -m caosadvancedtools.loadFiles -p <prefix-in-caosdb-file-system> <path-to-crawled-folder>
+
+Important: The <path-to-crawled-folder> is the location of the files **as seen by CaosDB**, e.g. for a CaosDB instance running in a docker container (e.g. see: https://gitlab.com/caosdb/caosdb-docker) the command line could look like:
+
+python3 -m caosadvancedtools.loadFiles -p / /opt/caosdb/mnt/extroot/ExperimentalData
+
+This command line would load the folder "ExperimentalData" contained in the extroot folder within the docker container to the CaosDB-prefix "/" which is the root prefix.
+
+Running the crawler
+)))))))))))))))))))
+
+The following command line assumes that the extroot folder visible in the CaosDB docker container is located in "../extroot":
+
+caosdb-crawler -i identifiables.yml --prefix /extroot --debug --provenance=provenance.yml -s update cfood.yml ../extroot/ExperimentalData/
-- 
GitLab


From d54561189e0166f2e597c1e5ddc61358df946be5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com>
Date: Sat, 4 Feb 2023 19:26:21 +0100
Subject: [PATCH 03/17] duplication and out of date

---
 src/doc/cfood-schema.yml | 67 ----------------------------------------
 1 file changed, 67 deletions(-)
 delete mode 100644 src/doc/cfood-schema.yml

diff --git a/src/doc/cfood-schema.yml b/src/doc/cfood-schema.yml
deleted file mode 100644
index d7b5abfd..00000000
--- a/src/doc/cfood-schema.yml
+++ /dev/null
@@ -1,67 +0,0 @@
-cfood:
-  type: object
-  additionalProperties:
-    $ref:
-      "#/$defs/converter"
-  $defs:
-    converter:
-      properties:
-        type:
-          enum:
-          - Directory
-          - File
-          - DictTextElement
-          - TextElement
-          - SimpleFile
-          - YamlFileCaosDBRecord
-          - MarkdownFile
-          - DictListElement
-          - DictDictElement
-          - DictFloatElement
-          - DictIntegerElement
-          - DictBooleanElement
-          - Definitions
-          - Dict
-          - JSONFile
-          - CSVTableConverter
-          - XLSXTableConverter
-          description: Type of this converter node.
-        match:
-          description: typically a regexp which is matched to a structure element name
-          type: string
-        match_name:
-          description: a regexp that is matched to the key of a key-value pair
-          type: string
-        match_value:
-          description: a regexp that is matched to the value of a key-value pair
-          type: string
-        records:
-          description: This field is used to define new records or to modify records which have been defined on a higher level.
-          type: object
-          properties:
-            parents:
-              description: Parents for this record are given here as a list of names.
-              type: array
-              items:
-                type: string
-            additionalProperties:
-              oneOf:
-              - type: object
-                properties:
-                  value:
-                    description: Dictionary notation for variable values. Values can be given by a variable which is indicated by an initial "$". Use "$$" for setting values actually starting with a dollar sign.
-                    type: string
-                  collection_mode:
-                    description: The collection mode defines whether the resulting property will be a single property or whether the values of multiple structure elements will be collected either into a list or a multiproperty.
-                    enum:
-                    - single
-                    - list
-                    - multiproperty
-                additionalProperties: false
-              - type: string
-                description: The short notation for values. Values can be given by a variable which is indicated by an initial "$". Use "$$" for setting values actually starting with a dollar sign. Multiproperties can be set using an initial "*" and list properties using an initial "+".
-        subtree:
-          type: object
-          additionalProperties:
-            $ref:
-              "#/$defs/converter"
-- 
GitLab


From 3e0e3e82bcca0cd9c03d288e97a8247103e21864 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com>
Date: Sat, 4 Feb 2023 19:44:47 +0100
Subject: [PATCH 04/17] WIP: add explanation

---
 src/doc/cfood-schema.yml        | 1 +
 src/doc/cfood-specification.rst | 4 ++++
 2 files changed, 5 insertions(+)
 create mode 120000 src/doc/cfood-schema.yml

diff --git a/src/doc/cfood-schema.yml b/src/doc/cfood-schema.yml
new file mode 120000
index 00000000..c34280ed
--- /dev/null
+++ b/src/doc/cfood-schema.yml
@@ -0,0 +1 @@
+../caoscrawler/cfood-schema.yml
\ No newline at end of file
diff --git a/src/doc/cfood-specification.rst b/src/doc/cfood-specification.rst
index 453921f9..89588b2d 100644
--- a/src/doc/cfood-specification.rst
+++ b/src/doc/cfood-specification.rst
@@ -2,5 +2,9 @@ CFood-Specification
 (((((((((((((((((((
 
 
+CFoods are defined using a YAML find that has to abide by the following 
+specification. The specifiacition  is defined using a JSON schema (see
+`src/caoscrawler/cfood-schema.yml`). A CFood is basically composed of converter
+definitions. A converter definition must have the following structure:
 
 .. jsonschema:: cfood-schema.yml#/cfood/$defs/converter
-- 
GitLab


From 7a6252ad31dbbb043d4eb63d28b6e254dc75a9c3 Mon Sep 17 00:00:00 2001
From: Florian Spreckelsen <f.spreckelsen@indiscale.com>
Date: Wed, 5 Mar 2025 14:40:31 +0100
Subject: [PATCH 05/17] REL: Begin next release cycle

---
 CHANGELOG.md    | 16 ++++++++++++++++
 setup.cfg       |  2 +-
 src/doc/conf.py |  4 ++--
 3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f54c2a25..8e88f71b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,22 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [Unreleased] ##
+
+### Added ###
+
+### Changed ###
+
+### Deprecated ###
+
+### Removed ###
+
+### Fixed ###
+
+### Security ###
+
+### Documentation ###
+
 ## [0.11.0] - 2025-03-05 ##
 
 ### Added ###
diff --git a/setup.cfg b/setup.cfg
index da645c0d..f6f95d6d 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = caoscrawler
-version = 0.11.0
+version = 0.11.1
 author = Alexander Schlemmer
 author_email = alexander.schlemmer@ds.mpg.de
 description = A new crawler for LinkAhead
diff --git a/src/doc/conf.py b/src/doc/conf.py
index a1e9dbde..6583364e 100644
--- a/src/doc/conf.py
+++ b/src/doc/conf.py
@@ -33,10 +33,10 @@ copyright = '2024, IndiScale'
 author = 'Alexander Schlemmer'
 
 # The short X.Y version
-version = '0.11.0'
+version = '0.11.1'
 # The full version, including alpha/beta/rc tags
 # release = '0.5.2-rc2'
-release = '0.11.0'
+release = '0.11.1-dev'
 
 
 # -- General configuration ---------------------------------------------------
-- 
GitLab


From d89954bb14d716510839da2c4ed7e3a63a7a3b96 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com>
Date: Wed, 19 Mar 2025 16:02:15 +0100
Subject: [PATCH 06/17] FIX: Multiple registered identifiables error when
 multiple parents exist

---
 src/caoscrawler/identifiable_adapters.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py
index 6169a99e..444b73f5 100644
--- a/src/caoscrawler/identifiable_adapters.py
+++ b/src/caoscrawler/identifiable_adapters.py
@@ -672,11 +672,15 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter):
         registered = []
         for parent in rt.parents:
             prt = _retrieve_RecordType(id=parent.id, name=parent.name)
-            registered.append(self._get_registered_for_rt(prt))
+            reg = self._get_registered_for_rt(prt)
+            if reg is not None:
+                registered.append(reg)
         # TODO we might in future want to check whether the registered identifiables are the same
         if len(registered) > 1:
-            raise RuntimeError("Multiple registered identifiables found for the RecordType "
-                               f" {rt.name} with the following parents: {rt.parents}")
+            ri_names = [i.name for i in registered]
+            raise RuntimeError(f"Multiple registered identifiables found for the RecordType "
+                               f" {rt.name} with the following parents: {rt.parents}\n"
+                               f"Registered identifiables: {', '.join(ri_names)}")
         elif len(registered) == 1:
             return registered[0]
         else:
-- 
GitLab


From 58b17c0c41f8c029748d01efc05524be47966c87 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com>
Date: Wed, 19 Mar 2025 16:13:26 +0100
Subject: [PATCH 07/17] TST: add test for multiple parents

---
 unittests/test_identifiable_adapters.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/unittests/test_identifiable_adapters.py b/unittests/test_identifiable_adapters.py
index 5108e83c..5b6b880f 100644
--- a/unittests/test_identifiable_adapters.py
+++ b/unittests/test_identifiable_adapters.py
@@ -54,7 +54,7 @@ def mock_retrieve_RecordType(id, name):
         "Lab": db.RecordType(name="Lab"),
         "Analysis": db.RecordType(name="Analysis"),
         "MetaAnalysis": db.RecordType(name="MetaAnalysis").add_parent("Analysis"),
-        "Measurement": db.RecordType(name="Measurement").add_parent("Experiment")
+        "Measurement": db.RecordType(name="Measurement").add_parent("Experiment").add_parent("A")
     }[name]
 
 
@@ -338,3 +338,11 @@ def test_get_registered_identifiable():
     registered = ident.get_registered_identifiable(rec)
     assert registered is not None
     assert registered.parents[0].name == "Experiment"
+
+    # Same again, but with two grand parents
+    ident = CaosDBIdentifiableAdapter()
+    ident.load_from_yaml_definition(UNITTESTDIR / "example_identifiables.yml")
+    rec = db.Record().add_parent(name="Measurement")
+    registered = ident.get_registered_identifiable(rec)
+    assert registered is not None
+    assert registered.parents[0].name == "Experiment"
-- 
GitLab


From 63a01e6b8c4f082de53f57e0808806f62b949d44 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com>
Date: Wed, 19 Mar 2025 16:26:39 +0100
Subject: [PATCH 08/17] DOC: update cl

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8e88f71b..33fdff70 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,6 +16,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Removed ###
 
 ### Fixed ###
+- A RecordType with multiple Parents no longer causes an error during
+  collection of identifiables
 
 ### Security ###
 
-- 
GitLab


From 3a5b3cf7399915ef30f0eaa1642b8eb9fe28a1c8 Mon Sep 17 00:00:00 2001
From: Florian Spreckelsen <f.spreckelsen@indiscale.com>
Date: Wed, 19 Mar 2025 17:53:01 +0100
Subject: [PATCH 09/17] TEST(identifiable): Remove unused test case

---
 unittests/test_identifiable_adapters.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/unittests/test_identifiable_adapters.py b/unittests/test_identifiable_adapters.py
index 5b6b880f..1c7733ac 100644
--- a/unittests/test_identifiable_adapters.py
+++ b/unittests/test_identifiable_adapters.py
@@ -54,6 +54,8 @@ def mock_retrieve_RecordType(id, name):
         "Lab": db.RecordType(name="Lab"),
         "Analysis": db.RecordType(name="Analysis"),
         "MetaAnalysis": db.RecordType(name="MetaAnalysis").add_parent("Analysis"),
+        # Test that two parents are possible; only one of them
+        # (Experiment) has an identifiable.
         "Measurement": db.RecordType(name="Measurement").add_parent("Experiment").add_parent("A")
     }[name]
 
@@ -330,16 +332,10 @@ def test_get_registered_identifiable():
     with pytest.raises(RuntimeError):
         registered = ident.get_registered_identifiable(rec)
 
-    # Test the case that the record has a parent for which no identifiable is registered
-    # and there is a registered identifiable for a grand parent
-    ident = CaosDBIdentifiableAdapter()
-    ident.load_from_yaml_definition(UNITTESTDIR / "example_identifiables.yml")
-    rec = db.Record().add_parent(name="Measurement")
-    registered = ident.get_registered_identifiable(rec)
-    assert registered is not None
-    assert registered.parents[0].name == "Experiment"
-
-    # Same again, but with two grand parents
+    # Test the case that the record has a parent for which no
+    # identifiable is registered and there is a registered
+    # identifiable for a grand parent. Note that this also tests the
+    # case of two grandparents, only one of which has an identifiable.
     ident = CaosDBIdentifiableAdapter()
     ident.load_from_yaml_definition(UNITTESTDIR / "example_identifiables.yml")
     rec = db.Record().add_parent(name="Measurement")
-- 
GitLab


From 650087410670d43994bf3b8fc2b3ca1b4d576770 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com>
Date: Fri, 21 Mar 2025 09:18:22 +0100
Subject: [PATCH 10/17] DOC: include workflow documentation

---
 src/doc/index.rst    |  1 +
 src/doc/workflow.rst | 16 ++++++++--------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/doc/index.rst b/src/doc/index.rst
index fdb99d4d..4cf6fd8c 100644
--- a/src/doc/index.rst
+++ b/src/doc/index.rst
@@ -9,6 +9,7 @@ CaosDB-Crawler Documentation
 
    Getting started<getting_started/index>
    Tutorials<tutorials/index>
+   Workflow<workflow>
    Concepts<concepts>
    Converters<converters/index>
    CFoods (Crawler Definitions)<cfood>
diff --git a/src/doc/workflow.rst b/src/doc/workflow.rst
index 0ffd50ec..9116f85e 100644
--- a/src/doc/workflow.rst
+++ b/src/doc/workflow.rst
@@ -1,9 +1,9 @@
 Crawler Workflow
 ================
 
-The CaosDB crawler aims to provide a very flexible framework for synchronizing
+The LinkAhead crawler aims to provide a very flexible framework for synchronizing
 data on file systems (or potentially other sources of information) with a
-running CaosDB instance. The workflow that is used in the scientific environment
+running LinkAhead instance. The workflow that is used in the scientific environment
 should be choosen according to the users needs. It is also possible to combine multiple workflow or use them in parallel.
 
 In this document we will describe several workflows for crawler operation.
@@ -14,13 +14,13 @@ Local Crawler Operation
 A very simple setup that can also reliably used for testing (e.g. in local
 docker containers) sets up the crawler on a local computer. The files that
 are being crawled need to be visible to both, the local computer and the
-machine, running the CaosDB.
+machine, running the LinkAhead.
 
 Prerequisites
 +++++++++++++
 
-- Make sure that CaosDB is running, that your computer has a network connection to CaosDB and
-  that your pycaosdb.ini is pointing to the correct instance of CaosDB. Please refer to the
+- Make sure that LinkAhead is running, that your computer has a network connection to LinkAhead and
+  that your pycaosdb.ini is pointing to the correct instance of LinkAhead. Please refer to the
   pylib manual for questions related to the configuration in pycaosdb.ini
   (https://docs.indiscale.com/caosdb-pylib/README_SETUP.html).
 - Make sure that caosdb-crawler and caosdb-advanced-user-tools are installed (e.g. using pip).
@@ -46,15 +46,15 @@ The generic syntax is:
 
 python3 -m caosadvancedtools.loadFiles -p <prefix-in-caosdb-file-system> <path-to-crawled-folder>
 
-Important: The <path-to-crawled-folder> is the location of the files **as seen by CaosDB**, e.g. for a CaosDB instance running in a docker container (e.g. see: https://gitlab.com/caosdb/caosdb-docker) the command line could look like:
+Important: The <path-to-crawled-folder> is the location of the files **as seen by LinkAhead**, e.g. for a LinkAhead instance running in a docker container (e.g. see: https://gitlab.com/caosdb/caosdb-docker) the command line could look like:
 
 python3 -m caosadvancedtools.loadFiles -p / /opt/caosdb/mnt/extroot/ExperimentalData
 
-This command line would load the folder "ExperimentalData" contained in the extroot folder within the docker container to the CaosDB-prefix "/" which is the root prefix.
+This command line would load the folder "ExperimentalData" contained in the extroot folder within the docker container to the LinkAhead-prefix "/" which is the root prefix.
 
 Running the crawler
 )))))))))))))))))))
 
-The following command line assumes that the extroot folder visible in the CaosDB docker container is located in "../extroot":
+The following command line assumes that the extroot folder visible in the LinkAhead docker container is located in "../extroot":
 
 caosdb-crawler -i identifiables.yml --prefix /extroot --debug --provenance=provenance.yml -s update cfood.yml ../extroot/ExperimentalData/
-- 
GitLab


From b3669164a7ca8b55220bc83c5335284022223d2e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com>
Date: Fri, 21 Mar 2025 09:22:53 +0100
Subject: [PATCH 11/17] DOC: minor rephrasing

---
 src/doc/workflow.rst | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/doc/workflow.rst b/src/doc/workflow.rst
index 9116f85e..b8d48f1a 100644
--- a/src/doc/workflow.rst
+++ b/src/doc/workflow.rst
@@ -4,17 +4,18 @@ Crawler Workflow
 The LinkAhead crawler aims to provide a very flexible framework for synchronizing
 data on file systems (or potentially other sources of information) with a
 running LinkAhead instance. The workflow that is used in the scientific environment
-should be choosen according to the users needs. It is also possible to combine multiple workflow or use them in parallel.
+should be choosen according to the users needs. It is also possible to combine
+multiple workflow or use them in parallel.
 
 In this document we will describe several workflows for crawler operation.
 
 Local Crawler Operation
 -----------------------
 
-A very simple setup that can also reliably used for testing (e.g. in local
-docker containers) sets up the crawler on a local computer. The files that
-are being crawled need to be visible to both, the local computer and the
-machine, running the LinkAhead.
+A very simple setup that can also reliably be used for testing
+sets up the crawler on a local computer. The files that
+are being crawled need to be visible to both, the locally running crawler and
+the LinkAhead server.
 
 Prerequisites
 +++++++++++++
@@ -58,3 +59,7 @@ Running the crawler
 The following command line assumes that the extroot folder visible in the LinkAhead docker container is located in "../extroot":
 
 caosdb-crawler -i identifiables.yml --prefix /extroot --debug --provenance=provenance.yml -s update cfood.yml ../extroot/ExperimentalData/
+
+Server Side Crawler Operation
+-----------------------
+To be filled.
-- 
GitLab


From ad27ac01f13bda0a0d20f139c449cf56eab40240 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com>
Date: Fri, 21 Mar 2025 15:58:30 +0100
Subject: [PATCH 12/17] DOC: CaosDB->LinkAhead

---
 README_SETUP.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README_SETUP.md b/README_SETUP.md
index 32f0bb89..be38b727 100644
--- a/README_SETUP.md
+++ b/README_SETUP.md
@@ -1,4 +1,4 @@
-# Getting started with the CaosDB Crawler #
+# Getting started with the LinkAhead Crawler #
 
 ## Installation
 see INSTALL.md
-- 
GitLab


From 4e021ace318cdd8528b581ccc97c2881dbee2c57 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com>
Date: Fri, 21 Mar 2025 16:06:07 +0100
Subject: [PATCH 13/17] REFACTOR: use more cached queries

---
 src/caoscrawler/crawl.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py
index e0d24397..3d1e9f75 100644
--- a/src/caoscrawler/crawl.py
+++ b/src/caoscrawler/crawl.py
@@ -873,15 +873,12 @@ def _update_status_record(run_id, n_inserts, n_updates, status):
         cr_rec = db.execute_query(f"FIND RECORD CrawlerRun WITH run_id={run_id}", unique=True)
         cr_rec.get_property('status').value = status
         (cr_rec
-            .add_property(db.execute_query(
-                "FIND Property with name='number_of_inserted_entities'", unique=True).id,
-                n_inserts)
             .add_property(
-                db.execute_query("FIND Property with name='number_of_updated_entities'",
-                                 unique=True).id, n_updates)
+                cached_get_entity_by(name='number_of_inserted_entities').id, n_inserts)
             .add_property(
-                db.execute_query("FIND Property with name='finished'",
-                                 unique=True).id, datetime.now().isoformat()))
+                cached_get_entity_by(name='number_of_updated_entities').id, n_updates)
+            .add_property(
+                cached_get_entity_by(name='finished').id, datetime.now().isoformat()))
         cr_rec.update()
 
 
-- 
GitLab


From 3c32b1f0222621b7bfd173b5a6f4c7b195974bea Mon Sep 17 00:00:00 2001
From: Florian Spreckelsen <f.spreckelsen@indiscale.com>
Date: Mon, 24 Mar 2025 14:58:49 +0100
Subject: [PATCH 14/17] FEAT(scanner): Auto-generate converter_registry and
 registered_transformer_functions

---
 CHANGELOG.md               |  6 ++++++
 src/caoscrawler/scanner.py | 15 ++++++++++++++-
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 33fdff70..6d15568e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,11 +11,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed ###
 
+- `scanner.scan_structure_elements` now auto-generates the
+  `converter_registry` and the `registered_transformer_functions` from
+  the `crawler_definition` if none are given. Therefore, the
+  `converter_registry` argument is now optional.
+
 ### Deprecated ###
 
 ### Removed ###
 
 ### Fixed ###
+
 - A RecordType with multiple Parents no longer causes an error during
   collection of identifiables
 
diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py
index af1f4173..cb8dfdf2 100644
--- a/src/caoscrawler/scanner.py
+++ b/src/caoscrawler/scanner.py
@@ -486,7 +486,7 @@ def scan_directory(dirname: Union[str, list[str]], crawler_definition_path: str,
 
 def scan_structure_elements(items: Union[list[StructureElement], StructureElement],
                             crawler_definition: dict,
-                            converter_registry: dict,
+                            converter_registry: Optional[dict] = None,
                             restricted_path: Optional[list[str]] = None,
                             debug_tree: Optional[DebugTree] = None,
                             registered_transformer_functions: Optional[dict] = None) -> (
@@ -508,6 +508,15 @@ def scan_structure_elements(items: Union[list[StructureElement], StructureElemen
          Traverse the data tree only along the given path. When the end of the
          given path is reached, traverse the full tree as normal. See docstring
          of 'scanner' for more details.
+    converter_registry: dict, optional
+         Optional dictionary containing the converter definitions
+         needed for the crawler definition. If none is given, it will
+         be generated from the `crawler_definition`. Default is None.
+    registered_transformer_functions: dict, optional
+         Optional dictionary containing the transformer function
+         definitions needed for the crawler definition. If none is
+         given, it will be generated from the
+         `crawler_definition`. Default is None.
 
     Returns
     -------
@@ -519,6 +528,10 @@ def scan_structure_elements(items: Union[list[StructureElement], StructureElemen
     if not isinstance(items, list):
         items = [items]
 
+    if converter_registry is None:
+        converter_registry = create_converter_registry(crawler_definition)
+    if registered_transformer_functions is None:
+        registered_transformer_functions = create_transformer_registry(crawler_definition)
     # TODO: needs to be covered somewhere else
     # self.run_id = uuid.uuid1()
     converters = initialize_converters(crawler_definition, converter_registry)
-- 
GitLab


From e9e6bb4523dfa2e61de87fb68a58813a4c5cf107 Mon Sep 17 00:00:00 2001
From: Florian Spreckelsen <f.spreckelsen@indiscale.com>
Date: Tue, 25 Mar 2025 09:46:18 +0100
Subject: [PATCH 15/17] TEST(scanner): Extend unit test for new behavior of
 scan_structure_elements

---
 unittests/test_scanner.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/unittests/test_scanner.py b/unittests/test_scanner.py
index c531f66f..80c7b3e9 100644
--- a/unittests/test_scanner.py
+++ b/unittests/test_scanner.py
@@ -61,6 +61,12 @@ def test_scan_structure_elements():
                                    converter_registry)
     assert len(recs) == 4
 
+    # Converter registry is generated automatically from the crawler
+    # definition in scan_structure_elements.
+    recs_without_converter_registry = scan_structure_elements(DictElement(name="", value=data),
+                                                              crawler_definition)
+    assert len(recs) == len(recs_without_converter_registry)
+
 
 def test_provenance_debug_data():
     # TODO rewrite the test to use a smaller example setup
-- 
GitLab


From c7ceb3a3e2fa54cf84fc00acdfdd919ab4f3abe8 Mon Sep 17 00:00:00 2001
From: Florian Spreckelsen <f.spreckelsen@indiscale.com>
Date: Tue, 25 Mar 2025 14:58:42 +0100
Subject: [PATCH 16/17] BUILD: Increase version numbers for release 0.12.0

---
 CHANGELOG.md    | 12 +-----------
 CITATION.cff    |  4 ++--
 setup.cfg       |  2 +-
 src/doc/conf.py |  4 ++--
 4 files changed, 6 insertions(+), 16 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6d15568e..ddd0bbe3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,9 +5,7 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [Unreleased] ##
-
-### Added ###
+## [0.12.0] - 2025-03-25 ##
 
 ### Changed ###
 
@@ -16,19 +14,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   the `crawler_definition` if none are given. Therefore, the
   `converter_registry` argument is now optional.
 
-### Deprecated ###
-
-### Removed ###
-
 ### Fixed ###
 
 - A RecordType with multiple Parents no longer causes an error during
   collection of identifiables
 
-### Security ###
-
-### Documentation ###
-
 ## [0.11.0] - 2025-03-05 ##
 
 ### Added ###
diff --git a/CITATION.cff b/CITATION.cff
index 8f4e22a4..2a883a07 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -17,6 +17,6 @@ authors:
     given-names: Alexander
     orcid: https://orcid.org/0000-0003-4124-9649
 title: CaosDB - Crawler
-version: 0.11.0
+version: 0.12.0
 doi: 10.3390/data9020024
-date-released: 2025-03-05
\ No newline at end of file
+date-released: 2025-03-25
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
index f6f95d6d..f7c6df7b 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = caoscrawler
-version = 0.11.1
+version = 0.12.0
 author = Alexander Schlemmer
 author_email = alexander.schlemmer@ds.mpg.de
 description = A new crawler for LinkAhead
diff --git a/src/doc/conf.py b/src/doc/conf.py
index 2a783dec..85de34ab 100644
--- a/src/doc/conf.py
+++ b/src/doc/conf.py
@@ -33,10 +33,10 @@ copyright = '2024, IndiScale'
 author = 'Alexander Schlemmer'
 
 # The short X.Y version
-version = '0.11.1'
+version = '0.12.0'
 # The full version, including alpha/beta/rc tags
 # release = '0.5.2-rc2'
-release = '0.11.1-dev'
+release = '0.12.0'
 
 
 # -- General configuration ---------------------------------------------------
-- 
GitLab


From 1e3daf77d0cda76d236eb1e0e33c69253220f6a6 Mon Sep 17 00:00:00 2001
From: Florian Spreckelsen <f.spreckelsen@indiscale.com>
Date: Tue, 25 Mar 2025 15:08:22 +0100
Subject: [PATCH 17/17] CI: Add missing sphinx dependency

---
 .docker/Dockerfile | 1 +
 README_SETUP.md    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.docker/Dockerfile b/.docker/Dockerfile
index 14c3c1ef..ab4c5ca9 100644
--- a/.docker/Dockerfile
+++ b/.docker/Dockerfile
@@ -37,6 +37,7 @@ RUN pip3 install --break-system-packages \
   pylint \
   recommonmark \
   sphinx-rtd-theme \
+  sphinx-jsonschema \
   ;
 COPY .docker/wait-for-it.sh /wait-for-it.sh
 ARG PYLIB
diff --git a/README_SETUP.md b/README_SETUP.md
index be38b727..3ad25de7 100644
--- a/README_SETUP.md
+++ b/README_SETUP.md
@@ -24,6 +24,7 @@ dependencies, i.e., `pip install .[h5-crawler,spss]`.
 - `sphinx-autoapi`
 - `recommonmark`
 - `sphinx-rtd-theme`
+- `sphinx-jsonschema`
 
 ### How to contribute ###
 
-- 
GitLab