Merge branch 'dev' into f-new-debug-tree

f200440c · Alexander Schlemmer · 4c925e19 · 96ae0ada · f200440c · f200440c
Commit f200440c authored 7 months ago by Alexander Schlemmer
--- a/src/caoscrawler/converters.py
+++ b/src/caoscrawler/converters.py
--- a/src/caoscrawler/hdf5_converter.py
+++ b/src/caoscrawler/hdf5_converter.py
@@ -18,6 +18,8 @@
 # along with this program. If not, see <https://www.gnu.org/licenses/>.
 #
+from typing import Optional
 try:
    import h5py
 except ModuleNotFoundError:
@@ -26,16 +28,16 @@ except ModuleNotFoundError:
        "its optional `h5-crawler` dependency?"
    )
-import numpy as np
 from typing import Union
 import linkahead as db
+import numpy as np
-from .converters import (convert_basic_element, Converter, DictElementConverter,
+from ..stores import GeneralStore, RecordStore
-                         match_name_and_value, SimpleFileConverter)
+from ..structure_elements import (DictElement, File, FloatElement,
-from .stores import GeneralStore, RecordStore
+                                  IntegerElement, StructureElement)
-from .structure_elements import DictElement, File, FloatElement, IntegerElement, StructureElement
+from .converters import (Converter, DictElementConverter, SimpleFileConverter,
+                         convert_basic_element, match_name_and_value)
 def convert_attributes(elt: Union[h5py.File, h5py.Group, h5py.Dataset]):
@@ -94,8 +96,8 @@ def convert_h5_element(elt: Union[h5py.Group, h5py.Dataset], name: str):
    raise ValueError("The given element must be either a HDF5 Group or Dataset object.")
-def convert_basic_element_with_nd_array(value, name: str = None,
+def convert_basic_element_with_nd_array(value, name: Optional[str] = None,
-                                        internal_path: str = None, msg_prefix: str = ""):
+                                        internal_path: Optional[str] = None, msg_prefix: str = ""):
    """Convert a given object either to an ndarray structure element or to a
    basic scalar structure element.

--- a/src/caoscrawler/converters/rocrate.py
+++ b/src/caoscrawler/converters/rocrate.py
+# encoding: utf-8
+#
+# This file is a part of the LinkAhead Project.
+#
+# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com>
+# Copyright (C) 2024 Alexander Schlemmer
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+"""Converters take structure elements and create Records and new structure elements from them.
+This converter converts ro-crate files which may also be .eln-files.
+"""
+from __future__ import annotations
+import os
+import re
+import tempfile
+from typing import Optional
+from zipfile import ZipFile
+import rocrate
+from rocrate.rocrate import ROCrate
+from ..stores import GeneralStore
+from ..structure_elements import (Directory, File, ROCrateEntity,
+                                  StructureElement)
+from .converters import Converter, SimpleFileConverter, convert_basic_element
+class ROCrateConverter(SimpleFileConverter):
+    """Convert ro-crate files / directories.
+    """
+    def setup(self):
+        self._tempdir = None
+    def cleanup(self):
+        self._tempdir.cleanup()
+    def typecheck(self, element: StructureElement):
+        """
+        Check whether the current structure element can be converted using
+        this converter.
+        """
+        return isinstance(element, File) or isinstance(element, Directory)
+    def match(self, element: StructureElement) -> Optional[dict]:
+        m = re.match(self.definition["match"], element.name)
+        if m is None:
+            return None
+        return m.groupdict()
+    def create_children(self, generalStore: GeneralStore, element: StructureElement):
+        """
+        Loads an ROCrate from an rocrate file or directory.
+        Arguments:
+        ----------
+        element must be a File or Directory (structure element).
+        Returns:
+        --------
+        A list with an ROCrateElement representing the contents of the .eln-file or None
+        in case of errors.
+        """
+        if isinstance(element, File):
+            self._tempdir = tempfile.TemporaryDirectory()
+            with ZipFile(element.path) as zipf:
+                zipf.extractall(self._tempdir.name)
+            crate_path = self._tempdir.name
+            crate = ROCrate(crate_path)
+            entity_ls = []
+            for ent in crate.get_entities():
+                entity_ls.append(ROCrateEntity(crate_path, ent))
+            return entity_ls
+        elif isinstance(element, Directory):
+            # This would be an unzipped .eln file
+            # As this is possible for rocrate files, I think it is reasonable
+            # to support it as well.
+            raise NotImplementedError()
+        else:
+            raise ValueError("create_children was called with wrong type of StructureElement")
+        return None
+class ELNFileConverter(ROCrateConverter):
+    """Convert .eln-Files
+    See: https://github.com/TheELNConsortium/TheELNFileFormat
+    These files are basically RO-Crates with some minor differences:
+    - The ro-crate metadata file is not on top-level within the .eln-zip-container,
+      but in a top-level subdirectory.
+    """
+    def create_children(self, generalStore: GeneralStore, element: StructureElement):
+        """
+        Loads an ROCrate from an .eln-file or directory.
+        This involves unzipping the .eln-file to a temporary folder and creating an ROCrate object
+        from its contents.
+        Arguments:
+        ----------
+        element must be a File or Directory (structure element).
+        Returns:
+        --------
+        A list with an ROCrateElement representing the contents of the .eln-file or None
+        in case of errors.
+        """
+        if isinstance(element, File):
+            self._tempdir = tempfile.TemporaryDirectory()
+            with ZipFile(element.path) as zipf:
+                zipf.extractall(self._tempdir.name)
+            cratep = os.listdir(self._tempdir.name)
+            if len(cratep) != 1:
+                raise RuntimeError(".eln file must contain exactly one folder")
+            crate_path = os.path.join(self._tempdir.name, cratep[0])
+            crate = ROCrate(crate_path)
+            entity_ls = []
+            for ent in crate.get_entities():
+                entity_ls.append(ROCrateEntity(crate_path, ent))
+            return entity_ls
+        elif isinstance(element, Directory):
+            # This would be an unzipped .eln file
+            # As this is possible for rocrate files, I think it is reasonable
+            # to support it as well.
+            raise NotImplementedError()
+        else:
+            raise ValueError("create_children was called with wrong type of StructureElement")
+        return None
+class ROCrateEntityConverter(Converter):
+    def typecheck(self, element: StructureElement):
+        """
+        Check whether the current structure element can be converted using
+        this converter.
+        """
+        return isinstance(element, ROCrateEntity)
+    def match(self, element: StructureElement) -> Optional[dict]:
+        # See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145
+        # for a suggestion for the design of the matching algorithm.
+        if not isinstance(element, ROCrateEntity):
+            raise TypeError("Element must be an instance of ROCrateEntity.")
+        # Store the result of all individual regexp variable results:
+        vardict = {}
+        # TODO: I accidentally used "match_type" instead
+        #       of "match_entity_type". This was completely
+        #       unnoticed. So add it to schema and adapt tests.
+        if "match_entity_type" in self.definition:
+            entity_type = element.entity.type
+            if isinstance(entity_type, list):
+                # TODO: this seems to be a bug in kadi4mat RO-Crates
+                #       ./ has type ['Dataset']
+                #       instead of type 'Dataset'
+                entity_type = entity_type[0]
+            m_type = re.match(self.definition["match_entity_type"], entity_type)
+            if m_type is None:
+                return None
+            vardict.update(m_type.groupdict())
+        if not self.match_properties(element.entity.properties(), vardict):
+            return None
+        return vardict
+    def create_children(self, generalStore: GeneralStore, element: StructureElement):
+        children = []
+        eprops = element.entity.properties()
+        # Add the properties:
+        for name, value in eprops.items():
+            children.append(convert_basic_element(value, name))
+        # Add the files:
+        if isinstance(element.entity, rocrate.model.file.File):
+            path, name = os.path.split(eprops["@id"])
+            children.append(File(name, os.path.join(element.folder, path, name)))
+        # Parts of this entity are added as child entities:
+        if "hasPart" in eprops:
+            for p in eprops["hasPart"]:
+                children.append(
+                    ROCrateEntity(element.folder, element.entity.crate.dereference(
+                        p["@id"])))
+        return children
--- a/src/caoscrawler/converters/spss.py
+++ b/src/caoscrawler/converters/spss.py
+# This file is a part of the LinkAhead Project.
+#
+# Copyright (C) 2024 IndiScale GmbH <info@indiscale.com>
+# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+"""Converter for SAV files (stored by SPSS)."""
+from __future__ import annotations  # Can be removed with 3.10.
+import argparse
+from collections import OrderedDict
+from typing import Any, Optional
+import numpy as np
+import pandas as pd
+import pyreadstat
+import yaml
+from ..stores import GeneralStore
+from ..structure_elements import File, StructureElement
+from . import converters
+READSTAT_TYPES = {
+    "double": "DOUBLE",
+    "string": "TEXT",
+}
+ORIGINAL_TYPES = {
+    "EDATE8": "DATETIME",
+}
+class SPSSConverter(converters.TableConverter):
+    """Converter for SAV files (stored by SPSS)."""
+    def create_children(self, values: GeneralStore, element: StructureElement) -> list:
+        assert isinstance(element, File)
+        # The default dtype backend "numpy_nullable" does not handle dates well.
+        # Note that pandas.ArrowDtype is considered experimental (in Pandas 2.2).
+        df = pd.io.spss.read_spss(element.path, dtype_backend="pyarrow")
+        dtypes = read_column_types(element.path)
+        # Fix datetime columns
+        for name, dtype in dtypes.items():
+            if dtype != "DATETIME":
+                continue
+            col = df.loc[:, name]
+            col.fillna(np.nan, inplace=True)
+            col.replace([np.nan], [None], inplace=True)
+        return self._children_from_dataframe(df)
+def read_column_types(savfile: Optional[str] = None, meta: Optional[Any] = None) -> dict[str, str]:
+    """Read SAV file and return the column types.
+Optionally, take data from a previours reading.
+Parameters
+----------
+savfile : Optional[str]
+    The SAV file to read.
+meta : Optional
+    The meta data result from `pyreadstat.read_sav(...)`.
+Returns
+-------
+out : dict[str, str]
+    The column names and types.
+    """
+    if not meta:
+        _, meta = pyreadstat.read_sav(savfile, metadataonly=True)
+    elif savfile is not None:
+        raise ValueError("Only one of `savfile` and `meta` must be given.")
+    dtypes: dict[str, str] = {}
+    for name in meta.column_names:
+        datatype = ORIGINAL_TYPES.get(meta.original_variable_types[name],
+                                      READSTAT_TYPES[meta.readstat_variable_types[name]])
+        dtypes[name] = datatype
+    return dtypes
+def spss_to_yaml(savfile: str, yamlfile: str, cfood: Optional[str] = None) -> None:
+    """Parse the *.sav and create basic datamodel in ``yamlfile``.
+Parameters
+----------
+cfood: str
+  If given, also create a cfood skeleton.
+    """
+    _, meta = pyreadstat.read_sav(savfile, metadataonly=True)
+    dtypes = read_column_types(meta=meta)
+    cfood_str = """
+---
+metadata:
+  macros:
+  - !defmacro
+    # Simple column value -> property rule
+    name: ColumnValue
+    params:
+      name: null
+      belongsto: BaseElement
+      type: TextElement
+    definition:
+      ${name}:
+        type: ${type}
+        match_name: ^${name}$$
+        match_value: (?P<val>.*)
+        records:
+          ${belongsto}:
+            ${name}: $$val
+  - !defmacro
+    # column value -> reference property
+    name: ColumnValueReference
+    params:
+      name: null
+      reftype: null  # RecordType of the reference
+      belongsto: BaseElement
+      type: TextElement  # References are always text, right?
+    definition:
+      ${name}:
+        type: ${type}
+        match_name: ^${name}$$
+        match_value: (?P<val>.*)
+        records:
+          ${reftype}:
+            name: $$val
+          ${belongsto}:
+            ${name}: $$${reftype}
+  - !defmacro
+    # Same as "ColumnValue", but also give name of property.
+    name: ColumnValuePropname
+    params:
+      name: null
+      propname: null
+      belongsto: BaseElement
+      type: TextElement
+    definition:
+      ${name}:
+        type: ${type}
+        match_name: ^${name}$$
+        match_value: (?P<val>.*)
+        records:
+          ${belongsto}:
+            ${propname}: $$val
+---
+directory: # corresponds to the directory given to the crawler
+  type: Directory
+  match: .* # we do not care how it is named here
+  subtree:
+    # This is the file
+    thisfile:
+      type: SPSSFile
+      match: ".*sav"
+      subtree:
+        entry:
+          type: Dict
+          match: .* # Name is irrelevant
+          records:
+            MyParent:
+          subtree: !macro
+"""
+    enums: dict[str, list[str]] = {}
+    properties = OrderedDict()
+    for name in meta.column_names:
+        prop = {
+            "datatype": dtypes[name],
+        }
+        desc = meta.column_names_to_labels.get(name)
+        if desc and desc != name:
+            prop["description"] = desc
+        # Handle categorial variables
+        if var_label := meta.variable_to_label.get(name):
+            vvl = meta.variable_value_labels[name]
+            # reproducible (and sensible) order
+            label_values = [vvl[key] for key in sorted(vvl.keys())]
+            if label_values not in enums.values():
+                enums[var_label] = label_values
+            else:
+                var_label = [key for key, value in enums.items() if value == label_values][0]
+            prop["datatype"] = var_label
+        properties[name] = prop
+    output = f"""# auto-generated data model from file "{savfile}".
+# To insert a datamodel into LinkAhead, run:
+#
+# python3 -m caosadvancedtools.models.parser datamodel.yaml --sync
+"""
+    # Actual datamodel
+    output += """
+#########
+# Enums #
+#########
+"""
+    for name, values in enums.items():
+        output += f"""{name}:
+  description:
+  # possible values: {values}\n"""
+    output += ("""
+###############
+# RecordTypes #
+###############
+DummyRT:
+  description: Note: Change name and enter description.
+  recommended_properties:
+    """
+               + "    ".join(yaml.dump(dict(properties),  # from OrderedDict to dict
+                                       allow_unicode=True,
+                                       sort_keys=False).splitlines(keepends=True)))
+    # Experimental: Enum creation
+    output += """
+###############
+# Enum values #
+###############
+"""
+    for name, values in enums.items():
+        output += f"\n# ### {name} ###\n"
+        for value in values:
+            output += f"""
+{value}:
+  role: Record
+  inherit_from_suggested:
+    - {name}
+"""
+    with open(yamlfile, encoding="utf-8", mode="w") as myfile:
+        myfile.write(output)
+    if cfood:
+        defs_col_value: list[str] = []
+        defs_col_value_ref: list[str] = []
+        prefix = " " * 14
+        for name, propdef in properties.items():
+            def_str = prefix + f"- name: {name}\n"
+            dtype = None
+            reftype = None
+            defs = defs_col_value
+            # Which type?
+            if propdef["datatype"] == "DOUBLE":
+                dtype = "FloatElement"
+            elif propdef["datatype"] in ("TEXT", "DATETIME"):
+                dtype = None
+            else:
+                reftype = propdef["datatype"]
+                defs = defs_col_value_ref
+            # Append according to types:
+            if reftype:
+                def_str += prefix + f"  reftype: {reftype}\n"
+            if dtype:
+                def_str += prefix + f"  type: {dtype}\n"
+            # Store result
+            defs.append(def_str)
+            del defs
+        cfood_str += (prefix[2:] + "ColumnValue:\n" + "".join(defs_col_value)
+                      + prefix[2:] + "ColumnValueReference:\n" + "".join(defs_col_value_ref)
+                      )
+        with open(cfood, encoding="utf-8", mode="w") as myfile:
+            myfile.write(cfood_str)
+def _parse_arguments():
+    """Parse the arguments."""
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument('-i', '--input', help="The *.sav file.", required=True)
+    parser.add_argument('-o', '--outfile', help="Yaml filename to save the result", required=True)
+    parser.add_argument('--cfood', help="Yaml filename to create cfood output in", required=False)
+    return parser.parse_args()
+def spss_to_datamodel_main():
+    """The main function of this script."""
+    args = _parse_arguments()
+    spss_to_yaml(savfile=args.input, yamlfile=args.outfile, cfood=args.cfood)
+    print(f"Written datamodel to: {args.outfile}")
+    if args.cfood:
+        print(f"Written cfood to: {args.cfood}")
--- a/src/caoscrawler/converters/xml_converter.py
+++ b/src/caoscrawler/converters/xml_converter.py
+# encoding: utf-8
+#
+# This file is a part of the LinkAhead Project.
+#
+# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com>
+# Copyright (C) 2024 Alexander Schlemmer
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+"""Converters take structure elements and create Records and new structure elements from them."""
+from __future__ import annotations
+import re
+from typing import Optional
+import lxml.etree
+from ..stores import GeneralStore
+from ..structure_elements import (File, StructureElement, XMLAttributeNode,
+                                  XMLTagElement, XMLTextNode)
+from .converters import (Converter, ConverterValidationError,
+                         SimpleFileConverter)
+class XMLFileConverter(SimpleFileConverter):
+    """Convert XML files. See
+    https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145
+    for the current suggestion for the specification.
+    """
+    def create_children(self, generalStore: GeneralStore, element: StructureElement):
+        # TODO: See comment on types and inheritance
+        if not isinstance(element, File):
+            raise ValueError("create_children was called with wrong type of StructureElement")
+        with open(element.path, 'r') as xml_file:
+            xml = lxml.etree.parse(xml_file)
+        if "validate" in self.definition and self.definition["validate"]:
+            try:
+                raise NotImplementedError("XML validation not implemented yet.")
+            except ConverterValidationError as err:
+                raise ConverterValidationError(
+                    "Error during the validation of the XML file:\n"
+                    f"{element.path}\n" + err.message)
+        return [XMLTagElement(xml.getroot())]
+class XMLTagConverter(Converter):
+    def create_children(self, generalStore: GeneralStore, element: StructureElement):
+        """Children that are generated by this function are the
+        result of the xpath query given in the yaml property
+        ``xpath``. Its default (when not given) is ``child::*``, so the
+        direct children of the current xml node. The xpath expression
+        must be designed in a way that it returns xml tags (and no
+        attributes or texts). That means, that the axis ``attribute::``
+        and the function ``text()`` must not be used.
+        The following yaml properties can be used to generate other
+        types of nodes (text nodes and attribute nodes) as subtree
+        structure elements:
+        ::
+            # _*_ marks the default:
+            attribs_as_children: true  # true / _false_
+            text_as_children: true  # true / _false_
+            tags_as_children: true  # _true_ / false
+        The default is to generate the tags matched by the xpath expression only.
+        - When text_as_children is set to true, text nodes will be generated that contain the text
+          contained in the matched tags.
+        - When attribs_as_children is set to true, attribute nodes will be generated from the attributes
+          of the matched tags.
+        Notes
+        -----
+        The default is to take the namespace map from the current node and use it in xpath queries.
+        Because default namespaces cannot be handled by xpath, it is possible to remap the default namespace
+        using the key ``default_namespace``.
+        The key ``nsmap`` can be used to define additional nsmap entries.
+        """
+        if not isinstance(element, XMLTagElement):
+            raise TypeError("Element must be an instance of XMLTagElement.")
+        # Get the namespace map from the element:
+        nsmap = element.tag.nsmap
+        # The default name of the default namespace is "default".
+        # You can overwrite it using the attribute "default_namespace" in the converter definition:
+        default_namespace = self.definition.get("default_namespace", "default")
+        if None in nsmap:
+            nsmap[default_namespace] = nsmap[None]
+            del nsmap[None]
+        # Set additional nsmap entries from the converter definition:
+        if "nsmap" in self.definition:
+            for key, value in self.definition["nsmap"].items():
+                nsmap[key] = value
+        xpath = self.definition.get("xpath", "child::*")
+        children = element.tag.xpath(xpath, namespaces=nsmap)
+        el_lst = []
+        for el in children:
+            if isinstance(el, str):
+                raise RuntimeError(
+                    "Only standard xml nodes are supported as results of xpath queries.")
+            elif isinstance(el, lxml.etree._Element):
+                if self.definition.get("tags_as_children", True):
+                    el_lst.append(XMLTagElement(el))
+                if self.definition.get("attribs_as_children", False):
+                    for attrib in el.attrib:
+                        el_lst.append(XMLAttributeNode(el, attrib))
+                if self.definition.get("text_as_children", False):
+                    el_lst.append(XMLTextNode(el))
+            else:
+                raise RuntimeError("Unsupported child type.")
+        return el_lst
+    def typecheck(self, element: StructureElement):
+        """
+        Check whether the current structure element can be converted using
+        this converter.
+        """
+        return isinstance(element, XMLTagElement)
+    def match(self, element: StructureElement) -> Optional[dict]:
+        # See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145
+        # for a suggestion for the design of the matching algorithm.
+        if not isinstance(element, XMLTagElement):
+            raise TypeError("Element must be an instance of XMLTagElement.")
+        # Store the result of all individual regexp variable results:
+        vardict = {}
+        if "match_tag" in self.definition:
+            m_tag = re.match(self.definition["match_tag"], element.tag.tag)
+            if m_tag is None:
+                return None
+            vardict.update(m_tag.groupdict())
+        if "match_text" in self.definition:
+            tagtext = element.tag.text
+            if element.tag.text is None:
+                tagtext = ""
+            m_text = re.match(self.definition["match_text"], tagtext, re.DOTALL)
+            if m_text is None:
+                return None
+            vardict.update(m_text.groupdict())
+        if not self.match_properties(element.tag.attrib, vardict, "match_attrib"):
+            return None
+        return vardict
+class XMLTextNodeConverter(Converter):
+    def create_children(self, generalStore: GeneralStore, element: StructureElement):
+        """
+        This converter does not create children.
+        """
+        return []
+    def typecheck(self, element: StructureElement):
+        """
+        Check whether the current structure element can be converted using
+        this converter.
+        """
+        return isinstance(element, XMLTextNode)
+    def match(self, element: StructureElement) -> Optional[dict]:
+        # See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145
+        # for a suggestion for the design of the matching algorithm.
+        if not isinstance(element, XMLTextNode):
+            raise TypeError("Element must be an instance of XMLTextNode.")
+        vardict = {}
+        m_text = re.match(self.definition["match_text"], element.value,
+                          re.DOTALL)
+        if m_text is None:
+            return None
+        vardict.update(m_text.groupdict())
+        return vardict
+class XMLAttributeNodeConverter(Converter):
+    def create_children(self, generalStore: GeneralStore, element: StructureElement):
+        """
+        This converter does not create children.
+        """
+        return []
+    def typecheck(self, element: StructureElement):
+        """
+        Check whether the current structure element can be converted using
+        this converter.
+        """
+        return isinstance(element, XMLAttributeNode)
+    def match(self, element: StructureElement) -> Optional[dict]:
+        # See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145
+        # for a suggestion for the design of the matching algorithm.
+        if not isinstance(element, XMLAttributeNode):
+            raise TypeError("Element must be an instance of XMLAttributeNode.")
+        vardict = {}
+        m_name = re.match(self.definition["match_name"], element.key)
+        if m_name is None:
+            return None
+        vardict.update(m_name.groupdict())
+        m_value = re.match(self.definition["match_value"], element.value)
+        if m_value is None:
+            return None
+        vardict.update(m_value.groupdict())
+        return vardict
--- a/src/caoscrawler/converters/zipfile_converter.py
+++ b/src/caoscrawler/converters/zipfile_converter.py
+# encoding: utf-8
+#
+# This file is a part of the LinkAhead Project.
+#
+# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com>
+# Copyright (C) 2024 Alexander Schlemmer
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+"""Converters take structure elements and create Records and new structure elements from them.
+This converter opens zip files, unzips them into a temporary directory and
+exposes its contents as File structure elements.
+"""
+from __future__ import annotations
+import os
+import tempfile
+from os.path import isdir, join
+from zipfile import ZipFile
+from ..stores import GeneralStore
+from ..structure_elements import Directory, File, StructureElement
+from .converters import SimpleFileConverter
+class ZipFileConverter(SimpleFileConverter):
+    """Convert zipfiles.
+    """
+    def setup(self):
+        self._tempdir = None
+    def cleanup(self):
+        self._tempdir.cleanup()
+    def create_children(self, generalStore: GeneralStore, element: StructureElement):
+        """
+        Loads an ROCrate from an rocrate file or directory.
+        Arguments:
+        ----------
+        element must be a File or Directory (structure element).
+        Returns:
+        --------
+        A list with an ROCrateElement representing the contents of the .eln-file or None
+        in case of errors.
+        """
+        if isinstance(element, File):
+            self._tempdir = tempfile.TemporaryDirectory()
+            unzd_path = self._tempdir.name
+            with ZipFile(element.path) as zipf:
+                zipf.extractall(unzd_path)
+            entity_ls = []
+            for el in os.listdir(unzd_path):
+                path = join(unzd_path, el)
+                if isdir(path):
+                    entity_ls.append(Directory(el, path))
+                else:
+                    entity_ls.append(File(el, path))
+            return entity_ls
+        else:
+            raise ValueError("create_children was called with wrong type of StructureElement")
+        return None
--- a/src/caoscrawler/crawl.py
+++ b/src/caoscrawler/crawl.py
--- a/src/caoscrawler/debug_tree.py
+++ b/src/caoscrawler/debug_tree.py
@@ -29,35 +29,20 @@ A structure containing debug tree information.
 from __future__ import annotations
-import argparse
-import importlib
-import logging
-import os
-import sys
-import warnings
-import yaml
-from argparse import RawTextHelpFormatter
 from collections import defaultdict
-from copy import deepcopy
-from enum import Enum
-from importlib_resources import files
-from jsonschema import validate
-from typing import Any, Optional, Type, Union
 import linkahead as db
+import yaml
-from caosadvancedtools.cache import UpdateCache, Cache
+from importlib_resources import files
-from caosadvancedtools.crawler import Crawler as OldCrawler
+from jsonschema import validate
-from linkahead.apiutils import (compare_entities, EntityMergeConflictError,
+from linkahead.apiutils import (EntityMergeConflictError, compare_entities,
                                merge_entities)
 from linkahead.common.datatype import is_reference
-from .converters import Converter, DirectoryConverter, ConverterValidationError
+from .converters import Converter, ConverterValidationError, DirectoryConverter
 from .macros import defmacro_constructor, macro_constructor
-from .stores import Store, GeneralStore, RecordStore
+from .stores import GeneralStore, RecordStore, Store
-from .structure_elements import StructureElement, Directory, NoneElement
+from .structure_elements import Directory, NoneElement, StructureElement
 from .version import check_cfood_version

--- a/src/caoscrawler/default_converters.yml
+++ b/src/caoscrawler/default_converters.yml
@@ -8,9 +8,15 @@ BooleanElement:
 Date:
  converter: DateElementConverter
  package: caoscrawler.converters
+Datetime:
+  converter: DatetimeElementConverter
+  package: caoscrawler.converters
 Dict:
  converter: DictElementConverter
  package: caoscrawler.converters
+PropertiesFromDictElement:
+  converter: PropertiesFromDictConverter
+  package: caoscrawler.converters
 FloatElement:
  converter: FloatElementConverter
  package: caoscrawler.converters
@@ -81,6 +87,31 @@ CSVTableConverter:
  converter: CSVTableConverter
  package: caoscrawler.converters
+SPSSFile:
+  converter: SPSSConverter
+  package: caoscrawler.converters
 XLSXTableConverter:
  converter: XLSXTableConverter
  package: caoscrawler.converters
+# -------------------------
+# XML
+# -------------------------
+XMLFile:
+  converter: XMLFileConverter
+  package: caoscrawler.converters
+XMLTag:
+  converter: XMLTagConverter
+  package: caoscrawler.converters
+XMLTextNode:
+  converter: XMLTextNodeConverter
+  package: caoscrawler.converters
+XMLAttributeNode:
+  converter: XMLAttributeNodeConverter
+  package: caoscrawler.converters
--- a/src/caoscrawler/default_transformers.yml
+++ b/src/caoscrawler/default_transformers.yml
+# Lookup table for matching functions and cfood yaml node names.
 submatch:
  package: caoscrawler.transformer_functions
@@ -9,3 +9,21 @@ split:
 replace:
  package: caoscrawler.transformer_functions
  function: replace
+date_parse:
+  package: caoscrawler.transformer_functions
+  function: date_parse
+datetime_parse:
+  package: caoscrawler.transformer_functions
+  function: datetime_parse
+cast_to_int:
+  package: caoscrawler.transformer_functions
+  function: cast_to_int
+cast_to_float:
+  package: caoscrawler.transformer_functions
+  function: cast_to_float
+cast_to_bool:
+  package: caoscrawler.transformer_functions
+  function: cast_to_bool
+cast_to_str:
+  package: caoscrawler.transformer_functions
+  function: cast_to_str
--- a/src/caoscrawler/exceptions.py
+++ b/src/caoscrawler/exceptions.py
+#!/usr/bin/env python3
+# encoding: utf-8
+#
+# This file is a part of the LinkAhead Project.
+#
+# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com>
+# Copyright (C) 2024 Henrik tom Wörden <h.tomwoerden@indiscale.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+from typing import Any
+class ForbiddenTransaction(Exception):
+    """Thrown if an transactions is needed that is not allowed.
+    For example an update of an entity if the security level is INSERT
+    """
+    pass
+class ImpossibleMergeError(Exception):
+    """Thrown if due to identifying information, two SyncNodes  or two Properties of SyncNodes
+    should be merged, but there is conflicting information that prevents this.
+    Parameters
+    ----------
+    msg : str
+        A case-specific error message describing where the merger error occurred.
+    pname : str
+        The name of the property the values of which caused the merge error.
+    value_a, value_b : Any
+        The two values that couldn't be merged.
+    Attributes
+    ----------
+    message : str
+        A case-specific error message describing where the merger error occurred.
+    values : tuple[Any]
+        The two values that couldn't be merged.
+    pname : str
+        The name of the property the values of which caused the merge error.
+    """
+    def __init__(self, msg: str, pname: str, value_a: Any, value_b: Any):
+        self.pname = pname
+        self.values = (value_a, value_b)
+        self.message = msg
+        super().__init__(self, msg)
+    def __str__(self):
+        return (
+            f"{self.message}\n\nThe problematic property is '{self.pname}' with "
+            f"values '{self.values[0]}' and '{self.values[1]}'."
+        )
+    def __repr__(self):
+        return self.__str__()
+class InvalidIdentifiableYAML(Exception):
+    """Thrown if the identifiable definition is invalid."""
+    pass
+class MissingIdentifyingProperty(Exception):
+    """Thrown if a SyncNode does not have the properties required by the corresponding registered
+    identifiable
+    """
+    pass
+class MissingRecordType(Exception):
+    """Thrown if an record type can not be found although it is expected that it exists on the
+    server.
+    """
+    pass
+class MissingReferencingEntityError(Exception):
+    """Thrown if the identifiable requires that some entity references the given entity but there
+    is no such reference """
+    def __init__(self, *args, rts=None, **kwargs):
+        self.rts = rts
+        super().__init__(self, *args, **kwargs)
--- a/src/caoscrawler/identifiable.py
+++ b/src/caoscrawler/identifiable.py
 #!/usr/bin/env python3
 # encoding: utf-8
 #
-# This file is a part of the CaosDB Project.
+# This file is a part of the LinkAhead Project.
 #
 # Copyright (C) 2022 Henrik tom Wörden
 #
@@ -20,23 +20,27 @@
 #
 from __future__ import annotations
-import linkahead as db
-from datetime import datetime
 import json
-from hashlib import sha256
-from typing import Union
 import logging
+from datetime import datetime
+from hashlib import sha256
+from typing import Optional, Union
+import linkahead as db
+from .exceptions import MissingIdentifyingProperty
+from .sync_node import SyncNode
 logger = logging.getLogger(__name__)
 class Identifiable():
    """
-    The fingerprint of a Record in CaosDB.
+    The fingerprint of a Record in LinkAhead.
-    This class contains the information that is used by the CaosDB Crawler to identify Records.
+    This class contains the information that is used by the LinkAhead Crawler to identify Records.
-    On one hand, this can be the ID or a Record or the path of a File.
+    In order to check whether a Record exits in the LinkAhead Server, a query can
-    On the other hand, in order to check whether a Record exits in the CaosDB Server, a query can
    be created using the information contained in the Identifiable.
    Parameters
@@ -46,23 +50,22 @@ class Identifiable():
    properties: dict, keys are names of Properties; values are Property values
                Note, that lists are not checked for equality but are interpreted as multiple
                conditions for a single Property.
-    path: str, In case of files: The path where the file is stored.
    backrefs: list, TODO future
    """
-    def __init__(self, record_id: int = None, path: str = None, record_type: str = None,
+    def __init__(self, record_id: Optional[int] = None, record_type: Optional[str] = None,
-                 name: str = None, properties: dict = None,
+                 name: Optional[str] = None, properties: Optional[dict] = None,
-                 backrefs: list[Union[int, str]] = None):
+                 backrefs: Optional[list[Union[int, str]]] = None):
-        if (record_id is None and path is None and name is None
+        if (record_id is None and name is None
                and (backrefs is None or len(backrefs) == 0)
                and (properties is None or len(properties) == 0)):
-            raise ValueError("There is no identifying information. You need to add a path or "
+            raise ValueError(
+                "There is no identifying information. You need to add "
                "properties or other identifying attributes.")
        if properties is not None and 'name' in [k.lower() for k in properties.keys()]:
            raise ValueError("Please use the separete 'name' keyword instead of the properties "
                             "dict for name")
        self.record_id = record_id
-        self.path = path
        self.record_type = record_type
        self.name = name
        if name == "":
@@ -81,20 +84,17 @@ class Identifiable():
    def _value_representation(value) -> str:
        """returns the string representation of property values to be used in the hash function
-        The string is the path of a File Entity, the CaosDB ID or Python ID of other Entities
+        The string is the LinkAhead ID in case of SyncNode objects (SyncNode objects must have an ID)
-        (Python Id only if there is no CaosDB ID) and the string representation of bool, float, int
+        and the string representation of None, bool, float, int, datetime and str.
-        and str.
        """
        if value is None:
            return "None"
-        elif isinstance(value, db.File):
+        elif isinstance(value, SyncNode):
-            return str(value.path)
-        elif isinstance(value, db.Entity):
            if value.id is not None:
                return str(value.id)
            else:
-                return "PyID=" + str(id(value))
+                raise RuntimeError("Python Entity (SyncNode) without ID not allowed")
        elif isinstance(value, list):
            return "[" + ", ".join([Identifiable._value_representation(el) for el in value]) + "]"
        elif (isinstance(value, str) or isinstance(value, int) or isinstance(value, float)
@@ -120,27 +120,20 @@ class Identifiable():
        return rec_string
    def __eq__(self, other) -> bool:
-        """
+        """ Identifiables are equal if they share the same ID or if the representation is equal """
-        Identifiables are equal if they belong to the same Record. Since ID and path are on their
-        own enough to identify the Record it is sufficient if those attributes are equal.
-        1. both IDs are set (not None)  -> equal if IDs are equal
-        2. both paths are set (not None)  -> equal if paths are equal
-        3. equal if attribute representations are equal
-        """
        if not isinstance(other, Identifiable):
            raise ValueError("Identifiable can only be compared to other Identifiable objects.")
-        elif self.record_id is not None and other.record_id is not None:
+        if self.record_id is not None and other.record_id is not None:
            return self.record_id == other.record_id
-        elif self.path is not None and other.path is not None:
-            return self.path == other.path
        elif self.get_representation() == other.get_representation():
            return True
        else:
            return False
    def __repr__(self):
-        pstring = json.dumps(self.properties)
+        """ deterministic text representation of the identifiable """
+        pstring = json.dumps({k: str(v) for k, v in self.properties.items()})
        return (f"{self.__class__.__name__} for RT {self.record_type}: id={self.record_id}; "
-                f"name={self.name}\n\tpath={self.path}\n"
+                f"name={self.name}\n"
                f"\tproperties:\n{pstring}\n"
                f"\tbackrefs:\n{self.backrefs}")
--- a/src/caoscrawler/identifiable_adapters.py
+++ b/src/caoscrawler/identifiable_adapters.py
--- a/src/caoscrawler/logging.py
+++ b/src/caoscrawler/logging.py
@@ -20,29 +20,46 @@
 # along with this program. If not, see <https://www.gnu.org/licenses/>.
 import logging
+import sys
-from caosadvancedtools.webui_formatter import WebUI_Formatter
 from caosadvancedtools.serverside.helper import get_shared_filename
-import sys
+from caosadvancedtools.webui_formatter import WebUI_Formatter
-def configure_server_side_logging():
+def configure_server_side_logging(max_log_level: int = logging.INFO):
    """
    Set logging up to save one plain debugging log file, one plain info log
    file (for users) and a stdout stream with messages wrapped in html elements
    returns the path to the file with debugging output
+    Parameters
+    ----------
+    max_log_level : int, optional
+        The maximum log level to use for SSS-logs. Default is
+        ``logging.INFO``.
+    Returns
+    -------
+    userlog_public, htmluserlog_public, debuglog_public: str
+        Public paths of the respective log files.
    """
    adv_logger = logging.getLogger("caosadvancedtools")
-    adv_logger.setLevel(level=logging.DEBUG)
+    # The max_<level> variables will be used to set the logger levels
+    # to the respective maximum of intended level and max_log_level,
+    # effectively cutting off logging above the specified
+    # max_log_level.
+    max_info = max(logging.INFO, max_log_level)
+    max_debug = max(logging.DEBUG, max_log_level)
+    adv_logger.setLevel(level=max_debug)
    cr_logger = logging.getLogger("caoscrawler")
-    cr_logger.setLevel(level=logging.DEBUG)
+    cr_logger.setLevel(level=max_debug)
    userlog_public, userlog_internal = get_shared_filename("userlog.txt")
    root_logger = logging.getLogger()
-    root_logger.setLevel(level=logging.INFO)
+    root_logger.setLevel(level=max_info)
    # this is a log file with INFO level for the user
    user_file_handler = logging.FileHandler(filename=userlog_internal)

--- a/src/caoscrawler/macros/macro_yaml_object.py
+++ b/src/caoscrawler/macros/macro_yaml_object.py
@@ -25,10 +25,14 @@
 # Function to expand a macro in yaml
 # A. Schlemmer, 05/2022
-from dataclasses import dataclass
+import re
-from typing import Any, Dict
 from copy import deepcopy
+from dataclasses import dataclass
 from string import Template
+from typing import Any, Dict
+_SAFE_SUBST_PAT = re.compile(r"^\$(?P<key>\w+)$")
+_SAFE_SUBST_PAT_BRACES = re.compile(r"^\$\{(?P<key>\w+)}$")
 @dataclass
@@ -53,6 +57,12 @@ def substitute(propvalue, values: dict):
    Substitution of variables in strings using the variable substitution
    library from python's standard library.
    """
+    # Simple matches are simply replaced by the raw dict entry.
+    if match := (_SAFE_SUBST_PAT.fullmatch(propvalue)
+                 or _SAFE_SUBST_PAT_BRACES.fullmatch(propvalue)):
+        key = match.group("key")
+        if key in values:
+            return values[key]
    propvalue_template = Template(propvalue)
    return propvalue_template.safe_substitute(**values)

--- a/src/caoscrawler/scanner.py
+++ b/src/caoscrawler/scanner.py
@@ -25,7 +25,9 @@
 #
 """
-This is the scanner, the original "_crawl" function from crawl.py.
+This is the scanner.
+This was where formerly the ``_crawl(...)`` function from ``crawl.py`` was located.
 This is just the functionality that extracts data from the file system.
 """
@@ -37,7 +39,7 @@ import logging
 import os
 import warnings
 from collections.abc import Callable
-from typing import Any, Optional, Type, Union
+from typing import Any, Optional, Union
 import jinja2
@@ -58,29 +60,45 @@ import pathlib
 logger = logging.getLogger(__name__)
-def load_definition(crawler_definition_path: str):
+def load_definition(crawler_definition_path: str) -> dict:
    """
    Load a cfood from a crawler definition defined by
    crawler definition path and validate it using cfood-schema.yml.
+    Arguments:
+    ----------
+    crawler_definition_path: str
+         Path to the crawler definition file in yaml format.
+    Returns:
+    --------
+    dict containing the crawler definition.
    """
    # Load the cfood from a yaml file:
-    with open(crawler_definition_path, "r") as f:
+    with open(crawler_definition_path, encoding="utf-8") as f:
        crawler_definitions = list(yaml.safe_load_all(f))
-    crawler_definition = _load_definition_from_yaml_dict(
+    crawler_definition = _load_definition_from_yaml_dict(crawler_definitions)
-        crawler_definitions)
    return _resolve_validator_paths(crawler_definition, crawler_definition_path)
-def _load_definition_from_yaml_dict(crawler_definitions: list[dict]):
+def _load_definition_from_yaml_dict(crawler_definitions: list[dict]) -> dict:
    """Load crawler definitions from a list of (yaml) dicts `crawler_definitions` which
    contains either one or two documents.
    Doesn't resolve the validator paths in the cfood definition, so for
    internal and testing use only.
+    Arguments:
+    ----------
+    crawler_definitions: list[dict]
+         List of one or two dicts containing (optionally) metadata and the crawler definition.
+    Returns:
+    --------
+    dict containing the crawler definition.
    """
    if len(crawler_definitions) == 1:
        # Simple case, just one document:
@@ -134,7 +152,8 @@ def _resolve_validator_paths(definition: dict, definition_path: str):
            # Validator is given by a path
            if not value.startswith('/'):
                # Not an absolute path
-                definition[key] = os.path.join(os.path.dirname(definition_path), value)
+                definition[key] = os.path.join(
+                    os.path.dirname(definition_path), value)
                if not os.path.isfile(definition[key]):
                    # TODO(henrik) capture this in `crawler_main` similar to
                    # `ConverterValidationError`.
@@ -263,31 +282,31 @@ def scanner(items: list[StructureElement],
            crawled_data: Optional[list[db.Record]] = None,
            debug_tree: Optional[DebugTree] = None,
            registered_transformer_functions: Optional[dict] = None,
-            new_debug_tree: Optional[dict] = None):
+            new_debug_tree: Optional[dict] = None) -> list[db.Record]:
    """Crawl a list of StructureElements and apply any matching converters.
-    Formerly known as "_crawl".
+    Formerly known as ``_crawl(...)``.
    Parameters
    ----------
-    items:
+    items: list[StructureElement]
        structure_elements (e.g. files and folders on one level on the hierarchy)
-    converters:
+    converters: list[Converter]
        locally defined converters for treating structure elements. A locally
        defined converter could be one that is only valid for a specific subtree
        of the originally cralwed StructureElement structure.
-    general_store, record_store:
+    general_store, record_store: GeneralStore, RecordStore, optional
        This recursion of the crawl function should only operate on copies of
        the global stores of the Crawler object.
-    restricted_path : list of strings, optional
+    restricted_path : list[str], optional
        traverse the data tree only along the given path. For example, when a
-        directory contains files a, b and c and b is given as restricted_path, a
+        directory contains files a, b and c, and b is given as ``restricted_path``, a
-        and c will be ignroed by the crawler. When the end of the given path is
+        and c will be ignored by the crawler. When the end of the given path is
        reached, traverse the full tree as normal. The first element of the list
-        provided by restricted_path should be the name of the StructureElement
+        provided by ``restricted_path`` should be the name of the StructureElement
        at this level, i.e. denoting the respective element in the items
        argument.
@@ -298,7 +317,8 @@ def scanner(items: list[StructureElement],
        Each function is a dictionary:
-        - The key is the name of the function to be looked up in the dictionary of registered transformer functions.
+        - The key is the name of the function to be looked up in the dictionary of registered
+          transformer functions.
        - The value is the function which needs to be of the form:
            def func(in_value: Any, in_parameters: dict) -> Any:
                pass
@@ -332,7 +352,8 @@ def scanner(items: list[StructureElement],
        pass
    for element in items:
-        element_path = os.path.join(*(structure_elements_path + [str(element.get_name())]))
+        element_path = os.path.join(
+            *(structure_elements_path + [str(element.get_name())]))
        logger.debug(f"Dealing with {element_path}")
        # Store whether this element was matched by at least one converter:
        at_least_one_matched = False
@@ -368,7 +389,8 @@ def scanner(items: list[StructureElement],
                keys_modified = converter.create_records(
                    general_store_copy, record_store_copy, element)
-                children = converter.create_children(general_store_copy, element)
+                children = converter.create_children(
+                    general_store_copy, element)
                if debug_tree is not None:
                    # add provenance information for each variable
@@ -380,6 +402,9 @@ def scanner(items: list[StructureElement],
                    debug_tree.debug_metadata["usage"][str(element)].add(
                        "/".join(converters_path + [converter.name]))
                    mod_info = debug_tree.debug_metadata["provenance"]
+                    # TODO: actually keys_modified must not be None. create_records should
+                    #       always return a list.
+                    if keys_modified is not None:
                        for record_name, prop_name in keys_modified:
                            # TODO: check
                            internal_id = record_store_copy.get_internal_id(
@@ -445,6 +470,9 @@ def scanner(items: list[StructureElement],
                element_dictionary["matching_converters"] = matching_converters
            new_debug_tree.append(element_dictionary)
+               # Clean up converter:
+               converter.cleanup()
    if restricted_path and not path_found:
        raise RuntimeError("A 'restricted_path' argument was given that is not contained in "
                           "the data tree")
@@ -463,7 +491,7 @@ def scanner(items: list[StructureElement],
 # --------------------------------------------------------------------------------
-def scan_directory(dirname: str, crawler_definition_path: str,
+def scan_directory(dirname: Union[str, list[str]], crawler_definition_path: str,
                   restricted_path: Optional[list[str]] = None,
                   debug_tree: Optional[DebugTree] = None,
                   new_debug_tree: Optional[dict] = None):
@@ -477,10 +505,12 @@ def scan_directory(dirname: str, crawler_definition_path: str,
    Parameters
    ----------
+    dirname: str or list[str]
+        directory or list of directories to be scanned
    restricted_path: optional, list of strings
-            Traverse the data tree only along the given path. When the end of the given path
+        Traverse the data tree only along the given path. When the end
-            is reached, traverse the full tree as normal. See docstring of 'scanner' for
+        of the given path is reached, traverse the full tree as
-            more details.
+        normal. See docstring of 'scanner' for more details.
    Returns
    -------
@@ -493,23 +523,31 @@ def scan_directory(dirname: str, crawler_definition_path: str,
    converter_registry = create_converter_registry(crawler_definition)
    # Load and register transformer functions:
-    registered_transformer_functions = create_transformer_registry(crawler_definition)
+    registered_transformer_functions = create_transformer_registry(
+        crawler_definition)
    if not dirname:
        raise ValueError(
            "You have to provide a non-empty path for crawling.")
-    dir_structure_name = os.path.basename(dirname)
+    if not isinstance(dirname, list):
+        dirname = [dirname]
+    dir_element_list = []
+    for dname in dirname:
+        dir_structure_name = os.path.basename(dname)
        # TODO: needs to be covered somewhere else
-    crawled_directory = dirname
+        crawled_directory = dname
-    if not dir_structure_name and dirname.endswith('/'):
+        if not dir_structure_name and dname.endswith(os.path.sep):
-        if dirname == '/':
+            if dname == os.path.sep:
                # Crawling the entire file system
                dir_structure_name = "root"
            else:
                # dirname had a trailing '/'
-            dir_structure_name = os.path.basename(dirname[:-1])
+                dir_structure_name = os.path.basename(dname[:-1])
+        dir_element_list.append(Directory(dir_structure_name, dname))
+<< << << < HEAD
   dir_structure_element = Directory(dir_structure_name, dirname)
    return scan_structure_elements(dir_structure_element,
@@ -520,6 +558,16 @@ def scan_directory(dirname: str, crawler_definition_path: str,
                                   registered_transformer_functions=registered_transformer_functions,
                                   new_debug_tree=new_debug_tree
                                   )
+== == == =
+   return scan_structure_elements(
+        dir_element_list,
+        crawler_definition,
+        converter_registry,
+        restricted_path=restricted_path,
+        debug_tree=debug_tree,
+        registered_transformer_functions=registered_transformer_functions
+    )
+>>>>>> > dev
 def scan_structure_elements(items: Union[list[StructureElement], StructureElement],
@@ -527,8 +575,15 @@ def scan_structure_elements(items: Union[list[StructureElement], StructureElemen
                            converter_registry: dict,
                            restricted_path: Optional[list[str]] = None,
                            debug_tree: Optional[DebugTree] = None,
+<< << << < HEAD
                            registered_transformer_functions: Optional[dict] = None,
                            new_debug_tree: Optional[dict] = None):
+== == == =
+   registered_transformer_functions: Optional[dict] = None) -> (
+       list[db.Record]):
+>>>>>> > dev
        """
    Start point of the crawler recursion.
@@ -542,14 +597,14 @@ def scan_structure_elements(items: Union[list[StructureElement], StructureElemen
    crawler_definition : dict
         A dictionary representing the crawler definition, possibly from a yaml
         file.
-    restricted_path: optional, list of strings
+    restricted_path: list[str], optional
         Traverse the data tree only along the given path. When the end of the
         given path is reached, traverse the full tree as normal. See docstring
         of 'scanner' for more details.
    Returns
    -------
-    crawled_data : list
+    crawled_data : list[db.Record]
        the final list with the target state of Records.
    """

--- a/src/caoscrawler/scripts/__init__.py
+++ b/src/caoscrawler/scripts/__init__.py
--- a/src/caoscrawler/scripts/generators.py
+++ b/src/caoscrawler/scripts/generators.py
--- a/src/caoscrawler/structure_elements/__init__.py
+++ b/src/caoscrawler/structure_elements/__init__.py
+# encoding: utf-8
+#
+# This file is a part of the LinkAhead Project.
+#
+# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com>
+# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+"""Submdule containing all default and optional converters."""
+from .. import utils
+from .structure_elements import *
+try:
+    from .rocrate_structure_elements import ROCrateEntity
+except ImportError as err:
+    ROCrateEntity: type = utils.MissingImport(
+        name="ROCrateEntity", hint="Try installing with the `rocrate` extra option.",
+        err=err)
--- a/src/caoscrawler/structure_elements/rocrate_structure_elements.py
+++ b/src/caoscrawler/structure_elements/rocrate_structure_elements.py
+#!/usr/bin/env python3
+# encoding: utf-8
+#
+# ** header v3.0
+# This file is a part of the CaosDB Project.
+#
+# Copyright (C) 2024 Alexander Schlemmer
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+# ** end header
+#
+from rocrate.model.entity import Entity
+from .structure_elements import StructureElement
+class ROCrateEntity(StructureElement):
+    """
+    Store entities contained in ROCrates.
+    """
+    def __init__(self, folder: str, entity: Entity):
+        """
+        Initializes this ROCrateEntity.
+        Arguments:
+        ----------
+        folder: str
+            The folder that contains the ROCrate data. In case of a zipped ROCrate, this
+            is a temporary folder that the ROCrate was unzipped to.
+            The folder is the folder containing the ro-crate-metadata.json.
+        entity: Entity
+            The ROCrate entity that is stored in this structure element.
+            The entity automatically contains an attribute ".crate"
+            that stores the ROCrate that this entity belongs to. It can be used
+            e.g. to look up links to other entities (ROCrate.dereference).
+        """
+        super().__init__(entity.properties()["@id"])
+        self.folder = folder
+        self.entity = entity