init int test

7a178ca1 · Henrik tom Wörden · b325d6fb · 7a178ca1 · 7a178ca1 · 7a178ca1
Commit 7a178ca1 authored 3 years ago by Henrik tom Wörden
--- a/integrationtests/dataset_cfoods.yml
+++ b/integrationtests/dataset_cfoods.yml
+# This file is a part of the CaosDB Project.
+#
+# Copyright (C) 2022 IndiScale GmbH <info@indiscale.com>
+# Copyright (C) 2022 Florian Spreckelsen <f.spreckelsen@indiscale.com>
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Affero General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option) any
+# later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Affero General Public License along
+# with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+Data:
+  type: Directory
+  match: data
+  subtree:
+    dataspace_dir:
+      type: Directory
+      match: (?P<dataspace_dir_number>[0-9]+)
+      records:
+        Dataspace:
+          name: $dataspace_dir_number
+      subtree:
+        dataspace_json:
+          type: JSONFile
+          match: .dataspace.json
+          validate: schema/dataspace.schema.json
+          subtree:
+            dataspace_id_element:
+              type: DictIntegerElement
+              match_name: "dataspace_id"
+              match_value: "(?P<id>[0-9]+)"
+              records:
+                Dataspace:
+                  dataspace_id: $id
+            archived_element:
+              type: DictBooleanElement
+              match_name: "archived"
+              match_value: "(?P<archived>.*)"
+              records:
+                Dataspace:
+                  archived: $archived
+            url_element:
+              type: DictTextElement
+              match_name: "url"
+              match_value: "(?P<url>.*)"
+              records:
+                Dataspace:
+                  url: $url
+            coordinator_element:
+              type: DictDictElement
+              match_name: "coordinator"
+              records:
+                Person:
+                  parents:
+                    - Person
+                Dataspace:
+                  Person: $Person
+              subtree: &person_subtree
+                full_name_element:
+                  type: DictTextElement
+                  match_name: "full_name"
+                  match_value: "(?P<full_name>.*)"
+                  records:
+                    Person:
+                      full_name: $full_name
+                full_name_nonlatin_element:
+                  type: DictTextElement
+                  match_name: "full_name_nonlatin"
+                  match_value: "(?P<full_name_nonlatin>.*)"
+                  records:
+                    Person:
+                      full_name_nonlatin: $full_name_nonlatin
+                family_name_element:
+                  type: DictTextElement
+                  match_name: "family_name"
+                  match_value: "(?P<family_name>.*)"
+                  records:
+                    Person:
+                      family_name: $family_name
+                given_name_element:
+                  type: DictTextElement
+                  match_name: "given_name"
+                  match_value: "(?P<given_name>.*)"
+                  records:
+                    Person:
+                      given_name: $given_name
+                email_element:
+                  type: DictTextElement
+                  match_name: "email"
+                  match_value: "(?P<email>.*)"
+                  records:
+                    Person:
+                      email: $email
+                affiliation_element:
+                  type: DictTextElement
+                  match_name: "affiliation"
+                  match_value: "(?P<affiliation>.*)"
+                  records:
+                    Person:
+                      affiliation: $affiliation
+                ORCID_element:
+                  type: DictTextElement
+                  match_name: "ORCID"
+                  match_value: "(?P<ORCID>.*)"
+                  records:
+                    Person:
+                      ORCID: $ORCID
+            start_date_element:
+              type: DictTextElement
+              match_name: "start_date"
+              match_value: "(?P<start_date>.*)"
+              records:
+                Dataspace:
+                  start_date: $start_date
+            end_date_element:
+              type: DictTextElement
+              match_name: "end_date"
+              match_value: "(?P<end_date>.*)"
+              records:
+                Dataspace:
+                  end_date: $end_date
+            comment:
+              type: DictTextElement
+              match_name: "comment"
+              match_value: "(?P<comment>.*)"
+              records:
+                Dataspace:
+                  comment: $comment
+        raw_data_dir:
+          type: Directory
+          match: 03_raw_data
+          subtree: &template
+            # TODO collect info from metadata.json and look into sub-directories
+            # (only one level) for metadata.json
+            dataset_dir:
+              match: (?P<dataset_dir_name>.*)
+              type: Directory
+              records:
+                Dataset:
+                  Dataspace: $Dataspace
+              subtree:
+                metadata_json: &metadata_json_template
+                  type: JSONFile
+                  match: metadata.json
+                  validate: schema/dataset.schema.json
+                  subtree:
+                    title_element:
+                      type: DictTextElement
+                      match_name: "title"
+                      match_value: "(?P<title>.*)"
+                      records:
+                        Dataset:
+                          name: $title
+                    authors_element:
+                      type: DictListElement
+                      match_name: "authors"
+                      subtree:
+                        author_element:
+                          type: Dict
+                          records:
+                            Person:
+                              parents:
+                                - Person
+                            Dataset:
+                              authors: +$Person
+                          subtree: *person_subtree
+                    abstract_element:
+                      type: DictTextElement
+                      match_name: "abstract"
+                      match_value: "(?P<abstract>.*)"
+                      records:
+                        Dataset:
+                          abstract: $abstract
+                    comment_element:
+                      type: DictTextElement
+                      match_name: "comment"
+                      match_value: "(?P<comment>.*)"
+                      records:
+                        Dataset:
+                          comment: $comment
+                    license_element:
+                      type: DictTextElement
+                      match_name: "license"
+                      match_value: "(?P<license_name>.*)"
+                      records:
+                        license:
+                          # TODO: As soon as such things can be validated, a
+                          # creation of a new license has to be forbidden here
+                          # (although this is effectively done already by
+                          # validating against the above schema.)
+                          name: $license_name
+                        Dataset:
+                          license: $license
+                    dataset_doi_element:
+                      type: DictTextElement
+                      match_name: "dataset_doi"
+                      match_value: "(?P<dataset_doi>.*)"
+                      records:
+                        Dataset:
+                          dataset_doi: $dataset_doi
+                    related_to_dois_element:
+                      type: DictListElement
+                      match_name: "related_to_dois"
+                      subtree:
+                        related_to_doi_element:
+                          type: TextElement
+                          match: "(?P<related_to_doi>).*"
+                          records:
+                            Dataset:
+                              related_to_dois: +$related_to_doi
+                    Keywords_element:
+                      type: DictListElement
+                      match_name: "Keyword"
+                    Events_element:
+                      type: DictListElement
+                      match_name: "Event"
+                      subtree:
+                        Event_element:
+                          type: Dict
+                          records:
+                            Event:
+                              parents:
+                                - Event
+                            Dataset:
+                              Event: +$Event
+                          subtree:
+                            label_element:
+                              type: DictTextElement
+                              match_name: "label"
+                              match_value: "(?P<label>.*)"
+                              records:
+                                Event:
+                                  label: $label
+                            comment_element:
+                              type: DictTextElement
+                              match_name: "comment"
+                              match_value: "(?P<comment>.*)"
+                              records:
+                                Event:
+                                  comment: $comment
+                            start_datetime_element:
+                              type: DictTextElement
+                              match_name: start_datetime
+                              match_value: "(?P<start_datetime>.*)"
+                              records:
+                                Event:
+                                  start_datetime: $start_datetime
+                            end_datetime_element:
+                              type: DictTextElement
+                              match_name: end_datetime
+                              match_value: "(?P<end_datetime>.*)"
+                              records:
+                                Event:
+                                  end_datetime: $end_datetime
+                            longitude_element:
+                              type: DictFloatElement
+                              match_name: "longitude"
+                              match_value: "(?P<longitude>.*)"
+                              records:
+                                Event:
+                                  longitude: $longitude
+                            latitude_element:
+                              type: DictFloatElement
+                              match_name: "latitude"
+                              match_value: "(?P<latitude>.*)"
+                              records:
+                                Event:
+                                  latitude: $latitude
+                            elevation_element:
+                              type: DictFloatElement
+                              match_name: "elevation"
+                              match_value: "(?P<elevation>.*)"
+                              records:
+                                Event:
+                                  elevation: $elevation
+                            location_element:
+                              type: DictTextElement
+                              match_name: location
+                              match_value: "(?P<location>.*)"
+                              records:
+                                Event:
+                                  location: $location
+                            igsn_element:
+                              type: DictTextElement
+                              match_name: igsn
+                              match_value: "(?P<igsn>.*)"
+                              records:
+                                Event:
+                                  igsn: $igsn
+                    events_in_data_element:
+                      type: DictBooleanElement
+                      match_name: "events_in_data"
+                      match_value: "(?P<events_in_data>.*)"
+                      records:
+                        Dataset:
+                          events_in_data: $events_in_data
+                    geojson_element:
+                      type: DictTextElement
+                      match_name: "geojson"
+                      match_value: "(?P<geojson>.*)"
+                      records:
+                        Dataset:
+                          geojson: $geojson
+                    project_element:
+                      type: DictDictElement
+                      match_name: "project"
+                      records:
+                        Project:
+                          parents:
+                            - Project
+                        Dataset:
+                          Project: $Project
+                      subtree:
+                        full_name_element:
+                          type: DictTextElement
+                          match_name: "full_name"
+                          match_value: "(?P<full_name>.*)"
+                          records:
+                            Project:
+                              full_name: $full_name
+                        project_id_element:
+                          type: DictTextElement
+                          match_name: "project_id"
+                          match_value: "(?P<project_id>.*)"
+                          records:
+                            Project:
+                              project_id: $project_id
+                        project_type_element:
+                          type: DictTextElement
+                          match_name: "project_type"
+                          match_value: "(?P<project_type_name>.*)"
+                          records:
+                            project_type:
+                              name: $project_type_name
+                            Project:
+                              project_type: $project_type
+                        institute_element:
+                          type: DictTextElement
+                          match_name: "institute"
+                          match_value: "(?P<institute>.*)"
+                          records:
+                            Project:
+                              institute: $institute
+                        start_date_element:
+                          type: DictTextElement
+                          match_name: "start_date"
+                          match_value: "(?P<start_date>.*)"
+                          records:
+                            Project:
+                              start_date: $start_date
+                        end_date_element:
+                          type: DictTextElement
+                          match_name: "end_date"
+                          match_value: "(?P<end_date>.*)"
+                          records:
+                            Project:
+                              end_date: $end_date
+                        url_element:
+                          type: DictTextElement
+                          match_name: "url"
+                          match_value: "(?P<url>.*)"
+                          records:
+                            Project:
+                              url: $url
+                        coordinators_element:
+                          type: DictListElement
+                          match_name: "coordinators"
+                          subtree:
+                            coordinator_element:
+                              type: Dict
+                              records:
+                                Person:
+                                  parents:
+                                    - Person
+                                Project:
+                                  coordinators: +$Person
+                              subtree: *person_subtree
+                        campaign_element:
+                          type: DictDictElement
+                          match_name: "campaign"
+                          records:
+                            Campaign:
+                              parents:
+                                - Campaign
+                            Dataset:
+                              Campaign: $Campaign
+                          subtree:
+                            label_element:
+                              type: DictTextElement
+                              match_name: "label"
+                              match_value: "(?P<label>.*)"
+                              records:
+                                Campaign:
+                                  label: $label
+                            optional_label_element:
+                              type: DictTextElement
+                              match_name: "optional_label"
+                              match_value: "(?P<optional_label>.*)"
+                              records:
+                                Campaign:
+                                  optional_label: $optional_label
+                            start_date_element:
+                              type: DictTextElement
+                              match_name: "start_date"
+                              match_value: "(?P<start_date>.*)"
+                              records:
+                                Campaign:
+                                  start_date: $start_date
+                            end_date_element:
+                              type: DictTextElement
+                              match_name: "end_date"
+                              match_value: "(?P<end_date>.*)"
+                              records:
+                                Campaign:
+                                  end_date: $end_date
+                            responsible_scientists_element:
+                              type: DictListElement
+                              match_name: "responsible_scientists"
+                              subtree:
+                                responsible_scientist_element:
+                                  type: Dict
+                                  records:
+                                    Person:
+                                      parents:
+                                        - Person
+                                    Campaign:
+                                      responsible_scientists: +$Person
+                                  subtree: *person_subtree
+                        Methods_element:
+                          type: DictListElement
+                          match_name: "Method"
+                          subtree:
+                            Method_element:
+                              type: Dict
+                              records:
+                                Method:
+                                  parents:
+                                    - Method
+                                Dataset:
+                                  Method: +$Method
+                              subtree:
+                                method_name_element:
+                                  type: DictTextElement
+                                  match_name: "method_name"
+                                  match_value: "(?P<method_name>.*)"
+                                  records:
+                                    Method:
+                                      name: $method_name
+                                abbreviation_element:
+                                  type: DictTextElement
+                                  match_name: "abbreviation"
+                                  match_value: "(?P<abbreviation>.*)"
+                                  records:
+                                    Method:
+                                      abbreviation: $abbreviation
+                                url_element:
+                                  type: DictTextElement
+                                  match_name: "url"
+                                  match_value: "(?P<url>.*)"
+                                  records:
+                                    Method:
+                                      url: $url
+                        Taxa_element:
+                          type: DictListElement
+                          match_name: "Taxon"
+                          subtree:
+                            Taxon_element:
+                              type: Dict
+                              records:
+                                Taxon:
+                                  parents:
+                                    - Taxon
+                                Dataset:
+                                  Taxon: +$Taxon
+                              subtree:
+                                taxon_name_element:
+                                  type: DictTextElement
+                                  match_name: "taxon_name"
+                                  match_value: "(?P<taxon_name>.*)"
+                                  records:
+                                    Taxon:
+                                      name: $taxon_name
+                        archived_element:
+                          type: DictBooleanElement
+                          match_name: "archived"
+                          match_value: "(P<archived>.*)"
+                          records:
+                            Dataset:
+                              archived: $archived
+                        publication_date_element:
+                          type: DictTextElement
+                          match_name: "publication_date"
+                          match_value: "(P<publication_date>.*)"
+                          records:
+                            Dataset:
+                              publication_date: $publication_date
+                        max_files_element:
+                          type: DictIntegerElement
+                          match_name: "max_files"
+                          match_value: "(P<max_files>.*)"
+                          records:
+                            Dataset:
+                              max_files: $max_files
+                auxiliary_file: &aux_file_template
+                  type: File
+                  match: "(?P<aux_file_name>(?!metadata.json).*)"
+                  # TODO File, path and reference dataset in file record
+                child_dataset_dir:
+                  type: Directory
+                  match: (?P<child_dataset_dir_name>.*)
+                  subtree:
+                    metadata_json: *metadata_json_template
+                    auxiliary_file: *aux_file_template
+        data_processing_dir:
+          type: Directory
+          match: 04_data_processing
+          subtree: *template
+        results_dir:
+          type: Directory
+          match: 05_results
+          subtree: *template
--- a/integrationtests/load_and_insert_json_models.py
+++ b/integrationtests/load_and_insert_json_models.py
+#!/usr/bin/env python3
+# encoding: utf-8
+#
+# This file is a part of the CaosDB Project.
+#
+# Copyright (C) 2022 Indiscale GmbH <info@indiscale.com>
+# Copyright (C) 2022 Henrik tom Wörden <h.tomwoerden@indiscale.com>
+# Copyright (C) 2022 Florian Spreckelsen <f.spreckelsen@indiscale.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+import sys
+
+from caosadvancedtools.models.parser import parse_model_from_json_schema, parse_model_from_yaml
+
+
+def main():
+    # First load dataspace data model
+    dataspace_definitions = parse_model_from_json_schema(
+        "schema/dataspace.schema.json")
+    dataspace_definitions.sync_data_model(noquestion=True)
+
+    # Then general dataset definitions
+    dataset_definitions = parse_model_from_json_schema(
+        "schema/dataset.schema.json")
+    dataset_definitions.sync_data_model(noquestion=True)
+
+    # Finally, add inheritances as defined in yaml
+    dataset_inherits = parse_model_from_yaml(
+        "schema/dataset-inheritance.yml")
+    dataset_inherits.sync_data_model(noquestion=True)
+
+
+if __name__ == "__main__":
+
+    sys.exit(main())
--- a/integrationtests/schema/dataset-inheritance.yml
+++ b/integrationtests/schema/dataset-inheritance.yml
+extern:
+- Keyword
+- Taxon
+- full_name
+- full_name_nonlatin
+- name
+
+full_name:
+  inherit_from_obligatory:
+  - name
+
+full_name_nonlatin:
+  inherit_from_obligatory:
+  - name
+
+Taxon:
+  inherit_from_obligatory:
+  - Keyword
--- a/integrationtests/schema/dataset.schema.json
+++ b/integrationtests/schema/dataset.schema.json
+{
+  "title": "Dataset",
+  "description": "",
+  "type": "object",
+  "properties": {
+    "title": {
+      "type": "string",
+      "description": "full dataset title"
+    },
+    "authors": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "title": "Person",
+        "properties": {
+          "full_name": {
+            "type": "string",
+            "description": "Full name (latin transcription, all UFT-8 characters allowed)"
+          },
+          "full_name_nonlatin": {
+            "type": "string",
+            "description": "Full name (non-latin alphabet)"
+          },
+          "family_name": {
+            "type": "string",
+            "description": "Family name (latin transcription)"
+          },
+          "given_name": {
+            "type": "string",
+            "description": "Given/other names (latin transcription)"
+          },
+          "affiliation": {
+            "type": "string"
+          },
+          "ORCID": {
+            "type": "string",
+            "description": "ORCID identifier as 16-digit number, e.g. 0000-0001-6233-1866",
+            "pattern": "^\\d{4}-\\d{4}-\\d{4}-\\d{4}$"
+          },
+          "email": {
+            "type": "string",
+            "format": "email"
+          }
+        },
+        "required": [
+          "full_name",
+          "email"
+        ]
+      },
+      "minItems": 1,
+      "uniqueItems": true
+    },
+    "abstract": {
+      "type": "string",
+      "minLength": 80,
+      "maxLength": 1000,
+      "description": "Abstract with at least 80 characters"
+    },
+    "comment": {
+      "type": "string"
+    },
+    "license": {
+      "type": "string",
+      "enum": [
+        "CC-BY",
+        "CC-BY-SA",
+        "CC0",
+        "restricted access"
+      ]
+    },
+    "dataset_doi": {
+      "type": "string",
+      "pattern": "(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![%\"#? ])\\S)+)",
+      "description": "Dataset DOI, e.g. 10.1594/PANGAEA.938740"
+    },
+    "related_to_dois": {
+      "type": "array",
+      "items": {
+        "type": "string",
+        "pattern": "(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![%\"#? ])\\S)+)"
+      },
+      "description": "DOIs of related publications and/or datasets, e.g. 10.1000/182"
+    },
+    "Keyword": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "name": {
+            "type": "string"
+          }
+        }
+      }
+    },
+    "Event": {
+      "type": "array",
+      "description": "https://wiki.pangaea.de/wiki/Event",
+      "items": {
+        "type": "object",
+        "properties": {
+          "label": {
+            "type": "string"
+          },
+          "comment": {
+            "type": "string"
+          },
+          "start_datetime": {
+            "type": "string",
+            "format": "date-time"
+          },
+          "end_datetime": {
+            "type": "string",
+            "format": "date-time"
+          },
+          "longitude": {
+            "type": "number",
+            "minimum": -180,
+            "maximum": 180,
+            "description": "longitude (W/E) in decimal degree (-180 to 180)"
+          },
+          "latitude": {
+            "type": "number",
+            "minimum": -90,
+            "maximum": 90,
+            "description": "latitude (N/S) in decimal degree (-90 to 90)"
+          },
+          "elevation": {
+            "type": "number",
+            "minimum": -10000,
+            "maximum": 20000,
+            "description": "elevation in m"
+          },
+          "location": {
+            "type": "string",
+            "description": "geographical location as text (e.g., North Sea; Espoo, Finland)"
+          },
+          "igsn": {
+            "type": "string",
+            "description": "International Geo Sample Number (http://www.geosamples.org/aboutigsn)"
+          }
+        },
+        "required": [
+          "longitude",
+          "latitude",
+          "start_date"
+        ]
+      }
+    },
+    "events_in_data": {
+      "type": "boolean",
+      "description": "Does the data contain additional information about timepoints and locations?"
+    },
+    "geojson": {
+      "type": "string",
+      "pattern": "",
+      "description": "GeoJSON for complex geographic structures"
+    },
+    "project": {
+      "title": "Project",
+      "description": "https://wiki.pangaea.de/wiki/Project",
+      "type": "object",
+      "properties": {
+        "name": {
+          "type": "string", 
+          "description": "short name of project"
+        },
+        "full_name": {
+          "type": "string",
+          "description": "Full name (latin transcription, all UTF-8 characters allowed)"
+        },
+        "project_id": {
+          "type": "string",
+          "description": "Project ID"
+        },
+        "project_type": {
+          "type": "string",
+          "enum": [
+            "DFG",
+            "EU",
+            "BMBF",
+            "national",
+            "international"
+          ]
+        },
+        "institute": {
+          "type": "string",
+          "description": "place of coordination or project office",
+          "default": "Centre for Research"
+        },
+        "start_date": {
+          "type": "string",
+          "format": "date"
+        },
+        "end_date": {
+          "type": "string",
+          "format": "date"
+        },
+        "url": {
+          "type": "string",
+          "format": "uri"
+        },
+        "coordinators": {
+          "type": "array",
+          "items": {
+            "type": "object",
+            "title": "Person",
+            "properties": {
+              "full_name": {
+                "type": "string",
+                "description": "Full name (latin transcription, all UTF-8 characters allowed)"
+              },
+              "full_name_nonlatin": {
+                "type": "string",
+                "description": "Full name (non-latin alphabet)"
+              },
+              "family_name": {
+                "type": "string",
+                "description": "Family name (latin transcription)"
+              },
+              "given_name": {
+                "type": "string",
+                "description": "Given/other names (latin transcription)"
+              },
+              "affiliation": {
+                "type": "string"
+              },
+              "ORCID": {
+                "type": "string",
+                "description": "ORCID identifier as 16-digit number, e.g. 0000-0001-6233-1866",
+                "pattern": "^\\d{4}-\\d{4}-\\d{4}-\\d{4}$"
+              },
+              "email": {
+                "type": "string",
+                "format": "email"
+              }
+            },
+            "required": [
+              "full_name",
+              "email"
+            ]
+          },
+          "minItems": 1,
+          "uniqueItems": true
+        }
+      },
+      "required": ["name", "full_name"]
+    },
+    "campaign": {
+      "title": "Campaign",
+      "description": "https://wiki.pangaea.de/wiki/Campaign, synonyms: cruise, expedition, leg, ",
+      "type": "object",
+      "properties": {
+        "label": {
+          "type": "string",
+          "description": "is unique and does not contain blanks; uses abbreviations instead of full names"
+        },
+        "optional_label": {
+          "type": "string"
+        },
+        "start_date": {
+          "type": "string",
+          "format": "date"
+        },
+        "end_date": {
+          "type": "string",
+          "format": "date"
+        },
+        "responsible_scientists": {
+          "type": "array",
+          "items": {
+            "type": "object",
+            "title": "Person",
+            "properties": {
+              "full_name": {
+                "type": "string",
+                "description": "Full name (latin transcription, all UFT-8 characters allowed)"
+              },
+              "full_name_nonlatin": {
+                "type": "string",
+                "description": "Full name (non-latin alphabet)"
+              },
+              "family_name": {
+                "type": "string",
+                "description": "Family name (latin transcription)"
+              },
+              "given_name": {
+                "type": "string",
+                "description": "Given/other names (latin transcription)"
+              },
+              "affiliation": {
+                "type": "string"
+              },
+              "ORCID": {
+                "type": "string",
+                "description": "ORCID identifier as 16-digit number, e.g. 0000-0001-6233-1866",
+                "pattern": "^\\d{4}-\\d{4}-\\d{4}-\\d{4}$"
+              },
+              "email": {
+                "type": "string",
+                "format": "email"
+              }
+            },
+            "required": [
+              "full_name",
+              "email"
+            ]
+          },
+          "minItems": 1,
+          "uniqueItems": true
+        }
+      }
+    },
+    "Method": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "description": "https://wiki.pangaea.de/wiki/Method",
+        "properties": {
+          "method_name": {
+            "type": "string",
+            "description": "full official name of tool/instrument/device/gear"
+          },
+          "abbreviation": {
+            "type": "string",
+            "description": "may be used for import in an event list to avoid misspellings"
+          },
+          "url": {
+            "type": "string",
+            "description": "should contain a web address, where an official description of the device can be found"
+          }
+        }
+      }
+    },
+    "Taxon": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "name": {
+            "type": "string"
+          }
+        }
+      }
+    },
+    "archived": {
+      "type": "boolean",
+      "description": "Has the dataset been archived?",
+      "default": false
+    },
+    "publication_date": {
+      "type": "string",
+      "format": "date"
+    },
+    "max_files": {
+      "type": "integer",
+      "description": "Maximum number of files to included by the CaosDB crawler",
+      "default": 100
+    }
+  },
+  "required": [
+    "title",
+    "authors",
+    "abstract"
+  ]
+}
--- a/integrationtests/schema/dataspace.schema.json
+++ b/integrationtests/schema/dataspace.schema.json
+{
+  "title": "Dataspace",
+  "description": "A Dataspace is a folder in the DataCloud with a pre-defined structure",
+  "type": "object",
+  "properties": {
+    "dataspace_id": {
+      "type": "integer",
+      "description": "Integer ID of Dataspace (matches LDAP GID)",
+      "minimum": 20000
+    },
+    "archived": { "type": "boolean" },
+    "url": {
+      "type": "string",
+      "description": "link to folder on file system (CaosDB or cloud folder)"
+    },
+    "coordinator": {
+      "type": "object",
+      "title": "Person",
+      "properties": {
+        "full_name": {
+          "type": "string",
+          "description": "Full name (latin transcription, all UFT-8 characters allowed)"
+        },
+        "full_name_nonlatin": {
+          "type": "string",
+          "description": "Full name (non-latin alphabet)"
+        },
+        "family_name": {
+          "type": "string",
+          "description": "Family name (latin transcription)"
+        },
+        "given_name": {
+          "type": "string",
+          "description": "Given/other names (latin transcription)"
+        },
+        "email": { "type": "string", "format": "email" }
+      },
+      "required": ["full_name", "email"]
+    },
+    "start_date": { "type": "string", "format": "date" },
+    "end_date": { "type": "string", "format": "date" },
+    "comment": { "type": "string" }
+  },
+  "required": ["dataspace_id", "url", "coordinator"]
+}
--- a/integrationtests/schema/zmt-organisation.yml
+++ b/integrationtests/schema/zmt-organisation.yml
+extern:
+- name
+- url
+- Dataset
+
+german_name:
+  datatype: TEXT
+  inherit_from_obligatory:
+  - name
+
+Department:
+  recommended_properties:
+    url:
+    german_name:
+
+
+WorkingGroup:
+  recommended_properties:
+    Department:
+    german_name:
+    url:
+
+Dataset:
+  recommended_properties:
+    WorkingGroup:
+    
--- a/integrationtests/test_dataset_crawler.py
+++ b/integrationtests/test_dataset_crawler.py
+#!/usr/bin/env python3
+# encoding: utf-8
+#
+# This file is a part of the CaosDB Project.
+#
+# Copyright (C) 2022 Indiscale GmbH <info@indiscale.com>
+# Copyright (C) 2022 Henrik tom Wörden <h.tomwoerden@indiscale.com>
+# Copyright (C) 2022 Florian Spreckelsen <f.spreckelsen@indiscale.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+
+"""
+module description
+"""
+import json
+import os
+
+import caosdb as db
+
+from newcrawler.crawl import Crawler
+from newcrawler.converters import JSONFileConverter, DictConverter
+from newcrawler.identifiable_adapters import CaosDBIdentifiableAdapter
+from newcrawler.structure_elements import File, JSONFile, Directory
+
+
+def test_dataset():
+    crawler_definition_path = "./dataset_cfoods.yml"
+    #json_file_path = rfp("test_directories", "single_file_test_data", "testjson.json")
+
+    ident = CaosDBIdentifiableAdapter()
+    ident.register_identifiable(
+        "license", db.RecordType().add_parent("license").add_property("name"))
+    ident.register_identifiable("project_type", db.RecordType(
+    ).add_parent("project_type").add_property("name"))
+    ident.register_identifiable("Person", db.RecordType(
+    ).add_parent("Person").add_property("full_name"))
+
+    crawler = Crawler(debug=True, identifiableAdapter=ident)
+    crawler_definition = crawler.load_definition(crawler_definition_path)
+    #print(json.dumps(crawler_definition, indent=3))
+    # Load and register converter packages:
+    converter_registry = crawler.load_converters(crawler_definition)
+    # print("DictIntegerElement" in converter_registry)
+
+    records = crawler.start_crawling(
+        Directory('data',
+                  "data"),
+        crawler_definition,
+        converter_registry
+    )
+    subd = crawler.debug_tree
+    subc = crawler.debug_metadata
+    # print(json.dumps(subc, indent=3))
+    # print(subd)
+    # print(subc)
+    # print(records)
+    ins, ups = crawler.synchronize()
+
+
+if __name__ == "__main__":
+    test_dataset()