Skip to content
Snippets Groups Projects
Commit 7a178ca1 authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

init int test

parent b325d6fb
Branches
Tags
2 merge requests!53Release 0.1,!18Add integrationtests based on a real world example
# This file is a part of the CaosDB Project.
#
# Copyright (C) 2022 IndiScale GmbH <info@indiscale.com>
# Copyright (C) 2022 Florian Spreckelsen <f.spreckelsen@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <https://www.gnu.org/licenses/>.
#
Data:
type: Directory
match: data
subtree:
dataspace_dir:
type: Directory
match: (?P<dataspace_dir_number>[0-9]+)
records:
Dataspace:
name: $dataspace_dir_number
subtree:
dataspace_json:
type: JSONFile
match: .dataspace.json
validate: schema/dataspace.schema.json
subtree:
dataspace_id_element:
type: DictIntegerElement
match_name: "dataspace_id"
match_value: "(?P<id>[0-9]+)"
records:
Dataspace:
dataspace_id: $id
archived_element:
type: DictBooleanElement
match_name: "archived"
match_value: "(?P<archived>.*)"
records:
Dataspace:
archived: $archived
url_element:
type: DictTextElement
match_name: "url"
match_value: "(?P<url>.*)"
records:
Dataspace:
url: $url
coordinator_element:
type: DictDictElement
match_name: "coordinator"
records:
Person:
parents:
- Person
Dataspace:
Person: $Person
subtree: &person_subtree
full_name_element:
type: DictTextElement
match_name: "full_name"
match_value: "(?P<full_name>.*)"
records:
Person:
full_name: $full_name
full_name_nonlatin_element:
type: DictTextElement
match_name: "full_name_nonlatin"
match_value: "(?P<full_name_nonlatin>.*)"
records:
Person:
full_name_nonlatin: $full_name_nonlatin
family_name_element:
type: DictTextElement
match_name: "family_name"
match_value: "(?P<family_name>.*)"
records:
Person:
family_name: $family_name
given_name_element:
type: DictTextElement
match_name: "given_name"
match_value: "(?P<given_name>.*)"
records:
Person:
given_name: $given_name
email_element:
type: DictTextElement
match_name: "email"
match_value: "(?P<email>.*)"
records:
Person:
email: $email
affiliation_element:
type: DictTextElement
match_name: "affiliation"
match_value: "(?P<affiliation>.*)"
records:
Person:
affiliation: $affiliation
ORCID_element:
type: DictTextElement
match_name: "ORCID"
match_value: "(?P<ORCID>.*)"
records:
Person:
ORCID: $ORCID
start_date_element:
type: DictTextElement
match_name: "start_date"
match_value: "(?P<start_date>.*)"
records:
Dataspace:
start_date: $start_date
end_date_element:
type: DictTextElement
match_name: "end_date"
match_value: "(?P<end_date>.*)"
records:
Dataspace:
end_date: $end_date
comment:
type: DictTextElement
match_name: "comment"
match_value: "(?P<comment>.*)"
records:
Dataspace:
comment: $comment
raw_data_dir:
type: Directory
match: 03_raw_data
subtree: &template
# TODO collect info from metadata.json and look into sub-directories
# (only one level) for metadata.json
dataset_dir:
match: (?P<dataset_dir_name>.*)
type: Directory
records:
Dataset:
Dataspace: $Dataspace
subtree:
metadata_json: &metadata_json_template
type: JSONFile
match: metadata.json
validate: schema/dataset.schema.json
subtree:
title_element:
type: DictTextElement
match_name: "title"
match_value: "(?P<title>.*)"
records:
Dataset:
name: $title
authors_element:
type: DictListElement
match_name: "authors"
subtree:
author_element:
type: Dict
records:
Person:
parents:
- Person
Dataset:
authors: +$Person
subtree: *person_subtree
abstract_element:
type: DictTextElement
match_name: "abstract"
match_value: "(?P<abstract>.*)"
records:
Dataset:
abstract: $abstract
comment_element:
type: DictTextElement
match_name: "comment"
match_value: "(?P<comment>.*)"
records:
Dataset:
comment: $comment
license_element:
type: DictTextElement
match_name: "license"
match_value: "(?P<license_name>.*)"
records:
license:
# TODO: As soon as such things can be validated, a
# creation of a new license has to be forbidden here
# (although this is effectively done already by
# validating against the above schema.)
name: $license_name
Dataset:
license: $license
dataset_doi_element:
type: DictTextElement
match_name: "dataset_doi"
match_value: "(?P<dataset_doi>.*)"
records:
Dataset:
dataset_doi: $dataset_doi
related_to_dois_element:
type: DictListElement
match_name: "related_to_dois"
subtree:
related_to_doi_element:
type: TextElement
match: "(?P<related_to_doi>).*"
records:
Dataset:
related_to_dois: +$related_to_doi
Keywords_element:
type: DictListElement
match_name: "Keyword"
Events_element:
type: DictListElement
match_name: "Event"
subtree:
Event_element:
type: Dict
records:
Event:
parents:
- Event
Dataset:
Event: +$Event
subtree:
label_element:
type: DictTextElement
match_name: "label"
match_value: "(?P<label>.*)"
records:
Event:
label: $label
comment_element:
type: DictTextElement
match_name: "comment"
match_value: "(?P<comment>.*)"
records:
Event:
comment: $comment
start_datetime_element:
type: DictTextElement
match_name: start_datetime
match_value: "(?P<start_datetime>.*)"
records:
Event:
start_datetime: $start_datetime
end_datetime_element:
type: DictTextElement
match_name: end_datetime
match_value: "(?P<end_datetime>.*)"
records:
Event:
end_datetime: $end_datetime
longitude_element:
type: DictFloatElement
match_name: "longitude"
match_value: "(?P<longitude>.*)"
records:
Event:
longitude: $longitude
latitude_element:
type: DictFloatElement
match_name: "latitude"
match_value: "(?P<latitude>.*)"
records:
Event:
latitude: $latitude
elevation_element:
type: DictFloatElement
match_name: "elevation"
match_value: "(?P<elevation>.*)"
records:
Event:
elevation: $elevation
location_element:
type: DictTextElement
match_name: location
match_value: "(?P<location>.*)"
records:
Event:
location: $location
igsn_element:
type: DictTextElement
match_name: igsn
match_value: "(?P<igsn>.*)"
records:
Event:
igsn: $igsn
events_in_data_element:
type: DictBooleanElement
match_name: "events_in_data"
match_value: "(?P<events_in_data>.*)"
records:
Dataset:
events_in_data: $events_in_data
geojson_element:
type: DictTextElement
match_name: "geojson"
match_value: "(?P<geojson>.*)"
records:
Dataset:
geojson: $geojson
project_element:
type: DictDictElement
match_name: "project"
records:
Project:
parents:
- Project
Dataset:
Project: $Project
subtree:
full_name_element:
type: DictTextElement
match_name: "full_name"
match_value: "(?P<full_name>.*)"
records:
Project:
full_name: $full_name
project_id_element:
type: DictTextElement
match_name: "project_id"
match_value: "(?P<project_id>.*)"
records:
Project:
project_id: $project_id
project_type_element:
type: DictTextElement
match_name: "project_type"
match_value: "(?P<project_type_name>.*)"
records:
project_type:
name: $project_type_name
Project:
project_type: $project_type
institute_element:
type: DictTextElement
match_name: "institute"
match_value: "(?P<institute>.*)"
records:
Project:
institute: $institute
start_date_element:
type: DictTextElement
match_name: "start_date"
match_value: "(?P<start_date>.*)"
records:
Project:
start_date: $start_date
end_date_element:
type: DictTextElement
match_name: "end_date"
match_value: "(?P<end_date>.*)"
records:
Project:
end_date: $end_date
url_element:
type: DictTextElement
match_name: "url"
match_value: "(?P<url>.*)"
records:
Project:
url: $url
coordinators_element:
type: DictListElement
match_name: "coordinators"
subtree:
coordinator_element:
type: Dict
records:
Person:
parents:
- Person
Project:
coordinators: +$Person
subtree: *person_subtree
campaign_element:
type: DictDictElement
match_name: "campaign"
records:
Campaign:
parents:
- Campaign
Dataset:
Campaign: $Campaign
subtree:
label_element:
type: DictTextElement
match_name: "label"
match_value: "(?P<label>.*)"
records:
Campaign:
label: $label
optional_label_element:
type: DictTextElement
match_name: "optional_label"
match_value: "(?P<optional_label>.*)"
records:
Campaign:
optional_label: $optional_label
start_date_element:
type: DictTextElement
match_name: "start_date"
match_value: "(?P<start_date>.*)"
records:
Campaign:
start_date: $start_date
end_date_element:
type: DictTextElement
match_name: "end_date"
match_value: "(?P<end_date>.*)"
records:
Campaign:
end_date: $end_date
responsible_scientists_element:
type: DictListElement
match_name: "responsible_scientists"
subtree:
responsible_scientist_element:
type: Dict
records:
Person:
parents:
- Person
Campaign:
responsible_scientists: +$Person
subtree: *person_subtree
Methods_element:
type: DictListElement
match_name: "Method"
subtree:
Method_element:
type: Dict
records:
Method:
parents:
- Method
Dataset:
Method: +$Method
subtree:
method_name_element:
type: DictTextElement
match_name: "method_name"
match_value: "(?P<method_name>.*)"
records:
Method:
name: $method_name
abbreviation_element:
type: DictTextElement
match_name: "abbreviation"
match_value: "(?P<abbreviation>.*)"
records:
Method:
abbreviation: $abbreviation
url_element:
type: DictTextElement
match_name: "url"
match_value: "(?P<url>.*)"
records:
Method:
url: $url
Taxa_element:
type: DictListElement
match_name: "Taxon"
subtree:
Taxon_element:
type: Dict
records:
Taxon:
parents:
- Taxon
Dataset:
Taxon: +$Taxon
subtree:
taxon_name_element:
type: DictTextElement
match_name: "taxon_name"
match_value: "(?P<taxon_name>.*)"
records:
Taxon:
name: $taxon_name
archived_element:
type: DictBooleanElement
match_name: "archived"
match_value: "(P<archived>.*)"
records:
Dataset:
archived: $archived
publication_date_element:
type: DictTextElement
match_name: "publication_date"
match_value: "(P<publication_date>.*)"
records:
Dataset:
publication_date: $publication_date
max_files_element:
type: DictIntegerElement
match_name: "max_files"
match_value: "(P<max_files>.*)"
records:
Dataset:
max_files: $max_files
auxiliary_file: &aux_file_template
type: File
match: "(?P<aux_file_name>(?!metadata.json).*)"
# TODO File, path and reference dataset in file record
child_dataset_dir:
type: Directory
match: (?P<child_dataset_dir_name>.*)
subtree:
metadata_json: *metadata_json_template
auxiliary_file: *aux_file_template
data_processing_dir:
type: Directory
match: 04_data_processing
subtree: *template
results_dir:
type: Directory
match: 05_results
subtree: *template
#!/usr/bin/env python3
# encoding: utf-8
#
# This file is a part of the CaosDB Project.
#
# Copyright (C) 2022 Indiscale GmbH <info@indiscale.com>
# Copyright (C) 2022 Henrik tom Wörden <h.tomwoerden@indiscale.com>
# Copyright (C) 2022 Florian Spreckelsen <f.spreckelsen@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
import sys
from caosadvancedtools.models.parser import parse_model_from_json_schema, parse_model_from_yaml
def main():
# First load dataspace data model
dataspace_definitions = parse_model_from_json_schema(
"schema/dataspace.schema.json")
dataspace_definitions.sync_data_model(noquestion=True)
# Then general dataset definitions
dataset_definitions = parse_model_from_json_schema(
"schema/dataset.schema.json")
dataset_definitions.sync_data_model(noquestion=True)
# Finally, add inheritances as defined in yaml
dataset_inherits = parse_model_from_yaml(
"schema/dataset-inheritance.yml")
dataset_inherits.sync_data_model(noquestion=True)
if __name__ == "__main__":
sys.exit(main())
extern:
- Keyword
- Taxon
- full_name
- full_name_nonlatin
- name
full_name:
inherit_from_obligatory:
- name
full_name_nonlatin:
inherit_from_obligatory:
- name
Taxon:
inherit_from_obligatory:
- Keyword
{
"title": "Dataset",
"description": "",
"type": "object",
"properties": {
"title": {
"type": "string",
"description": "full dataset title"
},
"authors": {
"type": "array",
"items": {
"type": "object",
"title": "Person",
"properties": {
"full_name": {
"type": "string",
"description": "Full name (latin transcription, all UFT-8 characters allowed)"
},
"full_name_nonlatin": {
"type": "string",
"description": "Full name (non-latin alphabet)"
},
"family_name": {
"type": "string",
"description": "Family name (latin transcription)"
},
"given_name": {
"type": "string",
"description": "Given/other names (latin transcription)"
},
"affiliation": {
"type": "string"
},
"ORCID": {
"type": "string",
"description": "ORCID identifier as 16-digit number, e.g. 0000-0001-6233-1866",
"pattern": "^\\d{4}-\\d{4}-\\d{4}-\\d{4}$"
},
"email": {
"type": "string",
"format": "email"
}
},
"required": [
"full_name",
"email"
]
},
"minItems": 1,
"uniqueItems": true
},
"abstract": {
"type": "string",
"minLength": 80,
"maxLength": 1000,
"description": "Abstract with at least 80 characters"
},
"comment": {
"type": "string"
},
"license": {
"type": "string",
"enum": [
"CC-BY",
"CC-BY-SA",
"CC0",
"restricted access"
]
},
"dataset_doi": {
"type": "string",
"pattern": "(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![%\"#? ])\\S)+)",
"description": "Dataset DOI, e.g. 10.1594/PANGAEA.938740"
},
"related_to_dois": {
"type": "array",
"items": {
"type": "string",
"pattern": "(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![%\"#? ])\\S)+)"
},
"description": "DOIs of related publications and/or datasets, e.g. 10.1000/182"
},
"Keyword": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string"
}
}
}
},
"Event": {
"type": "array",
"description": "https://wiki.pangaea.de/wiki/Event",
"items": {
"type": "object",
"properties": {
"label": {
"type": "string"
},
"comment": {
"type": "string"
},
"start_datetime": {
"type": "string",
"format": "date-time"
},
"end_datetime": {
"type": "string",
"format": "date-time"
},
"longitude": {
"type": "number",
"minimum": -180,
"maximum": 180,
"description": "longitude (W/E) in decimal degree (-180 to 180)"
},
"latitude": {
"type": "number",
"minimum": -90,
"maximum": 90,
"description": "latitude (N/S) in decimal degree (-90 to 90)"
},
"elevation": {
"type": "number",
"minimum": -10000,
"maximum": 20000,
"description": "elevation in m"
},
"location": {
"type": "string",
"description": "geographical location as text (e.g., North Sea; Espoo, Finland)"
},
"igsn": {
"type": "string",
"description": "International Geo Sample Number (http://www.geosamples.org/aboutigsn)"
}
},
"required": [
"longitude",
"latitude",
"start_date"
]
}
},
"events_in_data": {
"type": "boolean",
"description": "Does the data contain additional information about timepoints and locations?"
},
"geojson": {
"type": "string",
"pattern": "",
"description": "GeoJSON for complex geographic structures"
},
"project": {
"title": "Project",
"description": "https://wiki.pangaea.de/wiki/Project",
"type": "object",
"properties": {
"name": {
"type": "string",
"description": "short name of project"
},
"full_name": {
"type": "string",
"description": "Full name (latin transcription, all UTF-8 characters allowed)"
},
"project_id": {
"type": "string",
"description": "Project ID"
},
"project_type": {
"type": "string",
"enum": [
"DFG",
"EU",
"BMBF",
"national",
"international"
]
},
"institute": {
"type": "string",
"description": "place of coordination or project office",
"default": "Centre for Research"
},
"start_date": {
"type": "string",
"format": "date"
},
"end_date": {
"type": "string",
"format": "date"
},
"url": {
"type": "string",
"format": "uri"
},
"coordinators": {
"type": "array",
"items": {
"type": "object",
"title": "Person",
"properties": {
"full_name": {
"type": "string",
"description": "Full name (latin transcription, all UTF-8 characters allowed)"
},
"full_name_nonlatin": {
"type": "string",
"description": "Full name (non-latin alphabet)"
},
"family_name": {
"type": "string",
"description": "Family name (latin transcription)"
},
"given_name": {
"type": "string",
"description": "Given/other names (latin transcription)"
},
"affiliation": {
"type": "string"
},
"ORCID": {
"type": "string",
"description": "ORCID identifier as 16-digit number, e.g. 0000-0001-6233-1866",
"pattern": "^\\d{4}-\\d{4}-\\d{4}-\\d{4}$"
},
"email": {
"type": "string",
"format": "email"
}
},
"required": [
"full_name",
"email"
]
},
"minItems": 1,
"uniqueItems": true
}
},
"required": ["name", "full_name"]
},
"campaign": {
"title": "Campaign",
"description": "https://wiki.pangaea.de/wiki/Campaign, synonyms: cruise, expedition, leg, ",
"type": "object",
"properties": {
"label": {
"type": "string",
"description": "is unique and does not contain blanks; uses abbreviations instead of full names"
},
"optional_label": {
"type": "string"
},
"start_date": {
"type": "string",
"format": "date"
},
"end_date": {
"type": "string",
"format": "date"
},
"responsible_scientists": {
"type": "array",
"items": {
"type": "object",
"title": "Person",
"properties": {
"full_name": {
"type": "string",
"description": "Full name (latin transcription, all UFT-8 characters allowed)"
},
"full_name_nonlatin": {
"type": "string",
"description": "Full name (non-latin alphabet)"
},
"family_name": {
"type": "string",
"description": "Family name (latin transcription)"
},
"given_name": {
"type": "string",
"description": "Given/other names (latin transcription)"
},
"affiliation": {
"type": "string"
},
"ORCID": {
"type": "string",
"description": "ORCID identifier as 16-digit number, e.g. 0000-0001-6233-1866",
"pattern": "^\\d{4}-\\d{4}-\\d{4}-\\d{4}$"
},
"email": {
"type": "string",
"format": "email"
}
},
"required": [
"full_name",
"email"
]
},
"minItems": 1,
"uniqueItems": true
}
}
},
"Method": {
"type": "array",
"items": {
"type": "object",
"description": "https://wiki.pangaea.de/wiki/Method",
"properties": {
"method_name": {
"type": "string",
"description": "full official name of tool/instrument/device/gear"
},
"abbreviation": {
"type": "string",
"description": "may be used for import in an event list to avoid misspellings"
},
"url": {
"type": "string",
"description": "should contain a web address, where an official description of the device can be found"
}
}
}
},
"Taxon": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string"
}
}
}
},
"archived": {
"type": "boolean",
"description": "Has the dataset been archived?",
"default": false
},
"publication_date": {
"type": "string",
"format": "date"
},
"max_files": {
"type": "integer",
"description": "Maximum number of files to included by the CaosDB crawler",
"default": 100
}
},
"required": [
"title",
"authors",
"abstract"
]
}
{
"title": "Dataspace",
"description": "A Dataspace is a folder in the DataCloud with a pre-defined structure",
"type": "object",
"properties": {
"dataspace_id": {
"type": "integer",
"description": "Integer ID of Dataspace (matches LDAP GID)",
"minimum": 20000
},
"archived": { "type": "boolean" },
"url": {
"type": "string",
"description": "link to folder on file system (CaosDB or cloud folder)"
},
"coordinator": {
"type": "object",
"title": "Person",
"properties": {
"full_name": {
"type": "string",
"description": "Full name (latin transcription, all UFT-8 characters allowed)"
},
"full_name_nonlatin": {
"type": "string",
"description": "Full name (non-latin alphabet)"
},
"family_name": {
"type": "string",
"description": "Family name (latin transcription)"
},
"given_name": {
"type": "string",
"description": "Given/other names (latin transcription)"
},
"email": { "type": "string", "format": "email" }
},
"required": ["full_name", "email"]
},
"start_date": { "type": "string", "format": "date" },
"end_date": { "type": "string", "format": "date" },
"comment": { "type": "string" }
},
"required": ["dataspace_id", "url", "coordinator"]
}
extern:
- name
- url
- Dataset
german_name:
datatype: TEXT
inherit_from_obligatory:
- name
Department:
recommended_properties:
url:
german_name:
WorkingGroup:
recommended_properties:
Department:
german_name:
url:
Dataset:
recommended_properties:
WorkingGroup:
#!/usr/bin/env python3
# encoding: utf-8
#
# This file is a part of the CaosDB Project.
#
# Copyright (C) 2022 Indiscale GmbH <info@indiscale.com>
# Copyright (C) 2022 Henrik tom Wörden <h.tomwoerden@indiscale.com>
# Copyright (C) 2022 Florian Spreckelsen <f.spreckelsen@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
"""
module description
"""
import json
import os
import caosdb as db
from newcrawler.crawl import Crawler
from newcrawler.converters import JSONFileConverter, DictConverter
from newcrawler.identifiable_adapters import CaosDBIdentifiableAdapter
from newcrawler.structure_elements import File, JSONFile, Directory
def test_dataset():
crawler_definition_path = "./dataset_cfoods.yml"
#json_file_path = rfp("test_directories", "single_file_test_data", "testjson.json")
ident = CaosDBIdentifiableAdapter()
ident.register_identifiable(
"license", db.RecordType().add_parent("license").add_property("name"))
ident.register_identifiable("project_type", db.RecordType(
).add_parent("project_type").add_property("name"))
ident.register_identifiable("Person", db.RecordType(
).add_parent("Person").add_property("full_name"))
crawler = Crawler(debug=True, identifiableAdapter=ident)
crawler_definition = crawler.load_definition(crawler_definition_path)
#print(json.dumps(crawler_definition, indent=3))
# Load and register converter packages:
converter_registry = crawler.load_converters(crawler_definition)
# print("DictIntegerElement" in converter_registry)
records = crawler.start_crawling(
Directory('data',
"data"),
crawler_definition,
converter_registry
)
subd = crawler.debug_tree
subc = crawler.debug_metadata
# print(json.dumps(subc, indent=3))
# print(subd)
# print(subc)
# print(records)
ins, ups = crawler.synchronize()
if __name__ == "__main__":
test_dataset()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment