Skip to content
Snippets Groups Projects
Commit 7a178ca1 authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

init int test

parent b325d6fb
No related branches found
No related tags found
2 merge requests!53Release 0.1,!18Add integrationtests based on a real world example
# This file is a part of the CaosDB Project.
#
# Copyright (C) 2022 IndiScale GmbH <info@indiscale.com>
# Copyright (C) 2022 Florian Spreckelsen <f.spreckelsen@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <https://www.gnu.org/licenses/>.
#
Data:
type: Directory
match: data
subtree:
dataspace_dir:
type: Directory
match: (?P<dataspace_dir_number>[0-9]+)
records:
Dataspace:
name: $dataspace_dir_number
subtree:
dataspace_json:
type: JSONFile
match: .dataspace.json
validate: schema/dataspace.schema.json
subtree:
dataspace_id_element:
type: DictIntegerElement
match_name: "dataspace_id"
match_value: "(?P<id>[0-9]+)"
records:
Dataspace:
dataspace_id: $id
archived_element:
type: DictBooleanElement
match_name: "archived"
match_value: "(?P<archived>.*)"
records:
Dataspace:
archived: $archived
url_element:
type: DictTextElement
match_name: "url"
match_value: "(?P<url>.*)"
records:
Dataspace:
url: $url
coordinator_element:
type: DictDictElement
match_name: "coordinator"
records:
Person:
parents:
- Person
Dataspace:
Person: $Person
subtree: &person_subtree
full_name_element:
type: DictTextElement
match_name: "full_name"
match_value: "(?P<full_name>.*)"
records:
Person:
full_name: $full_name
full_name_nonlatin_element:
type: DictTextElement
match_name: "full_name_nonlatin"
match_value: "(?P<full_name_nonlatin>.*)"
records:
Person:
full_name_nonlatin: $full_name_nonlatin
family_name_element:
type: DictTextElement
match_name: "family_name"
match_value: "(?P<family_name>.*)"
records:
Person:
family_name: $family_name
given_name_element:
type: DictTextElement
match_name: "given_name"
match_value: "(?P<given_name>.*)"
records:
Person:
given_name: $given_name
email_element:
type: DictTextElement
match_name: "email"
match_value: "(?P<email>.*)"
records:
Person:
email: $email
affiliation_element:
type: DictTextElement
match_name: "affiliation"
match_value: "(?P<affiliation>.*)"
records:
Person:
affiliation: $affiliation
ORCID_element:
type: DictTextElement
match_name: "ORCID"
match_value: "(?P<ORCID>.*)"
records:
Person:
ORCID: $ORCID
start_date_element:
type: DictTextElement
match_name: "start_date"
match_value: "(?P<start_date>.*)"
records:
Dataspace:
start_date: $start_date
end_date_element:
type: DictTextElement
match_name: "end_date"
match_value: "(?P<end_date>.*)"
records:
Dataspace:
end_date: $end_date
comment:
type: DictTextElement
match_name: "comment"
match_value: "(?P<comment>.*)"
records:
Dataspace:
comment: $comment
raw_data_dir:
type: Directory
match: 03_raw_data
subtree: &template
# TODO collect info from metadata.json and look into sub-directories
# (only one level) for metadata.json
dataset_dir:
match: (?P<dataset_dir_name>.*)
type: Directory
records:
Dataset:
Dataspace: $Dataspace
subtree:
metadata_json: &metadata_json_template
type: JSONFile
match: metadata.json
validate: schema/dataset.schema.json
subtree:
title_element:
type: DictTextElement
match_name: "title"
match_value: "(?P<title>.*)"
records:
Dataset:
name: $title
authors_element:
type: DictListElement
match_name: "authors"
subtree:
author_element:
type: Dict
records:
Person:
parents:
- Person
Dataset:
authors: +$Person
subtree: *person_subtree
abstract_element:
type: DictTextElement
match_name: "abstract"
match_value: "(?P<abstract>.*)"
records:
Dataset:
abstract: $abstract
comment_element:
type: DictTextElement
match_name: "comment"
match_value: "(?P<comment>.*)"
records:
Dataset:
comment: $comment
license_element:
type: DictTextElement
match_name: "license"
match_value: "(?P<license_name>.*)"
records:
license:
# TODO: As soon as such things can be validated, a
# creation of a new license has to be forbidden here
# (although this is effectively done already by
# validating against the above schema.)
name: $license_name
Dataset:
license: $license
dataset_doi_element:
type: DictTextElement
match_name: "dataset_doi"
match_value: "(?P<dataset_doi>.*)"
records:
Dataset:
dataset_doi: $dataset_doi
related_to_dois_element:
type: DictListElement
match_name: "related_to_dois"
subtree:
related_to_doi_element:
type: TextElement
match: "(?P<related_to_doi>).*"
records:
Dataset:
related_to_dois: +$related_to_doi
Keywords_element:
type: DictListElement
match_name: "Keyword"
Events_element:
type: DictListElement
match_name: "Event"
subtree:
Event_element:
type: Dict
records:
Event:
parents:
- Event
Dataset:
Event: +$Event
subtree:
label_element:
type: DictTextElement
match_name: "label"
match_value: "(?P<label>.*)"
records:
Event:
label: $label
comment_element:
type: DictTextElement
match_name: "comment"
match_value: "(?P<comment>.*)"
records:
Event:
comment: $comment
start_datetime_element:
type: DictTextElement
match_name: start_datetime
match_value: "(?P<start_datetime>.*)"
records:
Event:
start_datetime: $start_datetime
end_datetime_element:
type: DictTextElement
match_name: end_datetime
match_value: "(?P<end_datetime>.*)"
records:
Event:
end_datetime: $end_datetime
longitude_element:
type: DictFloatElement
match_name: "longitude"
match_value: "(?P<longitude>.*)"
records:
Event:
longitude: $longitude
latitude_element:
type: DictFloatElement
match_name: "latitude"
match_value: "(?P<latitude>.*)"
records:
Event:
latitude: $latitude
elevation_element:
type: DictFloatElement
match_name: "elevation"
match_value: "(?P<elevation>.*)"
records:
Event:
elevation: $elevation
location_element:
type: DictTextElement
match_name: location
match_value: "(?P<location>.*)"
records:
Event:
location: $location
igsn_element:
type: DictTextElement
match_name: igsn
match_value: "(?P<igsn>.*)"
records:
Event:
igsn: $igsn
events_in_data_element:
type: DictBooleanElement
match_name: "events_in_data"
match_value: "(?P<events_in_data>.*)"
records:
Dataset:
events_in_data: $events_in_data
geojson_element:
type: DictTextElement
match_name: "geojson"
match_value: "(?P<geojson>.*)"
records:
Dataset:
geojson: $geojson
project_element:
type: DictDictElement
match_name: "project"
records:
Project:
parents:
- Project
Dataset:
Project: $Project
subtree:
full_name_element:
type: DictTextElement
match_name: "full_name"
match_value: "(?P<full_name>.*)"
records:
Project:
full_name: $full_name
project_id_element:
type: DictTextElement
match_name: "project_id"
match_value: "(?P<project_id>.*)"
records:
Project:
project_id: $project_id
project_type_element:
type: DictTextElement
match_name: "project_type"
match_value: "(?P<project_type_name>.*)"
records:
project_type:
name: $project_type_name
Project:
project_type: $project_type
institute_element:
type: DictTextElement
match_name: "institute"
match_value: "(?P<institute>.*)"
records:
Project:
institute: $institute
start_date_element:
type: DictTextElement
match_name: "start_date"
match_value: "(?P<start_date>.*)"
records:
Project:
start_date: $start_date
end_date_element:
type: DictTextElement
match_name: "end_date"
match_value: "(?P<end_date>.*)"
records:
Project:
end_date: $end_date
url_element:
type: DictTextElement
match_name: "url"
match_value: "(?P<url>.*)"
records:
Project:
url: $url
coordinators_element:
type: DictListElement
match_name: "coordinators"
subtree:
coordinator_element:
type: Dict
records:
Person:
parents:
- Person
Project:
coordinators: +$Person
subtree: *person_subtree
campaign_element:
type: DictDictElement
match_name: "campaign"
records:
Campaign:
parents:
- Campaign
Dataset:
Campaign: $Campaign
subtree:
label_element:
type: DictTextElement
match_name: "label"
match_value: "(?P<label>.*)"
records:
Campaign:
label: $label
optional_label_element:
type: DictTextElement
match_name: "optional_label"
match_value: "(?P<optional_label>.*)"
records:
Campaign:
optional_label: $optional_label
start_date_element:
type: DictTextElement
match_name: "start_date"
match_value: "(?P<start_date>.*)"
records:
Campaign:
start_date: $start_date
end_date_element:
type: DictTextElement
match_name: "end_date"
match_value: "(?P<end_date>.*)"
records:
Campaign:
end_date: $end_date
responsible_scientists_element:
type: DictListElement
match_name: "responsible_scientists"
subtree:
responsible_scientist_element:
type: Dict
records:
Person:
parents:
- Person
Campaign:
responsible_scientists: +$Person
subtree: *person_subtree
Methods_element:
type: DictListElement
match_name: "Method"
subtree:
Method_element:
type: Dict
records:
Method:
parents:
- Method
Dataset:
Method: +$Method
subtree:
method_name_element:
type: DictTextElement
match_name: "method_name"
match_value: "(?P<method_name>.*)"
records:
Method:
name: $method_name
abbreviation_element:
type: DictTextElement
match_name: "abbreviation"
match_value: "(?P<abbreviation>.*)"
records:
Method:
abbreviation: $abbreviation
url_element:
type: DictTextElement
match_name: "url"
match_value: "(?P<url>.*)"
records:
Method:
url: $url
Taxa_element:
type: DictListElement
match_name: "Taxon"
subtree:
Taxon_element:
type: Dict
records:
Taxon:
parents:
- Taxon
Dataset:
Taxon: +$Taxon
subtree:
taxon_name_element:
type: DictTextElement
match_name: "taxon_name"
match_value: "(?P<taxon_name>.*)"
records:
Taxon:
name: $taxon_name
archived_element:
type: DictBooleanElement
match_name: "archived"
match_value: "(P<archived>.*)"
records:
Dataset:
archived: $archived
publication_date_element:
type: DictTextElement
match_name: "publication_date"
match_value: "(P<publication_date>.*)"
records:
Dataset:
publication_date: $publication_date
max_files_element:
type: DictIntegerElement
match_name: "max_files"
match_value: "(P<max_files>.*)"
records:
Dataset:
max_files: $max_files
auxiliary_file: &aux_file_template
type: File
match: "(?P<aux_file_name>(?!metadata.json).*)"
# TODO File, path and reference dataset in file record
child_dataset_dir:
type: Directory
match: (?P<child_dataset_dir_name>.*)
subtree:
metadata_json: *metadata_json_template
auxiliary_file: *aux_file_template
data_processing_dir:
type: Directory
match: 04_data_processing
subtree: *template
results_dir:
type: Directory
match: 05_results
subtree: *template
#!/usr/bin/env python3
# encoding: utf-8
#
# This file is a part of the CaosDB Project.
#
# Copyright (C) 2022 Indiscale GmbH <info@indiscale.com>
# Copyright (C) 2022 Henrik tom Wörden <h.tomwoerden@indiscale.com>
# Copyright (C) 2022 Florian Spreckelsen <f.spreckelsen@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
import sys
from caosadvancedtools.models.parser import parse_model_from_json_schema, parse_model_from_yaml
def main():
# First load dataspace data model
dataspace_definitions = parse_model_from_json_schema(
"schema/dataspace.schema.json")
dataspace_definitions.sync_data_model(noquestion=True)
# Then general dataset definitions
dataset_definitions = parse_model_from_json_schema(
"schema/dataset.schema.json")
dataset_definitions.sync_data_model(noquestion=True)
# Finally, add inheritances as defined in yaml
dataset_inherits = parse_model_from_yaml(
"schema/dataset-inheritance.yml")
dataset_inherits.sync_data_model(noquestion=True)
if __name__ == "__main__":
sys.exit(main())
extern:
- Keyword
- Taxon
- full_name
- full_name_nonlatin
- name
full_name:
inherit_from_obligatory:
- name
full_name_nonlatin:
inherit_from_obligatory:
- name
Taxon:
inherit_from_obligatory:
- Keyword
{
"title": "Dataset",
"description": "",
"type": "object",
"properties": {
"title": {
"type": "string",
"description": "full dataset title"
},
"authors": {
"type": "array",
"items": {
"type": "object",
"title": "Person",
"properties": {
"full_name": {
"type": "string",
"description": "Full name (latin transcription, all UFT-8 characters allowed)"
},
"full_name_nonlatin": {
"type": "string",
"description": "Full name (non-latin alphabet)"
},
"family_name": {
"type": "string",
"description": "Family name (latin transcription)"
},
"given_name": {
"type": "string",
"description": "Given/other names (latin transcription)"
},
"affiliation": {
"type": "string"
},
"ORCID": {
"type": "string",
"description": "ORCID identifier as 16-digit number, e.g. 0000-0001-6233-1866",
"pattern": "^\\d{4}-\\d{4}-\\d{4}-\\d{4}$"
},
"email": {
"type": "string",
"format": "email"
}
},
"required": [
"full_name",
"email"
]
},
"minItems": 1,
"uniqueItems": true
},
"abstract": {
"type": "string",
"minLength": 80,
"maxLength": 1000,
"description": "Abstract with at least 80 characters"
},
"comment": {
"type": "string"
},
"license": {
"type": "string",
"enum": [
"CC-BY",
"CC-BY-SA",
"CC0",
"restricted access"
]
},
"dataset_doi": {
"type": "string",
"pattern": "(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![%\"#? ])\\S)+)",
"description": "Dataset DOI, e.g. 10.1594/PANGAEA.938740"
},
"related_to_dois": {
"type": "array",
"items": {
"type": "string",
"pattern": "(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![%\"#? ])\\S)+)"
},
"description": "DOIs of related publications and/or datasets, e.g. 10.1000/182"
},
"Keyword": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string"
}
}
}
},
"Event": {
"type": "array",
"description": "https://wiki.pangaea.de/wiki/Event",
"items": {
"type": "object",
"properties": {
"label": {
"type": "string"
},
"comment": {
"type": "string"
},
"start_datetime": {
"type": "string",
"format": "date-time"
},
"end_datetime": {
"type": "string",
"format": "date-time"
},
"longitude": {
"type": "number",
"minimum": -180,
"maximum": 180,
"description": "longitude (W/E) in decimal degree (-180 to 180)"
},
"latitude": {
"type": "number",
"minimum": -90,
"maximum": 90,
"description": "latitude (N/S) in decimal degree (-90 to 90)"
},
"elevation": {
"type": "number",
"minimum": -10000,
"maximum": 20000,
"description": "elevation in m"
},
"location": {
"type": "string",
"description": "geographical location as text (e.g., North Sea; Espoo, Finland)"
},
"igsn": {
"type": "string",
"description": "International Geo Sample Number (http://www.geosamples.org/aboutigsn)"
}
},
"required": [
"longitude",
"latitude",
"start_date"
]
}
},
"events_in_data": {
"type": "boolean",
"description": "Does the data contain additional information about timepoints and locations?"
},
"geojson": {
"type": "string",
"pattern": "",
"description": "GeoJSON for complex geographic structures"
},
"project": {
"title": "Project",
"description": "https://wiki.pangaea.de/wiki/Project",
"type": "object",
"properties": {
"name": {
"type": "string",
"description": "short name of project"
},
"full_name": {
"type": "string",
"description": "Full name (latin transcription, all UTF-8 characters allowed)"
},
"project_id": {
"type": "string",
"description": "Project ID"
},
"project_type": {
"type": "string",
"enum": [
"DFG",
"EU",
"BMBF",
"national",
"international"
]
},
"institute": {
"type": "string",
"description": "place of coordination or project office",
"default": "Centre for Research"
},
"start_date": {
"type": "string",
"format": "date"
},
"end_date": {
"type": "string",
"format": "date"
},
"url": {
"type": "string",
"format": "uri"
},
"coordinators": {
"type": "array",
"items": {
"type": "object",
"title": "Person",
"properties": {
"full_name": {
"type": "string",
"description": "Full name (latin transcription, all UTF-8 characters allowed)"
},
"full_name_nonlatin": {
"type": "string",
"description": "Full name (non-latin alphabet)"
},
"family_name": {
"type": "string",
"description": "Family name (latin transcription)"
},
"given_name": {
"type": "string",
"description": "Given/other names (latin transcription)"
},
"affiliation": {
"type": "string"
},
"ORCID": {
"type": "string",
"description": "ORCID identifier as 16-digit number, e.g. 0000-0001-6233-1866",
"pattern": "^\\d{4}-\\d{4}-\\d{4}-\\d{4}$"
},
"email": {
"type": "string",
"format": "email"
}
},
"required": [
"full_name",
"email"
]
},
"minItems": 1,
"uniqueItems": true
}
},
"required": ["name", "full_name"]
},
"campaign": {
"title": "Campaign",
"description": "https://wiki.pangaea.de/wiki/Campaign, synonyms: cruise, expedition, leg, ",
"type": "object",
"properties": {
"label": {
"type": "string",
"description": "is unique and does not contain blanks; uses abbreviations instead of full names"
},
"optional_label": {
"type": "string"
},
"start_date": {
"type": "string",
"format": "date"
},
"end_date": {
"type": "string",
"format": "date"
},
"responsible_scientists": {
"type": "array",
"items": {
"type": "object",
"title": "Person",
"properties": {
"full_name": {
"type": "string",
"description": "Full name (latin transcription, all UFT-8 characters allowed)"
},
"full_name_nonlatin": {
"type": "string",
"description": "Full name (non-latin alphabet)"
},
"family_name": {
"type": "string",
"description": "Family name (latin transcription)"
},
"given_name": {
"type": "string",
"description": "Given/other names (latin transcription)"
},
"affiliation": {
"type": "string"
},
"ORCID": {
"type": "string",
"description": "ORCID identifier as 16-digit number, e.g. 0000-0001-6233-1866",
"pattern": "^\\d{4}-\\d{4}-\\d{4}-\\d{4}$"
},
"email": {
"type": "string",
"format": "email"
}
},
"required": [
"full_name",
"email"
]
},
"minItems": 1,
"uniqueItems": true
}
}
},
"Method": {
"type": "array",
"items": {
"type": "object",
"description": "https://wiki.pangaea.de/wiki/Method",
"properties": {
"method_name": {
"type": "string",
"description": "full official name of tool/instrument/device/gear"
},
"abbreviation": {
"type": "string",
"description": "may be used for import in an event list to avoid misspellings"
},
"url": {
"type": "string",
"description": "should contain a web address, where an official description of the device can be found"
}
}
}
},
"Taxon": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string"
}
}
}
},
"archived": {
"type": "boolean",
"description": "Has the dataset been archived?",
"default": false
},
"publication_date": {
"type": "string",
"format": "date"
},
"max_files": {
"type": "integer",
"description": "Maximum number of files to included by the CaosDB crawler",
"default": 100
}
},
"required": [
"title",
"authors",
"abstract"
]
}
{
"title": "Dataspace",
"description": "A Dataspace is a folder in the DataCloud with a pre-defined structure",
"type": "object",
"properties": {
"dataspace_id": {
"type": "integer",
"description": "Integer ID of Dataspace (matches LDAP GID)",
"minimum": 20000
},
"archived": { "type": "boolean" },
"url": {
"type": "string",
"description": "link to folder on file system (CaosDB or cloud folder)"
},
"coordinator": {
"type": "object",
"title": "Person",
"properties": {
"full_name": {
"type": "string",
"description": "Full name (latin transcription, all UFT-8 characters allowed)"
},
"full_name_nonlatin": {
"type": "string",
"description": "Full name (non-latin alphabet)"
},
"family_name": {
"type": "string",
"description": "Family name (latin transcription)"
},
"given_name": {
"type": "string",
"description": "Given/other names (latin transcription)"
},
"email": { "type": "string", "format": "email" }
},
"required": ["full_name", "email"]
},
"start_date": { "type": "string", "format": "date" },
"end_date": { "type": "string", "format": "date" },
"comment": { "type": "string" }
},
"required": ["dataspace_id", "url", "coordinator"]
}
extern:
- name
- url
- Dataset
german_name:
datatype: TEXT
inherit_from_obligatory:
- name
Department:
recommended_properties:
url:
german_name:
WorkingGroup:
recommended_properties:
Department:
german_name:
url:
Dataset:
recommended_properties:
WorkingGroup:
#!/usr/bin/env python3
# encoding: utf-8
#
# This file is a part of the CaosDB Project.
#
# Copyright (C) 2022 Indiscale GmbH <info@indiscale.com>
# Copyright (C) 2022 Henrik tom Wörden <h.tomwoerden@indiscale.com>
# Copyright (C) 2022 Florian Spreckelsen <f.spreckelsen@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
"""
module description
"""
import json
import os
import caosdb as db
from newcrawler.crawl import Crawler
from newcrawler.converters import JSONFileConverter, DictConverter
from newcrawler.identifiable_adapters import CaosDBIdentifiableAdapter
from newcrawler.structure_elements import File, JSONFile, Directory
def test_dataset():
crawler_definition_path = "./dataset_cfoods.yml"
#json_file_path = rfp("test_directories", "single_file_test_data", "testjson.json")
ident = CaosDBIdentifiableAdapter()
ident.register_identifiable(
"license", db.RecordType().add_parent("license").add_property("name"))
ident.register_identifiable("project_type", db.RecordType(
).add_parent("project_type").add_property("name"))
ident.register_identifiable("Person", db.RecordType(
).add_parent("Person").add_property("full_name"))
crawler = Crawler(debug=True, identifiableAdapter=ident)
crawler_definition = crawler.load_definition(crawler_definition_path)
#print(json.dumps(crawler_definition, indent=3))
# Load and register converter packages:
converter_registry = crawler.load_converters(crawler_definition)
# print("DictIntegerElement" in converter_registry)
records = crawler.start_crawling(
Directory('data',
"data"),
crawler_definition,
converter_registry
)
subd = crawler.debug_tree
subc = crawler.debug_metadata
# print(json.dumps(subc, indent=3))
# print(subd)
# print(subc)
# print(records)
ins, ups = crawler.synchronize()
if __name__ == "__main__":
test_dataset()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment