diff --git a/integrationtests/dataset_cfoods.yml b/integrationtests/dataset_cfoods.yml new file mode 100644 index 0000000000000000000000000000000000000000..5b5e08a5421e4117e2949e65a9e42da8e74eb10c --- /dev/null +++ b/integrationtests/dataset_cfoods.yml @@ -0,0 +1,528 @@ +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2022 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2022 Florian Spreckelsen <f.spreckelsen@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify it under +# the terms of the GNU Affero General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your option) any +# later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +# details. +# +# You should have received a copy of the GNU Affero General Public License along +# with this program. If not, see <https://www.gnu.org/licenses/>. +# +Data: + type: Directory + match: data + subtree: + dataspace_dir: + type: Directory + match: (?P<dataspace_dir_number>[0-9]+) + records: + Dataspace: + name: $dataspace_dir_number + subtree: + dataspace_json: + type: JSONFile + match: .dataspace.json + validate: schema/dataspace.schema.json + subtree: + dataspace_id_element: + type: DictIntegerElement + match_name: "dataspace_id" + match_value: "(?P<id>[0-9]+)" + records: + Dataspace: + dataspace_id: $id + archived_element: + type: DictBooleanElement + match_name: "archived" + match_value: "(?P<archived>.*)" + records: + Dataspace: + archived: $archived + url_element: + type: DictTextElement + match_name: "url" + match_value: "(?P<url>.*)" + records: + Dataspace: + url: $url + coordinator_element: + type: DictDictElement + match_name: "coordinator" + records: + Person: + parents: + - Person + Dataspace: + Person: $Person + subtree: &person_subtree + full_name_element: + type: DictTextElement + match_name: "full_name" + match_value: "(?P<full_name>.*)" + records: + Person: + full_name: $full_name + full_name_nonlatin_element: + type: DictTextElement + match_name: "full_name_nonlatin" + match_value: "(?P<full_name_nonlatin>.*)" + records: + Person: + full_name_nonlatin: $full_name_nonlatin + family_name_element: + type: DictTextElement + match_name: "family_name" + match_value: "(?P<family_name>.*)" + records: + Person: + family_name: $family_name + given_name_element: + type: DictTextElement + match_name: "given_name" + match_value: "(?P<given_name>.*)" + records: + Person: + given_name: $given_name + email_element: + type: DictTextElement + match_name: "email" + match_value: "(?P<email>.*)" + records: + Person: + email: $email + affiliation_element: + type: DictTextElement + match_name: "affiliation" + match_value: "(?P<affiliation>.*)" + records: + Person: + affiliation: $affiliation + ORCID_element: + type: DictTextElement + match_name: "ORCID" + match_value: "(?P<ORCID>.*)" + records: + Person: + ORCID: $ORCID + start_date_element: + type: DictTextElement + match_name: "start_date" + match_value: "(?P<start_date>.*)" + records: + Dataspace: + start_date: $start_date + end_date_element: + type: DictTextElement + match_name: "end_date" + match_value: "(?P<end_date>.*)" + records: + Dataspace: + end_date: $end_date + comment: + type: DictTextElement + match_name: "comment" + match_value: "(?P<comment>.*)" + records: + Dataspace: + comment: $comment + raw_data_dir: + type: Directory + match: 03_raw_data + subtree: &template + # TODO collect info from metadata.json and look into sub-directories + # (only one level) for metadata.json + dataset_dir: + match: (?P<dataset_dir_name>.*) + type: Directory + records: + Dataset: + Dataspace: $Dataspace + subtree: + metadata_json: &metadata_json_template + type: JSONFile + match: metadata.json + validate: schema/dataset.schema.json + subtree: + title_element: + type: DictTextElement + match_name: "title" + match_value: "(?P<title>.*)" + records: + Dataset: + name: $title + authors_element: + type: DictListElement + match_name: "authors" + subtree: + author_element: + type: Dict + records: + Person: + parents: + - Person + Dataset: + authors: +$Person + subtree: *person_subtree + abstract_element: + type: DictTextElement + match_name: "abstract" + match_value: "(?P<abstract>.*)" + records: + Dataset: + abstract: $abstract + comment_element: + type: DictTextElement + match_name: "comment" + match_value: "(?P<comment>.*)" + records: + Dataset: + comment: $comment + license_element: + type: DictTextElement + match_name: "license" + match_value: "(?P<license_name>.*)" + records: + license: + # TODO: As soon as such things can be validated, a + # creation of a new license has to be forbidden here + # (although this is effectively done already by + # validating against the above schema.) + name: $license_name + Dataset: + license: $license + dataset_doi_element: + type: DictTextElement + match_name: "dataset_doi" + match_value: "(?P<dataset_doi>.*)" + records: + Dataset: + dataset_doi: $dataset_doi + related_to_dois_element: + type: DictListElement + match_name: "related_to_dois" + subtree: + related_to_doi_element: + type: TextElement + match: "(?P<related_to_doi>).*" + records: + Dataset: + related_to_dois: +$related_to_doi + Keywords_element: + type: DictListElement + match_name: "Keyword" + Events_element: + type: DictListElement + match_name: "Event" + subtree: + Event_element: + type: Dict + records: + Event: + parents: + - Event + Dataset: + Event: +$Event + subtree: + label_element: + type: DictTextElement + match_name: "label" + match_value: "(?P<label>.*)" + records: + Event: + label: $label + comment_element: + type: DictTextElement + match_name: "comment" + match_value: "(?P<comment>.*)" + records: + Event: + comment: $comment + start_datetime_element: + type: DictTextElement + match_name: start_datetime + match_value: "(?P<start_datetime>.*)" + records: + Event: + start_datetime: $start_datetime + end_datetime_element: + type: DictTextElement + match_name: end_datetime + match_value: "(?P<end_datetime>.*)" + records: + Event: + end_datetime: $end_datetime + longitude_element: + type: DictFloatElement + match_name: "longitude" + match_value: "(?P<longitude>.*)" + records: + Event: + longitude: $longitude + latitude_element: + type: DictFloatElement + match_name: "latitude" + match_value: "(?P<latitude>.*)" + records: + Event: + latitude: $latitude + elevation_element: + type: DictFloatElement + match_name: "elevation" + match_value: "(?P<elevation>.*)" + records: + Event: + elevation: $elevation + location_element: + type: DictTextElement + match_name: location + match_value: "(?P<location>.*)" + records: + Event: + location: $location + igsn_element: + type: DictTextElement + match_name: igsn + match_value: "(?P<igsn>.*)" + records: + Event: + igsn: $igsn + events_in_data_element: + type: DictBooleanElement + match_name: "events_in_data" + match_value: "(?P<events_in_data>.*)" + records: + Dataset: + events_in_data: $events_in_data + geojson_element: + type: DictTextElement + match_name: "geojson" + match_value: "(?P<geojson>.*)" + records: + Dataset: + geojson: $geojson + project_element: + type: DictDictElement + match_name: "project" + records: + Project: + parents: + - Project + Dataset: + Project: $Project + subtree: + full_name_element: + type: DictTextElement + match_name: "full_name" + match_value: "(?P<full_name>.*)" + records: + Project: + full_name: $full_name + project_id_element: + type: DictTextElement + match_name: "project_id" + match_value: "(?P<project_id>.*)" + records: + Project: + project_id: $project_id + project_type_element: + type: DictTextElement + match_name: "project_type" + match_value: "(?P<project_type_name>.*)" + records: + project_type: + name: $project_type_name + Project: + project_type: $project_type + institute_element: + type: DictTextElement + match_name: "institute" + match_value: "(?P<institute>.*)" + records: + Project: + institute: $institute + start_date_element: + type: DictTextElement + match_name: "start_date" + match_value: "(?P<start_date>.*)" + records: + Project: + start_date: $start_date + end_date_element: + type: DictTextElement + match_name: "end_date" + match_value: "(?P<end_date>.*)" + records: + Project: + end_date: $end_date + url_element: + type: DictTextElement + match_name: "url" + match_value: "(?P<url>.*)" + records: + Project: + url: $url + coordinators_element: + type: DictListElement + match_name: "coordinators" + subtree: + coordinator_element: + type: Dict + records: + Person: + parents: + - Person + Project: + coordinators: +$Person + subtree: *person_subtree + campaign_element: + type: DictDictElement + match_name: "campaign" + records: + Campaign: + parents: + - Campaign + Dataset: + Campaign: $Campaign + subtree: + label_element: + type: DictTextElement + match_name: "label" + match_value: "(?P<label>.*)" + records: + Campaign: + label: $label + optional_label_element: + type: DictTextElement + match_name: "optional_label" + match_value: "(?P<optional_label>.*)" + records: + Campaign: + optional_label: $optional_label + start_date_element: + type: DictTextElement + match_name: "start_date" + match_value: "(?P<start_date>.*)" + records: + Campaign: + start_date: $start_date + end_date_element: + type: DictTextElement + match_name: "end_date" + match_value: "(?P<end_date>.*)" + records: + Campaign: + end_date: $end_date + responsible_scientists_element: + type: DictListElement + match_name: "responsible_scientists" + subtree: + responsible_scientist_element: + type: Dict + records: + Person: + parents: + - Person + Campaign: + responsible_scientists: +$Person + subtree: *person_subtree + Methods_element: + type: DictListElement + match_name: "Method" + subtree: + Method_element: + type: Dict + records: + Method: + parents: + - Method + Dataset: + Method: +$Method + subtree: + method_name_element: + type: DictTextElement + match_name: "method_name" + match_value: "(?P<method_name>.*)" + records: + Method: + name: $method_name + abbreviation_element: + type: DictTextElement + match_name: "abbreviation" + match_value: "(?P<abbreviation>.*)" + records: + Method: + abbreviation: $abbreviation + url_element: + type: DictTextElement + match_name: "url" + match_value: "(?P<url>.*)" + records: + Method: + url: $url + Taxa_element: + type: DictListElement + match_name: "Taxon" + subtree: + Taxon_element: + type: Dict + records: + Taxon: + parents: + - Taxon + Dataset: + Taxon: +$Taxon + subtree: + taxon_name_element: + type: DictTextElement + match_name: "taxon_name" + match_value: "(?P<taxon_name>.*)" + records: + Taxon: + name: $taxon_name + archived_element: + type: DictBooleanElement + match_name: "archived" + match_value: "(P<archived>.*)" + records: + Dataset: + archived: $archived + publication_date_element: + type: DictTextElement + match_name: "publication_date" + match_value: "(P<publication_date>.*)" + records: + Dataset: + publication_date: $publication_date + max_files_element: + type: DictIntegerElement + match_name: "max_files" + match_value: "(P<max_files>.*)" + records: + Dataset: + max_files: $max_files + auxiliary_file: &aux_file_template + type: File + match: "(?P<aux_file_name>(?!metadata.json).*)" + # TODO File, path and reference dataset in file record + child_dataset_dir: + type: Directory + match: (?P<child_dataset_dir_name>.*) + subtree: + metadata_json: *metadata_json_template + auxiliary_file: *aux_file_template + data_processing_dir: + type: Directory + match: 04_data_processing + subtree: *template + results_dir: + type: Directory + match: 05_results + subtree: *template diff --git a/integrationtests/load_and_insert_json_models.py b/integrationtests/load_and_insert_json_models.py new file mode 100644 index 0000000000000000000000000000000000000000..682fd9c77531e63ed18dd13417399ad0d18a8de2 --- /dev/null +++ b/integrationtests/load_and_insert_json_models.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2022 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2022 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# Copyright (C) 2022 Florian Spreckelsen <f.spreckelsen@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +import sys + +from caosadvancedtools.models.parser import parse_model_from_json_schema, parse_model_from_yaml + + +def main(): + # First load dataspace data model + dataspace_definitions = parse_model_from_json_schema( + "schema/dataspace.schema.json") + dataspace_definitions.sync_data_model(noquestion=True) + + # Then general dataset definitions + dataset_definitions = parse_model_from_json_schema( + "schema/dataset.schema.json") + dataset_definitions.sync_data_model(noquestion=True) + + # Finally, add inheritances as defined in yaml + dataset_inherits = parse_model_from_yaml( + "schema/dataset-inheritance.yml") + dataset_inherits.sync_data_model(noquestion=True) + + +if __name__ == "__main__": + + sys.exit(main()) diff --git a/integrationtests/schema/dataset-inheritance.yml b/integrationtests/schema/dataset-inheritance.yml new file mode 100644 index 0000000000000000000000000000000000000000..3d12053a0007cdea1005e7673db69f46b35a063d --- /dev/null +++ b/integrationtests/schema/dataset-inheritance.yml @@ -0,0 +1,18 @@ +extern: +- Keyword +- Taxon +- full_name +- full_name_nonlatin +- name + +full_name: + inherit_from_obligatory: + - name + +full_name_nonlatin: + inherit_from_obligatory: + - name + +Taxon: + inherit_from_obligatory: + - Keyword diff --git a/integrationtests/schema/dataset.schema.json b/integrationtests/schema/dataset.schema.json new file mode 100644 index 0000000000000000000000000000000000000000..81b0df9c48182e596d4cc52f87140537d3746722 --- /dev/null +++ b/integrationtests/schema/dataset.schema.json @@ -0,0 +1,365 @@ +{ + "title": "Dataset", + "description": "", + "type": "object", + "properties": { + "title": { + "type": "string", + "description": "full dataset title" + }, + "authors": { + "type": "array", + "items": { + "type": "object", + "title": "Person", + "properties": { + "full_name": { + "type": "string", + "description": "Full name (latin transcription, all UFT-8 characters allowed)" + }, + "full_name_nonlatin": { + "type": "string", + "description": "Full name (non-latin alphabet)" + }, + "family_name": { + "type": "string", + "description": "Family name (latin transcription)" + }, + "given_name": { + "type": "string", + "description": "Given/other names (latin transcription)" + }, + "affiliation": { + "type": "string" + }, + "ORCID": { + "type": "string", + "description": "ORCID identifier as 16-digit number, e.g. 0000-0001-6233-1866", + "pattern": "^\\d{4}-\\d{4}-\\d{4}-\\d{4}$" + }, + "email": { + "type": "string", + "format": "email" + } + }, + "required": [ + "full_name", + "email" + ] + }, + "minItems": 1, + "uniqueItems": true + }, + "abstract": { + "type": "string", + "minLength": 80, + "maxLength": 1000, + "description": "Abstract with at least 80 characters" + }, + "comment": { + "type": "string" + }, + "license": { + "type": "string", + "enum": [ + "CC-BY", + "CC-BY-SA", + "CC0", + "restricted access" + ] + }, + "dataset_doi": { + "type": "string", + "pattern": "(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![%\"#? ])\\S)+)", + "description": "Dataset DOI, e.g. 10.1594/PANGAEA.938740" + }, + "related_to_dois": { + "type": "array", + "items": { + "type": "string", + "pattern": "(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![%\"#? ])\\S)+)" + }, + "description": "DOIs of related publications and/or datasets, e.g. 10.1000/182" + }, + "Keyword": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string" + } + } + } + }, + "Event": { + "type": "array", + "description": "https://wiki.pangaea.de/wiki/Event", + "items": { + "type": "object", + "properties": { + "label": { + "type": "string" + }, + "comment": { + "type": "string" + }, + "start_datetime": { + "type": "string", + "format": "date-time" + }, + "end_datetime": { + "type": "string", + "format": "date-time" + }, + "longitude": { + "type": "number", + "minimum": -180, + "maximum": 180, + "description": "longitude (W/E) in decimal degree (-180 to 180)" + }, + "latitude": { + "type": "number", + "minimum": -90, + "maximum": 90, + "description": "latitude (N/S) in decimal degree (-90 to 90)" + }, + "elevation": { + "type": "number", + "minimum": -10000, + "maximum": 20000, + "description": "elevation in m" + }, + "location": { + "type": "string", + "description": "geographical location as text (e.g., North Sea; Espoo, Finland)" + }, + "igsn": { + "type": "string", + "description": "International Geo Sample Number (http://www.geosamples.org/aboutigsn)" + } + }, + "required": [ + "longitude", + "latitude", + "start_date" + ] + } + }, + "events_in_data": { + "type": "boolean", + "description": "Does the data contain additional information about timepoints and locations?" + }, + "geojson": { + "type": "string", + "pattern": "", + "description": "GeoJSON for complex geographic structures" + }, + "project": { + "title": "Project", + "description": "https://wiki.pangaea.de/wiki/Project", + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "short name of project" + }, + "full_name": { + "type": "string", + "description": "Full name (latin transcription, all UTF-8 characters allowed)" + }, + "project_id": { + "type": "string", + "description": "Project ID" + }, + "project_type": { + "type": "string", + "enum": [ + "DFG", + "EU", + "BMBF", + "national", + "international" + ] + }, + "institute": { + "type": "string", + "description": "place of coordination or project office", + "default": "Centre for Research" + }, + "start_date": { + "type": "string", + "format": "date" + }, + "end_date": { + "type": "string", + "format": "date" + }, + "url": { + "type": "string", + "format": "uri" + }, + "coordinators": { + "type": "array", + "items": { + "type": "object", + "title": "Person", + "properties": { + "full_name": { + "type": "string", + "description": "Full name (latin transcription, all UTF-8 characters allowed)" + }, + "full_name_nonlatin": { + "type": "string", + "description": "Full name (non-latin alphabet)" + }, + "family_name": { + "type": "string", + "description": "Family name (latin transcription)" + }, + "given_name": { + "type": "string", + "description": "Given/other names (latin transcription)" + }, + "affiliation": { + "type": "string" + }, + "ORCID": { + "type": "string", + "description": "ORCID identifier as 16-digit number, e.g. 0000-0001-6233-1866", + "pattern": "^\\d{4}-\\d{4}-\\d{4}-\\d{4}$" + }, + "email": { + "type": "string", + "format": "email" + } + }, + "required": [ + "full_name", + "email" + ] + }, + "minItems": 1, + "uniqueItems": true + } + }, + "required": ["name", "full_name"] + }, + "campaign": { + "title": "Campaign", + "description": "https://wiki.pangaea.de/wiki/Campaign, synonyms: cruise, expedition, leg, ", + "type": "object", + "properties": { + "label": { + "type": "string", + "description": "is unique and does not contain blanks; uses abbreviations instead of full names" + }, + "optional_label": { + "type": "string" + }, + "start_date": { + "type": "string", + "format": "date" + }, + "end_date": { + "type": "string", + "format": "date" + }, + "responsible_scientists": { + "type": "array", + "items": { + "type": "object", + "title": "Person", + "properties": { + "full_name": { + "type": "string", + "description": "Full name (latin transcription, all UFT-8 characters allowed)" + }, + "full_name_nonlatin": { + "type": "string", + "description": "Full name (non-latin alphabet)" + }, + "family_name": { + "type": "string", + "description": "Family name (latin transcription)" + }, + "given_name": { + "type": "string", + "description": "Given/other names (latin transcription)" + }, + "affiliation": { + "type": "string" + }, + "ORCID": { + "type": "string", + "description": "ORCID identifier as 16-digit number, e.g. 0000-0001-6233-1866", + "pattern": "^\\d{4}-\\d{4}-\\d{4}-\\d{4}$" + }, + "email": { + "type": "string", + "format": "email" + } + }, + "required": [ + "full_name", + "email" + ] + }, + "minItems": 1, + "uniqueItems": true + } + } + }, + "Method": { + "type": "array", + "items": { + "type": "object", + "description": "https://wiki.pangaea.de/wiki/Method", + "properties": { + "method_name": { + "type": "string", + "description": "full official name of tool/instrument/device/gear" + }, + "abbreviation": { + "type": "string", + "description": "may be used for import in an event list to avoid misspellings" + }, + "url": { + "type": "string", + "description": "should contain a web address, where an official description of the device can be found" + } + } + } + }, + "Taxon": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string" + } + } + } + }, + "archived": { + "type": "boolean", + "description": "Has the dataset been archived?", + "default": false + }, + "publication_date": { + "type": "string", + "format": "date" + }, + "max_files": { + "type": "integer", + "description": "Maximum number of files to included by the CaosDB crawler", + "default": 100 + } + }, + "required": [ + "title", + "authors", + "abstract" + ] +} diff --git a/integrationtests/schema/dataspace.schema.json b/integrationtests/schema/dataspace.schema.json new file mode 100644 index 0000000000000000000000000000000000000000..01653bfa821e0a0acbb5a481bfd458e2ed784fb9 --- /dev/null +++ b/integrationtests/schema/dataspace.schema.json @@ -0,0 +1,45 @@ +{ + "title": "Dataspace", + "description": "A Dataspace is a folder in the DataCloud with a pre-defined structure", + "type": "object", + "properties": { + "dataspace_id": { + "type": "integer", + "description": "Integer ID of Dataspace (matches LDAP GID)", + "minimum": 20000 + }, + "archived": { "type": "boolean" }, + "url": { + "type": "string", + "description": "link to folder on file system (CaosDB or cloud folder)" + }, + "coordinator": { + "type": "object", + "title": "Person", + "properties": { + "full_name": { + "type": "string", + "description": "Full name (latin transcription, all UFT-8 characters allowed)" + }, + "full_name_nonlatin": { + "type": "string", + "description": "Full name (non-latin alphabet)" + }, + "family_name": { + "type": "string", + "description": "Family name (latin transcription)" + }, + "given_name": { + "type": "string", + "description": "Given/other names (latin transcription)" + }, + "email": { "type": "string", "format": "email" } + }, + "required": ["full_name", "email"] + }, + "start_date": { "type": "string", "format": "date" }, + "end_date": { "type": "string", "format": "date" }, + "comment": { "type": "string" } + }, + "required": ["dataspace_id", "url", "coordinator"] +} diff --git a/integrationtests/schema/zmt-organisation.yml b/integrationtests/schema/zmt-organisation.yml new file mode 100644 index 0000000000000000000000000000000000000000..7e251eeced7bf626e77364fc5555b1cb10dd3afb --- /dev/null +++ b/integrationtests/schema/zmt-organisation.yml @@ -0,0 +1,26 @@ +extern: +- name +- url +- Dataset + +german_name: + datatype: TEXT + inherit_from_obligatory: + - name + +Department: + recommended_properties: + url: + german_name: + + +WorkingGroup: + recommended_properties: + Department: + german_name: + url: + +Dataset: + recommended_properties: + WorkingGroup: + diff --git a/integrationtests/test_dataset_crawler.py b/integrationtests/test_dataset_crawler.py new file mode 100644 index 0000000000000000000000000000000000000000..3c8e486a2330baf3ccc1eabeec58c049299566b7 --- /dev/null +++ b/integrationtests/test_dataset_crawler.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2022 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2022 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# Copyright (C) 2022 Florian Spreckelsen <f.spreckelsen@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" +module description +""" +import json +import os + +import caosdb as db + +from newcrawler.crawl import Crawler +from newcrawler.converters import JSONFileConverter, DictConverter +from newcrawler.identifiable_adapters import CaosDBIdentifiableAdapter +from newcrawler.structure_elements import File, JSONFile, Directory + + +def test_dataset(): + crawler_definition_path = "./dataset_cfoods.yml" + #json_file_path = rfp("test_directories", "single_file_test_data", "testjson.json") + + ident = CaosDBIdentifiableAdapter() + ident.register_identifiable( + "license", db.RecordType().add_parent("license").add_property("name")) + ident.register_identifiable("project_type", db.RecordType( + ).add_parent("project_type").add_property("name")) + ident.register_identifiable("Person", db.RecordType( + ).add_parent("Person").add_property("full_name")) + + crawler = Crawler(debug=True, identifiableAdapter=ident) + crawler_definition = crawler.load_definition(crawler_definition_path) + #print(json.dumps(crawler_definition, indent=3)) + # Load and register converter packages: + converter_registry = crawler.load_converters(crawler_definition) + # print("DictIntegerElement" in converter_registry) + + records = crawler.start_crawling( + Directory('data', + "data"), + crawler_definition, + converter_registry + ) + subd = crawler.debug_tree + subc = crawler.debug_metadata + # print(json.dumps(subc, indent=3)) + # print(subd) + # print(subc) + # print(records) + ins, ups = crawler.synchronize() + + +if __name__ == "__main__": + test_dataset()