test_dataset_crawler.py

#!/usr/bin/env python3
# encoding: utf-8
#
# This file is a part of the CaosDB Project.
#
# Copyright (C) 2022 Indiscale GmbH <info@indiscale.com>
# Copyright (C) 2022 Henrik tom Wörden <h.tomwoerden@indiscale.com>
# Copyright (C) 2022 Florian Spreckelsen <f.spreckelsen@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#

"""
module description
"""
import json
import os

import caosdb as db

from newcrawler.crawl import Crawler
from newcrawler.converters import JSONFileConverter, DictConverter
from newcrawler.identifiable_adapters import CaosDBIdentifiableAdapter
from newcrawler.structure_elements import File, JSONFile, Directory

from caosadvancedtools.testutils import clear_database, set_test_key
import sys

from caosadvancedtools.models.parser import parse_model_from_json_schema, parse_model_from_yaml
set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2")


def rfp(*pathcomponents):
    """
    Return full path.
    Shorthand convenience function.
    """
    return os.path.join(os.path.dirname(__file__), *pathcomponents)


DATADIR = rfp("..", "test_data", "extroot", "realworld_example")


@pytest.fixture
def usemodel():
    # First load dataspace data model
    dataspace_definitions = parse_model_from_json_schema(
        os.path.join(DATADIR, "schema", "dataspace.schema.json"))
    dataspace_definitions.sync_data_model(noquestion=True)

    # Then general dataset definitions
    dataset_definitions = parse_model_from_json_schema(
        os.path.join(DATADIR, "schema", "dataset.schema.json"))
    dataset_definitions.sync_data_model(noquestion=True)

    # Finally, add inheritances as defined in yaml
    dataset_inherits = parse_model_from_yaml(
        os.path.join(DATADIR, "schema", "dataset-inheritance.yml"))
    dataset_inherits.sync_data_model(noquestion=True)


def test_dataset(clear_database, usemodel):
    # json_file_path = rfp("test_directories", "single_file_test_data", "testjson.json")

    ident = CaosDBIdentifiableAdapter()
    ident.register_identifiable(
        "license", db.RecordType().add_parent("license").add_property("name"))
    ident.register_identifiable("project_type", db.RecordType(
    ).add_parent("project_type").add_property("name"))
    ident.register_identifiable("Person", db.RecordType(
    ).add_parent("Person").add_property("full_name"))

    crawler = Crawler(debug=True, identifiableAdapter=ident)
    crawler_definition = crawler.load_definition(os.path.join(DATADIR, "dataset_cfoods.yml"))
    # print(json.dumps(crawler_definition, indent=3))
    # Load and register converter packages:
    converter_registry = crawler.load_converters(crawler_definition)
    # print("DictIntegerElement" in converter_registry)

    records = crawler.start_crawling(
        Directory(os.path.join(DATADIR, 'data'), "data"),
        crawler_definition,
        converter_registry
    )
    subd = crawler.debug_tree
    subc = crawler.debug_metadata
    # print(json.dumps(subc, indent=3))
    # print(subd)
    # print(subc)
    # print(records)
    ins, ups = crawler.synchronize()