Skip to content
Snippets Groups Projects
Select Git revision
  • 2cc517ee59f7d48af45b490b598c6e90e85e2ee3
  • main default protected
  • dev
  • f-spss-value-label-name
  • f-unmod
  • f-checkidentical
  • f-simple-breakpoint
  • f-new-debug-tree
  • f-existing-file-id
  • f-no-ident
  • f-collect-problems
  • f-refactor-debug-tree
  • v0.13.0
  • v0.12.0
  • v0.11.0
  • v0.10.1
  • v0.10.0
  • v0.9.1
  • v0.9.0
  • v0.8.0
  • v0.7.1
  • v0.7.0
  • v0.6.0
  • v0.5.0
  • v0.4.0
  • v0.3.0
  • v0.2.0
  • v0.1.0
28 results

test_dataset_crawler.py

Blame
  • Code owners
    Assign users and groups as approvers for specific file changes. Learn more.
    test_dataset_crawler.py 3.61 KiB
    #!/usr/bin/env python3
    # encoding: utf-8
    #
    # This file is a part of the CaosDB Project.
    #
    # Copyright (C) 2022 Indiscale GmbH <info@indiscale.com>
    # Copyright (C) 2022 Henrik tom Wörden <h.tomwoerden@indiscale.com>
    # Copyright (C) 2022 Florian Spreckelsen <f.spreckelsen@indiscale.com>
    #
    # This program is free software: you can redistribute it and/or modify
    # it under the terms of the GNU Affero General Public License as
    # published by the Free Software Foundation, either version 3 of the
    # License, or (at your option) any later version.
    #
    # This program is distributed in the hope that it will be useful,
    # but WITHOUT ANY WARRANTY; without even the implied warranty of
    # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    # GNU Affero General Public License for more details.
    #
    # You should have received a copy of the GNU Affero General Public License
    # along with this program. If not, see <https://www.gnu.org/licenses/>.
    #
    
    """
    module description
    """
    import json
    import os
    
    import caosdb as db
    
    from newcrawler.crawl import Crawler
    from newcrawler.converters import JSONFileConverter, DictConverter
    from newcrawler.identifiable_adapters import CaosDBIdentifiableAdapter
    from newcrawler.structure_elements import File, JSONFile, Directory
    
    from caosadvancedtools.testutils import clear_database, set_test_key
    import sys
    
    from caosadvancedtools.models.parser import parse_model_from_json_schema, parse_model_from_yaml
    set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2")
    
    
    def rfp(*pathcomponents):
        """
        Return full path.
        Shorthand convenience function.
        """
        return os.path.join(os.path.dirname(__file__), *pathcomponents)
    
    
    DATADIR = rfp("..", "test_data", "extroot", "realworld_example")
    
    
    @pytest.fixture
    def usemodel():
        # First load dataspace data model
        dataspace_definitions = parse_model_from_json_schema(
            os.path.join(DATADIR, "schema", "dataspace.schema.json"))
        dataspace_definitions.sync_data_model(noquestion=True)
    
        # Then general dataset definitions
        dataset_definitions = parse_model_from_json_schema(
            os.path.join(DATADIR, "schema", "dataset.schema.json"))
        dataset_definitions.sync_data_model(noquestion=True)
    
        # Finally, add inheritances as defined in yaml
        dataset_inherits = parse_model_from_yaml(
            os.path.join(DATADIR, "schema", "dataset-inheritance.yml"))
        dataset_inherits.sync_data_model(noquestion=True)
    
    
    def test_dataset(clear_database, usemodel):
        # json_file_path = rfp("test_directories", "single_file_test_data", "testjson.json")
    
        ident = CaosDBIdentifiableAdapter()
        ident.register_identifiable(
            "license", db.RecordType().add_parent("license").add_property("name"))
        ident.register_identifiable("project_type", db.RecordType(
        ).add_parent("project_type").add_property("name"))
        ident.register_identifiable("Person", db.RecordType(
        ).add_parent("Person").add_property("full_name"))
    
        crawler = Crawler(debug=True, identifiableAdapter=ident)
        crawler_definition = crawler.load_definition(os.path.join(DATADIR, "dataset_cfoods.yml"))
        # print(json.dumps(crawler_definition, indent=3))
        # Load and register converter packages:
        converter_registry = crawler.load_converters(crawler_definition)
        # print("DictIntegerElement" in converter_registry)
    
        records = crawler.start_crawling(
            Directory(os.path.join(DATADIR, 'data'), "data"),
            crawler_definition,
            converter_registry
        )
        subd = crawler.debug_tree
        subc = crawler.debug_metadata
        # print(json.dumps(subc, indent=3))
        # print(subd)
        # print(subc)
        # print(records)
        ins, ups = crawler.synchronize()