diff --git a/CHANGELOG.md b/CHANGELOG.md index c498b9286e0977295066340a2a4172093ac10bfe..87e33c543e6126024504687e945bfb9bb41b4148 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,28 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.5.0] - 2023-03-28 ## +(Florian Spreckelsen) + +### Changed ### + +- Refactoring of the crawl.py module: Now there is a separate scanner module handling the + collecting of information that is independent of CaosDB itself. +- The signature of the function ``save_debug_data`` was changed to explicitely + take the ``debug_tree`` as its first argument. This change was necessary, as + the ``debug_tree`` is no longer saved as member field of the Crawler class. + + +### Deprecated ### + +- The functions ``load_definition``, ``initialize_converters`` and + ``load_converters`` are deprecated. Please use the functions + ``load_definition``, ``initialize_converters`` and + ``create_converter_registry`` from the scanner module instead. +- The function ``start_crawling`` is deprecated. The function + ``scan_structure_elements`` in the scanner module mostly covers its + functionality. + ## [0.4.0] - 2023-03-22 ## (Florian Spreckelsen) @@ -48,7 +70,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### - Identifiable class to represent the information used to identify Records. -- Added some StructureElements: BooleanElement, FloatElement, IntegerElement, +- Added some StructureElements: BooleanElement, FloatElement, IntegerElement, ListElement, DictElement - String representation for Identifiables - [#43](https://gitlab.com/caosdb/caosdb-crawler/-/issues/43) the crawler diff --git a/CITATION.cff b/CITATION.cff index 9c8bf551c41a6a3447b076914741b349a8c72b9c..834f57db4521c983947ed4b960b2877c914b5bb2 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -20,6 +20,6 @@ authors: given-names: Stefan orcid: https://orcid.org/0000-0001-7214-8125 title: CaosDB - Crawler -version: 0.4.0 +version: 0.5.0 doi: 10.3390/data4020083 -date-released: 2023-03-22 \ No newline at end of file +date-released: 2023-03-28 \ No newline at end of file diff --git a/integrationtests/basic_example/test_basic.py b/integrationtests/basic_example/test_basic.py index 0c847b08a729f3b112cbdf3c38bac31309cda125..b33974d9c2c5600bf2a91cbf14d7c8799ffc2644 100755 --- a/integrationtests/basic_example/test_basic.py +++ b/integrationtests/basic_example/test_basic.py @@ -27,6 +27,7 @@ an integration test module that does basic integration tests """ from caosadvancedtools.crawler import Crawler as OldCrawler +from caoscrawler.debug_tree import DebugTree import os from caosdb import EmptyUniqueQueryError import argparse @@ -36,6 +37,7 @@ from caoscrawler import Crawler, SecurityMode from caoscrawler.identifiable import Identifiable import caosdb as db from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter +from caoscrawler.scanner import scan_directory import pytest from caosadvancedtools.models.parser import parse_model_from_yaml import yaml @@ -82,42 +84,46 @@ def ident(): return ident -def crawl_standard_test_directory(cr: Crawler, - subdir: str = "examples_article", - cfood: str = "scifolder_cfood.yml"): - cr.crawl_directory(rfp("..", "..", "unittests", "test_directories", subdir), - rfp("..", "..", "unittests", cfood)) +def crawl_standard_test_directory(subdir: str = "examples_article", + cfood: str = "scifolder_cfood.yml", + debug_tree=None): + return scan_directory(rfp("..", "..", "unittests", "test_directories", subdir), + rfp("..", "..", "unittests", cfood), + debug_tree=debug_tree) @pytest.fixture def crawler(ident): - cr = Crawler(debug=True, identifiableAdapter=ident) - crawl_standard_test_directory(cr) - return cr + cr = Crawler(identifiableAdapter=ident) + debug_tree = DebugTree() + crawled_data = crawl_standard_test_directory(debug_tree=debug_tree) + return cr, crawled_data, debug_tree @pytest.fixture def crawler_extended(ident): - cr = Crawler(debug=True, identifiableAdapter=ident) - crawl_standard_test_directory(cr, cfood="scifolder_extended.yml") + cr = Crawler(identifiableAdapter=ident) + debug_tree = DebugTree() + crawled_data = crawl_standard_test_directory( + cfood="scifolder_extended.yml", debug_tree=debug_tree) # correct paths for current working directory - file_list = [r for r in cr.crawled_data if r.role == "File"] + file_list = [r for r in crawled_data if r.role == "File"] for f in file_list: f.file = rfp("..", "..", "unittests", "test_directories", f.file) - return cr + return cr, crawled_data, debug_tree def test_ambigious_lookup(clear_database, usemodel, crawler, ident): - ins, ups = crawler.synchronize() + ins, ups = crawler[0].synchronize(crawled_data=crawler[1]) proj = db.execute_query("FIND Project WITH identifier='SpeedOfLight'", unique=True) with pytest.raises(RuntimeError, match=".*unambigiously.*"): - print(crawler.identifiableAdapter.retrieve_identified_record_for_identifiable( + print(crawler[0].identifiableAdapter.retrieve_identified_record_for_identifiable( Identifiable(properties={'project': proj.id}))) def test_single_insertion(clear_database, usemodel, crawler, ident): - ins, ups = crawler.synchronize() + ins, ups = crawler[0].synchronize(crawled_data=crawler[1]) # This test also generates the file records.xml used in some of the unittesets: res = db.execute_query("FIND Record") @@ -138,94 +144,93 @@ def test_single_insertion(clear_database, usemodel, crawler, ident): assert len(ups) == 0 # Do a second run on the same data, there should be no changes: - crawler = Crawler(debug=True, identifiableAdapter=ident) - crawler.crawl_directory(rfp("../../unittests/test_directories", "examples_article"), - rfp("../../unittests/scifolder_cfood.yml")) - ins, ups = crawler.synchronize() + crawler = Crawler(identifiableAdapter=ident) + crawled_data = scan_directory(rfp("../../unittests/test_directories", "examples_article"), + rfp("../../unittests/scifolder_cfood.yml")) + ins, ups = crawler.synchronize(crawled_data=crawled_data) assert len(ins) == 0 assert len(ups) == 0 def test_multiple_insertions(clear_database, usemodel, ident, crawler): - ins, ups = crawler.synchronize() + ins, ups = crawler[0].synchronize(crawled_data=crawler[1]) # Do a second run on the same data, there should be no changes: - cr = Crawler(debug=True, identifiableAdapter=ident) - crawl_standard_test_directory(cr) - ins, ups = cr.synchronize() + cr = Crawler(identifiableAdapter=ident) + crawled_data = crawl_standard_test_directory() + ins, ups = cr.synchronize(crawled_data=crawled_data) assert len(ins) == 0 assert len(ups) == 0 def test_insertion(clear_database, usemodel, ident, crawler): - ins, ups = crawler.synchronize() + ins, ups = crawler[0].synchronize(crawled_data=crawler[1]) # Do a second run on the same data, there should a new insert: - cr = Crawler(debug=True, identifiableAdapter=ident) - crawl_standard_test_directory(cr, "example_insert") - assert len(cr.crawled_data) == 3 - ins, ups = cr.synchronize() + cr = Crawler(identifiableAdapter=ident) + crawled_data = crawl_standard_test_directory("example_insert") + assert len(crawled_data) == 3 + ins, ups = cr.synchronize(crawled_data=crawled_data) assert len(ins) == 1 assert len(ups) == 0 # Do it again to check whether nothing is changed: - cr = Crawler(debug=True, identifiableAdapter=ident) - crawl_standard_test_directory(cr, "example_insert") - assert len(cr.crawled_data) == 3 - ins, ups = cr.synchronize() + cr = Crawler(identifiableAdapter=ident) + crawled_data = crawl_standard_test_directory("example_insert") + assert len(crawled_data) == 3 + ins, ups = cr.synchronize(crawled_data=crawled_data) assert len(ins) == 0 assert len(ups) == 0 def test_insert_auth(clear_database, usemodel, ident, crawler): - ins, ups = crawler.synchronize() + ins, ups = crawler[0].synchronize(crawled_data=crawler[1]) # Do a second run on the same data, there should a new insert: - cr = Crawler(debug=True, identifiableAdapter=ident, securityMode=SecurityMode.RETRIEVE) - crawl_standard_test_directory(cr, "example_insert") - assert len(cr.crawled_data) == 3 - ins, ups = cr.synchronize() + cr = Crawler(identifiableAdapter=ident, securityMode=SecurityMode.RETRIEVE) + crawled_data = crawl_standard_test_directory("example_insert") + assert len(crawled_data) == 3 + ins, ups = cr.synchronize(crawled_data=crawled_data) assert len(ins) == 1 assert not ins[0].is_valid() nins, nups = OldCrawler.update_authorized_changes(cr.run_id) assert nins == 1 # Do it again to check whether nothing is changed: - cr = Crawler(debug=True, identifiableAdapter=ident) - crawl_standard_test_directory(cr, "example_insert") - assert len(cr.crawled_data) == 3 - ins, ups = cr.synchronize() + cr = Crawler(identifiableAdapter=ident) + crawled_data = crawl_standard_test_directory("example_insert") + assert len(crawled_data) == 3 + ins, ups = cr.synchronize(crawled_data=crawled_data) assert len(ins) == 0 assert len(ups) == 0 def test_insertion_and_update(clear_database, usemodel, ident, crawler): - ins, ups = crawler.synchronize() + ins, ups = crawler[0].synchronize(crawled_data=crawler[1]) - cr = Crawler(debug=True, identifiableAdapter=ident) - crawl_standard_test_directory(cr, "example_insert") - ins, ups = cr.synchronize() + cr = Crawler(identifiableAdapter=ident) + crawled_data = crawl_standard_test_directory("example_insert") + ins, ups = cr.synchronize(crawled_data=crawled_data) - cr = Crawler(debug=True, identifiableAdapter=ident) - crawl_standard_test_directory(cr, "example_overwrite_1") - # print(cr.crawled_data) + cr = Crawler(identifiableAdapter=ident) + crawled_data = crawl_standard_test_directory("example_overwrite_1") # cr.save_debug_data(rfp("provenance.yml")) - assert len(cr.crawled_data) == 3 - ins, ups = cr.synchronize() + assert len(crawled_data) == 3 + ins, ups = cr.synchronize(crawled_data=crawled_data) assert len(ins) == 0 assert len(ups) == 1 def test_identifiable_update(clear_database, usemodel, ident, crawler): - ins, ups = crawler.synchronize() + ins, ups = crawler[0].synchronize(crawled_data=crawler[1]) # Do a second run on the same data with a change in one # of the identifiables: - cr = Crawler(debug=True, identifiableAdapter=ident) - crawl_standard_test_directory(cr) + cr = Crawler(identifiableAdapter=ident) + crawled_data = crawl_standard_test_directory() # Test the addition of a single property: - l = cr.crawled_data + l = crawled_data for record in l: if (record.parents[0].name == "Measurement" and record.get_property("date").value == "2020-01-03"): @@ -234,28 +239,28 @@ def test_identifiable_update(clear_database, usemodel, ident, crawler): name="email", value="testperson@testaccount.test") print("one change") break - ins, ups = cr.synchronize() + ins, ups = cr.synchronize(crawled_data=crawled_data) assert len(ins) == 0 assert len(ups) == 1 # Test the change within one property: - cr = Crawler(debug=True, identifiableAdapter=ident) - crawl_standard_test_directory(cr) - l = cr.crawled_data + cr = Crawler(identifiableAdapter=ident) + crawled_data = crawl_standard_test_directory() + l = crawled_data for record in l: if (record.parents[0].name == "Measurement" and record.get_property("date").value == "2020-01-03"): record.add_property(name="email", value="testperson@coolmail.test") print("one change") break - ins, ups = cr.synchronize() + ins, ups = cr.synchronize(crawled_data=crawled_data) assert len(ins) == 0 assert len(ups) == 1 # Changing the date should result in a new insertion: - cr = Crawler(debug=True, identifiableAdapter=ident) - crawl_standard_test_directory(cr) - l = cr.crawled_data + cr = Crawler(identifiableAdapter=ident) + crawled_data = crawl_standard_test_directory() + l = crawled_data for record in l: if (record.parents[0].name == "Measurement" and record.get_property("date").value == "2020-01-03"): @@ -263,30 +268,31 @@ def test_identifiable_update(clear_database, usemodel, ident, crawler): record.get_property("date").value = "2012-01-02" print("one change") break - ins, ups = cr.synchronize() + ins, ups = cr.synchronize(crawled_data=crawled_data) assert len(ins) == 1 assert len(ups) == 0 def test_file_insertion_dry(clear_database, usemodel, ident): - crawler_extended = Crawler(debug=True, identifiableAdapter=ident) - crawl_standard_test_directory( - crawler_extended, cfood="scifolder_extended.yml") - file_list = [r for r in crawler_extended.crawled_data if r.role == "File"] + crawler_extended = Crawler(identifiableAdapter=ident) + crawled_data = crawl_standard_test_directory( + cfood="scifolder_extended.yml") + file_list = [r for r in crawled_data if r.role == "File"] assert len(file_list) == 11 for f in file_list: assert f.path.endswith("README.md") assert f.path[1:] == f.file - ins, ups = crawler_extended.synchronize(commit_changes=False) + ins, ups = crawler_extended.synchronize(crawled_data=crawled_data, commit_changes=False) assert len(ups) == 0 file_list_ins = [r for r in ins if r.role == "File"] assert len(file_list_ins) == 11 def test_file_insertion(clear_database, usemodel, ident, crawler_extended): - ins, ups = crawler_extended.synchronize(commit_changes=True) + ins, ups = crawler_extended[0].synchronize( + crawled_data=crawler_extended[1], commit_changes=True) file_list_ins = [r for r in ins if r.role == "File"] assert len(file_list_ins) == 11 @@ -302,16 +308,17 @@ def test_file_insertion(clear_database, usemodel, ident, crawler_extended): def test_file_update(clear_database, usemodel, ident, crawler_extended): - ins1, ups1 = crawler_extended.synchronize(commit_changes=True) + ins1, ups1 = crawler_extended[0].synchronize( + crawled_data=crawler_extended[1], commit_changes=True) file_list_ins = [r for r in ins1 if r.role == "File"] - cr = Crawler(debug=True, identifiableAdapter=ident) - crawl_standard_test_directory(cr, cfood="scifolder_extended.yml") + cr = Crawler(identifiableAdapter=ident) + crawled_data = crawl_standard_test_directory(cfood="scifolder_extended.yml") - file_list = [r for r in cr.crawled_data if r.role == "File"] + file_list = [r for r in crawled_data if r.role == "File"] for f in file_list: f.file = rfp("..", "..", "unittests", "test_directories", f.file) - ins2, ups2 = cr.synchronize(commit_changes=True) + ins2, ups2 = cr.synchronize(crawled_data=crawled_data, commit_changes=True) assert len(ups1) == 0 assert len(ups2) == 0 @@ -320,13 +327,13 @@ def test_file_update(clear_database, usemodel, ident, crawler_extended): assert len(res) == 11 assert len(res[0].parents) == 0 - cr2 = Crawler(debug=True, identifiableAdapter=ident) - crawl_standard_test_directory(cr2, cfood="scifolder_extended2.yml") + cr2 = Crawler(identifiableAdapter=ident) + crawled_data = crawl_standard_test_directory(cfood="scifolder_extended2.yml") - file_list = [r for r in cr2.crawled_data if r.role == "File"] + file_list = [r for r in crawled_data if r.role == "File"] for f in file_list: f.file = rfp("..", "..", "unittests", "test_directories", f.file) - ins3, ups3 = cr2.synchronize(commit_changes=True) + ins3, ups3 = cr2.synchronize(crawled_data=crawled_data, commit_changes=True) assert len(ups3) == 11 res = db.execute_query("Find File") diff --git a/integrationtests/test_issues.py b/integrationtests/test_issues.py index 527b4c0cf67f483d5b61972a0104ff4fb673402d..08e254daf4052670fcec18760626c460604efe15 100644 --- a/integrationtests/test_issues.py +++ b/integrationtests/test_issues.py @@ -24,6 +24,8 @@ from caoscrawler.crawl import Crawler from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter from caoscrawler.structure_elements import DictElement +from caoscrawler.scanner import create_converter_registry, scan_structure_elements + from caosdb.utils.register_tests import clear_database, set_test_key set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") @@ -86,8 +88,8 @@ def test_issue_23(clear_database): ident.register_identifiable("TestType", db.RecordType().add_parent( name="TestType").add_property(name="identifying_prop")) - crawler = Crawler(debug=True, identifiableAdapter=ident) - converter_registry = crawler.load_converters(crawler_definition) + crawler = Crawler(identifiableAdapter=ident) + converter_registry = create_converter_registry(crawler_definition) # the dictionary to be crawled... test_dict = { @@ -95,7 +97,8 @@ def test_issue_23(clear_database): "prop_b": "something_else" } - records = crawler.start_crawling( + crawler.generate_run_id() + records = scan_structure_elements( DictElement("TestDict", test_dict), crawler_definition, converter_registry) assert len(records) == 1 @@ -109,7 +112,7 @@ def test_issue_23(clear_database): assert rec_crawled.get_property("prop_a") is None # synchronize with database and update the record - ins, ups = crawler.synchronize() + ins, ups = crawler.synchronize(crawled_data=records) assert len(ins) == 0 assert len(ups) == 1 diff --git a/integrationtests/test_realworld_example.py b/integrationtests/test_realworld_example.py index cb5ed2c769945af033bc56a2d6af3bf1cec86de4..45873ddeb8b4f4a23fbcbc9225cbeea60b213cc4 100644 --- a/integrationtests/test_realworld_example.py +++ b/integrationtests/test_realworld_example.py @@ -38,6 +38,8 @@ import pytest from caosadvancedtools.models.parser import parse_model_from_json_schema, parse_model_from_yaml from caosadvancedtools.loadFiles import loadpath +from caoscrawler.scanner import load_definition, scan_structure_elements, create_converter_registry + import sys set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") @@ -103,12 +105,11 @@ def test_dataset(clear_database, usemodel, addfiles, caplog): identifiable_path = os.path.join(DATADIR, "identifiables.yml") crawler_definition_path = os.path.join(DATADIR, "dataset_cfoods.yml") crawler_main( - os.path.join(DATADIR, 'data'), - crawler_definition_path, - identifiable_path, - True, - os.path.join(DATADIR, "provenance.yml"), - False, + crawled_directory_path=os.path.join(DATADIR, 'data'), + cfood_file_name=crawler_definition_path, + identifiables_definition_file=identifiable_path, + provenance_file=os.path.join(DATADIR, "provenance.yml"), + dry_run=False, remove_prefix=DATADIR, # this test will fail without this prefix since the crawler would try to create new files add_prefix="/extroot/realworld_example" @@ -143,12 +144,11 @@ def test_event_update(clear_database, usemodel, addfiles): crawler_definition_path = os.path.join(DATADIR, "dataset_cfoods.yml") crawler_main( - os.path.join(DATADIR, 'data'), - crawler_definition_path, - identifiable_path, - True, - os.path.join(DATADIR, "provenance.yml"), - False, + crawled_directory_path=os.path.join(DATADIR, 'data'), + cfood_file_name=crawler_definition_path, + identifiables_definition_file=identifiable_path, + provenance_file=os.path.join(DATADIR, "provenance.yml"), + dry_run=False, remove_prefix=DATADIR, # this test will fail without this prefix since the crawler would try to create new files add_prefix="/extroot/realworld_example" @@ -169,10 +169,11 @@ def test_event_update(clear_database, usemodel, addfiles): ident.load_from_yaml_definition(identifiable_path) second_crawler = Crawler(identifiableAdapter=ident) - crawler_definition = second_crawler.load_definition( + second_crawler.generate_run_id() + crawler_definition = load_definition( crawler_definition_path) - converter_registry = second_crawler.load_converters(crawler_definition) - records = second_crawler.start_crawling( + converter_registry = create_converter_registry(crawler_definition) + records = scan_structure_elements( Directory("data", os.path.join(DATADIR, "data")), crawler_definition, converter_registry @@ -190,7 +191,7 @@ def test_event_update(clear_database, usemodel, addfiles): "latitude").value = 0.0 rec.get_property("Event").value[0].get_property( "location").value = "Origin" - second_crawler.synchronize() + second_crawler.synchronize(crawled_data=records) # Dataset is still the same Record, but with an updated event new_dataset_rec = db.Record(id=old_dataset_rec.id).retrieve() diff --git a/integrationtests/test_use_case_simple_presentation.py b/integrationtests/test_use_case_simple_presentation.py index 5fc0f6c7d85a0fce4490c72952e711fe241a0099..0f48677d4bf64158374a0eb0865eb2b85ea715db 100644 --- a/integrationtests/test_use_case_simple_presentation.py +++ b/integrationtests/test_use_case_simple_presentation.py @@ -57,22 +57,24 @@ def test_complete_crawler(clear_database): # test that a bad value for "remove_prefix" leads to runtime error with pytest.raises(RuntimeError) as re: - crawler_main(DATADIR, - os.path.join(DATADIR, "cfood.yml"), - os.path.join(DATADIR, "identifiables.yml"), - True, - os.path.join(DATADIR, "provenance.yml"), - False, - remove_prefix="sldkfjsldf") + crawler_main( + crawled_directory_path=os.path.join(DATADIR), + cfood_file_name=os.path.join(DATADIR, "cfood.yml"), + identifiables_definition_file=os.path.join(DATADIR, "identifiables.yml"), + provenance_file=os.path.join(DATADIR, "provenance.yml"), + dry_run=False, + remove_prefix="sldkfjsldf", + ) assert "path does not start with the prefix" in str(re.value) - crawler_main(DATADIR, - os.path.join(DATADIR, "cfood.yml"), - os.path.join(DATADIR, "identifiables.yml"), - True, - os.path.join(DATADIR, "provenance.yml"), - False, - remove_prefix=os.path.abspath(DATADIR)) + crawler_main( + crawled_directory_path=os.path.join(DATADIR), + cfood_file_name=os.path.join(DATADIR, "cfood.yml"), + identifiables_definition_file=os.path.join(DATADIR, "identifiables.yml"), + provenance_file=os.path.join(DATADIR, "provenance.yml"), + dry_run=False, + remove_prefix=os.path.abspath(DATADIR), + ) res = db.execute_query("FIND Record Experiment") assert len(res) == 1 diff --git a/setup.cfg b/setup.cfg index fbdd9d7119312e2831c77fe3e8b24bd16b5826b4..0323d979854656d33e29cd760113fcb259a77f6e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = caoscrawler -version = 0.4.0 +version = 0.5.0 author = Alexander Schlemmer author_email = alexander.schlemmer@ds.mpg.de description = A new crawler for caosdb @@ -21,7 +21,7 @@ python_requires = >=3.7 install_requires = importlib-resources caosdb >= 0.11.0 - caosadvancedtools >= 0.6.0 + caosadvancedtools >= 0.7.0 yaml-header-tools >= 0.2.1 pyyaml odfpy #make optional diff --git a/src/caoscrawler/__init__.py b/src/caoscrawler/__init__.py index 044d8f0bf53c4c80dab9b492919fa64ab321a60d..05bad0b54d9098c0b7f165d8295a0faa2966fa32 100644 --- a/src/caoscrawler/__init__.py +++ b/src/caoscrawler/__init__.py @@ -1,2 +1,4 @@ from .crawl import Crawler, SecurityMode -from .version import CfoodRequiredVersionError, version as __version__ +from .version import CfoodRequiredVersionError, get_caoscrawler_version + +__version__ = get_caoscrawler_version() diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index c77dcee1f29eac69732ce353e0271761eca2df13..bacc5356b7b14f43d44db25c461c717fa9c39bc9 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -1,11 +1,12 @@ #!/usr/bin/env python3 # encoding: utf-8 # -# ** header v3.0 # This file is a part of the CaosDB Project. # -# Copyright (C) 2021 Henrik tom Wörden -# 2021 Alexander Schlemmer +# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# 2021-2023 Research Group Biomedical Physics, +# Max-Planck-Institute for Dynamics and Self-Organization Göttingen +# Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as @@ -67,6 +68,13 @@ from .stores import GeneralStore, RecordStore from .structure_elements import StructureElement, Directory, NoneElement from .version import check_cfood_version +from .scanner import (scan_directory, + load_definition, + create_converter_registry, + initialize_converters, + scan_structure_elements) +from .debug_tree import DebugTree + logger = logging.getLogger(__name__) SPECIAL_PROPERTIES_STRICT = ("description", "name", "id", "path") @@ -176,26 +184,19 @@ class Crawler(object): def __init__(self, generalStore: Optional[GeneralStore] = None, - debug: bool = False, - identifiableAdapter: IdentifiableAdapter = None, - securityMode: SecurityMode = SecurityMode.UPDATE - ): + debug: Optional[bool] = None, + identifiableAdapter: Optional[IdentifiableAdapter] = None, + securityMode: SecurityMode = SecurityMode.UPDATE): """ Create a new crawler and initialize an empty RecordStore and GeneralStore. + Deprecated arguments: + - The debug argument does not have an effect anymore. + - generalStore: This argument does not have an effect anymore. It might be added to the scanning + functions in the scanner module in the future, if needed. + Parameters ---------- - recordStore : GeneralStore - An initial GeneralStore which might store e.g. environment variables. - debug : bool - Create a debugging information tree when set to True. - The debugging information tree is a variable stored in - self.debug_tree. It is a dictionary mapping directory entries - to a tuple of general stores and record stores which are valid for - the directory scope. - Furthermore, it is stored in a second tree named self.debug_copied whether the - objects in debug_tree had been copied from a higher level in the hierarchy - of the structureelements. identifiableAdapter : IdentifiableAdapter TODO describe securityMode : int @@ -203,278 +204,93 @@ class Crawler(object): Please use SecurityMode Enum """ + # Remove this once the property `crawled_data` is no longer needed for compatibility + # reasons + self._crawled_data = None + # The following caches store records, where we checked whether they exist on the remote # server. Since, it is important to know whether they exist or not, we store them into two # different caches. self.remote_existing_cache = IdentifiedCache() self.remote_missing_cache = IdentifiedCache() - self.recordStore = RecordStore() self.securityMode = securityMode - self.generalStore = generalStore - if generalStore is None: - self.generalStore = GeneralStore() - self.identifiableAdapter: IdentifiableAdapter = LocalStorageIdentifiableAdapter() if identifiableAdapter is not None: self.identifiableAdapter = identifiableAdapter - # If a directory is crawled this may hold the path to that directory - self.crawled_directory: Optional[str] = None - self.debug = debug - if self.debug: - # order in the tuple: - # 0: generalStore - # 1: recordStore - self.debug_tree: dict[str, tuple] = dict() - self.debug_metadata: dict[str, dict] = dict() - self.debug_metadata["copied"] = dict() - self.debug_metadata["provenance"] = defaultdict(lambda: dict()) - self.debug_metadata["usage"] = defaultdict(lambda: set()) - def load_definition(self, crawler_definition_path: str): - """ - Load a cfood from a crawler definition defined by - crawler definition path and validate it using cfood-schema.yml. - """ + if debug is not None: + warnings.warn(DeprecationWarning( + "The debug argument of the Crawler class is deprecated and has no effect.")) - # Load the cfood from a yaml file: - with open(crawler_definition_path, "r") as f: - crawler_definitions = list(yaml.safe_load_all(f)) - - crawler_definition = self._load_definition_from_yaml_dict( - crawler_definitions) - - return self._resolve_validator_paths(crawler_definition, crawler_definition_path) - - def _load_definition_from_yaml_dict(self, crawler_definitions: list[dict]): - """Load crawler definitions from a list of (yaml) dicts `crawler_definitions` which - contains either one or two documents. - - Doesn't resolve the validator paths in the cfood definition, so for - internal and testing use only. - - """ - if len(crawler_definitions) == 1: - # Simple case, just one document: - crawler_definition = crawler_definitions[0] - metadata = {} - elif len(crawler_definitions) == 2: - metadata = crawler_definitions[0]["metadata"] if "metadata" in crawler_definitions[0] else { - } - crawler_definition = crawler_definitions[1] - else: - raise RuntimeError( - "Crawler definition must not contain more than two documents.") - - check_cfood_version(metadata) - - # TODO: at this point this function can already load the cfood schema extensions - # from the crawler definition and add them to the yaml schema that will be - # tested in the next lines of code: - - # Load the cfood schema: - with open(str(files('caoscrawler').joinpath('cfood-schema.yml')), "r") as f: - schema = yaml.safe_load(f) - - # Add custom converters to converter enum in schema: - if "Converters" in crawler_definition: - for key in crawler_definition["Converters"]: - schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append( - key) - if len(crawler_definitions) == 2: - if "Converters" in metadata: - for key in metadata["Converters"]: - schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append( - key) - - # Validate the cfood schema: - validate(instance=crawler_definition, schema=schema["cfood"]) - - return crawler_definition - - def _resolve_validator_paths(self, definition: dict, definition_path: str): - """Resolve path to validation files with respect to the file in which - the crawler was defined. - - """ - - for key, value in definition.items(): - - if key == "validate" and isinstance(value, str): - # Validator is given by a path - if not value.startswith('/'): - # Not an absolute path - definition[key] = os.path.join(os.path.dirname(definition_path), value) - if not os.path.isfile(definition[key]): - # TODO(henrik) capture this in `crawler_main` similar to - # `ConverterValidationError`. - raise FileNotFoundError( - f"Couldn't find validation file {definition[key]}") - elif isinstance(value, dict): - # Recursively resolve all validators - definition[key] = self._resolve_validator_paths(value, definition_path) - - return definition + if generalStore is not None: + warnings.warn(DeprecationWarning( + "The generalStore argument of the Crawler class is deprecated and has no effect.")) def load_converters(self, definition: dict): - """ - Currently the converter registry is a dictionary containing for each converter: - - key is the short code, abbreviation for the converter class name - - module is the name of the module to be imported which must be installed - - class is the converter class to load and associate with this converter entry - - all other info for the converter needs to be included in the converter plugin - directory: - schema.yml file - README.md documentation - - TODO: this function does not make use of self, so it could become static. - """ - - # Defaults for the converter registry: - with open(str(files('caoscrawler').joinpath('default_converters.yml')), "r") as f: - converter_registry: dict[str, dict[str, str]] = yaml.safe_load(f) - - # More converters from definition file: - if "Converters" in definition: - for key, entry in definition["Converters"].items(): - if key in ["Dict", "DictTextElement", "DictIntegerElement", "DictBooleanElement", - "DictDictElement", "DictListElement", "DictFloatElement"]: - warnings.warn(DeprecationWarning(f"{key} is deprecated. Please use the new" - " variant; without 'Dict' prefix or " - "'DictElement' in case of 'Dict'")) - - converter_registry[key] = { - "converter": entry["converter"], - "package": entry["package"] - } - - # Load modules and associate classes: - for key, value in converter_registry.items(): - module = importlib.import_module(value["package"]) - value["class"] = getattr(module, value["converter"]) - return converter_registry - - def crawl_directory(self, dirname: str, crawler_definition_path: str, - restricted_path: Optional[list[str]] = None): - """ Crawl a single directory. + warnings.warn(DeprecationWarning( + "The function load_converters in the crawl module is deprecated. " + "Please use create_converter_registry from the scanner module.")) + return create_converter_registry(definition) - Convenience function that starts the crawler (calls start_crawling) - with a single directory as the StructureElement. - - restricted_path: optional, list of strings - Traverse the data tree only along the given path. When the end of the given path - is reached, traverse the full tree as normal. - """ - - crawler_definition = self.load_definition(crawler_definition_path) - # Load and register converter packages: - converter_registry = self.load_converters(crawler_definition) - - if not dirname: - raise ValueError( - "You have to provide a non-empty path for crawling.") - dir_structure_name = os.path.basename(dirname) - self.crawled_directory = dirname - if not dir_structure_name and dirname.endswith('/'): - if dirname == '/': - # Crawling the entire file system - dir_structure_name = "root" - else: - # dirname had a trailing '/' - dir_structure_name = os.path.basename(dirname[:-1]) - - self.start_crawling(Directory(dir_structure_name, - dirname), - crawler_definition, - converter_registry, - restricted_path=restricted_path - ) - - @staticmethod - def initialize_converters(crawler_definition: dict, converter_registry: dict): - """ - takes the cfood as dict (`crawler_definition`) and creates the converter objects that - are defined on the highest level. Child Converters will in turn be created during the - initialization of the Converters. - """ - converters = [] - - for key, value in crawler_definition.items(): - # Definitions and Converters are reserved keywords - # on the top level of the yaml file. - # TODO: there should also be a top level keyword for the actual - # CFood to avoid confusion between top level keywords - # and the CFood. - if key == "Definitions": - continue - elif key == "Converters": - continue - converters.append(Converter.converter_factory( - value, key, converter_registry)) - - return converters + def load_definition(self, crawler_definition_path: str): + warnings.warn(DeprecationWarning( + "The function load_definition in the crawl module is deprecated. " + "Please use load_definition from the scanner module.")) + return load_definition(crawler_definition_path) + + def initialize_converters(self, crawler_definition: dict, converter_registry: dict): + warnings.warn(DeprecationWarning( + "The function initialize_converters in the crawl module is deprecated. " + "Please use initialize_converters from the scanner module.")) + return initialize_converters(crawler_definition, converter_registry) + + def generate_run_id(self): + self.run_id = uuid.uuid1() def start_crawling(self, items: Union[list[StructureElement], StructureElement], crawler_definition: dict, converter_registry: dict, restricted_path: Optional[list[str]] = None): - """ - Start point of the crawler recursion. - Parameters - ---------- - items: list - A list of structure elements (or a single StructureElement) that is used for - generating the initial items for the crawler. This could e.g. be a Directory. - crawler_definition : dict - A dictionary representing the crawler definition, possibly from a yaml - file. - restricted_path: optional, list of strings - Traverse the data tree only along the given path. When the end of the given path - is reached, traverse the full tree as normal. - - Returns - ------- - crawled_data : list - the final list with the target state of Records. - """ - - # This function builds the tree of converters out of the crawler definition. - - if self.generalStore is None: - raise RuntimeError("Should not happen.") - - if not isinstance(items, list): - items = [items] - - self.run_id = uuid.uuid1() - local_converters = Crawler.initialize_converters(crawler_definition, converter_registry) - - # This recursive crawling procedure generates the update list: - self.crawled_data: list[db.Record] = [] - self._crawl( - items=items, - local_converters=local_converters, - generalStore=self.generalStore, - recordStore=self.recordStore, - structure_elements_path=[], - converters_path=[], - restricted_path=restricted_path) - if self.debug: - self.debug_converters = local_converters - - return self.crawled_data - - def synchronize(self, commit_changes: bool = True, unique_names=True): + warnings.warn(DeprecationWarning( + "The function start_crawling in the crawl module is deprecated. " + "Please use scan_structure_elements from the scanner module.")) + + data = scan_structure_elements( + items, crawler_definition, converter_registry, restricted_path) + self.crawled_data = data + return data + + @property + def crawled_data(self): + warnings.warn(DeprecationWarning( + "The use of self.crawled_data is depricated. You should not access this variable. " + "Instead, create the data with the scanner and then pass it as argument to Crawler " + "functions")) + return self._crawled_data + + @crawled_data.setter + def crawled_data(self, arg): + self._crawled_data = arg + + def crawl_directory(self, + crawled_directory: str, + crawler_definition_path: str, + restricted_path: Optional[list[str]] = None): """ - Carry out the actual synchronization. + The new main function to run the crawler on a directory. """ - # After the crawling, the actual synchronization with the database, based on the - # update list is carried out: + warnings.warn(DeprecationWarning( + "The function crawl_directory in the crawl module is deprecated. " + "Please use scan_directory from the scanner module.")) - return self._synchronize(self.crawled_data, commit_changes, unique_names=unique_names) + data = scan_directory(crawled_directory, + crawler_definition_path, + restricted_path) + self.crawled_data = data + return data def _has_reference_value_without_id(self, ident: Identifiable) -> bool: """ @@ -712,7 +528,7 @@ class Crawler(object): # TODO: can the following be removed at some point for ent in flat: if ent.role == "Record" and len(ent.parents) == 0: - raise RuntimeError("Records must have a parent.") + raise RuntimeError(f"Records must have a parent.\n{ent}") resolved_references = True # flat contains Entities which could not yet be checked against the remote server @@ -947,7 +763,8 @@ class Crawler(object): return db.Entity(id=id).retrieve() @staticmethod - def execute_inserts_in_list(to_be_inserted, securityMode, run_id: uuid.UUID = None, + def execute_inserts_in_list(to_be_inserted, securityMode, + run_id: Optional[uuid.UUID] = None, unique_names=True): for record in to_be_inserted: for prop in record.properties: @@ -975,7 +792,8 @@ class Crawler(object): _resolve_datatype(prop, entity) @staticmethod - def execute_updates_in_list(to_be_updated, securityMode, run_id: uuid.UUID = None, + def execute_updates_in_list(to_be_updated, securityMode, + run_id: Optional[uuid.UUID] = None, unique_names=True): Crawler.set_ids_and_datatype_of_parents_and_properties(to_be_updated) logger.debug("UPDATE") @@ -987,8 +805,11 @@ class Crawler(object): update_cache = UpdateCache() update_cache.insert(to_be_updated, run_id) - def _synchronize(self, crawled_data: list[db.Record], commit_changes: bool = True, - unique_names=True): + def synchronize(self, + commit_changes: bool = True, + unique_names: bool = True, + crawled_data: Optional[list[db.Record]] = None, + ): """ This function applies several stages: 1) Retrieve identifiables for all records in crawled_data. @@ -1003,6 +824,13 @@ class Crawler(object): Return the final to_be_inserted and to_be_updated as tuple. """ + if crawled_data is None: + warnings.warn(DeprecationWarning( + "Calling synchronize without the data to be synchronized is depricated. Please " + "use for example the Scanner to create this data.")) + crawled_data = self.crawled_data + + self.generate_run_id() to_be_inserted, to_be_updated = self.split_into_inserts_and_updates(crawled_data) referencing_entities = self.create_reference_mapping(to_be_updated + to_be_inserted) @@ -1023,7 +851,7 @@ class Crawler(object): to_be_updated = self.remove_unnecessary_updates(to_be_updated, identified_records) logger.info(f"Going to insert {len(to_be_inserted)} Entities and update " - f"{len(to_be_inserted)} Entities.") + f"{len(to_be_updated)} Entities.") if commit_changes: self.execute_parent_updates_in_list(to_be_updated, securityMode=self.securityMode, run_id=self.run_id, unique_names=unique_names) @@ -1041,12 +869,14 @@ class Crawler(object): pending_inserts = update_cache.get_inserts(self.run_id) if pending_inserts: Crawler.inform_about_pending_changes( - pending_inserts, self.run_id, self.crawled_directory) + # TODO crawled_directory is no longer available + pending_inserts, self.run_id, "missing crawled_directory") pending_updates = update_cache.get_updates(self.run_id) if pending_updates: Crawler.inform_about_pending_changes( - pending_updates, self.run_id, self.crawled_directory) + # TODO crawled_directory is no longer available + pending_updates, self.run_id, "missing crawled_directory") return (to_be_inserted, to_be_updated) @@ -1110,11 +940,15 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) res[converter.name]["subtree"][k[0]] = d[k[0]] return res - def save_debug_data(self, filename: str): + def save_debug_data(self, filename: str, debug_tree: DebugTree = None): + """ + Save the information contained in a debug_tree to a file named filename. + """ + paths: dict[str, Union[dict, list]] = dict() def flatten_debug_info(key): - mod_info = self.debug_metadata[key] + mod_info = debug_tree.debug_metadata[key] paths[key] = dict() for record_name in mod_info: if key == "provenance": @@ -1130,125 +964,19 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) for key in ("provenance", "usage"): flatten_debug_info(key) - paths["converters_usage"] = [self.debug_build_usage_tree( - cv) for cv in self.debug_converters] + # TODO: clarify what this was used for + # paths["converters_usage"] = [self.debug_build_usage_tree( + # cv) for cv in self.debug_converters] with open(filename, "w") as f: f.write(yaml.dump(paths, sort_keys=False)) - def _crawl(self, - items: list[StructureElement], - local_converters: list[Converter], - generalStore: GeneralStore, - recordStore: RecordStore, - structure_elements_path: list[str], - converters_path: list[str], - restricted_path: Optional[list[str]] = None): - """ - Crawl a list of StructureElements and apply any matching converters. - - items: structure_elements (e.g. files and folders on one level on the hierarchy) - local_converters: locally defined converters for - treating structure elements. A locally defined converter could be - one that is only valid for a specific subtree of the originally - cralwed StructureElement structure. - generalStore and recordStore: This recursion of the crawl function should only operate on - copies of the global stores of the Crawler object. - restricted_path: optional, list of strings, traverse the data tree only along the given - path. For example, when a directory contains files a, b and c and b is - given in restricted_path, a and c will be ignroed by the crawler. - When the end of the given path is reached, traverse the full tree as - normal. The first element of the list provided by restricted_path should - be the name of the StructureElement at this level, i.e. denoting the - respective element in the items argument. - """ - # This path_found variable stores wether the path given by restricted_path was found in the - # data tree - path_found = False - if restricted_path is not None and len(restricted_path) == 0: - restricted_path = None - - for element in items: - for converter in local_converters: - - # type is something like "matches files", replace isinstance with "type_matches" - # match function tests regexp for example - if (converter.typecheck(element) and ( - restricted_path is None or element.name == restricted_path[0]) - and converter.match(element) is not None): - path_found = True - generalStore_copy = generalStore.create_scoped_copy() - recordStore_copy = recordStore.create_scoped_copy() - - # Create an entry for this matched structure element that contains the path: - generalStore_copy[converter.name] = ( - os.path.join(*(structure_elements_path + [element.get_name()]))) - - # extracts values from structure element and stores them in the - # variable store - converter.create_values(generalStore_copy, element) - - keys_modified = converter.create_records( - generalStore_copy, recordStore_copy, element) - - children = converter.create_children(generalStore_copy, element) - - if self.debug: - # add provenance information for each variable - self.debug_tree[str(element)] = ( - generalStore_copy.get_storage(), recordStore_copy.get_storage()) - self.debug_metadata["copied"][str(element)] = ( - generalStore_copy.get_dict_copied(), - recordStore_copy.get_dict_copied()) - self.debug_metadata["usage"][str(element)].add( - "/".join(converters_path + [converter.name])) - mod_info = self.debug_metadata["provenance"] - for record_name, prop_name in keys_modified: - # TODO: check - internal_id = recordStore_copy.get_internal_id( - record_name) - record_identifier = record_name + \ - "_" + str(internal_id) - converter.metadata["usage"].add(record_identifier) - mod_info[record_identifier][prop_name] = ( - structure_elements_path + [element.get_name()], - converters_path + [converter.name]) - - self._crawl(children, converter.converters, - generalStore_copy, recordStore_copy, - structure_elements_path + [element.get_name()], - converters_path + [converter.name], - restricted_path[1:] if restricted_path is not None else None) - - if restricted_path and not path_found: - raise RuntimeError("A 'restricted_path' argument was given that is not contained in " - "the data tree") - # if the crawler is running out of scope, copy all records in - # the recordStore, that were created in this scope - # to the general update container. - scoped_records = recordStore.get_records_current_scope() - for record in scoped_records: - self.crawled_data.append(record) - - # TODO: the scoped variables should be cleaned up as soon if the variables - # are no longer in the current scope. This can be implemented as follows, - # but this breaks the test "test_record_structure_generation", because - # some debug info is also deleted. This implementation can be used as soon - # as the remaining problems with the debug_tree are fixed. - # Delete the variables that are no longer needed: - # scoped_names = recordStore.get_names_current_scope() - # for name in scoped_names: - # del recordStore[name] - # del generalStore[name] - - return self.crawled_data - def crawler_main(crawled_directory_path: str, cfood_file_name: str, - identifiables_definition_file: str = None, + identifiables_definition_file: Optional[str] = None, debug: bool = False, - provenance_file: str = None, + provenance_file: Optional[str] = None, dry_run: bool = False, prefix: str = "", securityMode: SecurityMode = SecurityMode.UPDATE, @@ -1292,14 +1020,17 @@ def crawler_main(crawled_directory_path: str, return_value : int 0 if successful """ - crawler = Crawler(debug=debug, securityMode=securityMode) + crawler = Crawler(securityMode=securityMode) try: - crawler.crawl_directory(crawled_directory_path, cfood_file_name, restricted_path) + + debug_tree = DebugTree() + crawled_data = scan_directory( + crawled_directory_path, cfood_file_name, restricted_path, debug_tree=debug_tree) except ConverterValidationError as err: logger.error(err) return 1 if provenance_file is not None and debug: - crawler.save_debug_data(provenance_file) + crawler.save_debug_data(debug_tree, provenance_file) if identifiables_definition_file is not None: ident = CaosDBIdentifiableAdapter() @@ -1316,7 +1047,7 @@ def crawler_main(crawled_directory_path: str, remove_prefix = prefix if dry_run: - ins, upd = crawler.synchronize(commit_changes=False) + ins, upd = crawler.synchronize(commit_changes=False, crawled_data=crawled_data) inserts = [str(i) for i in ins] updates = [str(i) for i in upd] with open("dry.yml", "w") as f: @@ -1325,7 +1056,7 @@ def crawler_main(crawled_directory_path: str, "update": updates})) else: rtsfinder = dict() - for elem in crawler.crawled_data: + for elem in crawled_data: if isinstance(elem, db.File): # correct the file path: # elem.file = os.path.join(args.path, elem.file) @@ -1362,7 +1093,8 @@ def crawler_main(crawled_directory_path: str, raise RuntimeError("Missing RecordTypes: {}". format(", ".join(notfound))) - crawler.synchronize(commit_changes=True, unique_names=unique_names) + crawler.synchronize(commit_changes=True, unique_names=unique_names, + crawled_data=crawled_data) return 0 diff --git a/src/caoscrawler/debug_tree.py b/src/caoscrawler/debug_tree.py new file mode 100644 index 0000000000000000000000000000000000000000..79701773a5cece1747878c45bff2e394ec0f7f6b --- /dev/null +++ b/src/caoscrawler/debug_tree.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2023 Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> +# +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +""" +A structure containing debug tree information. +""" + +from __future__ import annotations + +import argparse +import importlib +import logging +import os +import sys +import warnings +import yaml + +from argparse import RawTextHelpFormatter +from collections import defaultdict +from copy import deepcopy +from enum import Enum +from importlib_resources import files +from jsonschema import validate +from typing import Any, Optional, Type, Union + +import caosdb as db + +from caosadvancedtools.cache import UpdateCache, Cache +from caosadvancedtools.crawler import Crawler as OldCrawler +from caosdb.apiutils import (compare_entities, EntityMergeConflictError, + merge_entities) +from caosdb.common.datatype import is_reference + +from .converters import Converter, DirectoryConverter, ConverterValidationError + +from .macros import defmacro_constructor, macro_constructor +from .stores import Store, GeneralStore, RecordStore +from .structure_elements import StructureElement, Directory, NoneElement +from .version import check_cfood_version + +from caosdb.high_level_api import convert_to_python_object + + +class DebugTree(object): + + def __init__(self): + # order in the tuple: + # 0: general_store + # 1: record_store + self.debug_tree: dict[str, tuple] = dict() + self.debug_metadata: dict[str, dict] = dict() + self.debug_metadata["copied"] = dict() + self.debug_metadata["provenance"] = defaultdict(lambda: dict()) + self.debug_metadata["usage"] = defaultdict(lambda: set()) + + # TODO: turn the tuple into two individual elements diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py new file mode 100644 index 0000000000000000000000000000000000000000..ff6156aed3bde639435219a705d6d7d2124f7f38 --- /dev/null +++ b/src/caoscrawler/scanner.py @@ -0,0 +1,444 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2023 Research Group Biomedical Physics, +# Max-Planck-Institute for Dynamics and Self-Organization Göttingen +# Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> +# +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +""" +This is the scanner, the original "_crawl" function from crawl.py. +This is just the functionality, that extracts data from the file system. +""" + +from __future__ import annotations + +import argparse +import importlib +import logging +import os +import sys +import warnings +import yaml + +from argparse import RawTextHelpFormatter +from collections import defaultdict +from copy import deepcopy +from enum import Enum +from importlib_resources import files +from jsonschema import validate +from typing import Any, Optional, Type, Union + +import caosdb as db + +from caosadvancedtools.cache import UpdateCache, Cache +from caosadvancedtools.crawler import Crawler as OldCrawler +from caosdb.apiutils import (compare_entities, EntityMergeConflictError, + merge_entities) +from caosdb.common.datatype import is_reference + +from .converters import Converter, DirectoryConverter, ConverterValidationError + +from .macros import defmacro_constructor, macro_constructor +from .stores import Store, GeneralStore, RecordStore +from .structure_elements import StructureElement, Directory, NoneElement +from .version import check_cfood_version + +from caosdb.high_level_api import convert_to_python_object + +from .debug_tree import DebugTree + +logger = logging.getLogger(__name__) + + +def load_definition(crawler_definition_path: str): + """ + Load a cfood from a crawler definition defined by + crawler definition path and validate it using cfood-schema.yml. + """ + + # Load the cfood from a yaml file: + with open(crawler_definition_path, "r") as f: + crawler_definitions = list(yaml.safe_load_all(f)) + + crawler_definition = _load_definition_from_yaml_dict( + crawler_definitions) + + return _resolve_validator_paths(crawler_definition, crawler_definition_path) + + +def _load_definition_from_yaml_dict(crawler_definitions: list[dict]): + """Load crawler definitions from a list of (yaml) dicts `crawler_definitions` which + contains either one or two documents. + + Doesn't resolve the validator paths in the cfood definition, so for + internal and testing use only. + + """ + if len(crawler_definitions) == 1: + # Simple case, just one document: + crawler_definition = crawler_definitions[0] + metadata = {} + elif len(crawler_definitions) == 2: + metadata = crawler_definitions[0]["metadata"] if "metadata" in crawler_definitions[0] else { + } + crawler_definition = crawler_definitions[1] + else: + raise RuntimeError( + "Crawler definition must not contain more than two documents.") + + check_cfood_version(metadata) + + # TODO: at this point this function can already load the cfood schema extensions + # from the crawler definition and add them to the yaml schema that will be + # tested in the next lines of code: + + # Load the cfood schema: + with open(str(files('caoscrawler').joinpath('cfood-schema.yml')), "r") as f: + schema = yaml.safe_load(f) + + # Add custom converters to converter enum in schema: + if "Converters" in crawler_definition: + for key in crawler_definition["Converters"]: + schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append( + key) + if len(crawler_definitions) == 2: + if "Converters" in metadata: + for key in metadata["Converters"]: + schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append( + key) + + # Validate the cfood schema: + validate(instance=crawler_definition, schema=schema["cfood"]) + + return crawler_definition + + +def _resolve_validator_paths(definition: dict, definition_path: str): + """Resolve path to validation files with respect to the file in which + the crawler was defined. + + """ + + for key, value in definition.items(): + + if key == "validate" and isinstance(value, str): + # Validator is given by a path + if not value.startswith('/'): + # Not an absolute path + definition[key] = os.path.join(os.path.dirname(definition_path), value) + if not os.path.isfile(definition[key]): + # TODO(henrik) capture this in `crawler_main` similar to + # `ConverterValidationError`. + raise FileNotFoundError( + f"Couldn't find validation file {definition[key]}") + elif isinstance(value, dict): + # Recursively resolve all validators + definition[key] = _resolve_validator_paths(value, definition_path) + + return definition + + +def create_converter_registry(definition: dict): + """ + Currently the converter registry is a dictionary containing for each converter: + - key is the short code, abbreviation for the converter class name + - module is the name of the module to be imported which must be installed + - class is the converter class to load and associate with this converter entry + + Formerly known as "load_converters". + + all other info for the converter needs to be included in the converter plugin + directory: + schema.yml file + README.md documentation + """ + + # Defaults for the converter registry: + with open(str(files('caoscrawler').joinpath('default_converters.yml')), "r") as f: + converter_registry: dict[str, dict[str, str]] = yaml.safe_load(f) + + # More converters from definition file: + if "Converters" in definition: + for key, entry in definition["Converters"].items(): + if key in ["Dict", "DictTextElement", "DictIntegerElement", "DictBooleanElement", + "DictDictElement", "DictListElement", "DictFloatElement"]: + warnings.warn(DeprecationWarning(f"{key} is deprecated. Please use the new" + " variant; without 'Dict' prefix or " + "'DictElement' in case of 'Dict'")) + + converter_registry[key] = { + "converter": entry["converter"], + "package": entry["package"] + } + + # Load modules and associate classes: + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry + + +def initialize_converters(crawler_definition: dict, converter_registry: dict): + """ + takes the cfood as dict (`crawler_definition`) and creates the converter objects that + are defined on the highest level. Child Converters will in turn be created during the + initialization of the Converters. + """ + converters = [] + + for key, value in crawler_definition.items(): + # Definitions and Converters are reserved keywords + # on the top level of the yaml file. + # TODO: there should also be a top level keyword for the actual + # CFood to avoid confusion between top level keywords + # and the CFood. + if key == "Definitions": + continue + elif key == "Converters": + continue + converters.append(Converter.converter_factory( + value, key, converter_registry)) + + return converters + +# -------------------------------------------------------------------------------- +# Main scanner function: +# -------------------------------------------------------------------------------- + + +def scanner(items: list[StructureElement], + converters: list[Converter], + general_store: Optional[GeneralStore] = None, + record_store: Optional[RecordStore] = None, + structure_elements_path: Optional[list[str]] = None, + converters_path: Optional[list[str]] = None, + restricted_path: Optional[list[str]] = None, + crawled_data: Optional[list[db.Record]] = None, + debug_tree: Optional[DebugTree] = None): + """ + Crawl a list of StructureElements and apply any matching converters. + + Formerly known as "_crawl". + + items: structure_elements (e.g. files and folders on one level on the hierarchy) + converters: locally defined converters for + treating structure elements. A locally defined converter could be + one that is only valid for a specific subtree of the originally + cralwed StructureElement structure. + general_store and record_store: This recursion of the crawl function should only operate on + copies of the global stores of the Crawler object. + restricted_path: optional, list of strings, traverse the data tree only along the given + path. For example, when a directory contains files a, b and c and b is + given in restricted_path, a and c will be ignroed by the crawler. + When the end of the given path is reached, traverse the full tree as + normal. The first element of the list provided by restricted_path should + be the name of the StructureElement at this level, i.e. denoting the + respective element in the items argument. + """ + # This path_found variable stores wether the path given by restricted_path was found in the + # data tree + path_found = False + if restricted_path is not None and len(restricted_path) == 0: + restricted_path = None + + if crawled_data is None: + crawled_data = [] + + if general_store is None: + general_store = GeneralStore() + + if record_store is None: + record_store = RecordStore() + + if structure_elements_path is None: + structure_elements_path = [] + + if converters_path is None: + converters_path = [] + + for element in items: + for converter in converters: + + # type is something like "matches files", replace isinstance with "type_matches" + # match function tests regexp for example + if (converter.typecheck(element) and ( + restricted_path is None or element.name == restricted_path[0]) + and converter.match(element) is not None): + path_found = True + general_store_copy = general_store.create_scoped_copy() + record_store_copy = record_store.create_scoped_copy() + + # Create an entry for this matched structure element that contains the path: + general_store_copy[converter.name] = ( + os.path.join(*(structure_elements_path + [element.get_name()]))) + + # extracts values from structure element and stores them in the + # variable store + converter.create_values(general_store_copy, element) + + keys_modified = converter.create_records( + general_store_copy, record_store_copy, element) + + children = converter.create_children(general_store_copy, element) + + if debug_tree is not None: + # add provenance information for each variable + debug_tree.debug_tree[str(element)] = ( + general_store_copy.get_storage(), record_store_copy.get_storage()) + debug_tree.debug_metadata["copied"][str(element)] = ( + general_store_copy.get_dict_copied(), + record_store_copy.get_dict_copied()) + debug_tree.debug_metadata["usage"][str(element)].add( + "/".join(converters_path + [converter.name])) + mod_info = debug_tree.debug_metadata["provenance"] + for record_name, prop_name in keys_modified: + # TODO: check + internal_id = record_store_copy.get_internal_id( + record_name) + record_identifier = record_name + \ + "_" + str(internal_id) + converter.metadata["usage"].add(record_identifier) + mod_info[record_identifier][prop_name] = ( + structure_elements_path + [element.get_name()], + converters_path + [converter.name]) + + scanner(children, converter.converters, + general_store_copy, record_store_copy, + structure_elements_path + [element.get_name()], + converters_path + [converter.name], + restricted_path[1:] if restricted_path is not None else None, + crawled_data, debug_tree) + + if restricted_path and not path_found: + raise RuntimeError("A 'restricted_path' argument was given that is not contained in " + "the data tree") + # if the crawler is running out of scope, copy all records in + # the record_store, that were created in this scope + # to the general update container. + scoped_records = record_store.get_records_current_scope() + for record in scoped_records: + crawled_data.append(record) + + # TODO: the scoped variables should be cleaned up as soon if the variables + # are no longer in the current scope. This can be implemented as follows, + # but this breaks the test "test_record_structure_generation", because + # some debug info is also deleted. This implementation can be used as soon + # as the remaining problems with the debug_tree are fixed. + # Delete the variables that are no longer needed: + # scoped_names = record_store.get_names_current_scope() + # for name in scoped_names: + # del record_store[name] + # del general_store[name] + + return crawled_data + + +# -------------------------------------------------------------------------------- +# Main scanning interface functions: +# -------------------------------------------------------------------------------- + + +def scan_directory(dirname: str, crawler_definition_path: str, + restricted_path: Optional[list[str]] = None, + debug_tree: Optional[DebugTree] = None): + """ Crawl a single directory. + + Formerly known as "crawl_directory". + + Convenience function that starts the crawler (calls start_crawling) + with a single directory as the StructureElement. + + restricted_path: optional, list of strings + Traverse the data tree only along the given path. When the end of the given path + is reached, traverse the full tree as normal. + """ + + crawler_definition = load_definition(crawler_definition_path) + # Load and register converter packages: + converter_registry = create_converter_registry(crawler_definition) + + if not dirname: + raise ValueError( + "You have to provide a non-empty path for crawling.") + dir_structure_name = os.path.basename(dirname) + + # TODO: needs to be covered somewhere else + crawled_directory = dirname + if not dir_structure_name and dirname.endswith('/'): + if dirname == '/': + # Crawling the entire file system + dir_structure_name = "root" + else: + # dirname had a trailing '/' + dir_structure_name = os.path.basename(dirname[:-1]) + + return scan_structure_elements(Directory(dir_structure_name, + dirname), + crawler_definition, + converter_registry, + restricted_path=restricted_path, + debug_tree=debug_tree + ) + + +def scan_structure_elements(items: Union[list[StructureElement], StructureElement], + crawler_definition: dict, + converter_registry: dict, + restricted_path: Optional[list[str]] = None, + debug_tree: Optional[DebugTree] = None): + """ + Start point of the crawler recursion. + + Formerly known as "start_crawling". + + Parameters + ---------- + items: list + A list of structure elements (or a single StructureElement) that is used for + generating the initial items for the crawler. This could e.g. be a Directory. + crawler_definition : dict + A dictionary representing the crawler definition, possibly from a yaml + file. + restricted_path: optional, list of strings + Traverse the data tree only along the given path. When the end of the given path + is reached, traverse the full tree as normal. + + Returns + ------- + crawled_data : list + the final list with the target state of Records. + """ + + # This function builds the tree of converters out of the crawler definition. + if not isinstance(items, list): + items = [items] + + # TODO: needs to be covered somewhere else + # self.run_id = uuid.uuid1() + converters = initialize_converters(crawler_definition, converter_registry) + + return scanner( + items=items, + converters=converters, + restricted_path=restricted_path, + debug_tree=debug_tree + ) diff --git a/src/caoscrawler/version.py b/src/caoscrawler/version.py index e73905dcd25673eae88f718a7e45b7b4d0665e47..fdc8323452cd190cc3628efa57c15992f30fabeb 100644 --- a/src/caoscrawler/version.py +++ b/src/caoscrawler/version.py @@ -25,8 +25,10 @@ except ImportError: # Python<3.8 dowesn"t support this so use from packaging.version import parse as parse_version from warnings import warn -# Read in version of locally installed caoscrawler package -version = importlib_metadata.version("caoscrawler") + +def get_caoscrawler_version(): + """ Read in version of locally installed caoscrawler package""" + return importlib_metadata.version("caoscrawler") class CfoodRequiredVersionError(RuntimeError): @@ -51,7 +53,7 @@ as expected with the installed version of the crawler. warn(msg, UserWarning) return - installed_version = parse_version(version) + installed_version = parse_version(get_caoscrawler_version()) cfood_version = parse_version(metadata["crawler-version"]) if cfood_version > installed_version: diff --git a/src/doc/README_SETUP.md b/src/doc/README_SETUP.md index 952a8c94a7dfa24110f320f5dd32b0ad2ac1df01..5f5161d0d672ff3ad14db5c5b49f5c65550b06d7 100644 --- a/src/doc/README_SETUP.md +++ b/src/doc/README_SETUP.md @@ -17,6 +17,7 @@ Build documentation in `src/doc` with `make html`. - `sphinx` - `sphinx-autoapi` - `recommonmark` +- `sphinx-rtd-theme` ### How to contribute ### diff --git a/src/doc/conf.py b/src/doc/conf.py index 7719a920328c46b4453cd59413b939fcf2d45f5a..544f7292a766d59891d23235bb380ed90ce0d226 100644 --- a/src/doc/conf.py +++ b/src/doc/conf.py @@ -33,10 +33,10 @@ copyright = '2021, MPIDS' author = 'Alexander Schlemmer' # The short X.Y version -version = '0.4.0' +version = '0.5.0' # The full version, including alpha/beta/rc tags # release = '0.5.2-rc2' -release = '0.4.0' +release = '0.5.0' # -- General configuration --------------------------------------------------- diff --git a/src/doc/how-to-upgrade.md b/src/doc/how-to-upgrade.md index 931fa0cd2f2d621c89c35046d6df4ba6ac9b7a1e..4efc78280ca9ddbb893f166ee3530b3363684081 100644 --- a/src/doc/how-to-upgrade.md +++ b/src/doc/how-to-upgrade.md @@ -1,6 +1,18 @@ # How to upgrade +## 0.4.0 to 0.5.0 +The crawler was split into two modules: the scanner and the crawler. The scanner creates a Record +structure from the data and the crawler synchronizes this with the server. Due to this change you +should: +- Remove the `debug` argument from the Crawler constructor. For debugging supply a DebugTree as + argument to functions like the scanner. +- Remove the `generalStore` argument from the Crawler constructor. A store can no longer be + provided to the crawler. +- `load_definition` and `initialize_converters` are now part of the scanner module +- `crawl_directory` is replcaced by `scan_directory` of the scanner module +- `start_crawling` is replcaced by `scan_structure_elements` of the scanner module + ## 0.2.x to 0.3.0 DictElementConverter (old: DictConverter) now can use "match" keywords. If none are in the definition, the behavior is as before. If you had "match", diff --git a/unittests/scifolder_cfood.yml b/unittests/scifolder_cfood.yml index dce219b751c3e980662a1eaa4904e1163d9836a0..9d6e8cf3ea325ad14641530f2e6cafd43f0dc1bb 100644 --- a/unittests/scifolder_cfood.yml +++ b/unittests/scifolder_cfood.yml @@ -2,6 +2,10 @@ # The full scifolder cfood will be developed here: # https://gitlab.indiscale.com/caosdb/src/crawler-cfoods/scifolder-cfood +--- +metadata: + crawler-version: 0.3.1 +--- Definitions: type: Definitions #include "description.yml" diff --git a/unittests/test_cfood_metadata.py b/unittests/test_cfood_metadata.py index 09d6c88bdc27e1066ed18a9c5865cbfb95270c3a..494bd383d95b4a845b5ea6f86ccff0f9a1db257f 100644 --- a/unittests/test_cfood_metadata.py +++ b/unittests/test_cfood_metadata.py @@ -21,20 +21,12 @@ import pytest import yaml from tempfile import NamedTemporaryFile +from unittest.mock import patch +from unittest.mock import MagicMock, Mock import caoscrawler -CRAWLER_VERSION = "" - - -def setup_function(function): - """Store original crawler version in case it is altered for tests.""" - CRAWLER_VERSION = caoscrawler.version.version - - -def teardown_function(function): - """Reset version""" - caoscrawler.version.version = CRAWLER_VERSION +from caoscrawler.scanner import load_definition def _temp_file_load(txt: str): @@ -46,8 +38,7 @@ def _temp_file_load(txt: str): with NamedTemporaryFile() as f: f.write(txt.encode()) f.flush() - c = caoscrawler.Crawler() - definition = c.load_definition(f.name) + definition = load_definition(f.name) return definition @@ -68,9 +59,12 @@ SimulationData: with pytest.warns(UserWarning) as uw: _temp_file_load(definition_text) - assert len(uw) == 1 - assert "No crawler version specified in cfood definition" in uw[0].message.args[0] - assert "Specifying a version is highly recommended" in uw[0].message.args[0] + found = False + for w in uw: + if ("No crawler version specified in cfood definition" in w.message.args[0] and + "Specifying a version is highly recommended" in w.message.args[0]): + found = True + assert found # metadata section is missing alltogether definition_text = """ @@ -82,12 +76,16 @@ SimulationData: with pytest.warns(UserWarning) as uw: _temp_file_load(definition_text) - assert len(uw) == 1 - assert "No crawler version specified in cfood definition" in uw[0].message.args[0] - assert "Specifying a version is highly recommended" in uw[0].message.args[0] + found = False + for w in uw: + if ("No crawler version specified in cfood definition" in w.message.args[0] and + "Specifying a version is highly recommended" in w.message.args[0]): + found = True + assert found -def test_warning_if_version_too_old(): +@patch("caoscrawler.version.get_caoscrawler_version") +def test_warning_if_version_too_old(get_version): """Warn if the cfood was written for an older crawler version.""" definition_text = """ @@ -102,31 +100,38 @@ SimulationData: match: SimulationData """ - # higher minor - caoscrawler.version.version = "0.3.0" + get_version.side_effect = lambda: "0.3.0" with pytest.warns(UserWarning) as uw: _temp_file_load(definition_text) - assert len(uw) == 1 - assert "cfood was written for a previous crawler version" in uw[0].message.args[0] - assert "version specified in cfood: 0.2.0" in uw[0].message.args[0] - assert "version installed on your system: 0.3.0" in uw[0].message.args[0] + found = False + for w in uw: + if ("cfood was written for a previous crawler version" in w.message.args[0] and + "version specified in cfood: 0.2.0" in w.message.args[0] and + "version installed on your system: 0.3.0" in w.message.args[0]): + found = True + assert found # higher major - caoscrawler.version.version = "1.1.0" + get_version.side_effect = lambda: "1.1.0" with pytest.warns(UserWarning) as uw: _temp_file_load(definition_text) - assert len(uw) == 1 - assert "cfood was written for a previous crawler version" in uw[0].message.args[0] - assert "version specified in cfood: 0.2.0" in uw[0].message.args[0] - assert "version installed on your system: 1.1.0" in uw[0].message.args[0] + found = False + for w in uw: + if ("cfood was written for a previous crawler version" in w.message.args[0] and + "version specified in cfood: 0.2.0" in w.message.args[0] and + "version installed on your system: 1.1.0" in w.message.args[0]): + found = True + assert found -def test_error_if_version_too_new(): +@patch("caoscrawler.version.get_caoscrawler_version") +def test_error_if_version_too_new(get_version): """Raise error if the cfood requires a newer crawler version.""" # minor too old + get_version.side_effect = lambda: "0.1.5" definition_text = """ --- metadata: @@ -138,7 +143,6 @@ SimulationData: type: Directory match: SimulationData """ - caoscrawler.version.version = "0.1.5" with pytest.raises(caoscrawler.CfoodRequiredVersionError) as cre: _temp_file_load(definition_text) @@ -166,7 +170,7 @@ SimulationData: assert "version installed on your system: 0.1.5" in str(cre.value) # patch to old - caoscrawler.version.version = "1.0.0" + get_version.side_effect = lambda: "1.0.0" with pytest.raises(caoscrawler.CfoodRequiredVersionError) as cre: _temp_file_load(definition_text) @@ -176,7 +180,8 @@ SimulationData: assert "version installed on your system: 1.0.0" in str(cre.value) -def test_matching_version(): +@patch("caoscrawler.version.get_caoscrawler_version") +def test_matching_version(get_version): """Test that there is no warning or error in case the version matches.""" definition_text = """ @@ -190,10 +195,10 @@ SimulationData: type: Directory match: SimulationData """ - caoscrawler.version.version = "0.2.1" + get_version.side_effect = lambda: "0.2.1" assert _temp_file_load(definition_text) # The version is also considered a match if the patch version of the # installed crawler is newer than the one specified in the cfood metadata - caoscrawler.version.version = "0.2.7" + get_version.side_effect = lambda: "0.2.7" assert _temp_file_load(definition_text) diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 4d3791fce3ceffaafe529423e4020ebd6a4231ba..154724be6d126aefb430c7d0600b86a5ec721812 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -45,6 +45,7 @@ from caoscrawler.stores import GeneralStore from caoscrawler.structure_elements import (File, TextElement, ListElement, DictElement, BooleanElement, IntegerElement, FloatElement, Directory) +from caoscrawler.scanner import load_definition, _load_definition_from_yaml_dict, create_converter_registry from test_tool import rfp @@ -437,6 +438,7 @@ def test_filter_children_of_directory(converter_registry, capsys): children = dc.create_children(None, test_dir) +@pytest.mark.filterwarnings("ignore::UserWarning") def test_validate_custom_converters(): one_doc_yaml = """ Converters: @@ -447,8 +449,7 @@ MyElement: type: MyNewType match: something """ - crawler1 = Crawler() - one_doc_definitions = crawler1._load_definition_from_yaml_dict( + one_doc_definitions = _load_definition_from_yaml_dict( [yaml.load(one_doc_yaml, Loader=yaml.SafeLoader)]) assert "MyElement" in one_doc_definitions assert one_doc_definitions["MyElement"]["type"] == "MyNewType" @@ -457,6 +458,7 @@ MyElement: two_doc_yaml = """ --- metadata: + crawler-version: 0.3.1 Converters: MyNewType: converter: MyNewTypeConverter @@ -466,8 +468,7 @@ MyElement: type: MyNewType match: something """ - crawler2 = Crawler() - two_doc_definitions = crawler2._load_definition_from_yaml_dict( + two_doc_definitions = _load_definition_from_yaml_dict( list(yaml.safe_load_all(two_doc_yaml))) assert "MyElement" in two_doc_definitions assert two_doc_definitions["MyElement"]["type"] == one_doc_definitions["MyElement"]["type"] @@ -588,8 +589,7 @@ def test_date_converter(): def test_load_converters(): - c = Crawler() - converter_registry = c.load_converters({}) + converter_registry = create_converter_registry({}) # The previous function call actually already asserts that all defined # converter classes can be loaded from their respective packages. diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py new file mode 100644 index 0000000000000000000000000000000000000000..f3ad73c5d75acea5fd3e92954e3899983ea73a2a --- /dev/null +++ b/unittests/test_crawler.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2023 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2023 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" +test the Crawler class +""" +import json +import os + +from pytest import raises + +import caosdb as db + +from caoscrawler.stores import GeneralStore +from caoscrawler.crawl import Crawler +import warnings + +from test_tool import rfp +import pytest + + +@pytest.mark.filterwarnings("ignore::DeprecationWarning") +def test_constructor(): + with warnings.catch_warnings(record=True) as w: + # Cause all warnings to always be triggered. + warnings.filterwarnings("ignore") + warnings.filterwarnings("always", category=DeprecationWarning) + + Crawler(debug=True) + assert issubclass(w[-1].category, DeprecationWarning) + assert "The debug argument of the Crawler class" in str(w[-1].message) + + Crawler(generalStore=GeneralStore()) + assert issubclass(w[-1].category, DeprecationWarning) + assert "The generalStore argument of the Crawler" in str(w[-1].message) + + +@pytest.mark.filterwarnings("ignore::DeprecationWarning") +def test_deprecated_functions(): + with warnings.catch_warnings(record=True) as w: + # Cause all warnings to always be triggered. + warnings.filterwarnings("ignore") + warnings.filterwarnings("always", category=DeprecationWarning) + cr = Crawler() + cr.crawl_directory(".", rfp("scifolder_cfood.yml")) + print(w) + print(w[0].message) + assert issubclass(w[-1].category, DeprecationWarning) + assert "The function crawl_directory in the crawl" in str(w[-1].message) + + cr.start_crawling([], {}, {}) + assert issubclass(w[-1].category, DeprecationWarning) + assert "The function start_crawling in the crawl module" in str(w[-1].message) + + cr.crawled_data + assert issubclass(w[-1].category, DeprecationWarning) + assert "The use of self.crawled_data is depricated" in str(w[-1].message) diff --git a/unittests/test_issues.py b/unittests/test_issues.py index a1724e5a989190977a7ec0d86846fc2b7433ab5d..46157af9225c11b79e76dd3ef856d60519a6eb9d 100644 --- a/unittests/test_issues.py +++ b/unittests/test_issues.py @@ -30,6 +30,8 @@ from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter from caoscrawler.structure_elements import DictElement from test_tool import rfp +from caoscrawler.scanner import create_converter_registry, scan_structure_elements + def test_issue_10(): """Test integer-to-float conversion in dictionaries""" @@ -55,14 +57,13 @@ def test_issue_10(): } } - crawler = Crawler(debug=True) - converter_registry = crawler.load_converters(crawler_definition) + converter_registry = create_converter_registry(crawler_definition) test_dict = { "float_value": 4 } - records = crawler.start_crawling( + records = scan_structure_elements( DictElement("TestDict", test_dict), crawler_definition, converter_registry) assert len(records) == 1 assert records[0].parents[0].name == "TestRec" @@ -94,7 +95,7 @@ def test_list_datatypes(): } } - crawler = Crawler(debug=True) + crawler = Crawler() converter_registry = crawler.load_converters(crawler_definition) test_dict = { diff --git a/unittests/test_json.py b/unittests/test_json.py index 41fd31a43389148ad6fbc4167fd3fbd4f7f2ee9f..3c120be174ff819baeeaa49ddf142cf40dba751e 100644 --- a/unittests/test_json.py +++ b/unittests/test_json.py @@ -36,6 +36,7 @@ import caosdb as db from caoscrawler.converters import JSONFileConverter from caoscrawler.crawl import Crawler from caoscrawler.structure_elements import File, JSONFile +from caoscrawler.scanner import load_definition, create_converter_registry, scan_structure_elements from test_tool import rfp, dircheckstr @@ -44,12 +45,11 @@ def test_json(): "jsontest_cfood.yml") json_file_path = rfp("test_directories", "examples_json", "testjson.json") - crawler = Crawler(debug=True) - crawler_definition = crawler.load_definition(crawler_definition_path) + crawler_definition = load_definition(crawler_definition_path) # Load and register converter packages: - converter_registry = crawler.load_converters(crawler_definition) + converter_registry = create_converter_registry(crawler_definition) - records = crawler.start_crawling( + records = scan_structure_elements( JSONFile(os.path.basename(json_file_path), json_file_path), crawler_definition, converter_registry @@ -70,8 +70,7 @@ def test_json(): def test_broken_validation(): crawler_definition_path = rfp( "broken_cfoods", "broken_validation_path.yml") - crawler = Crawler() with raises(FileNotFoundError) as err: - crawler_definition = crawler.load_definition(crawler_definition_path) + crawler_definition = load_definition(crawler_definition_path) assert str(err.value).startswith("Couldn't find validation file") diff --git a/unittests/test_macros.py b/unittests/test_macros.py index b5ea5d84846f5f33853910c292132d7b5026600e..5244307db8e694ffb4864380d33936ebb76ae715 100644 --- a/unittests/test_macros.py +++ b/unittests/test_macros.py @@ -25,6 +25,7 @@ from caoscrawler.macros import defmacro_constructor, macro_constructor from caoscrawler.macros.macro_yaml_object import macro_store from caoscrawler.crawl import Crawler +from caoscrawler.scanner import load_definition from tempfile import NamedTemporaryFile @@ -52,8 +53,7 @@ def _temp_file_load(txt: str): with NamedTemporaryFile() as f: f.write(txt.encode()) f.flush() - c = Crawler() - definition = c.load_definition(f.name) + definition = load_definition(f.name) return definition @@ -142,6 +142,7 @@ def test_multi_macros_toplevel(register_macros, macro_store_reset): dat_loader = list(yaml.safe_load_all(""" --- metadata: + crawler-version: 0.3.1 macros: - !defmacro name: test_one @@ -168,6 +169,10 @@ testnode: !macro def test_load_definition(register_macros, macro_store_reset): txt = """ +--- +metadata: + crawler-version: 0.3.1 +--- extroot: type: Directory match: extroot @@ -183,6 +188,7 @@ extroot: cfood = _temp_file_load(""" --- metadata: + crawler-version: 0.3.1 macros: - !defmacro name: test_one @@ -256,6 +262,7 @@ def test_circular_macro_definition(register_macros, macro_store_reset): cfood = _temp_file_load(""" --- metadata: + crawler-version: 0.3.1 macros: - !defmacro name: test_one @@ -304,6 +311,7 @@ def test_use_macro_twice(): cfood = _temp_file_load(""" --- metadata: + crawler-version: 0.3.1 macros: - !defmacro name: test_twice @@ -337,6 +345,7 @@ def test_documentation_example_2(): cfood = _temp_file_load(""" --- metadata: + crawler-version: 0.3.1 macros: - !defmacro name: MarkdownFile @@ -374,6 +383,7 @@ def test_documentation_example_1(): cfood = _temp_file_load(""" --- metadata: + crawler-version: 0.3.1 macros: - !defmacro name: SimulationDatasetFile @@ -422,6 +432,7 @@ def test_def_replacements(): cfood = _temp_file_load(""" --- metadata: + crawler-version: 0.3.1 macros: - !defmacro name: test_def_replacements diff --git a/unittests/test_scalars_cfood.py b/unittests/test_scalars_cfood.py index ac408b2dab0fa151c370d3ec6ffd1dced22c77d7..89d94fc74ebda6aedfbee422294e99eab2216d73 100644 --- a/unittests/test_scalars_cfood.py +++ b/unittests/test_scalars_cfood.py @@ -10,16 +10,11 @@ from caoscrawler.converters import handle_value from caoscrawler.crawl import Crawler # We need the store for the above function from caoscrawler.stores import GeneralStore - -from test_tool import dircheckstr, rfp +from caoscrawler.scanner import scan_directory +from caoscrawler.debug_tree import DebugTree -@pytest.fixture -def crawler(): - crawler = Crawler(debug=True) - crawler.crawl_directory(rfp("test_directories", "examples_article"), - rfp("cfoods_scalar.yml")) - return crawler +from test_tool import dircheckstr, rfp def test_handle_value(): @@ -38,8 +33,11 @@ def test_handle_value(): assert handle_value([4, 3, 2], store) == ([4, 3, 2], "single") -def test_record_structure_generation(crawler): - subd = crawler.debug_tree[dircheckstr("DataAnalysis")] +def test_record_structure_generation(): + dbt = DebugTree() + scan_directory(rfp("test_directories", "examples_article"), rfp("cfoods_scalar.yml"), + debug_tree=dbt) + subd = dbt.debug_tree[dircheckstr("DataAnalysis")] assert len(subd) == 2 # variables store on Data Analysis node of debug tree if "Data" in subd[0]: diff --git a/unittests/test_schema.py b/unittests/test_schema.py index 0736698eb32146fb3cfbee6acbcf11f5436df27e..0d5bebce98fbc8c789c1080bcf3919f128bdbf54 100644 --- a/unittests/test_schema.py +++ b/unittests/test_schema.py @@ -13,6 +13,8 @@ from pytest import raises from jsonschema.exceptions import ValidationError +from caoscrawler.scanner import load_definition + def rfp(*pathcomponents): """ @@ -23,9 +25,8 @@ def rfp(*pathcomponents): def test_schema_validation(): - cr = Crawler() - cr.load_definition(rfp("scifolder_cfood.yml")) - cr.load_definition(rfp("scifolder_extended.yml")) + load_definition(rfp("scifolder_cfood.yml")) + load_definition(rfp("scifolder_extended.yml")) with raises(ValidationError, match=".*enum.*"): - cr.load_definition(rfp("broken_cfoods", "broken1.yml")) + load_definition(rfp("broken_cfoods", "broken1.yml")) diff --git a/unittests/test_table_converter.py b/unittests/test_table_converter.py index abe4ac85ec4fc0a78e71c177222817e1b84e9e56..d739695fc4c6a019f28f3c3697e3f134e0f1755e 100644 --- a/unittests/test_table_converter.py +++ b/unittests/test_table_converter.py @@ -28,6 +28,8 @@ test the converters module from caoscrawler.converters import Converter from caoscrawler.stores import GeneralStore +from caoscrawler.scanner import scan_directory +from caoscrawler.debug_tree import DebugTree from caoscrawler.converters import (ConverterValidationError, DictConverter, XLSXTableConverter, CSVTableConverter) from caoscrawler.structure_elements import Directory @@ -91,14 +93,6 @@ def dircheckstr(*pathcomponents): return "caoscrawler.structure_elements.File: " + basename(join(*pathcomponents)) + ", " + rfp("test_directories", "examples_tables", "ExperimentalData", *pathcomponents) -@pytest.fixture -def crawler(): - crawler = Crawler(debug=True) - crawler.crawl_directory(rfp("test_directories", "examples_tables", "ExperimentalData"), - rfp("test_directories", "examples_tables", "crawler_for_tables.yml")) - return crawler - - def test_convert_table(converter_registry): extentions = ["xlsx", "csv", "tsv"] if importlib.util.find_spec("odf") is not None: @@ -151,9 +145,13 @@ def test_convert_table(converter_registry): assert res[0].name == "jdsfkljadskf" -def test_crawl_csv_table(crawler): +def test_crawl_csv_table(): + dbt = DebugTree() + scan_directory(rfp("test_directories", "examples_tables", "ExperimentalData"), + rfp("test_directories", "examples_tables", "crawler_for_tables.yml"), + debug_tree=dbt) for file_ext in ["xlsx", "csv"]: - subd = crawler.debug_tree[dircheckstr("test1." + file_ext)] + subd = dbt.debug_tree[dircheckstr("test1." + file_ext)] record_experiment = subd[1]["Experiment"] assert isinstance(record_experiment, db.Record) assert isinstance(record_experiment.get_property("Measurements").value, list) diff --git a/unittests/test_tool.py b/unittests/test_tool.py index e15d7cb777ced4b92566df2b25b375e90be39295..08b3a0e4f9623e996540746ac408801090b97aa3 100755 --- a/unittests/test_tool.py +++ b/unittests/test_tool.py @@ -32,6 +32,8 @@ import os from caoscrawler.crawl import Crawler, SecurityMode, split_restricted_path from caoscrawler.identifiable import Identifiable from caoscrawler.structure_elements import File, DictTextElement, DictListElement, DictElement +from caoscrawler.scanner import scan_directory +from caoscrawler.debug_tree import DebugTree from caoscrawler.identifiable_adapters import IdentifiableAdapter, LocalStorageIdentifiableAdapter from simulated_server_data import full_data from functools import partial @@ -48,6 +50,8 @@ from caosdb.apiutils import compare_entities import pytest from pytest import raises +from caoscrawler.scanner import create_converter_registry, scan_structure_elements + def rfp(*pathcomponents): """ @@ -74,16 +78,18 @@ def dircheckstr(*pathcomponents): @pytest.fixture def crawler(): - crawler = Crawler(debug=True) - crawler.crawl_directory(rfp("test_directories", "examples_article"), - rfp("scifolder_cfood.yml")) - return crawler + crawler = Crawler() + debug_tree = DebugTree() + crawled_data = scan_directory( + rfp("test_directories", "examples_article"), + rfp("scifolder_cfood.yml"), debug_tree=debug_tree) + return crawler, crawled_data, debug_tree @pytest.fixture def ident(crawler): ident = LocalStorageIdentifiableAdapter() - crawler.identifiableAdapter = ident + crawler[0].identifiableAdapter = ident # The records.xml file is constructed as follows: # To a full run of the crawler, resolve all identifiables and insert all resulting entities. @@ -109,11 +115,16 @@ def ident(crawler): return ident -def test_record_structure_generation(crawler): +def test_record_structure_generation(): # TODO How does this test relate to the test function in test_scalars_cfood with the same name? # There seems to be code duplication - subd = crawler.debug_tree[dircheckstr("DataAnalysis")] - subc = crawler.debug_metadata["copied"][dircheckstr("DataAnalysis")] + + dbt = DebugTree() + scan_directory(rfp("test_directories", "examples_article"), + rfp("scifolder_cfood.yml"), + debug_tree=dbt) + subd = dbt.debug_tree[dircheckstr("DataAnalysis")] + subc = dbt.debug_metadata["copied"][dircheckstr("DataAnalysis")] assert len(subd) == 2 # variables store on Data Analysis node of debug tree assert len(subd[0]) == 4 @@ -127,9 +138,9 @@ def test_record_structure_generation(crawler): assert subd[0]["DataAnalysis"] == "examples_article/DataAnalysis" assert subc[0]["DataAnalysis"] is False - subd = crawler.debug_tree[dircheckstr( + subd = dbt.debug_tree[dircheckstr( "DataAnalysis", "2020_climate-model-predict")] - subc = crawler.debug_metadata["copied"][dircheckstr( + subc = dbt.debug_metadata["copied"][dircheckstr( "DataAnalysis", "2020_climate-model-predict")] assert len(subd[1]) == 1 @@ -157,12 +168,12 @@ def test_record_structure_generation(crawler): assert subc[0]["date"] is False assert subc[0]["identifier"] is False - subd = crawler.debug_tree[dircheckstr("DataAnalysis", - "2020_climate-model-predict", - "2020-02-08_prediction-errors")] - subc = crawler.debug_metadata["copied"][dircheckstr("DataAnalysis", - "2020_climate-model-predict", - "2020-02-08_prediction-errors")] + subd = dbt.debug_tree[dircheckstr("DataAnalysis", + "2020_climate-model-predict", + "2020-02-08_prediction-errors")] + subc = dbt.debug_metadata["copied"][dircheckstr("DataAnalysis", + "2020_climate-model-predict", + "2020-02-08_prediction-errors")] assert len(subd[0]) == 12 assert subd[0]["date"] == "2020-02-08" assert subd[0]["identifier"] == "prediction-errors" @@ -213,6 +224,7 @@ def test_record_structure_generation(crawler): def test_crawler_update_list(crawler, ident): + crawled_data = crawler[1] # If the following assertions fail, that is a hint, that the test file records.xml has changed # and this needs to be updated: assert len(ident.get_records()) == 18 @@ -227,7 +239,7 @@ def test_crawler_update_list(crawler, ident): ) == 2 # The crawler contains lots of duplicates, because identifiables have not been resolved yet: - assert len(ident.get_records()) != len(crawler.crawled_data) + assert len(ident.get_records()) != len(crawled_data) # Check consistency: # Check whether identifiables retrieved from current identifiable store return @@ -283,7 +295,7 @@ def test_crawler_update_list(crawler, ident): def test_synchronization(crawler, ident): - insl, updl = crawler.synchronize(commit_changes=False) + insl, updl = crawler[0].synchronize(commit_changes=False, crawled_data=crawler[1]) assert len(insl) == 0 assert len(updl) == 0 @@ -332,16 +344,16 @@ def test_remove_unnecessary_updates(): @pytest.mark.xfail def test_identifiable_adapter_no_identifiable(crawler, ident): del ident._registered_identifiables["Person"] - insl, updl = crawler.synchronize() + insl, updl = crawler[0].synchronize() assert len(updl) == 0 - pers = [r for r in crawler.crawled_data if r.parents[0].name == "Person"] + pers = [r for r in crawler[0].crawled_data if r.parents[0].name == "Person"] # All persons are inserted, because they are not identifiable: assert len(insl) == len(pers) def test_provenance_debug_data(crawler): - crawler.save_debug_data(rfp("provenance.yml")) + crawler[0].save_debug_data(rfp("provenance.yml"), debug_tree=crawler[2]) with open(rfp("provenance.yml"), "r") as f: provenance = yaml.load(f, Loader=yaml.SafeLoader) @@ -356,7 +368,7 @@ def test_provenance_debug_data(crawler): def test_split_into_inserts_and_updates_trivial(crawler): - crawler.split_into_inserts_and_updates([]) + crawler[0].split_into_inserts_and_updates([]) def basic_retrieve_by_name_mock_up(rec, referencing_entities=None, known=None): @@ -370,21 +382,21 @@ def basic_retrieve_by_name_mock_up(rec, referencing_entities=None, known=None): @pytest.fixture def crawler_mocked_identifiable_retrieve(crawler): # mock retrieval of registered identifiabls: return Record with just a parent - crawler.identifiableAdapter.get_registered_identifiable = Mock( + crawler[0].identifiableAdapter.get_registered_identifiable = Mock( side_effect=lambda x: db.Record().add_parent(x.parents[0].name)) # Simulate remote server content by using the names to identify records # There is only a single known Record with name A - crawler.identifiableAdapter.retrieve_identified_record_for_record = Mock(side_effect=partial( + crawler[0].identifiableAdapter.retrieve_identified_record_for_record = Mock(side_effect=partial( basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A")})) - crawler.identifiableAdapter.retrieve_identified_record_for_identifiable = Mock( + crawler[0].identifiableAdapter.retrieve_identified_record_for_identifiable = Mock( side_effect=partial( basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A")})) return crawler def test_split_into_inserts_and_updates_single(crawler_mocked_identifiable_retrieve): - crawler = crawler_mocked_identifiable_retrieve + crawler = crawler_mocked_identifiable_retrieve[0] identlist = [Identifiable(name="A", record_type="C"), Identifiable(name="B", record_type="C")] entlist = [db.Record(name="A").add_parent( "C"), db.Record(name="B").add_parent("C")] @@ -409,7 +421,7 @@ def test_split_into_inserts_and_updates_single(crawler_mocked_identifiable_retri def test_split_into_inserts_and_updates_with_duplicate(crawler_mocked_identifiable_retrieve): - crawler = crawler_mocked_identifiable_retrieve + crawler = crawler_mocked_identifiable_retrieve[0] a = db.Record(name="A").add_parent("C") b = db.Record(name="B").add_parent("C") b.add_property("A", a) @@ -427,7 +439,7 @@ def test_split_into_inserts_and_updates_with_duplicate(crawler_mocked_identifiab def test_split_into_inserts_and_updates_with_ref(crawler_mocked_identifiable_retrieve): - crawler = crawler_mocked_identifiable_retrieve + crawler = crawler_mocked_identifiable_retrieve[0] # try it with a reference a = db.Record(name="A").add_parent("C") b = db.Record(name="B").add_parent("C") @@ -454,7 +466,7 @@ def test_split_into_inserts_and_updates_with_circ(crawler): def test_split_into_inserts_and_updates_with_complex(crawler_mocked_identifiable_retrieve): - crawler = crawler_mocked_identifiable_retrieve + crawler = crawler_mocked_identifiable_retrieve[0] # A # ^ # | @@ -481,7 +493,7 @@ def test_split_into_inserts_and_updates_with_complex(crawler_mocked_identifiable def test_split_into_inserts_and_updates_with_copy_attr(crawler_mocked_identifiable_retrieve): - crawler = crawler_mocked_identifiable_retrieve + crawler = crawler_mocked_identifiable_retrieve[0] # assume identifiable is only the name a = db.Record(name="A").add_parent("C") a.add_property("foo", 1) @@ -500,7 +512,7 @@ def test_split_into_inserts_and_updates_with_copy_attr(crawler_mocked_identifiab def test_has_missing_object_in_references(crawler): # Simulate remote server content by using the names to identify records # There are only two known Records with name A and B - crawler.identifiableAdapter.get_registered_identifiable = Mock(side_effect=partial( + crawler[0].identifiableAdapter.get_registered_identifiable = Mock(side_effect=partial( basic_retrieve_by_name_mock_up, known={"C": db.Record(name="C").add_parent("RTC") .add_property("d"), "D": db.Record(name="D").add_parent("RTD") @@ -508,56 +520,56 @@ def test_has_missing_object_in_references(crawler): })) # one reference with id -> check - assert not crawler._has_missing_object_in_references( + assert not crawler[0]._has_missing_object_in_references( Identifiable(name="C", record_type="RTC", properties={'d': 123}), []) # one ref with Entity with id -> check - assert not crawler._has_missing_object_in_references( + assert not crawler[0]._has_missing_object_in_references( Identifiable(name="C", record_type="RTC", properties={'d': db.Record(id=123) .add_parent("C")}), []) # one ref with id one with Entity with id (mixed) -> check - assert not crawler._has_missing_object_in_references( + assert not crawler[0]._has_missing_object_in_references( Identifiable(name="C", record_type="RTD", properties={'d': 123, 'b': db.Record(id=123).add_parent("RTC")}), []) # entity to be referenced in the following a = db.Record(name="C").add_parent("C").add_property("d", 12311) # one ref with id one with Entity without id (but not identifying) -> fail - assert not crawler._has_missing_object_in_references( + assert not crawler[0]._has_missing_object_in_references( Identifiable(name="C", record_type="RTC", properties={'d': 123, 'e': a}), []) # one ref with id one with Entity without id (mixed) -> fail - assert not crawler._has_missing_object_in_references( + assert not crawler[0]._has_missing_object_in_references( Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), []) - crawler.add_to_remote_missing_cache(a, Identifiable(name="C", record_type="RTC", - properties={'d': 12311})) + crawler[0].add_to_remote_missing_cache(a, Identifiable(name="C", record_type="RTC", + properties={'d': 12311})) # one ref with id one with Entity without id but in cache -> check - assert crawler._has_missing_object_in_references( + assert crawler[0]._has_missing_object_in_references( Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), []) # if this ever fails, the mock up may be removed - crawler.identifiableAdapter.get_registered_identifiable.assert_called() + crawler[0].identifiableAdapter.get_registered_identifiable.assert_called() @pytest.mark.xfail() def test_references_entities_without_ids(crawler, ident): - assert not crawler._has_reference_value_without_id(db.Record().add_parent("Person") - .add_property('last_name', 123) - .add_property('first_name', 123)) + assert not crawler[0]._has_reference_value_without_id(db.Record().add_parent("Person") + .add_property('last_name', 123) + .add_property('first_name', 123)) # id and rec with id - assert not crawler._has_reference_value_without_id(db.Record().add_parent("Person") - .add_property('first_name', 123) - .add_property('last_name', - db.Record(id=123))) + assert not crawler[0]._has_reference_value_without_id(db.Record().add_parent("Person") + .add_property('first_name', 123) + .add_property('last_name', + db.Record(id=123))) # id and rec with id and one unneeded prop - assert crawler._has_reference_value_without_id(db.Record().add_parent("Person") - .add_property('first_name', 123) - .add_property('stuff', db.Record()) - .add_property('last_name', db.Record(id=123))) + assert crawler[0]._has_reference_value_without_id(db.Record().add_parent("Person") + .add_property('first_name', 123) + .add_property('stuff', db.Record()) + .add_property('last_name', db.Record(id=123))) # one identifying prop is missing - assert crawler._has_reference_value_without_id(db.Record().add_parent("Person") - .add_property('first_name', 123) - .add_property('last_name', db.Record())) + assert crawler[0]._has_reference_value_without_id(db.Record().add_parent("Person") + .add_property('first_name', 123) + .add_property('last_name', db.Record())) def test_replace_entities_with_ids(crawler): @@ -565,7 +577,7 @@ def test_replace_entities_with_ids(crawler): .add_property("B", db.Record(id=12345)) .add_property("C", [db.Record(id=12345), 233324])) - crawler.replace_entities_with_ids(a) + crawler[0].replace_entities_with_ids(a) assert a.get_property("A").value == 12345 assert a.get_property("B").value == 12345 assert a.get_property("C").value == [12345, 233324] @@ -589,12 +601,14 @@ def mock_get_entity_by_name(name): def prepare_crawler_with_sec_mode(mode, ident): - crawler = Crawler(debug=True, securityMode=mode) - crawler.crawl_directory(rfp("test_directories", "examples_article"), - rfp("scifolder_cfood.yml")) + crawler = Crawler(securityMode=mode) + debug_tree = DebugTree() + crawled_data = scan_directory( + rfp("test_directories", "examples_article"), + rfp("scifolder_cfood.yml"), debug_tree=debug_tree) crawler.identifiableAdapter = ident - return crawler + return crawler, crawled_data, debug_tree def reset_mocks(mocks): @@ -647,19 +661,19 @@ def test_security_mode(updateCacheMock, upmock, insmock, ident): records_backup = deepcopy(ident._records) # trivial case: nothing to do - crawler = prepare_crawler_with_sec_mode(SecurityMode.RETRIEVE, ident) - crawler.synchronize(commit_changes=True) + crawler, crawled_data, debug_tree = prepare_crawler_with_sec_mode(SecurityMode.RETRIEVE, ident) + crawler.synchronize(commit_changes=True, crawled_data=crawled_data) assert crawler.run_id is not None insmock.assert_not_called() upmock.assert_not_called() updateCacheMock.assert_not_called() # RETRIEVE: insert only - crawler = prepare_crawler_with_sec_mode(SecurityMode.RETRIEVE, ident) + crawler, crawled_data, debug_tree = prepare_crawler_with_sec_mode(SecurityMode.RETRIEVE, ident) # remove one element del ident._records[-1] # insert forbidden - crawler.synchronize(commit_changes=True) + crawler.synchronize(commit_changes=True, crawled_data=crawled_data) assert crawler.run_id is not None insmock.assert_not_called() upmock.assert_not_called() @@ -670,10 +684,10 @@ def test_security_mode(updateCacheMock, upmock, insmock, ident): ident._records = deepcopy(records_backup) # RETRIEVE: update only - crawler = prepare_crawler_with_sec_mode(SecurityMode.RETRIEVE, ident) + crawler, crawled_data, debug_tree = prepare_crawler_with_sec_mode(SecurityMode.RETRIEVE, ident) # change one element change_non_identifiable_prop(ident) - crawler.synchronize(commit_changes=True) + crawler.synchronize(commit_changes=True, crawled_data=crawled_data) assert crawler.run_id is not None insmock.assert_not_called() upmock.assert_not_called() @@ -684,10 +698,10 @@ def test_security_mode(updateCacheMock, upmock, insmock, ident): ident._records = deepcopy(records_backup) # INSERT: insert only - crawler = prepare_crawler_with_sec_mode(SecurityMode.INSERT, ident) + crawler, crawled_data, debug_tree = prepare_crawler_with_sec_mode(SecurityMode.INSERT, ident) # remove one element del ident._records[-1] - crawler.synchronize(commit_changes=True) + crawler.synchronize(commit_changes=True, crawled_data=crawled_data) assert crawler.run_id is not None insmock.assert_called_once() upmock.assert_not_called() @@ -698,10 +712,10 @@ def test_security_mode(updateCacheMock, upmock, insmock, ident): ident._records = deepcopy(records_backup) # INSERT: update only - crawler = prepare_crawler_with_sec_mode(SecurityMode.INSERT, ident) + crawler, crawled_data, debug_tree = prepare_crawler_with_sec_mode(SecurityMode.INSERT, ident) # change one element change_non_identifiable_prop(ident) - crawler.synchronize(commit_changes=True) + crawler.synchronize(commit_changes=True, crawled_data=crawled_data) assert crawler.run_id is not None insmock.assert_not_called() upmock.assert_not_called() @@ -712,11 +726,11 @@ def test_security_mode(updateCacheMock, upmock, insmock, ident): ident._records = deepcopy(records_backup) # INSERT: insert and update - crawler = prepare_crawler_with_sec_mode(SecurityMode.INSERT, ident) + crawler, crawled_data, debug_tree = prepare_crawler_with_sec_mode(SecurityMode.INSERT, ident) # change two elements change_non_identifiable_prop(ident) change_identifiable_prop(ident) - crawler.synchronize(commit_changes=True) + crawler.synchronize(commit_changes=True, crawled_data=crawled_data) assert crawler.run_id is not None insmock.asser_called_once() upmock.assert_not_called() @@ -769,14 +783,14 @@ def crawler_mocked_for_backref_test(crawler): "is_referenced_by", value=["BR", "BR2"]) else: return db.Record().add_parent(x.parents[0].name) - crawler.identifiableAdapter.get_registered_identifiable = Mock(side_effect=get_reg_ident) + crawler[0].identifiableAdapter.get_registered_identifiable = Mock(side_effect=get_reg_ident) # Simulate remote server content by using the names to identify records # There is only a single known Record with name A - crawler.identifiableAdapter.retrieve_identified_record_for_record = Mock(side_effect=partial( + crawler[0].identifiableAdapter.retrieve_identified_record_for_record = Mock(side_effect=partial( basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A").add_parent("BR")})) - crawler.identifiableAdapter.retrieve_identified_record_for_identifiable = Mock( + crawler[0].identifiableAdapter.retrieve_identified_record_for_identifiable = Mock( side_effect=partial( basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A").add_parent("BR")})) @@ -800,7 +814,7 @@ def test_validation_error_print(caplog): def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test): - crawler = crawler_mocked_for_backref_test + crawler = crawler_mocked_for_backref_test[0] identlist = [Identifiable(name="A", record_type="BR"), Identifiable(name="B", record_type="C", backrefs=[db.Entity()])] referenced = db.Record(name="B").add_parent("C") @@ -835,7 +849,7 @@ def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test) def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_test): # test whether multiple references of the same record type are correctly used - crawler = crawler_mocked_for_backref_test + crawler = crawler_mocked_for_backref_test[0] referenced = db.Record(name="B").add_parent("C") entlist = [referenced, db.Record(name="A").add_parent("BR").add_property("ref", referenced), @@ -855,7 +869,7 @@ def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_ def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_test): # test whether multiple references of the different record types are correctly used - crawler = crawler_mocked_for_backref_test + crawler = crawler_mocked_for_backref_test[0] referenced = db.Record(name="B").add_parent("D") entlist = [referenced, db.Record(name="A").add_parent("BR").add_property("ref", referenced), @@ -911,8 +925,8 @@ def test_restricted_path(create_mock): } } - crawler = Crawler(debug=True) - converter_registry = crawler.load_converters(crawler_definition) + crawler = Crawler() + converter_registry = create_converter_registry(crawler_definition) # This structure is crawled test_dict = { @@ -927,7 +941,7 @@ def test_restricted_path(create_mock): } # first test without a restricted_path restricted_path = None - records = crawler.start_crawling( + records = scan_structure_elements( DictElement("TestDict", test_dict), crawler_definition, converter_registry, restricted_path ) @@ -937,7 +951,7 @@ def test_restricted_path(create_mock): # test with a restricted_path but one that has no effect (single root element) # this also tests that the remainder of the tree is fully traversed restricted_path = ["TestDict"] - records = crawler.start_crawling( + records = scan_structure_elements( DictElement("TestDict", test_dict), crawler_definition, converter_registry, restricted_path ) @@ -946,7 +960,7 @@ def test_restricted_path(create_mock): # test with a restricted_path that restricts the tree (single root element) restricted_path = ["TestDict", "v2"] - records = crawler.start_crawling( + records = scan_structure_elements( DictElement("TestDict", test_dict), crawler_definition, converter_registry, restricted_path ) @@ -956,7 +970,7 @@ def test_restricted_path(create_mock): # test with a restricted_path that contains a bad element restricted_path = ["TestDict", "v3"] with raises(RuntimeError): - records = crawler.start_crawling( + records = scan_structure_elements( DictElement("TestDict", test_dict), crawler_definition, converter_registry, restricted_path ) @@ -968,6 +982,9 @@ def test_split_restricted_path(): assert ["el", "el"] == split_restricted_path("/el/el") +# Filter the warning because we want to have it here and this way it does not hinder running +# tests with -Werror. +@pytest.mark.filterwarnings("ignore:The prefix:DeprecationWarning") def test_deprecated_prefix_option(): """Test that calling the crawler's main function with the deprecated `prefix` option raises the correct errors and warnings. diff --git a/unittests/test_tool_extended.py b/unittests/test_tool_extended.py index d0b431a539a15e3e83906540c69becff437742ec..7dd4282e4c6d206c8c360424d865b9f736b5e582 100644 --- a/unittests/test_tool_extended.py +++ b/unittests/test_tool_extended.py @@ -6,7 +6,9 @@ from caoscrawler import Crawler from caoscrawler.structure_elements import File, DictTextElement, DictListElement from caoscrawler.identifiable_adapters import IdentifiableAdapter, LocalStorageIdentifiableAdapter +from caoscrawler.scanner import scan_directory from functools import partial +from caoscrawler.debug_tree import DebugTree from copy import deepcopy from unittest.mock import MagicMock, Mock from os.path import join, dirname, basename @@ -69,10 +71,14 @@ def crawler(): # return ident -def test_file_structure_generation(crawler): - sd = crawler.debug_tree[dircheckstr("SimulationData", - "2020_climate-model-predict", "2020-02-01", - "README.md", structure_element_type="File")] +def test_file_structure_generation(): + dbt = DebugTree() + scan_directory(rfp("test_directories", "examples_article"), + rfp("scifolder_extended.yml"), + debug_tree=dbt) + sd = dbt.debug_tree[dircheckstr("SimulationData", + "2020_climate-model-predict", "2020-02-01", + "README.md", structure_element_type="File")] assert sd[1]["ReadmeFile"].role == "File" assert len(sd[1]["ReadmeFile"].path) > 0 assert len(sd[1]["ReadmeFile"].file) > 0 diff --git a/unittests/test_variable_substitutions.py b/unittests/test_variable_substitutions.py index f6c3b6375a3111faff9d746779805ba16af260b7..f13e759982e8102bbf37e65311ff4073ba52e5a2 100644 --- a/unittests/test_variable_substitutions.py +++ b/unittests/test_variable_substitutions.py @@ -2,7 +2,9 @@ # Tests for variable substitutions # A. Schlemmer, 05/2022 +from caoscrawler.debug_tree import DebugTree from caoscrawler import Crawler +from caoscrawler.scanner import scan_directory from caoscrawler.structure_elements import File, DictTextElement, DictListElement from caoscrawler.identifiable_adapters import IdentifiableAdapter, LocalStorageIdentifiableAdapter from functools import partial @@ -32,33 +34,21 @@ def dircheckstr(element_type, *pathcomponents): return "caoscrawler.structure_elements." + element_type + ": " + basename(join(*pathcomponents)) + ", " + rfp("test_directories", "example_substitutions", *pathcomponents) -@pytest.fixture -def crawler(): - crawler = Crawler(debug=True) - crawler.crawl_directory(rfp("test_directories", "example_substitutions", "ExperimentalData"), - rfp("test_directories", "example_substitutions", "substitutions.yml")) - return crawler +def test_substitutions(): - -@pytest.fixture -def crawler_2(): - crawler = Crawler(debug=True) - crawler.crawl_directory(rfp("test_directories", "example_substitutions", "ExperimentalData"), - rfp("test_directories", "example_substitutions", - "substitutions_parents.yml")) - return crawler - - -def test_substitutions(crawler): + dbt = DebugTree() + scan_directory(rfp("test_directories", "example_substitutions", "ExperimentalData"), + rfp("test_directories", "example_substitutions", "substitutions.yml"), + debug_tree=dbt) # @review Florian Spreckelsen 2022-05-13 for i in range(2): - subd = crawler.debug_tree[dircheckstr( + subd = dbt.debug_tree[dircheckstr( "File", "ExperimentalData", "220512_data.dat")] assert subd[i]["Experiment"].get_property("date").value == "2022-05-12" assert isinstance(subd[i]["ExperimentSeries"].get_property( "Experiment").value, db.Record) - subd = crawler.debug_tree[dircheckstr("Directory", "ExperimentalData")] + subd = dbt.debug_tree[dircheckstr("Directory", "ExperimentalData")] assert subd[i]["Project"].name == "project" assert isinstance(subd[i]["Project"].get_property( "Experiments").value, list) @@ -70,11 +60,16 @@ def test_substitutions(crawler): "dates").value[0] == "2022-05-12" -def test_substitutions_parents(crawler_2): +def test_substitutions_parents(): + dbt = DebugTree() + scan_directory(rfp("test_directories", "example_substitutions", "ExperimentalData"), + rfp("test_directories", "example_substitutions", + "substitutions_parents.yml"), + debug_tree=dbt) # This is a test for: # https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/35 # ... testing whether variable substitutions can be used in parent declarations. - subd = crawler_2.debug_tree[dircheckstr( + subd = dbt.debug_tree[dircheckstr( "File", "ExperimentalData", "220512_data.dat")] # subd[0] <- generalStore # subd[1] <- recordStore @@ -85,11 +80,16 @@ def test_substitutions_parents(crawler_2): assert parents[1].name == "Month_05" -def test_empty_parents(crawler_2): +def test_empty_parents(): + dbt = DebugTree() + scan_directory(rfp("test_directories", "example_substitutions", "ExperimentalData"), + rfp("test_directories", "example_substitutions", + "substitutions_parents.yml"), + debug_tree=dbt) # This is a test for: # https://gitlab.com/caosdb/caosdb-crawler/-/issues/8 - subd = crawler_2.debug_tree[dircheckstr( + subd = dbt.debug_tree[dircheckstr( "File", "ExperimentalData", "220512_data.dat")] parents = subd[1]["RecordWithoutParents"].get_parents()