diff --git a/CHANGELOG.md b/CHANGELOG.md index 7df6439d5ee38b236a0731cf5ca09b82c7fcf002..ae54ee305c850655b6132b45fc6980aa9ad14d2a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 the correct loading behavior of that file. ### Deprecated ### +- The command line argument '--prefix' and the 'prefix' argument of + `crawler_main` is depricated. Use the new argument "--remove-prefix". ### Removed ### diff --git a/integrationtests/test_data/extroot/realworld_example/dataset_cfoods.yml b/integrationtests/test_data/extroot/realworld_example/dataset_cfoods.yml index 7a64d708667182b80b739812e5fdf3369fc5b462..e149c0ccc0008a20d96ee782a853d345d541f3ab 100644 --- a/integrationtests/test_data/extroot/realworld_example/dataset_cfoods.yml +++ b/integrationtests/test_data/extroot/realworld_example/dataset_cfoods.yml @@ -31,6 +31,11 @@ Data: type: JSONFile match: .dataspace.json validate: schema/dataspace.schema.json + records: + JSONFile: + role: File + path: $dataspace_json + file: $dataspace_json subtree: jsondict: type: DictElement diff --git a/integrationtests/test_data/extroot/realworld_example/schema/dataspace.schema.json b/integrationtests/test_data/extroot/realworld_example/schema/dataspace.schema.json index 01653bfa821e0a0acbb5a481bfd458e2ed784fb9..c6376ee44d248ce5ea9e24e34bd245255e888e39 100644 --- a/integrationtests/test_data/extroot/realworld_example/schema/dataspace.schema.json +++ b/integrationtests/test_data/extroot/realworld_example/schema/dataspace.schema.json @@ -9,6 +9,7 @@ "minimum": 20000 }, "archived": { "type": "boolean" }, + "JSONFile": { "type": "string" }, "url": { "type": "string", "description": "link to folder on file system (CaosDB or cloud folder)" diff --git a/integrationtests/test_realworld_example.py b/integrationtests/test_realworld_example.py index 4158ed22278ef5c871a22d45885e58fbfa84ea3b..5542596e2751a5de8a34a9e19628266e8c753ad3 100644 --- a/integrationtests/test_realworld_example.py +++ b/integrationtests/test_realworld_example.py @@ -35,6 +35,7 @@ from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter from caoscrawler.structure_elements import Directory import pytest from caosadvancedtools.models.parser import parse_model_from_json_schema, parse_model_from_yaml +from caosadvancedtools.loadFiles import loadpath import sys @@ -52,6 +53,17 @@ def rfp(*pathcomponents): DATADIR = rfp("test_data", "extroot", "realworld_example") +@pytest.fixture +def addfiles(): + loadpath(path='/opt/caosdb/mnt/extroot/', + include=None, + exclude=None, + prefix="", + dryrun=False, + forceAllowSymlinks=True, + ) + + @pytest.fixture def usemodel(): # First load dataspace data model @@ -85,22 +97,18 @@ def create_identifiable_adapter(): return ident -def test_dataset(clear_database, usemodel): - ident = create_identifiable_adapter() - crawler = Crawler(identifiableAdapter=ident) - crawler_definition = crawler.load_definition( - os.path.join(DATADIR, "dataset_cfoods.yml")) - # print(json.dumps(crawler_definition, indent=3)) - # Load and register converter packages: - converter_registry = crawler.load_converters(crawler_definition) - # print("DictIntegerElement" in converter_registry) - - records = crawler.start_crawling( - Directory("data", os.path.join(DATADIR, 'data')), - crawler_definition, - converter_registry +def test_dataset(clear_database, usemodel, addfiles): + identifiable_path = os.path.join(DATADIR, "identifiables.yml") + crawler_definition_path = os.path.join(DATADIR, "dataset_cfoods.yml") + crawler_main( + os.path.join(DATADIR, 'data'), + crawler_definition_path, + identifiable_path, + True, + os.path.join(DATADIR, "provenance.yml"), + False, + add_prefix="/data" ) - crawler.synchronize() dataspace = db.execute_query("FIND RECORD Dataspace WITH name=35 AND dataspace_id=20002 AND " "archived=FALSE AND url='https://datacloud.de/index.php/f/7679'" @@ -120,12 +128,11 @@ def test_dataset(clear_database, usemodel): assert db.execute_query(f"FIND Event WITH latitude=53", unique=True) -def test_event_update(clear_database, usemodel): +def test_event_update(clear_database, usemodel, addfiles): identifiable_path = os.path.join(DATADIR, "identifiables.yml") crawler_definition_path = os.path.join(DATADIR, "dataset_cfoods.yml") - # TODO(fspreck): Use crawler_main crawler_main( os.path.join(DATADIR, 'data'), crawler_definition_path, @@ -133,7 +140,7 @@ def test_event_update(clear_database, usemodel): True, os.path.join(DATADIR, "provenance.yml"), False, - "" + add_prefix="/data" ) old_dataset_rec = db.execute_query( diff --git a/integrationtests/test_use_case_simple_presentation.py b/integrationtests/test_use_case_simple_presentation.py index 91c523be90a4d0117a7cc54217cae0b911511957..ff2f2a43c5f4f923202218f5cd4eb89b82db04a7 100644 --- a/integrationtests/test_use_case_simple_presentation.py +++ b/integrationtests/test_use_case_simple_presentation.py @@ -63,7 +63,7 @@ def test_complete_crawler( True, os.path.join(DATADIR, "provenance.yml"), False, - "/use_case_simple_presentation") + remove_prefix="/use_case_simple_presentation") res = db.execute_query("FIND Record Experiment") assert len(res) == 1 diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index fe6c63ff7af86a76fd6649323c96b8977b9fcc2f..60face1706dabbe40b78026d637154f5c97e13c4 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -1229,7 +1229,9 @@ def crawler_main(crawled_directory_path: str, prefix: str = "", securityMode: SecurityMode = SecurityMode.UPDATE, unique_names=True, - restricted_path: Optional[list[str]] = None + restricted_path: Optional[list[str]] = None, + remove_prefix: Optional[str] = None, + add_prefix: Optional[str] = None, ): """ @@ -1248,7 +1250,7 @@ def crawler_main(crawled_directory_path: str, dry_run : bool do not commit any chnages to the server prefix : str - remove the given prefix from file paths + DEPRICATED, remove the given prefix from file paths securityMode : int securityMode of Crawler unique_names : bool @@ -1256,6 +1258,10 @@ def crawler_main(crawled_directory_path: str, restricted_path: optional, list of strings Traverse the data tree only along the given path. When the end of the given path is reached, traverse the full tree as normal. + remove_prefix : Optional[str] + remove the given prefix from file paths + add_prefix : Optional[str] + add the given prefix to file paths Returns ------- @@ -1272,11 +1278,17 @@ def crawler_main(crawled_directory_path: str, crawler.save_debug_data(provenance_file) if identifiables_definition_file is not None: - ident = CaosDBIdentifiableAdapter() ident.load_from_yaml_definition(identifiables_definition_file) crawler.identifiableAdapter = ident + if prefix != "": + warnings.warn(DeprecationWarning("The prefix argument is derpicated. Please use " + "remove_prefix.")) + if remove_prefix is not None: + raise ValueError("Please do not supply prefix argument. Only remove_prefix") + remove_prefix = prefix + if dry_run: ins, upd = crawler.synchronize(commit_changes=False) inserts = [str(i) for i in ins] @@ -1291,11 +1303,14 @@ def crawler_main(crawled_directory_path: str, if isinstance(elem, db.File): # correct the file path: # elem.file = os.path.join(args.path, elem.file) - if prefix is None: - raise RuntimeError( - "No prefix set. Prefix must be set if files are used.") - if elem.path.startswith(prefix): - elem.path = elem.path[len(prefix):] + if remove_prefix: + if elem.path.startswith(remove_prefix): + elem.path = elem.path[len(remove_prefix):] + else: + raise RuntimeError("Prefix shall be removed from file path but the path " + "does not start with the prefix") + if add_prefix: + elem.path = add_prefix + elem.path elem.file = None # TODO: as long as the new file backend is not finished # we are using the loadFiles function to insert symlinks. @@ -1363,8 +1378,12 @@ def parse_args(): parser.add_argument("-u", "--unique-names", help="Insert or updates entities even if name conflicts exist.") parser.add_argument("-p", "--prefix", - help="Remove the given prefix from the paths " + help="DEPRICATED. Remove the given prefix from the paths " "of all file objects.") + parser.add_argument("--remove-prefix", + help="Remove the given prefix from the paths of all file objects.") + parser.add_argument("--add-prefix", + help="Add the given prefix to the paths of all file objects.") return parser.parse_args() @@ -1384,6 +1403,10 @@ def main(): conlogger = logging.getLogger("connection") conlogger.setLevel(level=logging.ERROR) + if args.prefix: + print("Please use '--remove-prefix' option instead of '--prefix' or '-p'.") + return -1 + # logging config for local execution logger.addHandler(logging.StreamHandler(sys.stdout)) if args.debug: @@ -1406,12 +1429,13 @@ def main(): debug=args.debug, provenance_file=args.provenance, dry_run=args.dry_run, - prefix=args.prefix, securityMode={"retrieve": SecurityMode.RETRIEVE, "insert": SecurityMode.INSERT, "update": SecurityMode.UPDATE}[args.security_mode], unique_names=args.unique_names, - restricted_path=restricted_path + restricted_path=restricted_path, + remove_prefix=args.remove_prefix, + add_prefix=args.add_prefix, )) diff --git a/unittests/test_tool.py b/unittests/test_tool.py index 23b35f2dc9228eeda9137945198c49c19bf5c474..735c33125fcf024d1cb4bd9b83ad222ce66ba2d0 100755 --- a/unittests/test_tool.py +++ b/unittests/test_tool.py @@ -792,8 +792,7 @@ def test_validation_error_print(caplog): os.path.join(DATADIR, "identifiables.yml"), True, None, - False, - "/use_case_simple_presentation") + False) assert "Couldn't validate" in caplog.text caplog.clear()