diff --git a/CHANGELOG.md b/CHANGELOG.md index 0c68188c6cc140fa49c6cb4b8f1f58189b45f8c9..95189b50054033f54054a21388a67ca47c8356ea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,11 +8,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ## ### Added ### + - DateElementConverter: allows to interpret text as a date object - the restricted_path argument allows to crawl only a subtree - logging that provides a summary of what is inserted and updated - You can now access the file system path of a structure element (if it has one) using the variable name ``<converter name>.path`` +- ``add_prefix`` and ``remove_prefix`` arguments for the command line interface + and the ``crawler_main`` function for the adding/removal of path prefixes when + creating file entities. ### Changed ### @@ -25,9 +29,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Deprecated ### +- The ``prefix`` argument of `crawler_main` is deprecated. Use the new argument + ``remove_prefix`` instead. + ### Removed ### +- The command line argument ``--prefix``. Use the new argument ``--remove-prefix`` instead. ### Fixed ### + - an empty string as name is treated as no name (as does the server). This, fixes queries for identifiables since it would contain "WITH name=''" otherwise which is an impossible condition. If your cfoods contained this case, they are ill defined. diff --git a/integrationtests/test_data/extroot/realworld_example/dataset_cfoods.yml b/integrationtests/test_data/extroot/realworld_example/dataset_cfoods.yml index 7a64d708667182b80b739812e5fdf3369fc5b462..37a34d125dcff1d121b1bded2fe959c4d30ff403 100644 --- a/integrationtests/test_data/extroot/realworld_example/dataset_cfoods.yml +++ b/integrationtests/test_data/extroot/realworld_example/dataset_cfoods.yml @@ -153,6 +153,13 @@ Data: metadata_json: &metadata_json_template type: JSONFile match: metadata.json + records: + JSONFile: + parents: + - JSONFile + role: File + path: ${metadata_json.path} + file: ${metadata_json.path} validate: schema/dataset.schema.json subtree: jsondict: diff --git a/integrationtests/test_data/extroot/realworld_example/schema/dataspace.schema.json b/integrationtests/test_data/extroot/realworld_example/schema/dataspace.schema.json index 01653bfa821e0a0acbb5a481bfd458e2ed784fb9..36233230ae05f9df58ae4e492ff1f709322f6e51 100644 --- a/integrationtests/test_data/extroot/realworld_example/schema/dataspace.schema.json +++ b/integrationtests/test_data/extroot/realworld_example/schema/dataspace.schema.json @@ -9,6 +9,7 @@ "minimum": 20000 }, "archived": { "type": "boolean" }, + "JSONFile": { "type": "object" }, "url": { "type": "string", "description": "link to folder on file system (CaosDB or cloud folder)" diff --git a/integrationtests/test_realworld_example.py b/integrationtests/test_realworld_example.py index 3e4bec3f5465d176b03792253b2e776a48acf1e7..cb5ed2c769945af033bc56a2d6af3bf1cec86de4 100644 --- a/integrationtests/test_realworld_example.py +++ b/integrationtests/test_realworld_example.py @@ -36,6 +36,7 @@ from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter from caoscrawler.structure_elements import Directory import pytest from caosadvancedtools.models.parser import parse_model_from_json_schema, parse_model_from_yaml +from caosadvancedtools.loadFiles import loadpath import sys @@ -53,6 +54,17 @@ def rfp(*pathcomponents): DATADIR = rfp("test_data", "extroot", "realworld_example") +@pytest.fixture +def addfiles(): + loadpath(path='/opt/caosdb/mnt/extroot/', + include=None, + exclude=None, + prefix="", + dryrun=False, + forceAllowSymlinks=True, + ) + + @pytest.fixture def usemodel(): # First load dataspace data model @@ -86,23 +98,21 @@ def create_identifiable_adapter(): return ident -def test_dataset(clear_database, usemodel, caplog): +def test_dataset(clear_database, usemodel, addfiles, caplog): caplog.set_level(logging.DEBUG, logger="caoscrawler") - ident = create_identifiable_adapter() - crawler = Crawler(identifiableAdapter=ident) - crawler_definition = crawler.load_definition( - os.path.join(DATADIR, "dataset_cfoods.yml")) - # print(json.dumps(crawler_definition, indent=3)) - # Load and register converter packages: - converter_registry = crawler.load_converters(crawler_definition) - # print("DictIntegerElement" in converter_registry) - - records = crawler.start_crawling( - Directory("data", os.path.join(DATADIR, 'data')), - crawler_definition, - converter_registry + identifiable_path = os.path.join(DATADIR, "identifiables.yml") + crawler_definition_path = os.path.join(DATADIR, "dataset_cfoods.yml") + crawler_main( + os.path.join(DATADIR, 'data'), + crawler_definition_path, + identifiable_path, + True, + os.path.join(DATADIR, "provenance.yml"), + False, + remove_prefix=DATADIR, + # this test will fail without this prefix since the crawler would try to create new files + add_prefix="/extroot/realworld_example" ) - crawler.synchronize() dataspace = db.execute_query("FIND RECORD Dataspace WITH name=35 AND dataspace_id=20002 AND " "archived=FALSE AND url='https://datacloud.de/index.php/f/7679'" @@ -127,12 +137,11 @@ def test_dataset(clear_database, usemodel, caplog): assert "Executed updates" in caplog.text -def test_event_update(clear_database, usemodel): +def test_event_update(clear_database, usemodel, addfiles): identifiable_path = os.path.join(DATADIR, "identifiables.yml") crawler_definition_path = os.path.join(DATADIR, "dataset_cfoods.yml") - # TODO(fspreck): Use crawler_main crawler_main( os.path.join(DATADIR, 'data'), crawler_definition_path, @@ -140,7 +149,9 @@ def test_event_update(clear_database, usemodel): True, os.path.join(DATADIR, "provenance.yml"), False, - "" + remove_prefix=DATADIR, + # this test will fail without this prefix since the crawler would try to create new files + add_prefix="/extroot/realworld_example" ) old_dataset_rec = db.execute_query( diff --git a/integrationtests/test_use_case_simple_presentation.py b/integrationtests/test_use_case_simple_presentation.py index 463253b6a85cdbc95088a0fa3f64c831459e5b9e..5fc0f6c7d85a0fce4490c72952e711fe241a0099 100644 --- a/integrationtests/test_use_case_simple_presentation.py +++ b/integrationtests/test_use_case_simple_presentation.py @@ -38,9 +38,7 @@ DATADIR = os.path.join(os.path.dirname(__file__), "test_data", "extroot", "use_case_simple_presentation") -def test_complete_crawler( - clear_database -): +def test_complete_crawler(clear_database): # Setup the data model: model = parser.parse_model_from_yaml(os.path.join(DATADIR, "model.yml")) model.sync_data_model(noquestion=True, verbose=False) @@ -57,13 +55,24 @@ def test_complete_crawler( dryrun=False, forceAllowSymlinks=False) + # test that a bad value for "remove_prefix" leads to runtime error + with pytest.raises(RuntimeError) as re: + crawler_main(DATADIR, + os.path.join(DATADIR, "cfood.yml"), + os.path.join(DATADIR, "identifiables.yml"), + True, + os.path.join(DATADIR, "provenance.yml"), + False, + remove_prefix="sldkfjsldf") + assert "path does not start with the prefix" in str(re.value) + crawler_main(DATADIR, os.path.join(DATADIR, "cfood.yml"), os.path.join(DATADIR, "identifiables.yml"), True, os.path.join(DATADIR, "provenance.yml"), False, - os.path.abspath(DATADIR)) + remove_prefix=os.path.abspath(DATADIR)) res = db.execute_query("FIND Record Experiment") assert len(res) == 1 diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 18d2fcb368935abd6cd51acdc9f05fbc27fd46e0..03a1c314a528af4b3802fda1269a36656d995624 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -1254,7 +1254,9 @@ def crawler_main(crawled_directory_path: str, prefix: str = "", securityMode: SecurityMode = SecurityMode.UPDATE, unique_names=True, - restricted_path: Optional[list[str]] = None + restricted_path: Optional[list[str]] = None, + remove_prefix: Optional[str] = None, + add_prefix: Optional[str] = None, ): """ @@ -1273,7 +1275,7 @@ def crawler_main(crawled_directory_path: str, dry_run : bool do not commit any chnages to the server prefix : str - remove the given prefix from file paths + DEPRECATED, remove the given prefix from file paths securityMode : int securityMode of Crawler unique_names : bool @@ -1281,6 +1283,10 @@ def crawler_main(crawled_directory_path: str, restricted_path: optional, list of strings Traverse the data tree only along the given path. When the end of the given path is reached, traverse the full tree as normal. + remove_prefix : Optional[str] + remove the given prefix from file paths + add_prefix : Optional[str] + add the given prefix to file paths Returns ------- @@ -1297,11 +1303,19 @@ def crawler_main(crawled_directory_path: str, crawler.save_debug_data(provenance_file) if identifiables_definition_file is not None: - ident = CaosDBIdentifiableAdapter() ident.load_from_yaml_definition(identifiables_definition_file) crawler.identifiableAdapter = ident + if prefix != "": + warnings.warn(DeprecationWarning("The prefix argument is deprecated and will be removed " + "in the future. Please use `remove_prefix` instead.")) + if remove_prefix is not None: + raise ValueError("Please do not supply the (deprecated) `prefix` and the " + "`remove_prefix` argument at the same time. Only use " + "`remove_prefix` instead.") + remove_prefix = prefix + if dry_run: ins, upd = crawler.synchronize(commit_changes=False) inserts = [str(i) for i in ins] @@ -1316,11 +1330,15 @@ def crawler_main(crawled_directory_path: str, if isinstance(elem, db.File): # correct the file path: # elem.file = os.path.join(args.path, elem.file) - if prefix is None: - raise RuntimeError( - "No prefix set. Prefix must be set if files are used.") - if elem.path.startswith(prefix): - elem.path = elem.path[len(prefix):] + if remove_prefix: + if elem.path.startswith(remove_prefix): + elem.path = elem.path[len(remove_prefix):] + else: + raise RuntimeError("Prefix shall be removed from file path but the path " + "does not start with the prefix:" + f"\n{remove_prefix}\n{elem.path}") + if add_prefix: + elem.path = add_prefix + elem.path elem.file = None # TODO: as long as the new file backend is not finished # we are using the loadFiles function to insert symlinks. @@ -1388,8 +1406,12 @@ def parse_args(): parser.add_argument("-u", "--unique-names", help="Insert or updates entities even if name conflicts exist.") parser.add_argument("-p", "--prefix", - help="Remove the given prefix from the paths " - "of all file objects.") + help="DEPRECATED, use --remove-prefix instead. Remove the given prefix " + "from the paths of all file objects.") + parser.add_argument("--remove-prefix", + help="Remove the given prefix from the paths of all file objects.") + parser.add_argument("--add-prefix", + help="Add the given prefix to the paths of all file objects.") return parser.parse_args() @@ -1409,6 +1431,10 @@ def main(): conlogger = logging.getLogger("connection") conlogger.setLevel(level=logging.ERROR) + if args.prefix: + print("Please use '--remove-prefix' option instead of '--prefix' or '-p'.") + return -1 + # logging config for local execution logger.addHandler(logging.StreamHandler(sys.stdout)) if args.debug: @@ -1431,12 +1457,13 @@ def main(): debug=args.debug, provenance_file=args.provenance, dry_run=args.dry_run, - prefix=args.prefix, securityMode={"retrieve": SecurityMode.RETRIEVE, "insert": SecurityMode.INSERT, "update": SecurityMode.UPDATE}[args.security_mode], unique_names=args.unique_names, - restricted_path=restricted_path + restricted_path=restricted_path, + remove_prefix=args.remove_prefix, + add_prefix=args.add_prefix, )) diff --git a/unittests/test_tool.py b/unittests/test_tool.py index ca9d074e4a65b83d02347c2f7773e8a9b963886c..e15d7cb777ced4b92566df2b25b375e90be39295 100755 --- a/unittests/test_tool.py +++ b/unittests/test_tool.py @@ -794,8 +794,7 @@ def test_validation_error_print(caplog): os.path.join(DATADIR, "identifiables.yml"), True, None, - False, - "/use_case_simple_presentation") + False) assert "Couldn't validate" in caplog.text caplog.clear() @@ -969,6 +968,22 @@ def test_split_restricted_path(): assert ["el", "el"] == split_restricted_path("/el/el") +def test_deprecated_prefix_option(): + """Test that calling the crawler's main function with the deprecated + `prefix` option raises the correct errors and warnings. + + """ + + with pytest.deprecated_call(): + crawler_main("./", rfp("scifolder_cfood.yml"), prefix="to/be/removed") + + with raises(ValueError) as ve: + crawler_main("./", rfp("scifolder_cfood.yml"), prefix="to/be/removed", + remove_prefix="to/be/removed") + + assert "(deprecated) `prefix` and the `remove_prefix`" in str(ve.value) + + def test_create_entity_summary(): assert "" == Crawler.create_entity_summary([]).strip()