Skip to content
Snippets Groups Projects
Commit 580f3122 authored by Florian Spreckelsen's avatar Florian Spreckelsen
Browse files

Merge branch 'f-prefix' into 'dev'

ENH: add the 'add_prefix' argument

See merge request !101
parents 169cf927 baa032a8
No related branches found
No related tags found
2 merge requests!105REL: v0.4.0,!101ENH: add the 'add_prefix' argument
Pipeline #34107 passed
......@@ -8,11 +8,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased] ##
### Added ###
- DateElementConverter: allows to interpret text as a date object
- the restricted_path argument allows to crawl only a subtree
- logging that provides a summary of what is inserted and updated
- You can now access the file system path of a structure element (if it has one) using the variable
name ``<converter name>.path``
- ``add_prefix`` and ``remove_prefix`` arguments for the command line interface
and the ``crawler_main`` function for the adding/removal of path prefixes when
creating file entities.
### Changed ###
......@@ -25,9 +29,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Deprecated ###
- The ``prefix`` argument of `crawler_main` is deprecated. Use the new argument
``remove_prefix`` instead.
### Removed ###
- The command line argument ``--prefix``. Use the new argument ``--remove-prefix`` instead.
### Fixed ###
- an empty string as name is treated as no name (as does the server). This, fixes
queries for identifiables since it would contain "WITH name=''" otherwise
which is an impossible condition. If your cfoods contained this case, they are ill defined.
......
......@@ -153,6 +153,13 @@ Data:
metadata_json: &metadata_json_template
type: JSONFile
match: metadata.json
records:
JSONFile:
parents:
- JSONFile
role: File
path: ${metadata_json.path}
file: ${metadata_json.path}
validate: schema/dataset.schema.json
subtree:
jsondict:
......
......@@ -9,6 +9,7 @@
"minimum": 20000
},
"archived": { "type": "boolean" },
"JSONFile": { "type": "object" },
"url": {
"type": "string",
"description": "link to folder on file system (CaosDB or cloud folder)"
......
......@@ -36,6 +36,7 @@ from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter
from caoscrawler.structure_elements import Directory
import pytest
from caosadvancedtools.models.parser import parse_model_from_json_schema, parse_model_from_yaml
from caosadvancedtools.loadFiles import loadpath
import sys
......@@ -53,6 +54,17 @@ def rfp(*pathcomponents):
DATADIR = rfp("test_data", "extroot", "realworld_example")
@pytest.fixture
def addfiles():
loadpath(path='/opt/caosdb/mnt/extroot/',
include=None,
exclude=None,
prefix="",
dryrun=False,
forceAllowSymlinks=True,
)
@pytest.fixture
def usemodel():
# First load dataspace data model
......@@ -86,23 +98,21 @@ def create_identifiable_adapter():
return ident
def test_dataset(clear_database, usemodel, caplog):
def test_dataset(clear_database, usemodel, addfiles, caplog):
caplog.set_level(logging.DEBUG, logger="caoscrawler")
ident = create_identifiable_adapter()
crawler = Crawler(identifiableAdapter=ident)
crawler_definition = crawler.load_definition(
os.path.join(DATADIR, "dataset_cfoods.yml"))
# print(json.dumps(crawler_definition, indent=3))
# Load and register converter packages:
converter_registry = crawler.load_converters(crawler_definition)
# print("DictIntegerElement" in converter_registry)
records = crawler.start_crawling(
Directory("data", os.path.join(DATADIR, 'data')),
crawler_definition,
converter_registry
identifiable_path = os.path.join(DATADIR, "identifiables.yml")
crawler_definition_path = os.path.join(DATADIR, "dataset_cfoods.yml")
crawler_main(
os.path.join(DATADIR, 'data'),
crawler_definition_path,
identifiable_path,
True,
os.path.join(DATADIR, "provenance.yml"),
False,
remove_prefix=DATADIR,
# this test will fail without this prefix since the crawler would try to create new files
add_prefix="/extroot/realworld_example"
)
crawler.synchronize()
dataspace = db.execute_query("FIND RECORD Dataspace WITH name=35 AND dataspace_id=20002 AND "
"archived=FALSE AND url='https://datacloud.de/index.php/f/7679'"
......@@ -127,12 +137,11 @@ def test_dataset(clear_database, usemodel, caplog):
assert "Executed updates" in caplog.text
def test_event_update(clear_database, usemodel):
def test_event_update(clear_database, usemodel, addfiles):
identifiable_path = os.path.join(DATADIR, "identifiables.yml")
crawler_definition_path = os.path.join(DATADIR, "dataset_cfoods.yml")
# TODO(fspreck): Use crawler_main
crawler_main(
os.path.join(DATADIR, 'data'),
crawler_definition_path,
......@@ -140,7 +149,9 @@ def test_event_update(clear_database, usemodel):
True,
os.path.join(DATADIR, "provenance.yml"),
False,
""
remove_prefix=DATADIR,
# this test will fail without this prefix since the crawler would try to create new files
add_prefix="/extroot/realworld_example"
)
old_dataset_rec = db.execute_query(
......
......@@ -38,9 +38,7 @@ DATADIR = os.path.join(os.path.dirname(__file__), "test_data",
"extroot", "use_case_simple_presentation")
def test_complete_crawler(
clear_database
):
def test_complete_crawler(clear_database):
# Setup the data model:
model = parser.parse_model_from_yaml(os.path.join(DATADIR, "model.yml"))
model.sync_data_model(noquestion=True, verbose=False)
......@@ -57,13 +55,24 @@ def test_complete_crawler(
dryrun=False,
forceAllowSymlinks=False)
# test that a bad value for "remove_prefix" leads to runtime error
with pytest.raises(RuntimeError) as re:
crawler_main(DATADIR,
os.path.join(DATADIR, "cfood.yml"),
os.path.join(DATADIR, "identifiables.yml"),
True,
os.path.join(DATADIR, "provenance.yml"),
False,
os.path.abspath(DATADIR))
remove_prefix="sldkfjsldf")
assert "path does not start with the prefix" in str(re.value)
crawler_main(DATADIR,
os.path.join(DATADIR, "cfood.yml"),
os.path.join(DATADIR, "identifiables.yml"),
True,
os.path.join(DATADIR, "provenance.yml"),
False,
remove_prefix=os.path.abspath(DATADIR))
res = db.execute_query("FIND Record Experiment")
assert len(res) == 1
......
......@@ -1254,7 +1254,9 @@ def crawler_main(crawled_directory_path: str,
prefix: str = "",
securityMode: SecurityMode = SecurityMode.UPDATE,
unique_names=True,
restricted_path: Optional[list[str]] = None
restricted_path: Optional[list[str]] = None,
remove_prefix: Optional[str] = None,
add_prefix: Optional[str] = None,
):
"""
......@@ -1273,7 +1275,7 @@ def crawler_main(crawled_directory_path: str,
dry_run : bool
do not commit any chnages to the server
prefix : str
remove the given prefix from file paths
DEPRECATED, remove the given prefix from file paths
securityMode : int
securityMode of Crawler
unique_names : bool
......@@ -1281,6 +1283,10 @@ def crawler_main(crawled_directory_path: str,
restricted_path: optional, list of strings
Traverse the data tree only along the given path. When the end of the given path
is reached, traverse the full tree as normal.
remove_prefix : Optional[str]
remove the given prefix from file paths
add_prefix : Optional[str]
add the given prefix to file paths
Returns
-------
......@@ -1297,11 +1303,19 @@ def crawler_main(crawled_directory_path: str,
crawler.save_debug_data(provenance_file)
if identifiables_definition_file is not None:
ident = CaosDBIdentifiableAdapter()
ident.load_from_yaml_definition(identifiables_definition_file)
crawler.identifiableAdapter = ident
if prefix != "":
warnings.warn(DeprecationWarning("The prefix argument is deprecated and will be removed "
"in the future. Please use `remove_prefix` instead."))
if remove_prefix is not None:
raise ValueError("Please do not supply the (deprecated) `prefix` and the "
"`remove_prefix` argument at the same time. Only use "
"`remove_prefix` instead.")
remove_prefix = prefix
if dry_run:
ins, upd = crawler.synchronize(commit_changes=False)
inserts = [str(i) for i in ins]
......@@ -1316,11 +1330,15 @@ def crawler_main(crawled_directory_path: str,
if isinstance(elem, db.File):
# correct the file path:
# elem.file = os.path.join(args.path, elem.file)
if prefix is None:
raise RuntimeError(
"No prefix set. Prefix must be set if files are used.")
if elem.path.startswith(prefix):
elem.path = elem.path[len(prefix):]
if remove_prefix:
if elem.path.startswith(remove_prefix):
elem.path = elem.path[len(remove_prefix):]
else:
raise RuntimeError("Prefix shall be removed from file path but the path "
"does not start with the prefix:"
f"\n{remove_prefix}\n{elem.path}")
if add_prefix:
elem.path = add_prefix + elem.path
elem.file = None
# TODO: as long as the new file backend is not finished
# we are using the loadFiles function to insert symlinks.
......@@ -1388,8 +1406,12 @@ def parse_args():
parser.add_argument("-u", "--unique-names",
help="Insert or updates entities even if name conflicts exist.")
parser.add_argument("-p", "--prefix",
help="Remove the given prefix from the paths "
"of all file objects.")
help="DEPRECATED, use --remove-prefix instead. Remove the given prefix "
"from the paths of all file objects.")
parser.add_argument("--remove-prefix",
help="Remove the given prefix from the paths of all file objects.")
parser.add_argument("--add-prefix",
help="Add the given prefix to the paths of all file objects.")
return parser.parse_args()
......@@ -1409,6 +1431,10 @@ def main():
conlogger = logging.getLogger("connection")
conlogger.setLevel(level=logging.ERROR)
if args.prefix:
print("Please use '--remove-prefix' option instead of '--prefix' or '-p'.")
return -1
# logging config for local execution
logger.addHandler(logging.StreamHandler(sys.stdout))
if args.debug:
......@@ -1431,12 +1457,13 @@ def main():
debug=args.debug,
provenance_file=args.provenance,
dry_run=args.dry_run,
prefix=args.prefix,
securityMode={"retrieve": SecurityMode.RETRIEVE,
"insert": SecurityMode.INSERT,
"update": SecurityMode.UPDATE}[args.security_mode],
unique_names=args.unique_names,
restricted_path=restricted_path
restricted_path=restricted_path,
remove_prefix=args.remove_prefix,
add_prefix=args.add_prefix,
))
......
......@@ -794,8 +794,7 @@ def test_validation_error_print(caplog):
os.path.join(DATADIR, "identifiables.yml"),
True,
None,
False,
"/use_case_simple_presentation")
False)
assert "Couldn't validate" in caplog.text
caplog.clear()
......@@ -969,6 +968,22 @@ def test_split_restricted_path():
assert ["el", "el"] == split_restricted_path("/el/el")
def test_deprecated_prefix_option():
"""Test that calling the crawler's main function with the deprecated
`prefix` option raises the correct errors and warnings.
"""
with pytest.deprecated_call():
crawler_main("./", rfp("scifolder_cfood.yml"), prefix="to/be/removed")
with raises(ValueError) as ve:
crawler_main("./", rfp("scifolder_cfood.yml"), prefix="to/be/removed",
remove_prefix="to/be/removed")
assert "(deprecated) `prefix` and the `remove_prefix`" in str(ve.value)
def test_create_entity_summary():
assert "" == Crawler.create_entity_summary([]).strip()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment