diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index dd8edd3a7e62c892ab142bc489619c64bd6dc77f..da721ec3535fba306e2db3eacf76a6b700cba9fd 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -1165,11 +1165,29 @@ def _treat_deprecated_prefix(prefix, remove_prefix): return remove_prefix -def _fix_file_paths(crawled_data, add_prefix, remove_prefix): - """adjust the path according to add_/remove_prefix +def _fix_file_paths(crawled_data: list[db.Entity], + add_prefix: Optional[str], + remove_prefix: Optional[str]): + """ + Adjust the path according to add_/remove_prefix Also remove the `file` attribute from File entities (because inserts need currently be done by loadfiles. + + Arguments: + ------------ + + crawled_data: list[db.Entity] + A list of entities. This list will be searched for instances of db.File. + + add_prefix: Optional[str] + If add_prefix is not None, the given prefix will be added in front of elem.path. + + remove_prefix: Optional[str] + If remove_prefix is not None the given prefix will be removed from the front of + elem.path. In this case a RuntimeError will be raised if any path of a file does + not begin with "remove_prefix". + """ for elem in crawled_data: if isinstance(elem, db.File): @@ -1265,11 +1283,14 @@ def crawler_main(crawled_directory_path: str, whether or not to update or insert entities inspite of name conflicts restricted_path: optional, list of strings Traverse the data tree only along the given path. When the end of the given path - is reached, traverse the full tree as normal. + is reached, traverse the full tree as normal. See docstring of 'scanner' in + module 'scanner' for more details. remove_prefix : Optional[str] - remove the given prefix from file paths + Remove the given prefix from file paths. + See docstring of '_fix_file_paths' for more details. add_prefix : Optional[str] - add the given prefix to file paths + Add the given prefix to file paths. + See docstring of '_fix_file_paths' for more details. Returns ------- @@ -1382,12 +1403,18 @@ def parse_args(): def split_restricted_path(path): - elements = [] - while path != "/": - path, el = os.path.split(path) - if el != "": - elements.insert(0, el) - return elements + """ + Split a path string into components separated by slashes or other os.path.sep. + Empty elements will be removed. + """ + # This implementation leads to infinite loops + # for "ill-posed" paths (see test_utilities.py"): + # elements = [] + # while path != "/": + # path, el = os.path.split(path) + # if el != "": + # elements.insert(0, el) + return [i for i in path.split(os.path.sep) if i != ""] def main(): diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py index c5e078c582a22477e4bddfdae3048bdbc1e0fe06..5bd662d3fb8efd77564066eae353a17c499d62e8 100644 --- a/src/caoscrawler/scanner.py +++ b/src/caoscrawler/scanner.py @@ -235,7 +235,7 @@ def scanner(items: list[StructureElement], restricted_path: optional, list of strings, traverse the data tree only along the given path. For example, when a directory contains files a, b and c and b is - given in restricted_path, a and c will be ignroed by the crawler. + given as restricted_path, a and c will be ignroed by the crawler. When the end of the given path is reached, traverse the full tree as normal. The first element of the list provided by restricted_path should be the name of the StructureElement at this level, i.e. denoting the @@ -357,7 +357,8 @@ def scan_directory(dirname: str, crawler_definition_path: str, restricted_path: optional, list of strings Traverse the data tree only along the given path. When the end of the given path - is reached, traverse the full tree as normal. + is reached, traverse the full tree as normal. See docstring of 'scanner' for + more details. """ crawler_definition = load_definition(crawler_definition_path) @@ -408,7 +409,8 @@ def scan_structure_elements(items: Union[list[StructureElement], StructureElemen file. restricted_path: optional, list of strings Traverse the data tree only along the given path. When the end of the given path - is reached, traverse the full tree as normal. + is reached, traverse the full tree as normal. See docstring of 'scanner' for + more details. Returns ------- diff --git a/unittests/test_utilities.py b/unittests/test_utilities.py new file mode 100644 index 0000000000000000000000000000000000000000..5a80ab9b230db4540d741bf8fa4f9d11b5158aab --- /dev/null +++ b/unittests/test_utilities.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2023 Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +from caoscrawler.crawl import split_restricted_path + + +def test_split_restricted_path(): + assert split_restricted_path("") == [] + assert split_restricted_path("/") == [] + assert split_restricted_path("test/") == ["test"] + assert split_restricted_path("/test/") == ["test"] + assert split_restricted_path("test/bla") == ["test", "bla"] + assert split_restricted_path("/test/bla") == ["test", "bla"] + assert split_restricted_path("/test1/test2/bla") == ["test1", "test2", "bla"] + assert split_restricted_path("/test//bla") == ["test", "bla"] + assert split_restricted_path("//test/bla") == ["test", "bla"] + assert split_restricted_path("///test//bla////") == ["test", "bla"]