diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index dd8edd3a7e62c892ab142bc489619c64bd6dc77f..70bdb495c5ff48489570ab45557601664cdd37f5 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -1165,11 +1165,29 @@ def _treat_deprecated_prefix(prefix, remove_prefix): return remove_prefix -def _fix_file_paths(crawled_data, add_prefix, remove_prefix): - """adjust the path according to add_/remove_prefix +def _fix_file_paths(crawled_data: list[db.Entity], + add_prefix: Optional[str], + remove_prefix: Optional[str]): + """ + Adjust the path according to add_/remove_prefix Also remove the `file` attribute from File entities (because inserts need currently be done by loadfiles. + + Arguments: + ------------ + + crawled_data: list[db.Entity] + A list of entities. This list will be searched for instances of db.File. + + add_prefix: Optional[str] + If add_prefix is not None, the given prefix will be added in front of elem.path. + + remove_prefix: Optional[str] + If remove_prefix is not None the given prefix will be removed from the front of + elem.path. In this case a RuntimeError will be raised if any path of a file does + not begin with "remove_prefix". + """ for elem in crawled_data: if isinstance(elem, db.File): @@ -1267,9 +1285,11 @@ def crawler_main(crawled_directory_path: str, Traverse the data tree only along the given path. When the end of the given path is reached, traverse the full tree as normal. remove_prefix : Optional[str] - remove the given prefix from file paths + Remove the given prefix from file paths. + See docstring of '_fix_file_paths' for more details. add_prefix : Optional[str] - add the given prefix to file paths + Add the given prefix to file paths. + See docstring of '_fix_file_paths' for more details. Returns -------