From 0cd3c0c17221791adf22493204c83e8bd42142a7 Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Thu, 5 Dec 2024 17:26:04 +0100 Subject: [PATCH] ENH: Support list of directories for crawler_main and scan_directory --- src/caoscrawler/crawl.py | 20 ++++++++++---- src/caoscrawler/scanner.py | 55 +++++++++++++++++++++----------------- 2 files changed, 46 insertions(+), 29 deletions(-) diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 8ca84502..fd0beaa2 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -621,7 +621,7 @@ one with the entities that need to be updated and the other with entities to be crawled_data: Optional[list[db.Record]] = None, no_insert_RTs: Optional[list[str]] = None, no_update_RTs: Optional[list[str]] = None, - path_for_authorized_run: Optional[str] = "", + path_for_authorized_run: Optional[Union[str, list[str]]] = "", ): """ This function applies several stages: @@ -643,7 +643,7 @@ one with the entities that need to be updated and the other with entities to be no_update_RTs : list[str], optional list of RecordType names. Records that have one of those RecordTypes as parent will not be updated - path_for_authorized_run : str, optional + path_for_authorized_run : str or list[str], optional only used if there are changes that need authorization before being applied. The form for rerunning the crawler with the authorization of these changes will be generated with this path. See @@ -718,11 +718,21 @@ one with the entities that need to be updated and the other with entities to be update_cache = UpdateCache() pending_inserts = update_cache.get_inserts(self.run_id) if pending_inserts: + if isinstance(path_for_authorized_run, list): + raise NotImplementedError( + "Authorization of inserts is currently implemented only for single paths, " + "not for lists of paths." + ) Crawler.inform_about_pending_changes( pending_inserts, self.run_id, path_for_authorized_run) pending_updates = update_cache.get_updates(self.run_id) if pending_updates: + if isinstance(path_for_authorized_run, list): + raise NotImplementedError( + "Authorization of updates is currently implemented only for single paths, " + "not for lists of paths." + ) Crawler.inform_about_pending_changes( pending_updates, self.run_id, path_for_authorized_run) @@ -1004,7 +1014,7 @@ def _store_dry_run_data(ins, upd): "update": updates})) -def crawler_main(crawled_directory_path: str, +def crawler_main(crawled_directory_path: Union[str, list[str]], cfood_file_name: str, identifiables_definition_file: Optional[str] = None, debug: bool = False, @@ -1022,8 +1032,8 @@ def crawler_main(crawled_directory_path: str, Parameters ---------- - crawled_directory_path : str - path to be crawled + crawled_directory_path : str or list[str] + path(s) to be crawled cfood_file_name : str filename of the cfood to be used identifiables_definition_file : str diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py index 89bd1c04..6b4d7a12 100644 --- a/src/caoscrawler/scanner.py +++ b/src/caoscrawler/scanner.py @@ -421,7 +421,7 @@ def scanner(items: list[StructureElement], # -------------------------------------------------------------------------------- -def scan_directory(dirname: str, crawler_definition_path: str, +def scan_directory(dirname: Union[str, list[str]], crawler_definition_path: str, restricted_path: Optional[list[str]] = None, debug_tree: Optional[DebugTree] = None): """ Crawl a single directory. @@ -434,10 +434,12 @@ def scan_directory(dirname: str, crawler_definition_path: str, Parameters ---------- + dirname: str or list[str] + directory or list of directories to be scanned restricted_path: optional, list of strings - Traverse the data tree only along the given path. When the end of the given path - is reached, traverse the full tree as normal. See docstring of 'scanner' for - more details. + Traverse the data tree only along the given path. When the end + of the given path is reached, traverse the full tree as + normal. See docstring of 'scanner' for more details. Returns ------- @@ -455,26 +457,31 @@ def scan_directory(dirname: str, crawler_definition_path: str, if not dirname: raise ValueError( "You have to provide a non-empty path for crawling.") - dir_structure_name = os.path.basename(dirname) - - # TODO: needs to be covered somewhere else - crawled_directory = dirname - if not dir_structure_name and dirname.endswith('/'): - if dirname == '/': - # Crawling the entire file system - dir_structure_name = "root" - else: - # dirname had a trailing '/' - dir_structure_name = os.path.basename(dirname[:-1]) - - return scan_structure_elements(Directory(dir_structure_name, - dirname), - crawler_definition, - converter_registry, - restricted_path=restricted_path, - debug_tree=debug_tree, - registered_transformer_functions=registered_transformer_functions - ) + if not isinstance(dirname, list): + dirname = [dirname] + dir_element_list = [] + for dname in dirname: + dir_structure_name = os.path.basename(dname) + + # TODO: needs to be covered somewhere else + crawled_directory = dname + if not dir_structure_name and dname.endswith('/'): + if dname == '/': + # Crawling the entire file system + dir_structure_name = "root" + else: + # dirname had a trailing '/' + dir_structure_name = os.path.basename(dname[:-1]) + dir_element_list.append(Directory(dir_structure_name, dname)) + + return scan_structure_elements( + dir_element_list, + crawler_definition, + converter_registry, + restricted_path=restricted_path, + debug_tree=debug_tree, + registered_transformer_functions=registered_transformer_functions + ) def scan_structure_elements(items: Union[list[StructureElement], StructureElement], -- GitLab