ENH: Support list of directories for crawler_main and scan_directory

0cd3c0c1 · Florian Spreckelsen · 444f9831 · 0cd3c0c1 · 0cd3c0c1
Commit 0cd3c0c1 authored 5 months ago by Florian Spreckelsen
--- a/src/caoscrawler/crawl.py
+++ b/src/caoscrawler/crawl.py
@@ -621,7 +621,7 @@ one with the entities that need to be updated and the other with entities to be
                    crawled_data: Optional[list[db.Record]] = None,
                    no_insert_RTs: Optional[list[str]] = None,
                    no_update_RTs: Optional[list[str]] = None,
-                    path_for_authorized_run: Optional[str] = "",
+                    path_for_authorized_run: Optional[Union[str, list[str]]] = "",
                    ):
        """
        This function applies several stages:
@@ -643,7 +643,7 @@ one with the entities that need to be updated and the other with entities to be
        no_update_RTs : list[str], optional
            list of RecordType names. Records that have one of those RecordTypes
            as parent will not be updated
-        path_for_authorized_run : str, optional
+        path_for_authorized_run : str or list[str], optional
            only used if there are changes that need authorization before being
            applied. The form for rerunning the crawler with the authorization
            of these changes will be generated with this path. See
@@ -718,11 +718,21 @@ one with the entities that need to be updated and the other with entities to be
        update_cache = UpdateCache()
        pending_inserts = update_cache.get_inserts(self.run_id)
        if pending_inserts:
+            if isinstance(path_for_authorized_run, list):
+                raise NotImplementedError(
+                    "Authorization of inserts is currently implemented only for single paths, "
+                    "not for lists of paths."
+                )
            Crawler.inform_about_pending_changes(
                pending_inserts, self.run_id, path_for_authorized_run)
        pending_updates = update_cache.get_updates(self.run_id)
        if pending_updates:
+            if isinstance(path_for_authorized_run, list):
+                raise NotImplementedError(
+                    "Authorization of updates is currently implemented only for single paths, "
+                    "not for lists of paths."
+                )
            Crawler.inform_about_pending_changes(
                pending_updates, self.run_id, path_for_authorized_run)
@@ -1004,7 +1014,7 @@ def _store_dry_run_data(ins, upd):
            "update": updates}))
-def crawler_main(crawled_directory_path: str,
+def crawler_main(crawled_directory_path: Union[str, list[str]],
                 cfood_file_name: str,
                 identifiables_definition_file: Optional[str] = None,
                 debug: bool = False,
@@ -1022,8 +1032,8 @@ def crawler_main(crawled_directory_path: str,
    Parameters
    ----------
-    crawled_directory_path : str
+    crawled_directory_path : str or list[str]
-        path to be crawled
+        path(s) to be crawled
    cfood_file_name : str
        filename of the cfood to be used
    identifiables_definition_file : str

--- a/src/caoscrawler/scanner.py
+++ b/src/caoscrawler/scanner.py
@@ -421,7 +421,7 @@ def scanner(items: list[StructureElement],
 # --------------------------------------------------------------------------------
-def scan_directory(dirname: str, crawler_definition_path: str,
+def scan_directory(dirname: Union[str, list[str]], crawler_definition_path: str,
                   restricted_path: Optional[list[str]] = None,
                   debug_tree: Optional[DebugTree] = None):
    """ Crawl a single directory.
@@ -434,10 +434,12 @@ def scan_directory(dirname: str, crawler_definition_path: str,
    Parameters
    ----------
+    dirname: str or list[str]
+        directory or list of directories to be scanned
    restricted_path: optional, list of strings
-            Traverse the data tree only along the given path. When the end of the given path
+        Traverse the data tree only along the given path. When the end
-            is reached, traverse the full tree as normal. See docstring of 'scanner' for
+        of the given path is reached, traverse the full tree as
-            more details.
+        normal. See docstring of 'scanner' for more details.
    Returns
    -------
@@ -455,26 +457,31 @@ def scan_directory(dirname: str, crawler_definition_path: str,
    if not dirname:
        raise ValueError(
            "You have to provide a non-empty path for crawling.")
-    dir_structure_name = os.path.basename(dirname)
+    if not isinstance(dirname, list):
+        dirname = [dirname]
-    # TODO: needs to be covered somewhere else
+    dir_element_list = []
-    crawled_directory = dirname
+    for dname in dirname:
-    if not dir_structure_name and dirname.endswith('/'):
+        dir_structure_name = os.path.basename(dname)
-        if dirname == '/':
-            # Crawling the entire file system
+        # TODO: needs to be covered somewhere else
-            dir_structure_name = "root"
+        crawled_directory = dname
-        else:
+        if not dir_structure_name and dname.endswith('/'):
-            # dirname had a trailing '/'
+            if dname == '/':
-            dir_structure_name = os.path.basename(dirname[:-1])
+                # Crawling the entire file system
+                dir_structure_name = "root"
-    return scan_structure_elements(Directory(dir_structure_name,
+            else:
-                                             dirname),
+                # dirname had a trailing '/'
-                                   crawler_definition,
+                dir_structure_name = os.path.basename(dname[:-1])
-                                   converter_registry,
+        dir_element_list.append(Directory(dir_structure_name, dname))
-                                   restricted_path=restricted_path,
-                                   debug_tree=debug_tree,
+    return scan_structure_elements(
-                                   registered_transformer_functions=registered_transformer_functions
+        dir_element_list,
-                                   )
+        crawler_definition,
+        converter_registry,
+        restricted_path=restricted_path,
+        debug_tree=debug_tree,
+        registered_transformer_functions=registered_transformer_functions
+    )
 def scan_structure_elements(items: Union[list[StructureElement], StructureElement],