From 90620c94e089b9e3c189bcb06cf79f9723c624be Mon Sep 17 00:00:00 2001
From: Alexander Schlemmer <alexander@mail-schlemmer.de>
Date: Wed, 8 Mar 2023 14:35:07 +0100
Subject: [PATCH] MAIN: refactored scan_structure_elements and scan_directory
 functions

---
 src/caoscrawler/scanner.py | 45 +++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 25 deletions(-)

diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py
index 328c8933..cc0a50b5 100644
--- a/src/caoscrawler/scanner.py
+++ b/src/caoscrawler/scanner.py
@@ -164,6 +164,8 @@ def create_converter_registry(definition: dict):
     - module is the name of the module to be imported which must be installed
     - class is the converter class to load and associate with this converter entry
 
+    Formerly known as "load_converters".
+
     all other info for the converter needs to be included in the converter plugin
     directory:
     schema.yml file
@@ -337,10 +339,12 @@ def _crawl(self,
 # --------------------------------------------------------------------------------
 
 
-def crawl_directory(dirname: str, crawler_definition_path: str,
+def scan_directory(dirname: str, crawler_definition_path: str,
                     restricted_path: Optional[list[str]] = None):
     """ Crawl a single directory.
 
+    Formerly known as "crawl_directory".
+
     Convenience function that starts the crawler (calls start_crawling)
     with a single directory as the StructureElement.
 
@@ -351,7 +355,7 @@ def crawl_directory(dirname: str, crawler_definition_path: str,
 
     crawler_definition = load_definition(crawler_definition_path)
     # Load and register converter packages:
-    converter_registry = load_converters(crawler_definition)
+    converter_registry = create_converter_registry(crawler_definition)
 
     if not dirname:
         raise ValueError(
@@ -366,21 +370,22 @@ def crawl_directory(dirname: str, crawler_definition_path: str,
             # dirname had a trailing '/'
             dir_structure_name = os.path.basename(dirname[:-1])
 
-    start_crawling(Directory(dir_structure_name,
-                                  dirname),
-                        crawler_definition,
-                        converter_registry,
-                        restricted_path=restricted_path
-                        )
+    return scan_structure_elements(Directory(dir_structure_name,
+                                             dirname),
+                                   crawler_definition,
+                                   converter_registry,
+                                   restricted_path=restricted_path)
     
 
-def start_crawling(items: Union[list[StructureElement], StructureElement],
-                   crawler_definition: dict,
-                   converter_registry: dict,
-                   restricted_path: Optional[list[str]] = None):
+def scan_structure_elements(items: Union[list[StructureElement], StructureElement],
+                            crawler_definition: dict,
+                            converter_registry: dict,
+                            restricted_path: Optional[list[str]] = None):
     """
     Start point of the crawler recursion.
 
+    Formerly known as "start_crawling".
+
     Parameters
     ----------
     items: list
@@ -400,27 +405,17 @@ def start_crawling(items: Union[list[StructureElement], StructureElement],
     """
 
     # This function builds the tree of converters out of the crawler definition.
-
-    if self.generalStore is None:
-        raise RuntimeError("Should not happen.")
-
     if not isinstance(items, list):
         items = [items]
 
     self.run_id = uuid.uuid1()
-    local_converters = Crawler.initialize_converters(crawler_definition, converter_registry)
+    converters = initialize_converters(crawler_definition, converter_registry)
 
-    # This recursive crawling procedure generates the update list:
-    self.crawled_data: list[db.Record] = []
-    self._crawl(
+    return _crawl(
         items=items,
-        local_converters=local_converters,
+        local_converters=converters,
         generalStore=self.generalStore,
         recordStore=self.recordStore,
         structure_elements_path=[],
         converters_path=[],
         restricted_path=restricted_path)
-    if self.debug:
-        self.debug_converters = local_converters
-
-    return self.crawled_data
-- 
GitLab