Skip to content
Snippets Groups Projects
Commit 90620c94 authored by Alexander Schlemmer's avatar Alexander Schlemmer
Browse files

MAIN: refactored scan_structure_elements and scan_directory functions

parent 31a6b372
No related branches found
No related tags found
2 merge requests!108Release 0.5.0,!104Create a new scanner module and move functions from crawl module there
...@@ -164,6 +164,8 @@ def create_converter_registry(definition: dict): ...@@ -164,6 +164,8 @@ def create_converter_registry(definition: dict):
- module is the name of the module to be imported which must be installed - module is the name of the module to be imported which must be installed
- class is the converter class to load and associate with this converter entry - class is the converter class to load and associate with this converter entry
Formerly known as "load_converters".
all other info for the converter needs to be included in the converter plugin all other info for the converter needs to be included in the converter plugin
directory: directory:
schema.yml file schema.yml file
...@@ -337,10 +339,12 @@ def _crawl(self, ...@@ -337,10 +339,12 @@ def _crawl(self,
# -------------------------------------------------------------------------------- # --------------------------------------------------------------------------------
def crawl_directory(dirname: str, crawler_definition_path: str, def scan_directory(dirname: str, crawler_definition_path: str,
restricted_path: Optional[list[str]] = None): restricted_path: Optional[list[str]] = None):
""" Crawl a single directory. """ Crawl a single directory.
Formerly known as "crawl_directory".
Convenience function that starts the crawler (calls start_crawling) Convenience function that starts the crawler (calls start_crawling)
with a single directory as the StructureElement. with a single directory as the StructureElement.
...@@ -351,7 +355,7 @@ def crawl_directory(dirname: str, crawler_definition_path: str, ...@@ -351,7 +355,7 @@ def crawl_directory(dirname: str, crawler_definition_path: str,
crawler_definition = load_definition(crawler_definition_path) crawler_definition = load_definition(crawler_definition_path)
# Load and register converter packages: # Load and register converter packages:
converter_registry = load_converters(crawler_definition) converter_registry = create_converter_registry(crawler_definition)
if not dirname: if not dirname:
raise ValueError( raise ValueError(
...@@ -366,21 +370,22 @@ def crawl_directory(dirname: str, crawler_definition_path: str, ...@@ -366,21 +370,22 @@ def crawl_directory(dirname: str, crawler_definition_path: str,
# dirname had a trailing '/' # dirname had a trailing '/'
dir_structure_name = os.path.basename(dirname[:-1]) dir_structure_name = os.path.basename(dirname[:-1])
start_crawling(Directory(dir_structure_name, return scan_structure_elements(Directory(dir_structure_name,
dirname), dirname),
crawler_definition, crawler_definition,
converter_registry, converter_registry,
restricted_path=restricted_path restricted_path=restricted_path)
)
def start_crawling(items: Union[list[StructureElement], StructureElement], def scan_structure_elements(items: Union[list[StructureElement], StructureElement],
crawler_definition: dict, crawler_definition: dict,
converter_registry: dict, converter_registry: dict,
restricted_path: Optional[list[str]] = None): restricted_path: Optional[list[str]] = None):
""" """
Start point of the crawler recursion. Start point of the crawler recursion.
Formerly known as "start_crawling".
Parameters Parameters
---------- ----------
items: list items: list
...@@ -400,27 +405,17 @@ def start_crawling(items: Union[list[StructureElement], StructureElement], ...@@ -400,27 +405,17 @@ def start_crawling(items: Union[list[StructureElement], StructureElement],
""" """
# This function builds the tree of converters out of the crawler definition. # This function builds the tree of converters out of the crawler definition.
if self.generalStore is None:
raise RuntimeError("Should not happen.")
if not isinstance(items, list): if not isinstance(items, list):
items = [items] items = [items]
self.run_id = uuid.uuid1() self.run_id = uuid.uuid1()
local_converters = Crawler.initialize_converters(crawler_definition, converter_registry) converters = initialize_converters(crawler_definition, converter_registry)
# This recursive crawling procedure generates the update list: return _crawl(
self.crawled_data: list[db.Record] = []
self._crawl(
items=items, items=items,
local_converters=local_converters, local_converters=converters,
generalStore=self.generalStore, generalStore=self.generalStore,
recordStore=self.recordStore, recordStore=self.recordStore,
structure_elements_path=[], structure_elements_path=[],
converters_path=[], converters_path=[],
restricted_path=restricted_path) restricted_path=restricted_path)
if self.debug:
self.debug_converters = local_converters
return self.crawled_data
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment