Skip to content
Snippets Groups Projects

Create a new scanner module and move functions from crawl module there

Merged Alexander Schlemmer requested to merge f-refactor-scanner-crawler into dev
1 file
+ 20
25
Compare changes
  • Side-by-side
  • Inline
+ 20
25
@@ -164,6 +164,8 @@ def create_converter_registry(definition: dict):
- module is the name of the module to be imported which must be installed
- class is the converter class to load and associate with this converter entry
Formerly known as "load_converters".
all other info for the converter needs to be included in the converter plugin
directory:
schema.yml file
@@ -337,10 +339,12 @@ def _crawl(self,
# --------------------------------------------------------------------------------
def crawl_directory(dirname: str, crawler_definition_path: str,
def scan_directory(dirname: str, crawler_definition_path: str,
restricted_path: Optional[list[str]] = None):
""" Crawl a single directory.
Formerly known as "crawl_directory".
Convenience function that starts the crawler (calls start_crawling)
with a single directory as the StructureElement.
@@ -351,7 +355,7 @@ def crawl_directory(dirname: str, crawler_definition_path: str,
crawler_definition = load_definition(crawler_definition_path)
# Load and register converter packages:
converter_registry = load_converters(crawler_definition)
converter_registry = create_converter_registry(crawler_definition)
if not dirname:
raise ValueError(
@@ -366,21 +370,22 @@ def crawl_directory(dirname: str, crawler_definition_path: str,
# dirname had a trailing '/'
dir_structure_name = os.path.basename(dirname[:-1])
start_crawling(Directory(dir_structure_name,
dirname),
crawler_definition,
converter_registry,
restricted_path=restricted_path
)
return scan_structure_elements(Directory(dir_structure_name,
dirname),
crawler_definition,
converter_registry,
restricted_path=restricted_path)
def start_crawling(items: Union[list[StructureElement], StructureElement],
crawler_definition: dict,
converter_registry: dict,
restricted_path: Optional[list[str]] = None):
def scan_structure_elements(items: Union[list[StructureElement], StructureElement],
crawler_definition: dict,
converter_registry: dict,
restricted_path: Optional[list[str]] = None):
"""
Start point of the crawler recursion.
Formerly known as "start_crawling".
Parameters
----------
items: list
@@ -400,27 +405,17 @@ def start_crawling(items: Union[list[StructureElement], StructureElement],
"""
# This function builds the tree of converters out of the crawler definition.
if self.generalStore is None:
raise RuntimeError("Should not happen.")
if not isinstance(items, list):
items = [items]
self.run_id = uuid.uuid1()
local_converters = Crawler.initialize_converters(crawler_definition, converter_registry)
converters = initialize_converters(crawler_definition, converter_registry)
# This recursive crawling procedure generates the update list:
self.crawled_data: list[db.Record] = []
self._crawl(
return _crawl(
items=items,
local_converters=local_converters,
local_converters=converters,
generalStore=self.generalStore,
recordStore=self.recordStore,
structure_elements_path=[],
converters_path=[],
restricted_path=restricted_path)
if self.debug:
self.debug_converters = local_converters
return self.crawled_data
Loading