Skip to content
Snippets Groups Projects
Commit 31a6b372 authored by Alexander Schlemmer's avatar Alexander Schlemmer
Browse files

MAINT: moved main scanner function to scanner module

parent f547fa39
Branches
Tags
2 merge requests!108Release 0.5.0,!104Create a new scanner module and move functions from crawl module there
......@@ -903,113 +903,6 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3]))
with open(filename, "w") as f:
f.write(yaml.dump(paths, sort_keys=False))
def _crawl(self,
items: list[StructureElement],
local_converters: list[Converter],
generalStore: GeneralStore,
recordStore: RecordStore,
structure_elements_path: list[str],
converters_path: list[str],
restricted_path: Optional[list[str]] = None):
"""
Crawl a list of StructureElements and apply any matching converters.
items: structure_elements (e.g. files and folders on one level on the hierarchy)
local_converters: locally defined converters for
treating structure elements. A locally defined converter could be
one that is only valid for a specific subtree of the originally
cralwed StructureElement structure.
generalStore and recordStore: This recursion of the crawl function should only operate on
copies of the global stores of the Crawler object.
restricted_path: optional, list of strings, traverse the data tree only along the given
path. For example, when a directory contains files a, b and c and b is
given in restricted_path, a and c will be ignroed by the crawler.
When the end of the given path is reached, traverse the full tree as
normal. The first element of the list provided by restricted_path should
be the name of the StructureElement at this level, i.e. denoting the
respective element in the items argument.
"""
# This path_found variable stores wether the path given by restricted_path was found in the
# data tree
path_found = False
if restricted_path is not None and len(restricted_path) == 0:
restricted_path = None
for element in items:
for converter in local_converters:
# type is something like "matches files", replace isinstance with "type_matches"
# match function tests regexp for example
if (converter.typecheck(element) and (
restricted_path is None or element.name == restricted_path[0])
and converter.match(element) is not None):
path_found = True
generalStore_copy = generalStore.create_scoped_copy()
recordStore_copy = recordStore.create_scoped_copy()
# Create an entry for this matched structure element that contains the path:
generalStore_copy[converter.name] = (
os.path.join(*(structure_elements_path + [element.get_name()])))
# extracts values from structure element and stores them in the
# variable store
converter.create_values(generalStore_copy, element)
keys_modified = converter.create_records(
generalStore_copy, recordStore_copy, element)
children = converter.create_children(generalStore_copy, element)
if self.debug:
# add provenance information for each variable
self.debug_tree[str(element)] = (
generalStore_copy.get_storage(), recordStore_copy.get_storage())
self.debug_metadata["copied"][str(element)] = (
generalStore_copy.get_dict_copied(),
recordStore_copy.get_dict_copied())
self.debug_metadata["usage"][str(element)].add(
"/".join(converters_path + [converter.name]))
mod_info = self.debug_metadata["provenance"]
for record_name, prop_name in keys_modified:
# TODO: check
internal_id = recordStore_copy.get_internal_id(
record_name)
record_identifier = record_name + \
"_" + str(internal_id)
converter.metadata["usage"].add(record_identifier)
mod_info[record_identifier][prop_name] = (
structure_elements_path + [element.get_name()],
converters_path + [converter.name])
self._crawl(children, converter.converters,
generalStore_copy, recordStore_copy,
structure_elements_path + [element.get_name()],
converters_path + [converter.name],
restricted_path[1:] if restricted_path is not None else None)
if restricted_path and not path_found:
raise RuntimeError("A 'restricted_path' argument was given that is not contained in "
"the data tree")
# if the crawler is running out of scope, copy all records in
# the recordStore, that were created in this scope
# to the general update container.
scoped_records = recordStore.get_records_current_scope()
for record in scoped_records:
self.crawled_data.append(record)
# TODO: the scoped variables should be cleaned up as soon if the variables
# are no longer in the current scope. This can be implemented as follows,
# but this breaks the test "test_record_structure_generation", because
# some debug info is also deleted. This implementation can be used as soon
# as the remaining problems with the debug_tree are fixed.
# Delete the variables that are no longer needed:
# scoped_names = recordStore.get_names_current_scope()
# for name in scoped_names:
# del recordStore[name]
# del generalStore[name]
return self.crawled_data
def crawler_main(crawled_directory_path: str,
cfood_file_name: str,
......
......@@ -218,13 +218,126 @@ def initialize_converters(crawler_definition: dict, converter_registry: dict):
return converters
# --------------------------------------------------------------------------------
# Main scanner function:
# --------------------------------------------------------------------------------
def _crawl(self,
items: list[StructureElement],
local_converters: list[Converter],
generalStore: GeneralStore,
recordStore: RecordStore,
structure_elements_path: list[str],
converters_path: list[str],
restricted_path: Optional[list[str]] = None):
"""
Crawl a list of StructureElements and apply any matching converters.
items: structure_elements (e.g. files and folders on one level on the hierarchy)
local_converters: locally defined converters for
treating structure elements. A locally defined converter could be
one that is only valid for a specific subtree of the originally
cralwed StructureElement structure.
generalStore and recordStore: This recursion of the crawl function should only operate on
copies of the global stores of the Crawler object.
restricted_path: optional, list of strings, traverse the data tree only along the given
path. For example, when a directory contains files a, b and c and b is
given in restricted_path, a and c will be ignroed by the crawler.
When the end of the given path is reached, traverse the full tree as
normal. The first element of the list provided by restricted_path should
be the name of the StructureElement at this level, i.e. denoting the
respective element in the items argument.
"""
# This path_found variable stores wether the path given by restricted_path was found in the
# data tree
path_found = False
if restricted_path is not None and len(restricted_path) == 0:
restricted_path = None
for element in items:
for converter in local_converters:
# type is something like "matches files", replace isinstance with "type_matches"
# match function tests regexp for example
if (converter.typecheck(element) and (
restricted_path is None or element.name == restricted_path[0])
and converter.match(element) is not None):
path_found = True
generalStore_copy = generalStore.create_scoped_copy()
recordStore_copy = recordStore.create_scoped_copy()
# Create an entry for this matched structure element that contains the path:
generalStore_copy[converter.name] = (
os.path.join(*(structure_elements_path + [element.get_name()])))
# extracts values from structure element and stores them in the
# variable store
converter.create_values(generalStore_copy, element)
keys_modified = converter.create_records(
generalStore_copy, recordStore_copy, element)
children = converter.create_children(generalStore_copy, element)
if self.debug:
# add provenance information for each variable
self.debug_tree[str(element)] = (
generalStore_copy.get_storage(), recordStore_copy.get_storage())
self.debug_metadata["copied"][str(element)] = (
generalStore_copy.get_dict_copied(),
recordStore_copy.get_dict_copied())
self.debug_metadata["usage"][str(element)].add(
"/".join(converters_path + [converter.name]))
mod_info = self.debug_metadata["provenance"]
for record_name, prop_name in keys_modified:
# TODO: check
internal_id = recordStore_copy.get_internal_id(
record_name)
record_identifier = record_name + \
"_" + str(internal_id)
converter.metadata["usage"].add(record_identifier)
mod_info[record_identifier][prop_name] = (
structure_elements_path + [element.get_name()],
converters_path + [converter.name])
self._crawl(children, converter.converters,
generalStore_copy, recordStore_copy,
structure_elements_path + [element.get_name()],
converters_path + [converter.name],
restricted_path[1:] if restricted_path is not None else None)
if restricted_path and not path_found:
raise RuntimeError("A 'restricted_path' argument was given that is not contained in "
"the data tree")
# if the crawler is running out of scope, copy all records in
# the recordStore, that were created in this scope
# to the general update container.
scoped_records = recordStore.get_records_current_scope()
for record in scoped_records:
self.crawled_data.append(record)
# TODO: the scoped variables should be cleaned up as soon if the variables
# are no longer in the current scope. This can be implemented as follows,
# but this breaks the test "test_record_structure_generation", because
# some debug info is also deleted. This implementation can be used as soon
# as the remaining problems with the debug_tree are fixed.
# Delete the variables that are no longer needed:
# scoped_names = recordStore.get_names_current_scope()
# for name in scoped_names:
# del recordStore[name]
# del generalStore[name]
return self.crawled_data
# --------------------------------------------------------------------------------
# Main scanning functions:
# Main scanning interface functions:
# --------------------------------------------------------------------------------
def crawl_directory(self, dirname: str, crawler_definition_path: str,
def crawl_directory(dirname: str, crawler_definition_path: str,
restricted_path: Optional[list[str]] = None):
""" Crawl a single directory.
......@@ -236,15 +349,15 @@ def crawl_directory(self, dirname: str, crawler_definition_path: str,
is reached, traverse the full tree as normal.
"""
crawler_definition = self.load_definition(crawler_definition_path)
crawler_definition = load_definition(crawler_definition_path)
# Load and register converter packages:
converter_registry = self.load_converters(crawler_definition)
converter_registry = load_converters(crawler_definition)
if not dirname:
raise ValueError(
"You have to provide a non-empty path for crawling.")
dir_structure_name = os.path.basename(dirname)
self.crawled_directory = dirname
crawled_directory = dirname
if not dir_structure_name and dirname.endswith('/'):
if dirname == '/':
# Crawling the entire file system
......@@ -253,7 +366,7 @@ def crawl_directory(self, dirname: str, crawler_definition_path: str,
# dirname had a trailing '/'
dir_structure_name = os.path.basename(dirname[:-1])
self.start_crawling(Directory(dir_structure_name,
start_crawling(Directory(dir_structure_name,
dirname),
crawler_definition,
converter_registry,
......@@ -261,7 +374,7 @@ def crawl_directory(self, dirname: str, crawler_definition_path: str,
)
def start_crawling(self, items: Union[list[StructureElement], StructureElement],
def start_crawling(items: Union[list[StructureElement], StructureElement],
crawler_definition: dict,
converter_registry: dict,
restricted_path: Optional[list[str]] = None):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment