diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py index ded56aad1efe6ada80c1f2b5f55b6611d1f1b558..8a64416f7e0d28441fb1e7df6de0f4299698e03f 100644 --- a/src/caoscrawler/scanner.py +++ b/src/caoscrawler/scanner.py @@ -227,11 +227,11 @@ def initialize_converters(crawler_definition: dict, converter_registry: dict): def scanner(self, items: list[StructureElement], converters: list[Converter], - generalStore: GeneralStore, - recordStore: RecordStore, - structure_elements_path: list[str], - converters_path: list[str], - restricted_path: Optional[list[str]] = None): + general_store: Optional[GeneralStore] = None, + record_store: Optional[RecordStore] = None, + structure_elements_path: Optional[list[str]] = None, + restricted_path: Optional[list[str]] = None, + crawled_data: Optional[list[db.Record]] = None): """ Crawl a list of StructureElements and apply any matching converters. @@ -242,7 +242,7 @@ def scanner(self, treating structure elements. A locally defined converter could be one that is only valid for a specific subtree of the originally cralwed StructureElement structure. - generalStore and recordStore: This recursion of the crawl function should only operate on + general_store and record_store: This recursion of the crawl function should only operate on copies of the global stores of the Crawler object. restricted_path: optional, list of strings, traverse the data tree only along the given path. For example, when a directory contains files a, b and c and b is @@ -258,6 +258,18 @@ def scanner(self, if restricted_path is not None and len(restricted_path) == 0: restricted_path = None + if crawled_data is None: + crawled_data = [] + + if general_store is None: + general_store = GeneralStore() + + if record_store is None: + record_store = RecordStore() + + if structure_elements_path is None: + structure_elements_path = [] + for element in items: for converter in converters: @@ -267,35 +279,35 @@ def scanner(self, restricted_path is None or element.name == restricted_path[0]) and converter.match(element) is not None): path_found = True - generalStore_copy = generalStore.create_scoped_copy() - recordStore_copy = recordStore.create_scoped_copy() + general_store_copy = general_store.create_scoped_copy() + record_store_copy = record_store.create_scoped_copy() # Create an entry for this matched structure element that contains the path: - generalStore_copy[converter.name] = ( + general_store_copy[converter.name] = ( os.path.join(*(structure_elements_path + [element.get_name()]))) # extracts values from structure element and stores them in the # variable store - converter.create_values(generalStore_copy, element) + converter.create_values(general_store_copy, element) keys_modified = converter.create_records( - generalStore_copy, recordStore_copy, element) + general_store_copy, record_store_copy, element) - children = converter.create_children(generalStore_copy, element) + children = converter.create_children(general_store_copy, element) if self.debug: # add provenance information for each variable self.debug_tree[str(element)] = ( - generalStore_copy.get_storage(), recordStore_copy.get_storage()) + general_store_copy.get_storage(), record_store_copy.get_storage()) self.debug_metadata["copied"][str(element)] = ( - generalStore_copy.get_dict_copied(), - recordStore_copy.get_dict_copied()) + general_store_copy.get_dict_copied(), + record_store_copy.get_dict_copied()) self.debug_metadata["usage"][str(element)].add( "/".join(converters_path + [converter.name])) mod_info = self.debug_metadata["provenance"] for record_name, prop_name in keys_modified: # TODO: check - internal_id = recordStore_copy.get_internal_id( + internal_id = record_store_copy.get_internal_id( record_name) record_identifier = record_name + \ "_" + str(internal_id) @@ -305,7 +317,7 @@ def scanner(self, converters_path + [converter.name]) self.scanner(children, converter.converters, - generalStore_copy, recordStore_copy, + general_store_copy, record_store_copy, structure_elements_path + [element.get_name()], converters_path + [converter.name], restricted_path[1:] if restricted_path is not None else None) @@ -314,9 +326,9 @@ def scanner(self, raise RuntimeError("A 'restricted_path' argument was given that is not contained in " "the data tree") # if the crawler is running out of scope, copy all records in - # the recordStore, that were created in this scope + # the record_store, that were created in this scope # to the general update container. - scoped_records = recordStore.get_records_current_scope() + scoped_records = record_store.get_records_current_scope() for record in scoped_records: self.crawled_data.append(record) @@ -326,10 +338,10 @@ def scanner(self, # some debug info is also deleted. This implementation can be used as soon # as the remaining problems with the debug_tree are fixed. # Delete the variables that are no longer needed: - # scoped_names = recordStore.get_names_current_scope() + # scoped_names = record_store.get_names_current_scope() # for name in scoped_names: - # del recordStore[name] - # del generalStore[name] + # del record_store[name] + # del general_store[name] return self.crawled_data @@ -416,8 +428,4 @@ def scan_structure_elements(items: Union[list[StructureElement], StructureElemen return scanner( items=items, converters=converters, - generalStore=self.generalStore, - recordStore=self.recordStore, - structure_elements_path=[], - converters_path=[], restricted_path=restricted_path)