diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 0b8f921da4a2c7596ed17da56d27cb25bd938539..8d14dc2a2d617a685a75a33e2ad726b2c4b44666 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -903,113 +903,6 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) with open(filename, "w") as f: f.write(yaml.dump(paths, sort_keys=False)) - def _crawl(self, - items: list[StructureElement], - local_converters: list[Converter], - generalStore: GeneralStore, - recordStore: RecordStore, - structure_elements_path: list[str], - converters_path: list[str], - restricted_path: Optional[list[str]] = None): - """ - Crawl a list of StructureElements and apply any matching converters. - - items: structure_elements (e.g. files and folders on one level on the hierarchy) - local_converters: locally defined converters for - treating structure elements. A locally defined converter could be - one that is only valid for a specific subtree of the originally - cralwed StructureElement structure. - generalStore and recordStore: This recursion of the crawl function should only operate on - copies of the global stores of the Crawler object. - restricted_path: optional, list of strings, traverse the data tree only along the given - path. For example, when a directory contains files a, b and c and b is - given in restricted_path, a and c will be ignroed by the crawler. - When the end of the given path is reached, traverse the full tree as - normal. The first element of the list provided by restricted_path should - be the name of the StructureElement at this level, i.e. denoting the - respective element in the items argument. - """ - # This path_found variable stores wether the path given by restricted_path was found in the - # data tree - path_found = False - if restricted_path is not None and len(restricted_path) == 0: - restricted_path = None - - for element in items: - for converter in local_converters: - - # type is something like "matches files", replace isinstance with "type_matches" - # match function tests regexp for example - if (converter.typecheck(element) and ( - restricted_path is None or element.name == restricted_path[0]) - and converter.match(element) is not None): - path_found = True - generalStore_copy = generalStore.create_scoped_copy() - recordStore_copy = recordStore.create_scoped_copy() - - # Create an entry for this matched structure element that contains the path: - generalStore_copy[converter.name] = ( - os.path.join(*(structure_elements_path + [element.get_name()]))) - - # extracts values from structure element and stores them in the - # variable store - converter.create_values(generalStore_copy, element) - - keys_modified = converter.create_records( - generalStore_copy, recordStore_copy, element) - - children = converter.create_children(generalStore_copy, element) - - if self.debug: - # add provenance information for each variable - self.debug_tree[str(element)] = ( - generalStore_copy.get_storage(), recordStore_copy.get_storage()) - self.debug_metadata["copied"][str(element)] = ( - generalStore_copy.get_dict_copied(), - recordStore_copy.get_dict_copied()) - self.debug_metadata["usage"][str(element)].add( - "/".join(converters_path + [converter.name])) - mod_info = self.debug_metadata["provenance"] - for record_name, prop_name in keys_modified: - # TODO: check - internal_id = recordStore_copy.get_internal_id( - record_name) - record_identifier = record_name + \ - "_" + str(internal_id) - converter.metadata["usage"].add(record_identifier) - mod_info[record_identifier][prop_name] = ( - structure_elements_path + [element.get_name()], - converters_path + [converter.name]) - - self._crawl(children, converter.converters, - generalStore_copy, recordStore_copy, - structure_elements_path + [element.get_name()], - converters_path + [converter.name], - restricted_path[1:] if restricted_path is not None else None) - - if restricted_path and not path_found: - raise RuntimeError("A 'restricted_path' argument was given that is not contained in " - "the data tree") - # if the crawler is running out of scope, copy all records in - # the recordStore, that were created in this scope - # to the general update container. - scoped_records = recordStore.get_records_current_scope() - for record in scoped_records: - self.crawled_data.append(record) - - # TODO: the scoped variables should be cleaned up as soon if the variables - # are no longer in the current scope. This can be implemented as follows, - # but this breaks the test "test_record_structure_generation", because - # some debug info is also deleted. This implementation can be used as soon - # as the remaining problems with the debug_tree are fixed. - # Delete the variables that are no longer needed: - # scoped_names = recordStore.get_names_current_scope() - # for name in scoped_names: - # del recordStore[name] - # del generalStore[name] - - return self.crawled_data - def crawler_main(crawled_directory_path: str, cfood_file_name: str, diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py index f2c05ab13211c9434efa5b8f4bdf9b6983cca0e0..328c8933233cb0f4d3876706c8fc643b0b81c4be 100644 --- a/src/caoscrawler/scanner.py +++ b/src/caoscrawler/scanner.py @@ -218,13 +218,126 @@ def initialize_converters(crawler_definition: dict, converter_registry: dict): return converters +# -------------------------------------------------------------------------------- +# Main scanner function: +# -------------------------------------------------------------------------------- + +def _crawl(self, + items: list[StructureElement], + local_converters: list[Converter], + generalStore: GeneralStore, + recordStore: RecordStore, + structure_elements_path: list[str], + converters_path: list[str], + restricted_path: Optional[list[str]] = None): + """ + Crawl a list of StructureElements and apply any matching converters. + + items: structure_elements (e.g. files and folders on one level on the hierarchy) + local_converters: locally defined converters for + treating structure elements. A locally defined converter could be + one that is only valid for a specific subtree of the originally + cralwed StructureElement structure. + generalStore and recordStore: This recursion of the crawl function should only operate on + copies of the global stores of the Crawler object. + restricted_path: optional, list of strings, traverse the data tree only along the given + path. For example, when a directory contains files a, b and c and b is + given in restricted_path, a and c will be ignroed by the crawler. + When the end of the given path is reached, traverse the full tree as + normal. The first element of the list provided by restricted_path should + be the name of the StructureElement at this level, i.e. denoting the + respective element in the items argument. + """ + # This path_found variable stores wether the path given by restricted_path was found in the + # data tree + path_found = False + if restricted_path is not None and len(restricted_path) == 0: + restricted_path = None + + for element in items: + for converter in local_converters: + + # type is something like "matches files", replace isinstance with "type_matches" + # match function tests regexp for example + if (converter.typecheck(element) and ( + restricted_path is None or element.name == restricted_path[0]) + and converter.match(element) is not None): + path_found = True + generalStore_copy = generalStore.create_scoped_copy() + recordStore_copy = recordStore.create_scoped_copy() + + # Create an entry for this matched structure element that contains the path: + generalStore_copy[converter.name] = ( + os.path.join(*(structure_elements_path + [element.get_name()]))) + + # extracts values from structure element and stores them in the + # variable store + converter.create_values(generalStore_copy, element) + + keys_modified = converter.create_records( + generalStore_copy, recordStore_copy, element) + + children = converter.create_children(generalStore_copy, element) + + if self.debug: + # add provenance information for each variable + self.debug_tree[str(element)] = ( + generalStore_copy.get_storage(), recordStore_copy.get_storage()) + self.debug_metadata["copied"][str(element)] = ( + generalStore_copy.get_dict_copied(), + recordStore_copy.get_dict_copied()) + self.debug_metadata["usage"][str(element)].add( + "/".join(converters_path + [converter.name])) + mod_info = self.debug_metadata["provenance"] + for record_name, prop_name in keys_modified: + # TODO: check + internal_id = recordStore_copy.get_internal_id( + record_name) + record_identifier = record_name + \ + "_" + str(internal_id) + converter.metadata["usage"].add(record_identifier) + mod_info[record_identifier][prop_name] = ( + structure_elements_path + [element.get_name()], + converters_path + [converter.name]) + + self._crawl(children, converter.converters, + generalStore_copy, recordStore_copy, + structure_elements_path + [element.get_name()], + converters_path + [converter.name], + restricted_path[1:] if restricted_path is not None else None) + + if restricted_path and not path_found: + raise RuntimeError("A 'restricted_path' argument was given that is not contained in " + "the data tree") + # if the crawler is running out of scope, copy all records in + # the recordStore, that were created in this scope + # to the general update container. + scoped_records = recordStore.get_records_current_scope() + for record in scoped_records: + self.crawled_data.append(record) + + # TODO: the scoped variables should be cleaned up as soon if the variables + # are no longer in the current scope. This can be implemented as follows, + # but this breaks the test "test_record_structure_generation", because + # some debug info is also deleted. This implementation can be used as soon + # as the remaining problems with the debug_tree are fixed. + # Delete the variables that are no longer needed: + # scoped_names = recordStore.get_names_current_scope() + # for name in scoped_names: + # del recordStore[name] + # del generalStore[name] + + return self.crawled_data + + + # -------------------------------------------------------------------------------- -# Main scanning functions: +# Main scanning interface functions: # -------------------------------------------------------------------------------- -def crawl_directory(self, dirname: str, crawler_definition_path: str, +def crawl_directory(dirname: str, crawler_definition_path: str, restricted_path: Optional[list[str]] = None): """ Crawl a single directory. @@ -236,15 +349,15 @@ def crawl_directory(self, dirname: str, crawler_definition_path: str, is reached, traverse the full tree as normal. """ - crawler_definition = self.load_definition(crawler_definition_path) + crawler_definition = load_definition(crawler_definition_path) # Load and register converter packages: - converter_registry = self.load_converters(crawler_definition) + converter_registry = load_converters(crawler_definition) if not dirname: raise ValueError( "You have to provide a non-empty path for crawling.") dir_structure_name = os.path.basename(dirname) - self.crawled_directory = dirname + crawled_directory = dirname if not dir_structure_name and dirname.endswith('/'): if dirname == '/': # Crawling the entire file system @@ -253,7 +366,7 @@ def crawl_directory(self, dirname: str, crawler_definition_path: str, # dirname had a trailing '/' dir_structure_name = os.path.basename(dirname[:-1]) - self.start_crawling(Directory(dir_structure_name, + start_crawling(Directory(dir_structure_name, dirname), crawler_definition, converter_registry, @@ -261,7 +374,7 @@ def crawl_directory(self, dirname: str, crawler_definition_path: str, ) -def start_crawling(self, items: Union[list[StructureElement], StructureElement], +def start_crawling(items: Union[list[StructureElement], StructureElement], crawler_definition: dict, converter_registry: dict, restricted_path: Optional[list[str]] = None):