MAINT: moved main scanner function to scanner module

31a6b372 · Alexander Schlemmer · f547fa39 · 31a6b372 · 31a6b372
Commit 31a6b372 authored 2 years ago by Alexander Schlemmer
--- a/src/caoscrawler/crawl.py
+++ b/src/caoscrawler/crawl.py
@@ -903,113 +903,6 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3]))
        with open(filename, "w") as f:
            f.write(yaml.dump(paths, sort_keys=False))

-    def _crawl(self,
-               items: list[StructureElement],
-               local_converters: list[Converter],
-               generalStore: GeneralStore,
-               recordStore: RecordStore,
-               structure_elements_path: list[str],
-               converters_path: list[str],
-               restricted_path: Optional[list[str]] = None):
-        """
-        Crawl a list of StructureElements and apply any matching converters.
-
-        items: structure_elements (e.g. files and folders on one level on the hierarchy)
-        local_converters: locally defined converters for
-                            treating structure elements. A locally defined converter could be
-                            one that is only valid for a specific subtree of the originally
-                            cralwed StructureElement structure.
-        generalStore and recordStore: This recursion of the crawl function should only operate on
-                                      copies of the global stores of the Crawler object.
-        restricted_path: optional, list of strings, traverse the data tree only along the given
-                         path. For example, when a directory contains files a, b and c and b is
-                         given in restricted_path, a and c will be ignroed by the crawler.
-                         When the end of the given path is reached, traverse the full tree as
-                         normal. The first element of the list provided by restricted_path should
-                         be the name of the StructureElement at this level, i.e. denoting the
-                         respective element in the items argument.
-        """
-        # This path_found variable stores wether the path given by restricted_path was found in the
-        # data tree
-        path_found = False
-        if restricted_path is not None and len(restricted_path) == 0:
-            restricted_path = None
-
-        for element in items:
-            for converter in local_converters:
-
-                # type is something like "matches files", replace isinstance with "type_matches"
-                # match function tests regexp for example
-                if (converter.typecheck(element) and (
-                        restricted_path is None or element.name == restricted_path[0])
-                        and converter.match(element) is not None):
-                    path_found = True
-                    generalStore_copy = generalStore.create_scoped_copy()
-                    recordStore_copy = recordStore.create_scoped_copy()
-
-                    # Create an entry for this matched structure element that contains the path:
-                    generalStore_copy[converter.name] = (
-                        os.path.join(*(structure_elements_path + [element.get_name()])))
-
-                    # extracts values from structure element and stores them in the
-                    # variable store
-                    converter.create_values(generalStore_copy, element)
-
-                    keys_modified = converter.create_records(
-                        generalStore_copy, recordStore_copy, element)
-
-                    children = converter.create_children(generalStore_copy, element)
-
-                    if self.debug:
-                        # add provenance information for each variable
-                        self.debug_tree[str(element)] = (
-                            generalStore_copy.get_storage(), recordStore_copy.get_storage())
-                        self.debug_metadata["copied"][str(element)] = (
-                            generalStore_copy.get_dict_copied(),
-                            recordStore_copy.get_dict_copied())
-                        self.debug_metadata["usage"][str(element)].add(
-                            "/".join(converters_path + [converter.name]))
-                        mod_info = self.debug_metadata["provenance"]
-                        for record_name, prop_name in keys_modified:
-                            # TODO: check
-                            internal_id = recordStore_copy.get_internal_id(
-                                record_name)
-                            record_identifier = record_name + \
-                                "_" + str(internal_id)
-                            converter.metadata["usage"].add(record_identifier)
-                            mod_info[record_identifier][prop_name] = (
-                                structure_elements_path + [element.get_name()],
-                                converters_path + [converter.name])
-
-                    self._crawl(children, converter.converters,
-                                generalStore_copy, recordStore_copy,
-                                structure_elements_path + [element.get_name()],
-                                converters_path + [converter.name],
-                                restricted_path[1:] if restricted_path is not None else None)
-
-        if restricted_path and not path_found:
-            raise RuntimeError("A 'restricted_path' argument was given that is not contained in "
-                               "the data tree")
-        # if the crawler is running out of scope, copy all records in
-        # the recordStore, that were created in this scope
-        # to the general update container.
-        scoped_records = recordStore.get_records_current_scope()
-        for record in scoped_records:
-            self.crawled_data.append(record)
-
-        # TODO: the scoped variables should be cleaned up as soon if the variables
-        #       are no longer in the current scope. This can be implemented as follows,
-        #       but this breaks the test "test_record_structure_generation", because
-        #       some debug info is also deleted. This implementation can be used as soon
-        #       as the remaining problems with the debug_tree are fixed.
-        # Delete the variables that are no longer needed:
-        # scoped_names = recordStore.get_names_current_scope()
-        # for name in scoped_names:
-        #     del recordStore[name]
-        #     del generalStore[name]
-
-        return self.crawled_data
-

 def crawler_main(crawled_directory_path: str,
                 cfood_file_name: str,

--- a/src/caoscrawler/scanner.py
+++ b/src/caoscrawler/scanner.py
@@ -218,13 +218,126 @@ def initialize_converters(crawler_definition: dict, converter_registry: dict):

    return converters

+# --------------------------------------------------------------------------------
+# Main scanner function:
+# --------------------------------------------------------------------------------
+
+def _crawl(self,
+           items: list[StructureElement],
+           local_converters: list[Converter],
+           generalStore: GeneralStore,
+           recordStore: RecordStore,
+           structure_elements_path: list[str],
+           converters_path: list[str],
+           restricted_path: Optional[list[str]] = None):
+    """
+    Crawl a list of StructureElements and apply any matching converters.
+
+    items: structure_elements (e.g. files and folders on one level on the hierarchy)
+    local_converters: locally defined converters for
+                        treating structure elements. A locally defined converter could be
+                        one that is only valid for a specific subtree of the originally
+                        cralwed StructureElement structure.
+    generalStore and recordStore: This recursion of the crawl function should only operate on
+                                  copies of the global stores of the Crawler object.
+    restricted_path: optional, list of strings, traverse the data tree only along the given
+                     path. For example, when a directory contains files a, b and c and b is
+                     given in restricted_path, a and c will be ignroed by the crawler.
+                     When the end of the given path is reached, traverse the full tree as
+                     normal. The first element of the list provided by restricted_path should
+                     be the name of the StructureElement at this level, i.e. denoting the
+                     respective element in the items argument.
+    """
+    # This path_found variable stores wether the path given by restricted_path was found in the
+    # data tree
+    path_found = False
+    if restricted_path is not None and len(restricted_path) == 0:
+        restricted_path = None
+
+    for element in items:
+        for converter in local_converters:
+
+            # type is something like "matches files", replace isinstance with "type_matches"
+            # match function tests regexp for example
+            if (converter.typecheck(element) and (
+                    restricted_path is None or element.name == restricted_path[0])
+                    and converter.match(element) is not None):
+                path_found = True
+                generalStore_copy = generalStore.create_scoped_copy()
+                recordStore_copy = recordStore.create_scoped_copy()
+
+                # Create an entry for this matched structure element that contains the path:
+                generalStore_copy[converter.name] = (
+                    os.path.join(*(structure_elements_path + [element.get_name()])))
+
+                # extracts values from structure element and stores them in the
+                # variable store
+                converter.create_values(generalStore_copy, element)
+
+                keys_modified = converter.create_records(
+                    generalStore_copy, recordStore_copy, element)
+
+                children = converter.create_children(generalStore_copy, element)
+
+                if self.debug:
+                    # add provenance information for each variable
+                    self.debug_tree[str(element)] = (
+                        generalStore_copy.get_storage(), recordStore_copy.get_storage())
+                    self.debug_metadata["copied"][str(element)] = (
+                        generalStore_copy.get_dict_copied(),
+                        recordStore_copy.get_dict_copied())
+                    self.debug_metadata["usage"][str(element)].add(
+                        "/".join(converters_path + [converter.name]))
+                    mod_info = self.debug_metadata["provenance"]
+                    for record_name, prop_name in keys_modified:
+                        # TODO: check
+                        internal_id = recordStore_copy.get_internal_id(
+                            record_name)
+                        record_identifier = record_name + \
+                            "_" + str(internal_id)
+                        converter.metadata["usage"].add(record_identifier)
+                        mod_info[record_identifier][prop_name] = (
+                            structure_elements_path + [element.get_name()],
+                            converters_path + [converter.name])
+
+                self._crawl(children, converter.converters,
+                            generalStore_copy, recordStore_copy,
+                            structure_elements_path + [element.get_name()],
+                            converters_path + [converter.name],
+                            restricted_path[1:] if restricted_path is not None else None)
+
+    if restricted_path and not path_found:
+        raise RuntimeError("A 'restricted_path' argument was given that is not contained in "
+                           "the data tree")
+    # if the crawler is running out of scope, copy all records in
+    # the recordStore, that were created in this scope
+    # to the general update container.
+    scoped_records = recordStore.get_records_current_scope()
+    for record in scoped_records:
+        self.crawled_data.append(record)
+
+    # TODO: the scoped variables should be cleaned up as soon if the variables
+    #       are no longer in the current scope. This can be implemented as follows,
+    #       but this breaks the test "test_record_structure_generation", because
+    #       some debug info is also deleted. This implementation can be used as soon
+    #       as the remaining problems with the debug_tree are fixed.
+    # Delete the variables that are no longer needed:
+    # scoped_names = recordStore.get_names_current_scope()
+    # for name in scoped_names:
+    #     del recordStore[name]
+    #     del generalStore[name]
+
+    return self.crawled_data
+
+
+

 # --------------------------------------------------------------------------------
-# Main scanning functions:
+# Main scanning interface functions:
 # --------------------------------------------------------------------------------


-def crawl_directory(self, dirname: str, crawler_definition_path: str,
+def crawl_directory(dirname: str, crawler_definition_path: str,
                    restricted_path: Optional[list[str]] = None):
    """ Crawl a single directory.

@@ -236,15 +349,15 @@ def crawl_directory(self, dirname: str, crawler_definition_path: str,
            is reached, traverse the full tree as normal.
    """

-    crawler_definition = self.load_definition(crawler_definition_path)
+    crawler_definition = load_definition(crawler_definition_path)
    # Load and register converter packages:
-    converter_registry = self.load_converters(crawler_definition)
+    converter_registry = load_converters(crawler_definition)

    if not dirname:
        raise ValueError(
            "You have to provide a non-empty path for crawling.")
    dir_structure_name = os.path.basename(dirname)
-    self.crawled_directory = dirname
+    crawled_directory = dirname
    if not dir_structure_name and dirname.endswith('/'):
        if dirname == '/':
            # Crawling the entire file system
@@ -253,7 +366,7 @@ def crawl_directory(self, dirname: str, crawler_definition_path: str,
            # dirname had a trailing '/'
            dir_structure_name = os.path.basename(dirname[:-1])

-    self.start_crawling(Directory(dir_structure_name,
+    start_crawling(Directory(dir_structure_name,
                                  dirname),
                        crawler_definition,
                        converter_registry,
@@ -261,7 +374,7 @@ def crawl_directory(self, dirname: str, crawler_definition_path: str,
                        )
    

-def start_crawling(self, items: Union[list[StructureElement], StructureElement],
+def start_crawling(items: Union[list[StructureElement], StructureElement],
                   crawler_definition: dict,
                   converter_registry: dict,
                   restricted_path: Optional[list[str]] = None):