From e7e872d2eecb32c85f0243b6bf0400da4a464059 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com>
Date: Wed, 1 Feb 2023 22:06:55 +0100
Subject: [PATCH] ENH: allow to run crawler on a subtree

---
 CHANGELOG.md             |  5 +--
 src/caoscrawler/crawl.py | 65 ++++++++++++++++++++++------
 unittests/test_tool.py   | 92 +++++++++++++++++++++++++++++++++++++++-
 3 files changed, 146 insertions(+), 16 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c0186b47..5f3c0cf5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added ###
 - DateElementConverter: allows to interpret text as a date object
+- the restricted_path argument allows to crawl only a subtree
 
 ### Changed ###
 
@@ -19,9 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Fixed ###
 - an empty string as name is treated as no name (as does the server). This, fixes
   queries for identifiables since it would contain "WITH name=''" otherwise
-  which is an impossible condition. If your cfoods contained this case, it is
-  possible that Records are now matched that were not before. You need to adjust
-  your identifiable definition if this is not wanted.
+  which is an impossible condition. If your cfoods contained this case, they are ill defined.
 
 ### Security ###
 
diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py
index 18ecda75..49fd779b 100644
--- a/src/caoscrawler/crawl.py
+++ b/src/caoscrawler/crawl.py
@@ -420,11 +420,14 @@ class Crawler(object):
             value["class"] = getattr(module, value["converter"])
         return converter_registry
 
-    def crawl_directory(self, dirname: str, crawler_definition_path: str):
+    def crawl_directory(self, dirname: str, crawler_definition_path: str,
+                        restricted_path: Optional[list[str]] = None):
         """ Crawl a single directory.
 
         Convenience function that starts the crawler (calls start_crawling)
         with a single directory as the StructureElement.
+
+        restricted_path: see start_crawling
         """
 
         crawler_definition = self.load_definition(crawler_definition_path)
@@ -475,7 +478,8 @@ class Crawler(object):
 
     def start_crawling(self, items: Union[list[StructureElement], StructureElement],
                        crawler_definition: dict,
-                       converter_registry: dict):
+                       converter_registry: dict,
+                       restricted_path: Optional[list[str]] = None):
         """
         Start point of the crawler recursion.
 
@@ -487,6 +491,9 @@ class Crawler(object):
         crawler_definition : dict
              A dictionary representing the crawler definition, possibly from a yaml
              file.
+        restricted_path: optional, list of string
+             Traverse the data tree only along the given path. When the end of the given path
+             is reached, traverse the full tree as normal.
 
         Returns
         -------
@@ -507,8 +514,14 @@ class Crawler(object):
 
         # This recursive crawling procedure generates the update list:
         self.crawled_data: list[db.Record] = []
-        self._crawl(items, local_converters, self.generalStore, self.recordStore, [], [])
-
+        self._crawl(
+            items=items,
+            local_converters=local_converters,
+            generalStore=self.generalStore,
+            recordStore=self.recordStore,
+            structure_elements_path=[],
+            converters_path=[],
+            restricted_path=restricted_path)
         if self.debug:
             self.debug_converters = local_converters
 
@@ -1159,11 +1172,14 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3]))
         with open(filename, "w") as f:
             f.write(yaml.dump(paths, sort_keys=False))
 
-    def _crawl(self, items: list[StructureElement],
+    def _crawl(self,
+               items: list[StructureElement],
                local_converters: list[Converter],
                generalStore: GeneralStore,
                recordStore: RecordStore,
-               structure_elements_path: list[str], converters_path: list[str]):
+               structure_elements_path: list[str],
+               converters_path: list[str],
+               restricted_path: Optional[list[str]] = None):
         """
         Crawl a list of StructureElements and apply any matching converters.
 
@@ -1172,16 +1188,28 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3]))
                             treating structure elements. A locally defined converter could be
                             one that is only valid for a specific subtree of the originally
                             cralwed StructureElement structure.
-        generalStore and recordStore: This recursion of the crawl function should only operate on copies of the
-                            global stores of the Crawler object.
+        generalStore and recordStore: This recursion of the crawl function should only operate on
+                                      copies of the global stores of the Crawler object.
+        restricted_path: optional, list of strings, traverse the data tree only along the given
+                         path. When the end of the given path is reached, traverse the full tree as
+                         normal; The given path contains only the untreated levels, i.e. the first
+                         element is considered at this level.
         """
+        # This path_found variable stores wether the path given by restricted_path was found in the
+        # data tree
+        path_found = False
+        if restricted_path is not None and len(restricted_path) == 0:
+            restricted_path = None
+
         for element in items:
             for converter in local_converters:
 
                 # type is something like "matches files", replace isinstance with "type_matches"
                 # match function tests regexp for example
-                if (converter.typecheck(element) and
-                        converter.match(element) is not None):
+                if (converter.typecheck(element) and (
+                        restricted_path is None or element.name == restricted_path[0])
+                        and converter.match(element) is not None):
+                    path_found = True
                     generalStore_copy = generalStore.create_scoped_copy()
                     recordStore_copy = recordStore.create_scoped_copy()
 
@@ -1222,7 +1250,12 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3]))
                     self._crawl(children, converter.converters,
                                 generalStore_copy, recordStore_copy,
                                 structure_elements_path + [element.get_name()],
-                                converters_path + [converter.name])
+                                converters_path + [converter.name],
+                                restricted_path[1:] if restricted_path is not None else None)
+
+        if restricted_path and not path_found:
+            raise RuntimeError("A 'restricted_path' argument was given that is not contained in "
+                               "the data tree")
         # if the crawler is running out of scope, copy all records in
         # the recordStore, that were created in this scope
         # to the general update container.
@@ -1253,6 +1286,7 @@ def crawler_main(crawled_directory_path: str,
                  prefix: str = "",
                  securityMode: SecurityMode = SecurityMode.UPDATE,
                  unique_names=True,
+                 restricted_path: Optional[list[str]] = None
                  ):
     """
 
@@ -1276,6 +1310,8 @@ def crawler_main(crawled_directory_path: str,
         securityMode of Crawler
     unique_names : bool
         whether or not to update or insert entities inspite of name conflicts
+    restricted_path : optional, list of str
+        see start_crawling
 
     Returns
     -------
@@ -1284,7 +1320,7 @@ def crawler_main(crawled_directory_path: str,
     """
     crawler = Crawler(debug=debug, securityMode=securityMode)
     try:
-        crawler.crawl_directory(crawled_directory_path, cfood_file_name)
+        crawler.crawl_directory(crawled_directory_path, cfood_file_name, restricted_path)
     except ConverterValidationError as err:
         print(err)
         return 1
@@ -1376,6 +1412,10 @@ def parse_args():
     parser.add_argument("-p", "--prefix",
                         help="Remove the given prefix from the paths "
                         "of all file objects.")
+    parser.add_argument("-r", "--restrict",
+                        help="Restrict the crawling to the subtree at the end of the given path."
+                        "I.e. for each level that is given the crawler only treats the element "
+                        "with the given name.")
 
     return parser.parse_args()
 
@@ -1407,6 +1447,7 @@ def main():
                       "insert": SecurityMode.INSERT,
                       "update": SecurityMode.UPDATE}[args.security_mode],
         unique_names=args.unique_names,
+        restricted_path=restricted_path
     ))
 
 
diff --git a/unittests/test_tool.py b/unittests/test_tool.py
index 6a828532..8ea8b93b 100755
--- a/unittests/test_tool.py
+++ b/unittests/test_tool.py
@@ -26,10 +26,11 @@ Tests for the tool using pytest
 Adapted from check-sfs
 """
 
+from caoscrawler.stores import GeneralStore, RecordStore
 import os
 from caoscrawler.crawl import Crawler, SecurityMode
 from caoscrawler.identifiable import Identifiable
-from caoscrawler.structure_elements import File, DictTextElement, DictListElement
+from caoscrawler.structure_elements import File, DictTextElement, DictListElement, DictElement
 from caoscrawler.identifiable_adapters import IdentifiableAdapter, LocalStorageIdentifiableAdapter
 from simulated_server_data import full_data
 from functools import partial
@@ -867,3 +868,92 @@ def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_
     insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist))
     assert len(update) == 2
     assert len(insert) == 1
+
+
+def mock_create_values(values, element):
+    pass
+
+
+@patch("caoscrawler.converters.IntegerElementConverter.create_values")
+def test_restricted_path(create_mock):
+    """
+    The restricted_path argument allows to ignroe part of the crawled data structure. Here, we make
+    sure, that is that argument is provided, ideed only the given path of the tree is traversed.
+
+    The check is done using the mock of the create_values function of the IntegerElementConverter.
+    This function is only called if elements are being treated.
+    """
+    crawler_definition = {
+        "DictTest": {
+            "type": "DictElement",
+            "match": "(.*)",
+            "subtree": {
+                "nextdict": {
+                    "type": "DictElement",
+                    "match": "(.*)",
+                    "subtree": {
+                        "int_element": {
+                            "type": "IntegerElement",
+                            "match_name": ".*",
+                            "match_value": "(?P<int_value>.*)",
+                            "records": {
+                                "Dataset": {
+                                    "Subject": "$int_value"
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    crawler = Crawler(debug=True)
+    converter_registry = crawler.load_converters(crawler_definition)
+
+    # This structure is crawled
+    test_dict = {
+        "v1": {
+            "a": 1,
+            "b": 2,
+        },
+        "v2": {
+            "c": 3,
+            "d": 4,
+        }
+    }
+    # first test without a restricted_path
+    restricted_path = None
+    records = crawler.start_crawling(
+        DictElement("TestDict", test_dict), crawler_definition, converter_registry,
+        restricted_path
+    )
+    assert create_mock.call_count == 4
+    create_mock.reset_mock()
+
+    # test with a restricted_path but one that has no effect (single root element)
+    # this also tests that the remainder of the tree is fully traversed
+    restricted_path = ["TestDict"]
+    records = crawler.start_crawling(
+        DictElement("TestDict", test_dict), crawler_definition, converter_registry,
+        restricted_path
+    )
+    assert create_mock.call_count == 4
+    create_mock.reset_mock()
+
+    # test with a restricted_path that restricts the tree (single root element)
+    restricted_path = ["TestDict", "v2"]
+    records = crawler.start_crawling(
+        DictElement("TestDict", test_dict), crawler_definition, converter_registry,
+        restricted_path
+    )
+    assert create_mock.call_count == 2
+    create_mock.reset_mock()
+
+    # test with a restricted_path that contains a bad element
+    restricted_path = ["TestDict", "v3"]
+    with raises(RuntimeError):
+        records = crawler.start_crawling(
+            DictElement("TestDict", test_dict), crawler_definition, converter_registry,
+            restricted_path
+        )
-- 
GitLab