From e7e872d2eecb32c85f0243b6bf0400da4a464059 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Wed, 1 Feb 2023 22:06:55 +0100 Subject: [PATCH] ENH: allow to run crawler on a subtree --- CHANGELOG.md | 5 +-- src/caoscrawler/crawl.py | 65 ++++++++++++++++++++++------ unittests/test_tool.py | 92 +++++++++++++++++++++++++++++++++++++++- 3 files changed, 146 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c0186b47..5f3c0cf5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### - DateElementConverter: allows to interpret text as a date object +- the restricted_path argument allows to crawl only a subtree ### Changed ### @@ -19,9 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed ### - an empty string as name is treated as no name (as does the server). This, fixes queries for identifiables since it would contain "WITH name=''" otherwise - which is an impossible condition. If your cfoods contained this case, it is - possible that Records are now matched that were not before. You need to adjust - your identifiable definition if this is not wanted. + which is an impossible condition. If your cfoods contained this case, they are ill defined. ### Security ### diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 18ecda75..49fd779b 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -420,11 +420,14 @@ class Crawler(object): value["class"] = getattr(module, value["converter"]) return converter_registry - def crawl_directory(self, dirname: str, crawler_definition_path: str): + def crawl_directory(self, dirname: str, crawler_definition_path: str, + restricted_path: Optional[list[str]] = None): """ Crawl a single directory. Convenience function that starts the crawler (calls start_crawling) with a single directory as the StructureElement. + + restricted_path: see start_crawling """ crawler_definition = self.load_definition(crawler_definition_path) @@ -475,7 +478,8 @@ class Crawler(object): def start_crawling(self, items: Union[list[StructureElement], StructureElement], crawler_definition: dict, - converter_registry: dict): + converter_registry: dict, + restricted_path: Optional[list[str]] = None): """ Start point of the crawler recursion. @@ -487,6 +491,9 @@ class Crawler(object): crawler_definition : dict A dictionary representing the crawler definition, possibly from a yaml file. + restricted_path: optional, list of string + Traverse the data tree only along the given path. When the end of the given path + is reached, traverse the full tree as normal. Returns ------- @@ -507,8 +514,14 @@ class Crawler(object): # This recursive crawling procedure generates the update list: self.crawled_data: list[db.Record] = [] - self._crawl(items, local_converters, self.generalStore, self.recordStore, [], []) - + self._crawl( + items=items, + local_converters=local_converters, + generalStore=self.generalStore, + recordStore=self.recordStore, + structure_elements_path=[], + converters_path=[], + restricted_path=restricted_path) if self.debug: self.debug_converters = local_converters @@ -1159,11 +1172,14 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) with open(filename, "w") as f: f.write(yaml.dump(paths, sort_keys=False)) - def _crawl(self, items: list[StructureElement], + def _crawl(self, + items: list[StructureElement], local_converters: list[Converter], generalStore: GeneralStore, recordStore: RecordStore, - structure_elements_path: list[str], converters_path: list[str]): + structure_elements_path: list[str], + converters_path: list[str], + restricted_path: Optional[list[str]] = None): """ Crawl a list of StructureElements and apply any matching converters. @@ -1172,16 +1188,28 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) treating structure elements. A locally defined converter could be one that is only valid for a specific subtree of the originally cralwed StructureElement structure. - generalStore and recordStore: This recursion of the crawl function should only operate on copies of the - global stores of the Crawler object. + generalStore and recordStore: This recursion of the crawl function should only operate on + copies of the global stores of the Crawler object. + restricted_path: optional, list of strings, traverse the data tree only along the given + path. When the end of the given path is reached, traverse the full tree as + normal; The given path contains only the untreated levels, i.e. the first + element is considered at this level. """ + # This path_found variable stores wether the path given by restricted_path was found in the + # data tree + path_found = False + if restricted_path is not None and len(restricted_path) == 0: + restricted_path = None + for element in items: for converter in local_converters: # type is something like "matches files", replace isinstance with "type_matches" # match function tests regexp for example - if (converter.typecheck(element) and - converter.match(element) is not None): + if (converter.typecheck(element) and ( + restricted_path is None or element.name == restricted_path[0]) + and converter.match(element) is not None): + path_found = True generalStore_copy = generalStore.create_scoped_copy() recordStore_copy = recordStore.create_scoped_copy() @@ -1222,7 +1250,12 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) self._crawl(children, converter.converters, generalStore_copy, recordStore_copy, structure_elements_path + [element.get_name()], - converters_path + [converter.name]) + converters_path + [converter.name], + restricted_path[1:] if restricted_path is not None else None) + + if restricted_path and not path_found: + raise RuntimeError("A 'restricted_path' argument was given that is not contained in " + "the data tree") # if the crawler is running out of scope, copy all records in # the recordStore, that were created in this scope # to the general update container. @@ -1253,6 +1286,7 @@ def crawler_main(crawled_directory_path: str, prefix: str = "", securityMode: SecurityMode = SecurityMode.UPDATE, unique_names=True, + restricted_path: Optional[list[str]] = None ): """ @@ -1276,6 +1310,8 @@ def crawler_main(crawled_directory_path: str, securityMode of Crawler unique_names : bool whether or not to update or insert entities inspite of name conflicts + restricted_path : optional, list of str + see start_crawling Returns ------- @@ -1284,7 +1320,7 @@ def crawler_main(crawled_directory_path: str, """ crawler = Crawler(debug=debug, securityMode=securityMode) try: - crawler.crawl_directory(crawled_directory_path, cfood_file_name) + crawler.crawl_directory(crawled_directory_path, cfood_file_name, restricted_path) except ConverterValidationError as err: print(err) return 1 @@ -1376,6 +1412,10 @@ def parse_args(): parser.add_argument("-p", "--prefix", help="Remove the given prefix from the paths " "of all file objects.") + parser.add_argument("-r", "--restrict", + help="Restrict the crawling to the subtree at the end of the given path." + "I.e. for each level that is given the crawler only treats the element " + "with the given name.") return parser.parse_args() @@ -1407,6 +1447,7 @@ def main(): "insert": SecurityMode.INSERT, "update": SecurityMode.UPDATE}[args.security_mode], unique_names=args.unique_names, + restricted_path=restricted_path )) diff --git a/unittests/test_tool.py b/unittests/test_tool.py index 6a828532..8ea8b93b 100755 --- a/unittests/test_tool.py +++ b/unittests/test_tool.py @@ -26,10 +26,11 @@ Tests for the tool using pytest Adapted from check-sfs """ +from caoscrawler.stores import GeneralStore, RecordStore import os from caoscrawler.crawl import Crawler, SecurityMode from caoscrawler.identifiable import Identifiable -from caoscrawler.structure_elements import File, DictTextElement, DictListElement +from caoscrawler.structure_elements import File, DictTextElement, DictListElement, DictElement from caoscrawler.identifiable_adapters import IdentifiableAdapter, LocalStorageIdentifiableAdapter from simulated_server_data import full_data from functools import partial @@ -867,3 +868,92 @@ def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_ insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) assert len(update) == 2 assert len(insert) == 1 + + +def mock_create_values(values, element): + pass + + +@patch("caoscrawler.converters.IntegerElementConverter.create_values") +def test_restricted_path(create_mock): + """ + The restricted_path argument allows to ignroe part of the crawled data structure. Here, we make + sure, that is that argument is provided, ideed only the given path of the tree is traversed. + + The check is done using the mock of the create_values function of the IntegerElementConverter. + This function is only called if elements are being treated. + """ + crawler_definition = { + "DictTest": { + "type": "DictElement", + "match": "(.*)", + "subtree": { + "nextdict": { + "type": "DictElement", + "match": "(.*)", + "subtree": { + "int_element": { + "type": "IntegerElement", + "match_name": ".*", + "match_value": "(?P<int_value>.*)", + "records": { + "Dataset": { + "Subject": "$int_value" + } + } + } + } + } + } + } + } + + crawler = Crawler(debug=True) + converter_registry = crawler.load_converters(crawler_definition) + + # This structure is crawled + test_dict = { + "v1": { + "a": 1, + "b": 2, + }, + "v2": { + "c": 3, + "d": 4, + } + } + # first test without a restricted_path + restricted_path = None + records = crawler.start_crawling( + DictElement("TestDict", test_dict), crawler_definition, converter_registry, + restricted_path + ) + assert create_mock.call_count == 4 + create_mock.reset_mock() + + # test with a restricted_path but one that has no effect (single root element) + # this also tests that the remainder of the tree is fully traversed + restricted_path = ["TestDict"] + records = crawler.start_crawling( + DictElement("TestDict", test_dict), crawler_definition, converter_registry, + restricted_path + ) + assert create_mock.call_count == 4 + create_mock.reset_mock() + + # test with a restricted_path that restricts the tree (single root element) + restricted_path = ["TestDict", "v2"] + records = crawler.start_crawling( + DictElement("TestDict", test_dict), crawler_definition, converter_registry, + restricted_path + ) + assert create_mock.call_count == 2 + create_mock.reset_mock() + + # test with a restricted_path that contains a bad element + restricted_path = ["TestDict", "v3"] + with raises(RuntimeError): + records = crawler.start_crawling( + DictElement("TestDict", test_dict), crawler_definition, converter_registry, + restricted_path + ) -- GitLab