diff --git a/CHANGELOG.md b/CHANGELOG.md index c0186b47835c8e61ab3b6876b2b420795e43fb49..5f3c0cf58115f760cf248d4e87b2c1b27fed5d5c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### - DateElementConverter: allows to interpret text as a date object +- the restricted_path argument allows to crawl only a subtree ### Changed ### @@ -19,9 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed ### - an empty string as name is treated as no name (as does the server). This, fixes queries for identifiables since it would contain "WITH name=''" otherwise - which is an impossible condition. If your cfoods contained this case, it is - possible that Records are now matched that were not before. You need to adjust - your identifiable definition if this is not wanted. + which is an impossible condition. If your cfoods contained this case, they are ill defined. ### Security ### diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 18ecda75e995ebc6f6cdff9e4047c1453b3c2f2d..caf98e75b81b743d67af51d89e643d08cf52947b 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -420,11 +420,16 @@ class Crawler(object): value["class"] = getattr(module, value["converter"]) return converter_registry - def crawl_directory(self, dirname: str, crawler_definition_path: str): + def crawl_directory(self, dirname: str, crawler_definition_path: str, + restricted_path: Optional[list[str]] = None): """ Crawl a single directory. Convenience function that starts the crawler (calls start_crawling) with a single directory as the StructureElement. + + restricted_path: optional, list of strings + Traverse the data tree only along the given path. When the end of the given path + is reached, traverse the full tree as normal. """ crawler_definition = self.load_definition(crawler_definition_path) @@ -447,7 +452,9 @@ class Crawler(object): self.start_crawling(Directory(dir_structure_name, dirname), crawler_definition, - converter_registry) + converter_registry, + restricted_path=restricted_path + ) @staticmethod def initialize_converters(crawler_definition: dict, converter_registry: dict): @@ -475,7 +482,8 @@ class Crawler(object): def start_crawling(self, items: Union[list[StructureElement], StructureElement], crawler_definition: dict, - converter_registry: dict): + converter_registry: dict, + restricted_path: Optional[list[str]] = None): """ Start point of the crawler recursion. @@ -487,6 +495,9 @@ class Crawler(object): crawler_definition : dict A dictionary representing the crawler definition, possibly from a yaml file. + restricted_path: optional, list of strings + Traverse the data tree only along the given path. When the end of the given path + is reached, traverse the full tree as normal. Returns ------- @@ -507,8 +518,14 @@ class Crawler(object): # This recursive crawling procedure generates the update list: self.crawled_data: list[db.Record] = [] - self._crawl(items, local_converters, self.generalStore, self.recordStore, [], []) - + self._crawl( + items=items, + local_converters=local_converters, + generalStore=self.generalStore, + recordStore=self.recordStore, + structure_elements_path=[], + converters_path=[], + restricted_path=restricted_path) if self.debug: self.debug_converters = local_converters @@ -1159,11 +1176,14 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) with open(filename, "w") as f: f.write(yaml.dump(paths, sort_keys=False)) - def _crawl(self, items: list[StructureElement], + def _crawl(self, + items: list[StructureElement], local_converters: list[Converter], generalStore: GeneralStore, recordStore: RecordStore, - structure_elements_path: list[str], converters_path: list[str]): + structure_elements_path: list[str], + converters_path: list[str], + restricted_path: Optional[list[str]] = None): """ Crawl a list of StructureElements and apply any matching converters. @@ -1172,16 +1192,31 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) treating structure elements. A locally defined converter could be one that is only valid for a specific subtree of the originally cralwed StructureElement structure. - generalStore and recordStore: This recursion of the crawl function should only operate on copies of the - global stores of the Crawler object. + generalStore and recordStore: This recursion of the crawl function should only operate on + copies of the global stores of the Crawler object. + restricted_path: optional, list of strings, traverse the data tree only along the given + path. For example, when a directory contains files a, b and c and b is + given in restricted_path, a and c will be ignroed by the crawler. + When the end of the given path is reached, traverse the full tree as + normal. The first element of the list provided by restricted_path should + be the name of the StructureElement at this level, i.e. denoting the + respective element in the items argument. """ + # This path_found variable stores wether the path given by restricted_path was found in the + # data tree + path_found = False + if restricted_path is not None and len(restricted_path) == 0: + restricted_path = None + for element in items: for converter in local_converters: # type is something like "matches files", replace isinstance with "type_matches" # match function tests regexp for example - if (converter.typecheck(element) and - converter.match(element) is not None): + if (converter.typecheck(element) and ( + restricted_path is None or element.name == restricted_path[0]) + and converter.match(element) is not None): + path_found = True generalStore_copy = generalStore.create_scoped_copy() recordStore_copy = recordStore.create_scoped_copy() @@ -1222,7 +1257,12 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) self._crawl(children, converter.converters, generalStore_copy, recordStore_copy, structure_elements_path + [element.get_name()], - converters_path + [converter.name]) + converters_path + [converter.name], + restricted_path[1:] if restricted_path is not None else None) + + if restricted_path and not path_found: + raise RuntimeError("A 'restricted_path' argument was given that is not contained in " + "the data tree") # if the crawler is running out of scope, copy all records in # the recordStore, that were created in this scope # to the general update container. @@ -1253,6 +1293,7 @@ def crawler_main(crawled_directory_path: str, prefix: str = "", securityMode: SecurityMode = SecurityMode.UPDATE, unique_names=True, + restricted_path: Optional[list[str]] = None ): """ @@ -1276,6 +1317,9 @@ def crawler_main(crawled_directory_path: str, securityMode of Crawler unique_names : bool whether or not to update or insert entities inspite of name conflicts + restricted_path: optional, list of strings + Traverse the data tree only along the given path. When the end of the given path + is reached, traverse the full tree as normal. Returns ------- @@ -1284,7 +1328,7 @@ def crawler_main(crawled_directory_path: str, """ crawler = Crawler(debug=debug, securityMode=securityMode) try: - crawler.crawl_directory(crawled_directory_path, cfood_file_name) + crawler.crawl_directory(crawled_directory_path, cfood_file_name, restricted_path) except ConverterValidationError as err: print(err) return 1 @@ -1349,6 +1393,15 @@ def parse_args(): formatter_class=RawTextHelpFormatter) parser.add_argument("cfood_file_name", help="Path name of the cfood yaml file to be used.") + mg = parser.add_mutually_exclusive_group() + mg.add_argument("-r", "--restrict", nargs="*", + help="Restrict the crawling to the subtree at the end of the given path." + "I.e. for each level that is given the crawler only treats the element " + "with the given name.") + mg.add_argument("--restrict-path", help="same as restrict; instead of a list, this takes a " + "single string that is interpreded as file system path. Note that a trailing" + "separator (e.g. '/') will be ignored. Use --restrict if you need to have " + "empty strings.") parser.add_argument("--provenance", required=False, help="Path name of the provenance yaml file. " "This file will only be generated if this option is set.") @@ -1380,6 +1433,15 @@ def parse_args(): return parser.parse_args() +def split_restricted_path(path): + elements = [] + while path != "/": + path, el = os.path.split(path) + if el != "": + elements.insert(0, el) + return elements + + def main(): args = parse_args() @@ -1395,6 +1457,11 @@ def main(): if args.add_cwd_to_path: sys.path.append(os.path.abspath(".")) + if args.restrict_path: + restricted_path = split_restricted_path(args.restrict_path) + if args.restrict: + restricted_path = args.restrict + sys.exit(crawler_main( crawled_directory_path=args.crawled_directory_path, cfood_file_name=args.cfood_file_name, @@ -1407,6 +1474,7 @@ def main(): "insert": SecurityMode.INSERT, "update": SecurityMode.UPDATE}[args.security_mode], unique_names=args.unique_names, + restricted_path=restricted_path )) diff --git a/unittests/test_tool.py b/unittests/test_tool.py index 6a828532c1de9796008a6e51c21811f83b85657a..187ec06e097a3aba1053c865eac1190654a267c0 100755 --- a/unittests/test_tool.py +++ b/unittests/test_tool.py @@ -26,10 +26,11 @@ Tests for the tool using pytest Adapted from check-sfs """ +from caoscrawler.stores import GeneralStore, RecordStore import os -from caoscrawler.crawl import Crawler, SecurityMode +from caoscrawler.crawl import Crawler, SecurityMode, split_restricted_path from caoscrawler.identifiable import Identifiable -from caoscrawler.structure_elements import File, DictTextElement, DictListElement +from caoscrawler.structure_elements import File, DictTextElement, DictListElement, DictElement from caoscrawler.identifiable_adapters import IdentifiableAdapter, LocalStorageIdentifiableAdapter from simulated_server_data import full_data from functools import partial @@ -867,3 +868,98 @@ def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_ insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) assert len(update) == 2 assert len(insert) == 1 + + +def mock_create_values(values, element): + pass + + +@patch("caoscrawler.converters.IntegerElementConverter.create_values") +def test_restricted_path(create_mock): + """ + The restricted_path argument allows to ignroe part of the crawled data structure. Here, we make + sure, that is that argument is provided, ideed only the given path of the tree is traversed. + + The check is done using the mock of the create_values function of the IntegerElementConverter. + This function is only called if elements are being treated. + """ + crawler_definition = { + "DictTest": { + "type": "DictElement", + "match": "(.*)", + "subtree": { + "nextdict": { + "type": "DictElement", + "match": "(.*)", + "subtree": { + "int_element": { + "type": "IntegerElement", + "match_name": ".*", + "match_value": "(?P<int_value>.*)", + "records": { + "Dataset": { + "Subject": "$int_value" + } + } + } + } + } + } + } + } + + crawler = Crawler(debug=True) + converter_registry = crawler.load_converters(crawler_definition) + + # This structure is crawled + test_dict = { + "v1": { + "a": 1, + "b": 2, + }, + "v2": { + "c": 3, + "d": 4, + } + } + # first test without a restricted_path + restricted_path = None + records = crawler.start_crawling( + DictElement("TestDict", test_dict), crawler_definition, converter_registry, + restricted_path + ) + assert create_mock.call_count == 4 + create_mock.reset_mock() + + # test with a restricted_path but one that has no effect (single root element) + # this also tests that the remainder of the tree is fully traversed + restricted_path = ["TestDict"] + records = crawler.start_crawling( + DictElement("TestDict", test_dict), crawler_definition, converter_registry, + restricted_path + ) + assert create_mock.call_count == 4 + create_mock.reset_mock() + + # test with a restricted_path that restricts the tree (single root element) + restricted_path = ["TestDict", "v2"] + records = crawler.start_crawling( + DictElement("TestDict", test_dict), crawler_definition, converter_registry, + restricted_path + ) + assert create_mock.call_count == 2 + create_mock.reset_mock() + + # test with a restricted_path that contains a bad element + restricted_path = ["TestDict", "v3"] + with raises(RuntimeError): + records = crawler.start_crawling( + DictElement("TestDict", test_dict), crawler_definition, converter_registry, + restricted_path + ) + + +def test_split_restricted_path(): + assert ["el"] == split_restricted_path("/el") + assert ["el"] == split_restricted_path("/el/") + assert ["el", "el"] == split_restricted_path("/el/el")