diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 0704cee918f771bb01ec624b01aa529821a29edc..dd84adb11a6e98fc36d0c6ace1b93b5ac861db24 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -160,7 +160,6 @@ class Crawler(object): """ def __init__(self, - converters: List[Converter] = [], generalStore: Optional[GeneralStore] = None, debug: bool = False, identifiableAdapter: IdentifiableAdapter = None, @@ -171,8 +170,6 @@ class Crawler(object): Parameters ---------- - converters : List[Converter] - The set of converters used for this crawler. recordStore : GeneralStore An initial GeneralStore which might store e.g. environment variables. debug : bool @@ -191,7 +188,6 @@ class Crawler(object): """ # TODO: check if this feature is really needed - self.global_converters = converters self.identified_cache = IdentifiedCache() self.recordStore = RecordStore() @@ -376,9 +372,8 @@ class Crawler(object): converter_registry) @staticmethod - def create_local_converters(crawler_definition: dict, - converter_registry: dict): - local_converters = [] + def initialize_converters(crawler_definition: dict, converter_registry: dict): + converters = [] for key, value in crawler_definition.items(): # Definitions and Converters are reserved keywords @@ -390,10 +385,9 @@ class Crawler(object): continue elif key == "Converters": continue - local_converters.append(Converter.converter_factory( - value, key, converter_registry)) + converters.append(Converter.converter_factory(value, key, converter_registry)) - return local_converters + return converters def start_crawling(self, items: Union[List[StructureElement], StructureElement], crawler_definition: dict, @@ -425,16 +419,14 @@ class Crawler(object): items = [items] self.run_id = uuid.uuid1() - local_converters = Crawler.create_local_converters(crawler_definition, - converter_registry) + local_converters = Crawler.initialize_converters(crawler_definition, converter_registry) # This recursive crawling procedure generates the update list: self.target_data: List[db.Record] = [] - self._crawl(items, - self.global_converters, local_converters, self.generalStore, self.recordStore, - [], []) + self._crawl(items, local_converters, self.generalStore, + self.recordStore, [], []) if self.debug: - self.debug_converters = self.global_converters + local_converters + self.debug_converters = local_converters return self.target_data @@ -928,7 +920,6 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) f.write(yaml.dump(paths, sort_keys=False)) def _crawl(self, items: List[StructureElement], - global_converters: List[Converter], local_converters: List[Converter], generalStore: GeneralStore, recordStore: RecordStore, @@ -937,7 +928,7 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) Crawl a list of StructureElements and apply any matching converters. items: structure_elements (e.g. files and folders on one level on the hierarchy) - global_converters and local_converters: globally or locally defined converters for + local_converters: locally defined converters for treating structure elements. A locally defined converter could be one that is only valid for a specific subtree of the originally cralwed StructureElement structure. @@ -945,7 +936,8 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) global stores of the Crawler object. """ for element in items: - for converter in global_converters + local_converters: + for converter in local_converters: + # type is something like "matches files", replace isinstance with "type_matches" # match function tests regexp for example if (converter.typecheck(element) and @@ -985,7 +977,7 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) mod_info[record_identifier][prop_name] = (structure_elements_path + [element.get_name()], converters_path + [converter.name]) - self._crawl(children, global_converters, converter.converters, + self._crawl(children, converter.converters, generalStore_copy, recordStore_copy, structure_elements_path + [element.get_name()], converters_path + [converter.name])