diff --git a/CHANGELOG.md b/CHANGELOG.md index d2aae77aa7fe3c1807f9e3d998879c6316b9f3ce..fd7b354914dd87493b7b4530f8c2b5dcedc5930b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,7 +20,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed -* Renamed module from `newcrawler` to `caoscrawler` +* MAINT: Renamed module from `newcrawler` to `caoscrawler` +* MAINT: Removed global converters from `crawl.py` ### Deprecated diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 1694ffa1d97ea06d702c93bc1642a130a5840dd2..06059ed1eb532948419bfadd98473333a108203e 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -165,7 +165,6 @@ class Crawler(object): """ def __init__(self, - converters: List[Converter] = [], generalStore: Optional[GeneralStore] = None, debug: bool = False, identifiableAdapter: IdentifiableAdapter = None, @@ -176,8 +175,6 @@ class Crawler(object): Parameters ---------- - converters : List[Converter] - The set of converters used for this crawler. recordStore : GeneralStore An initial GeneralStore which might store e.g. environment variables. debug : bool @@ -197,7 +194,6 @@ class Crawler(object): """ # TODO: check if this feature is really needed - self.global_converters = converters self.identified_cache = IdentifiedCache() self.recordStore = RecordStore() @@ -409,8 +405,7 @@ class Crawler(object): continue elif key == "Converters": continue - converters.append(Converter.converter_factory( - value, key, converter_registry)) + converters.append(Converter.converter_factory(value, key, converter_registry)) return converters @@ -448,12 +443,11 @@ class Crawler(object): crawler_definition, converter_registry) # This recursive crawling procedure generates the update list: self.target_data: List[db.Record] = [] - self._crawl(items, - self.global_converters, local_converters, self.generalStore, self.recordStore, - [], []) + self._crawl(items, local_converters, self.generalStore, + self.recordStore, [], []) if self.debug: - self.debug_converters = self.global_converters + local_converters + self.debug_converters = local_converters return self.target_data @@ -947,7 +941,6 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) f.write(yaml.dump(paths, sort_keys=False)) def _crawl(self, items: List[StructureElement], - global_converters: List[Converter], local_converters: List[Converter], generalStore: GeneralStore, recordStore: RecordStore, @@ -956,7 +949,7 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) Crawl a list of StructureElements and apply any matching converters. items: structure_elements (e.g. files and folders on one level on the hierarchy) - global_converters and local_converters: globally or locally defined converters for + local_converters: locally defined converters for treating structure elements. A locally defined converter could be one that is only valid for a specific subtree of the originally cralwed StructureElement structure. @@ -964,7 +957,8 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) global stores of the Crawler object. """ for element in items: - for converter in global_converters + local_converters: + for converter in local_converters: + # type is something like "matches files", replace isinstance with "type_matches" # match function tests regexp for example if (converter.typecheck(element) and @@ -1006,7 +1000,7 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) structure_elements_path + [element.get_name()], converters_path + [converter.name]) - self._crawl(children, global_converters, converter.converters, + self._crawl(children, converter.converters, generalStore_copy, recordStore_copy, structure_elements_path + [element.get_name()], converters_path + [converter.name]) @@ -1094,6 +1088,8 @@ def crawler_main(crawled_directory_path: str, if isinstance(elem, db.File): # correct the file path: # elem.file = os.path.join(args.path, elem.file) + if prefix is None: + raise RuntimeError("No prefix set. Prefix must be set if files are used.") if elem.path.startswith(prefix): elem.path = elem.path[len(prefix):] elem.file = None