diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index e3f72b10ce2694853d6bc0644c736f0d621ed881..2ff45bb9b102fa824483f6509072ffc69e86bbe6 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -35,7 +35,7 @@ from .structure_elements import (StructureElement, Directory, File, Dict, JSONFi DictIntegerElement, DictBooleanElement, DictFloatElement, DictDictElement, TextElement, DictTextElement, DictElement, DictListElement) -from typing import Optional, Union +from typing import Dict as Dict_t, List, Optional, Union from abc import abstractmethod from string import Template import yaml_header_tools @@ -236,7 +236,7 @@ class Converter(object): self.name = name # Used to store usage information for debugging: - self.metadata: dict[str, set[str]] = { + self.metadata: Dict_t[str, set[str]] = { "usage": set() } @@ -346,7 +346,7 @@ class DirectoryConverter(Converter): element: A directory (of type Directory) which will be traversed. """ - children: list[StructureElement] = [] + children: List[StructureElement] = [] for name in sorted(os.listdir(element.path)): path = os.path.join(element.path, name) @@ -395,7 +395,7 @@ class MarkdownFileConverter(Converter): header = yaml_header_tools.get_header_from_file( element.path, clean=False) - children: list[StructureElement] = [] + children: List[StructureElement] = [] for name, entry in header.items(): if type(entry) == list: @@ -675,7 +675,8 @@ class TableConverter(Converter): # The option can often either be a single value or a list of values. # In the latter case each element of the list will be converted to the defined type. if isinstance(el, list): - option_dict[opt_name] = [opt_conversion(el_el) for el_el in el] + option_dict[opt_name] = [ + opt_conversion(el_el) for el_el in el] else: option_dict[opt_name] = opt_conversion(el) return option_dict diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 8886c5f87f1556517acafc7bfa673e8a0d29c6e2..6d0d683c92a3e1a2e59afe9d555cc4373fa1322b 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -50,7 +50,7 @@ from .identifiable_adapters import (IdentifiableAdapter, LocalStorageIdentifiableAdapter, CaosDBIdentifiableAdapter) from collections import defaultdict -from typing import Union, Any, Optional, Type +from typing import Any, Dict, List, Optional, Type, Union from caosdb.apiutils import compare_entities, merge_entities from copy import deepcopy from jsonschema import validate @@ -160,7 +160,7 @@ class Crawler(object): """ def __init__(self, - converters: list[Converter] = [], + converters: List[Converter] = [], generalStore: Optional[GeneralStore] = None, debug: bool = False, identifiableAdapter: IdentifiableAdapter = None, @@ -171,7 +171,7 @@ class Crawler(object): Parameters ---------- - converters : list[Converter] + converters : List[Converter] The set of converters used for this crawler. recordStore : GeneralStore An initial GeneralStore which might store e.g. environment variables. @@ -211,8 +211,8 @@ class Crawler(object): # order in the tuple: # 0: generalStore # 1: recordStore - self.debug_tree: dict[str, tuple] = dict() - self.debug_metadata: dict[str, dict] = dict() + self.debug_tree: Dict[str, tuple] = dict() + self.debug_metadata: Dict[str, dict] = dict() self.debug_metadata["copied"] = dict() self.debug_metadata["provenance"] = defaultdict(lambda: dict()) self.debug_metadata["usage"] = defaultdict(lambda: set()) @@ -284,7 +284,7 @@ class Crawler(object): """ # Defaults for the converter registry: - converter_registry: dict[str, dict[str, str]] = { + converter_registry: Dict[str, Dict[str, str]] = { "Directory": { "converter": "DirectoryConverter", "package": "caoscrawler.converters"}, @@ -395,7 +395,7 @@ class Crawler(object): return local_converters - def start_crawling(self, items: Union[list[StructureElement], StructureElement], + def start_crawling(self, items: Union[List[StructureElement], StructureElement], crawler_definition: dict, converter_registry: dict): """ @@ -428,7 +428,7 @@ class Crawler(object): local_converters = Crawler.create_local_converters(crawler_definition, converter_registry) # This recursive crawling procedure generates the update list: - self.target_data: list[db.Record] = [] + self.target_data: List[db.Record] = [] self._crawl(items, self.global_converters, local_converters, self.generalStore, self.recordStore, [], []) @@ -473,7 +473,7 @@ class Crawler(object): return False return True - def create_flat_list(self, ent_list: list[db.Entity], flat: list[db.Entity]): + def create_flat_list(self, ent_list: List[db.Entity], flat: List[db.Entity]): """ Recursively adds all properties contained in entities from ent_list to the output list flat. Each element will only be added once to the list. @@ -606,11 +606,11 @@ class Crawler(object): merge_entities(to, fro) - def split_into_inserts_and_updates(self, ent_list: list[db.Entity]): + def split_into_inserts_and_updates(self, ent_list: List[db.Entity]): if self.identifiableAdapter is None: raise RuntimeError("Should not happen.") - to_be_inserted: list[db.Entity] = [] - to_be_updated: list[db.Entity] = [] + to_be_inserted: List[db.Entity] = [] + to_be_updated: List[db.Entity] = [] flat = list(ent_list) # assure all entities are direct members TODO Can this be removed at some point?Check only? self.create_flat_list(ent_list, flat) @@ -738,8 +738,8 @@ class Crawler(object): el.value[index] = val.id @staticmethod - def remove_unnecessary_updates(target_data: list[db.Record], - identified_records: list[db.Record]): + def remove_unnecessary_updates(target_data: List[db.Record], + identified_records: List[db.Record]): """ checks whether all relevant attributes (especially Property values) are equal @@ -804,7 +804,7 @@ class Crawler(object): update_cache = UpdateCache() update_cache.insert(to_be_updated, run_id) - def _synchronize(self, target_data: list[db.Record], commit_changes: bool = True): + def _synchronize(self, target_data: List[db.Record], commit_changes: bool = True): """ This function applies several stages: 1) Retrieve identifiables for all records in target_data. @@ -823,7 +823,8 @@ class Crawler(object): if self.identifiableAdapter is None: raise RuntimeError("Should not happen.") - to_be_inserted, to_be_updated = self.split_into_inserts_and_updates(target_data) + to_be_inserted, to_be_updated = self.split_into_inserts_and_updates( + target_data) # TODO: refactoring of typo for el in to_be_updated: @@ -831,14 +832,17 @@ class Crawler(object): self.replace_entities_with_ids(el) identified_records = [ - self.identifiableAdapter.retrieve_identified_record_for_record(record) + self.identifiableAdapter.retrieve_identified_record_for_record( + record) for record in to_be_updated] # remove unnecessary updates from list by comparing the target records to the existing ones self.remove_unnecessary_updates(to_be_updated, identified_records) if commit_changes: - self.execute_inserts_in_list(to_be_inserted, self.securityMode, self.run_id) - self.execute_updates_in_list(to_be_updated, self.securityMode, self.run_id) + self.execute_inserts_in_list( + to_be_inserted, self.securityMode, self.run_id) + self.execute_updates_in_list( + to_be_updated, self.securityMode, self.run_id) update_cache = UpdateCache() pending_inserts = update_cache.get_inserts(self.run_id) @@ -859,7 +863,8 @@ class Crawler(object): # only done in SSS mode if "SHARED_DIR" in os.environ: - filename = OldCrawler.save_form([el[3] for el in pending_changes], path, run_id) + filename = OldCrawler.save_form( + [el[3] for el in pending_changes], path, run_id) OldCrawler.send_mail([el[3] for el in pending_changes], filename) for i, el in enumerate(pending_changes): @@ -870,13 +875,14 @@ UNAUTHORIZED UPDATE ({} of {}): ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) logger.info("There were unauthorized changes (see above). An " "email was sent to the curator.\n" - "You can authorize the " + ("inserts" if inserts else "updates") + "You can authorize the " + + ("inserts" if inserts else "updates") + " by invoking the crawler" " with the run id: {rid}\n".format(rid=run_id)) @staticmethod def debug_build_usage_tree(converter: Converter): - res: dict[str, dict[str, Any]] = { + res: Dict[str, Dict[str, Any]] = { converter.name: { "usage": ", ".join(converter.metadata["usage"]), "subtree": {} @@ -893,7 +899,7 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) return res def save_debug_data(self, filename: str): - paths: dict[str, Union[dict, list]] = dict() + paths: Dict[str, Union[dict, list]] = dict() def flatten_debug_info(key): mod_info = self.debug_metadata[key] @@ -918,12 +924,12 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) with open(filename, "w") as f: f.write(yaml.dump(paths, sort_keys=False)) - def _crawl(self, items: list[StructureElement], - global_converters: list[Converter], - local_converters: list[Converter], + def _crawl(self, items: List[StructureElement], + global_converters: List[Converter], + local_converters: List[Converter], generalStore: GeneralStore, recordStore: RecordStore, - structure_elements_path: list[str], converters_path: list[str]): + structure_elements_path: List[str], converters_path: List[str]): """ Crawl a list of StructureElements and apply any matching converters. diff --git a/src/caoscrawler/structure_elements.py b/src/caoscrawler/structure_elements.py index 6be653a4758e8c3fb789b22ea655836a3d976c34..01996b4ff3e14a9739857e6e03ceca161300b37e 100644 --- a/src/caoscrawler/structure_elements.py +++ b/src/caoscrawler/structure_elements.py @@ -23,12 +23,15 @@ # ** end header # +from typing import Dict + + class StructureElement(object): """ base class for elements in the hierarchical data structure """ def __init__(self, name): # Used to store usage information for debugging: - self.metadata: dict[str, set[str]] = { + self.metadata: Dict[str, set[str]] = { "usage": set() }