diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 4dd7674501c913e87662e1e38d9176e4004b5d3c..9abbd80cb378cac38f014f5ea065358bd3c1cac3 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -28,6 +28,7 @@ Crawl a file structure using a yaml cfood definition and synchronize the acuired data with CaosDB. """ +from __future__ import annotations import importlib from caosadvancedtools.cache import UpdateCache, Cache import uuid @@ -50,7 +51,7 @@ from .identifiable_adapters import (IdentifiableAdapter, LocalStorageIdentifiableAdapter, CaosDBIdentifiableAdapter) from collections import defaultdict -from typing import Any, Dict, List, Optional, Type, Union +from typing import Any, Optional, Type, Union from caosdb.apiutils import compare_entities, merge_entities from copy import deepcopy from jsonschema import validate @@ -168,7 +169,7 @@ class Crawler(object): generalStore: Optional[GeneralStore] = None, debug: bool = False, identifiableAdapter: IdentifiableAdapter = None, - securityMode: int = SecurityMode.UPDATE + securityMode: SecurityMode = SecurityMode.UPDATE ): """ Create a new crawler and initialize an empty RecordStore and GeneralStore. @@ -209,14 +210,14 @@ class Crawler(object): if identifiableAdapter is None: self.identifiableAdapter = LocalStorageIdentifiableAdapter() # If a directory is crawled this may hold the path to that directory - self.crawled_directory = None + self.crawled_directory: Optional[str] = None self.debug = debug if self.debug: # order in the tuple: # 0: generalStore # 1: recordStore - self.debug_tree: Dict[str, tuple] = dict() - self.debug_metadata: Dict[str, dict] = dict() + self.debug_tree: dict[str, tuple] = dict() + self.debug_metadata: dict[str, dict] = dict() self.debug_metadata["copied"] = dict() self.debug_metadata["provenance"] = defaultdict(lambda: dict()) self.debug_metadata["usage"] = defaultdict(lambda: set()) @@ -236,7 +237,7 @@ class Crawler(object): return self._resolve_validator_paths(crawler_definition, crawler_definition_path) - def _load_definition_from_yaml_dict(self, crawler_definitions: List[Dict]): + def _load_definition_from_yaml_dict(self, crawler_definitions: list[dict]): """Load crawler definitions from a list of (yaml) dicts `crawler_definitions` which contains either one or two documents. @@ -258,7 +259,7 @@ class Crawler(object): # tested in the next lines of code: # Load the cfood schema: - with open(files('caoscrawler').joinpath('cfood-schema.yml'), "r") as f: + with open(str(files('caoscrawler').joinpath('cfood-schema.yml')), "r") as f: schema = yaml.safe_load(f) # Add custom converters to converter enum in schema: @@ -315,7 +316,7 @@ class Crawler(object): """ # Defaults for the converter registry: - converter_registry: Dict[str, Dict[str, str]] = { + converter_registry: dict[str, dict[str, str]] = { "Directory": { "converter": "DirectoryConverter", "package": "caoscrawler.converters"}, @@ -430,7 +431,7 @@ class Crawler(object): return converters - def start_crawling(self, items: Union[List[StructureElement], StructureElement], + def start_crawling(self, items: Union[list[StructureElement], StructureElement], crawler_definition: dict, converter_registry: dict): """ @@ -462,8 +463,9 @@ class Crawler(object): self.run_id = uuid.uuid1() local_converters = Crawler.initialize_converters( crawler_definition, converter_registry) + # This recursive crawling procedure generates the update list: - self.crawled_data: List[db.Record] = [] + self.crawled_data: list[db.Record] = [] self._crawl(items, local_converters, self.generalStore, self.recordStore, [], []) @@ -501,7 +503,7 @@ class Crawler(object): return False @staticmethod - def create_flat_list(ent_list: List[db.Entity], flat: List[db.Entity]): + def create_flat_list(ent_list: list[db.Entity], flat: list[db.Entity]): """ Recursively adds all properties contained in entities from ent_list to the output list flat. Each element will only be added once to the list. @@ -688,11 +690,11 @@ class Crawler(object): if p.value is old: p.value = new - def split_into_inserts_and_updates(self, ent_list: List[db.Entity]): + def split_into_inserts_and_updates(self, ent_list: list[db.Entity]): if self.identifiableAdapter is None: raise RuntimeError("Should not happen.") - to_be_inserted: List[db.Entity] = [] - to_be_updated: List[db.Entity] = [] + to_be_inserted: list[db.Entity] = [] + to_be_updated: list[db.Entity] = [] flat = list(ent_list) # assure all entities are direct members TODO Can this be removed at some point?Check only? Crawler.create_flat_list(ent_list, flat) @@ -720,7 +722,7 @@ class Crawler(object): newrecord = self.get_from_any_cache(record) merge_entities(newrecord, record) Crawler.bend_references_to_new_object( - old=record, new=newrecord, entities=flat+to_be_updated+to_be_inserted) + old=record, new=newrecord, entities=flat + to_be_updated + to_be_inserted) del flat[i] resolved_references = True @@ -781,8 +783,8 @@ class Crawler(object): @staticmethod def _merge_properties_from_remote( - crawled_data: List[db.Record], - identified_records: List[db.Record] + crawled_data: list[db.Record], + identified_records: list[db.Record] ): """Merge entity representation that was created by crawling the data with remotely found identified records s.th. new properties and property values are updated correctly but @@ -823,8 +825,8 @@ class Crawler(object): @staticmethod def remove_unnecessary_updates( - crawled_data: List[db.Record], - identified_records: List[db.Record] + crawled_data: list[db.Record], + identified_records: list[db.Record] ): """Compare the Records to be updated with their remote correspondant. Only update if there are actual differences. @@ -896,7 +898,7 @@ class Crawler(object): return db.Entity(id=id).retrieve() @staticmethod - def execute_inserts_in_list(to_be_inserted, securityMode, run_id: int = None, + def execute_inserts_in_list(to_be_inserted, securityMode, run_id: uuid.UUID = None, unique_names=True): for record in to_be_inserted: for prop in record.properties: @@ -924,7 +926,7 @@ class Crawler(object): _resolve_datatype(prop, entity) @staticmethod - def execute_updates_in_list(to_be_updated, securityMode, run_id: int = None, + def execute_updates_in_list(to_be_updated, securityMode, run_id: uuid.UUID = None, unique_names=True): Crawler.set_ids_and_datatype_of_parents_and_properties(to_be_updated) logger.debug("UPDATE") @@ -936,7 +938,7 @@ class Crawler(object): update_cache = UpdateCache() update_cache.insert(to_be_updated, run_id) - def _synchronize(self, crawled_data: List[db.Record], commit_changes: bool = True, + def _synchronize(self, crawled_data: list[db.Record], commit_changes: bool = True, unique_names=True): """ This function applies several stages: @@ -1021,7 +1023,7 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) @staticmethod def debug_build_usage_tree(converter: Converter): - res: Dict[str, Dict[str, Any]] = { + res: dict[str, dict[str, Any]] = { converter.name: { "usage": ", ".join(converter.metadata["usage"]), "subtree": {} @@ -1038,7 +1040,7 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) return res def save_debug_data(self, filename: str): - paths: Dict[str, Union[dict, list]] = dict() + paths: dict[str, Union[dict, list]] = dict() def flatten_debug_info(key): mod_info = self.debug_metadata[key] @@ -1063,11 +1065,11 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) with open(filename, "w") as f: f.write(yaml.dump(paths, sort_keys=False)) - def _crawl(self, items: List[StructureElement], - local_converters: List[Converter], + def _crawl(self, items: list[StructureElement], + local_converters: list[Converter], generalStore: GeneralStore, recordStore: RecordStore, - structure_elements_path: List[str], converters_path: List[str]): + structure_elements_path: list[str], converters_path: list[str]): """ Crawl a list of StructureElements and apply any matching converters. @@ -1155,7 +1157,7 @@ def crawler_main(crawled_directory_path: str, provenance_file: str = None, dry_run: bool = False, prefix: str = "", - securityMode: int = SecurityMode.UPDATE, + securityMode: SecurityMode = SecurityMode.UPDATE, unique_names=True, ): """