diff --git a/CHANGELOG.md b/CHANGELOG.md index 8ff353d0319752a99e7de96da3d33575bbb17c49..314a38b50f7e4ecb0883feb70247a2cf6bb05d3a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,9 +8,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ## ### Added ### +- the -c/--add-cwd-to-path option allows to plays for example custom converter + modules into the current working directory(cwd) since the cwd is added to + the Python path. ### Changed ### +- Converters often used in dicts (DictFloatElementConverter, + DictIntegerElementConverter, ...) do now accept other StructureElements by + default. For example a DictIntegerElement is accepted by default instead of a + DictFloatElement. This behavior can be changed (see converter documentation). + **Note** This might lead to additional matches compared to previous versions. - `_AbstractDictElementConverter` uses `re.DOTALL` for `match_value` - The "fallback" parent, the name of the element in the cfood, is only used when the object is created and only if there are no parents given. diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 0bfe075b6847e52374cc7730f799558648446b47..2538d1189c933b19f8cb0ba3fd2c660b23a31e33 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -28,6 +28,7 @@ Crawl a file structure using a yaml cfood definition and synchronize the acuired data with CaosDB. """ +from __future__ import annotations import importlib from caosadvancedtools.cache import UpdateCache, Cache import uuid @@ -50,7 +51,7 @@ from .identifiable_adapters import (IdentifiableAdapter, LocalStorageIdentifiableAdapter, CaosDBIdentifiableAdapter) from collections import defaultdict -from typing import Any, Dict, List, Optional, Type, Union +from typing import Any, Optional, Type, Union from caosdb.apiutils import compare_entities, merge_entities from copy import deepcopy from jsonschema import validate @@ -168,7 +169,7 @@ class Crawler(object): generalStore: Optional[GeneralStore] = None, debug: bool = False, identifiableAdapter: IdentifiableAdapter = None, - securityMode: int = SecurityMode.UPDATE + securityMode: SecurityMode = SecurityMode.UPDATE ): """ Create a new crawler and initialize an empty RecordStore and GeneralStore. @@ -209,14 +210,14 @@ class Crawler(object): if identifiableAdapter is None: self.identifiableAdapter = LocalStorageIdentifiableAdapter() # If a directory is crawled this may hold the path to that directory - self.crawled_directory = None + self.crawled_directory: Optional[str] = None self.debug = debug if self.debug: # order in the tuple: # 0: generalStore # 1: recordStore - self.debug_tree: Dict[str, tuple] = dict() - self.debug_metadata: Dict[str, dict] = dict() + self.debug_tree: dict[str, tuple] = dict() + self.debug_metadata: dict[str, dict] = dict() self.debug_metadata["copied"] = dict() self.debug_metadata["provenance"] = defaultdict(lambda: dict()) self.debug_metadata["usage"] = defaultdict(lambda: set()) @@ -236,7 +237,7 @@ class Crawler(object): return self._resolve_validator_paths(crawler_definition, crawler_definition_path) - def _load_definition_from_yaml_dict(self, crawler_definitions: List[Dict]): + def _load_definition_from_yaml_dict(self, crawler_definitions: list[dict]): """Load crawler definitions from a list of (yaml) dicts `crawler_definitions` which contains either one or two documents. @@ -258,7 +259,7 @@ class Crawler(object): # tested in the next lines of code: # Load the cfood schema: - with open(files('caoscrawler').joinpath('cfood-schema.yml'), "r") as f: + with open(str(files('caoscrawler').joinpath('cfood-schema.yml')), "r") as f: schema = yaml.safe_load(f) # Add custom converters to converter enum in schema: @@ -315,7 +316,7 @@ class Crawler(object): """ # Defaults for the converter registry: - converter_registry: Dict[str, Dict[str, str]] = { + converter_registry: dict[str, dict[str, str]] = { "Directory": { "converter": "DirectoryConverter", "package": "caoscrawler.converters"}, @@ -445,7 +446,7 @@ class Crawler(object): return converters - def start_crawling(self, items: Union[List[StructureElement], StructureElement], + def start_crawling(self, items: Union[list[StructureElement], StructureElement], crawler_definition: dict, converter_registry: dict): """ @@ -477,8 +478,9 @@ class Crawler(object): self.run_id = uuid.uuid1() local_converters = Crawler.initialize_converters( crawler_definition, converter_registry) + # This recursive crawling procedure generates the update list: - self.crawled_data: List[db.Record] = [] + self.crawled_data: list[db.Record] = [] self._crawl(items, local_converters, self.generalStore, self.recordStore, [], []) @@ -516,7 +518,7 @@ class Crawler(object): return False @staticmethod - def create_flat_list(ent_list: List[db.Entity], flat: List[db.Entity]): + def create_flat_list(ent_list: list[db.Entity], flat: list[db.Entity]): """ Recursively adds all properties contained in entities from ent_list to the output list flat. Each element will only be added once to the list. @@ -703,11 +705,11 @@ class Crawler(object): if p.value is old: p.value = new - def split_into_inserts_and_updates(self, ent_list: List[db.Entity]): + def split_into_inserts_and_updates(self, ent_list: list[db.Entity]): if self.identifiableAdapter is None: raise RuntimeError("Should not happen.") - to_be_inserted: List[db.Entity] = [] - to_be_updated: List[db.Entity] = [] + to_be_inserted: list[db.Entity] = [] + to_be_updated: list[db.Entity] = [] flat = list(ent_list) # assure all entities are direct members TODO Can this be removed at some point?Check only? Crawler.create_flat_list(ent_list, flat) @@ -796,8 +798,8 @@ class Crawler(object): @staticmethod def _merge_properties_from_remote( - crawled_data: List[db.Record], - identified_records: List[db.Record] + crawled_data: list[db.Record], + identified_records: list[db.Record] ): """Merge entity representation that was created by crawling the data with remotely found identified records s.th. new properties and property values are updated correctly but @@ -838,8 +840,8 @@ class Crawler(object): @staticmethod def remove_unnecessary_updates( - crawled_data: List[db.Record], - identified_records: List[db.Record] + crawled_data: list[db.Record], + identified_records: list[db.Record] ): """Compare the Records to be updated with their remote correspondant. Only update if there are actual differences. @@ -911,7 +913,7 @@ class Crawler(object): return db.Entity(id=id).retrieve() @staticmethod - def execute_inserts_in_list(to_be_inserted, securityMode, run_id: int = None, + def execute_inserts_in_list(to_be_inserted, securityMode, run_id: uuid.UUID = None, unique_names=True): for record in to_be_inserted: for prop in record.properties: @@ -939,7 +941,7 @@ class Crawler(object): _resolve_datatype(prop, entity) @staticmethod - def execute_updates_in_list(to_be_updated, securityMode, run_id: int = None, + def execute_updates_in_list(to_be_updated, securityMode, run_id: uuid.UUID = None, unique_names=True): Crawler.set_ids_and_datatype_of_parents_and_properties(to_be_updated) logger.debug("UPDATE") @@ -951,7 +953,7 @@ class Crawler(object): update_cache = UpdateCache() update_cache.insert(to_be_updated, run_id) - def _synchronize(self, crawled_data: List[db.Record], commit_changes: bool = True, + def _synchronize(self, crawled_data: list[db.Record], commit_changes: bool = True, unique_names=True): """ This function applies several stages: @@ -1036,7 +1038,7 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) @staticmethod def debug_build_usage_tree(converter: Converter): - res: Dict[str, Dict[str, Any]] = { + res: dict[str, dict[str, Any]] = { converter.name: { "usage": ", ".join(converter.metadata["usage"]), "subtree": {} @@ -1053,7 +1055,7 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) return res def save_debug_data(self, filename: str): - paths: Dict[str, Union[dict, list]] = dict() + paths: dict[str, Union[dict, list]] = dict() def flatten_debug_info(key): mod_info = self.debug_metadata[key] @@ -1078,11 +1080,11 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) with open(filename, "w") as f: f.write(yaml.dump(paths, sort_keys=False)) - def _crawl(self, items: List[StructureElement], - local_converters: List[Converter], + def _crawl(self, items: list[StructureElement], + local_converters: list[Converter], generalStore: GeneralStore, recordStore: RecordStore, - structure_elements_path: List[str], converters_path: List[str]): + structure_elements_path: list[str], converters_path: list[str]): """ Crawl a list of StructureElements and apply any matching converters. @@ -1170,7 +1172,7 @@ def crawler_main(crawled_directory_path: str, provenance_file: str = None, dry_run: bool = False, prefix: str = "", - securityMode: int = SecurityMode.UPDATE, + securityMode: SecurityMode = SecurityMode.UPDATE, unique_names=True, ): """ @@ -1272,6 +1274,9 @@ def parse_args(): parser.add_argument("crawled_directory_path", help="The subtree of files below the given path will " "be considered. Use '/' for everything.") + parser.add_argument("-c", "--add-cwd-to-path", action="store_true", + help="If given, the current working directory(cwd) is added to the Python " + "path.") parser.add_argument("-s", "--security-mode", choices=["retrieve", "insert", "update"], default="retrieve", help="Determines whether entities may only be read from the server, or " @@ -1305,6 +1310,8 @@ def main(): else: logger.setLevel(logging.INFO) + if args.add_cwd_to_path: + sys.path.append(os.path.abspath(".")) sys.exit(crawler_main( crawled_directory_path=args.crawled_directory_path, cfood_file_name=args.cfood_file_name, diff --git a/src/doc/converters.rst b/src/doc/converters.rst index 1839d59dc6cffb46740d90296c439e3986882ca6..ae84644072ebbd53f1325d1f9d1d0ef8e5dc6de6 100644 --- a/src/doc/converters.rst +++ b/src/doc/converters.rst @@ -104,6 +104,16 @@ FloatElementConverter also accepts IntegerElements. The default behavior can be adjusted with the fields `accept_text`, `accept_int`, `accept_float`, and `accept_bool`. +The following denotes what kind of StructureElements are accepted by default +(they are defined in `src/caoscrawler/converters.py`): + +- DictBooleanElementConverter: bool, int +- DictFloatElementConverter: int, float +- DictTextElementConverter: text, bool, int, float +- DictIntegerElementConverter: int +- DictListElementConverter: list +- DictDictElementConverter: dict + YAMLFileConverter ================= diff --git a/unittests/test_macros.py b/unittests/test_macros.py index 2934d7902a2f7be6925491f73412b0350265145d..4e27e42f8d1e633cf97fa142e2c0ec8aa013af05 100644 --- a/unittests/test_macros.py +++ b/unittests/test_macros.py @@ -327,7 +327,6 @@ extroot: !macro assert cfood["extroot"]["once"]["something"]["a"] == "4" assert cfood["extroot"]["twice"]["something"]["a"] == "5" assert cfood["extroot"]["default_name"]["something"]["a"] == "4" - # Code sample to generate the expanded macro: # with open("expanded_test_macro.yaml", "w") as f: # f.write(yaml.dump(cfood)) @@ -409,3 +408,51 @@ SimulationData: # Code sample to generate the expanded macro: # with open("expanded_test_macro.yaml", "w") as f: # f.write(yaml.dump(cfood)) + + +@pytest.mark.xfail( + reason="Wait until this feature is implemented" + "https://gitlab.com/caosdb/caosdb-crawler/-/issues/21." +) +def test_def_replacements(): + """Test that parameters in macro definitions can be used + for defining subsequent parameters. + """ + + cfood = _temp_file_load(""" +--- +metadata: + macros: + - !defmacro + name: test_def_replacements + params: + macro_name: default_name + z: $macro_name + a: $macro_name + v: $z + definition: + $macro_name: + macro_name: $macro_name + z: $z + a: $a + v: $v +--- +extroot: !macro + test_def_replacements: + - macro_name: once + - macro_name: twice + z: 5 + - {} + """) + assert cfood["extroot"]["once"]["z"] == "once" + assert cfood["extroot"]["once"]["a"] == "once" + assert cfood["extroot"]["once"]["v"] == "once" + assert cfood["extroot"]["once"]["macro_name"] == "once" + assert cfood["extroot"]["twice"]["z"] == "5" + assert cfood["extroot"]["twice"]["a"] == "5" + assert cfood["extroot"]["twice"]["v"] == "5" + assert cfood["extroot"]["twice"]["macro_name"] == "twice" + assert cfood["extroot"]["default_name"]["z"] == "default_name" + assert cfood["extroot"]["default_name"]["a"] == "default_name" + assert cfood["extroot"]["default_name"]["v"] == "default_name" + assert cfood["extroot"]["default_name"]["macro_name"] == "default_name"