diff --git a/.gitignore b/.gitignore index 9af5ee22fdd68c1c25e98614ab516bf4d384d577..5599d7d263c8927025e128c37eabb185025bf96b 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,6 @@ provenance.yml *.tar.gz *.sql /integrationtests/test-profile/custom/other/cert/ +src/doc/_apidoc/ +start_caosdb_docker.sh +src/doc/_apidoc diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b5bc53fed1b069f3a6f665a188aa8bdcd7252570..30a8cd8fe4c08fd3fe0f3f98aaa56b83cb623086 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -152,11 +152,11 @@ inttest: - CAOSDB_TAG=$CAOSDB_TAG docker-compose up -d # Store versions of CaosDB parts - - docker exec -u 0 -t docker_caosdb-server_1 cat /opt/caosdb/git/caosdb_pylib_commit > hash_pylib - - docker exec -u 0 -t docker_caosdb-server_1 cat /opt/caosdb/git/caosdb_webui_commit > hash_webui - - docker exec -u 0 -t docker_caosdb-server_1 cat /opt/caosdb/git/caosdb_server_commit > hash_server - - docker exec -u 0 -t docker_caosdb-server_1 cat /opt/caosdb/git/caosdb_mysqlbackend_commit > hash_mysql - - docker exec -u 0 -t docker_caosdb-server_1 cat /opt/caosdb/git/caosdb_proto_commit > hash_proto + - docker exec -u 0 -t docker-caosdb-server-1 cat /opt/caosdb/git/caosdb_pylib_commit > hash_pylib + - docker exec -u 0 -t docker-caosdb-server-1 cat /opt/caosdb/git/caosdb_webui_commit > hash_webui + - docker exec -u 0 -t docker-caosdb-server-1 cat /opt/caosdb/git/caosdb_server_commit > hash_server + - docker exec -u 0 -t docker-caosdb-server-1 cat /opt/caosdb/git/caosdb_mysqlbackend_commit > hash_mysql + - docker exec -u 0 -t docker-caosdb-server-1 cat /opt/caosdb/git/caosdb_proto_commit > hash_proto - cat hash_server - cat hash_proto - cat hash_mysql @@ -167,8 +167,8 @@ inttest: - /bin/sh ./run.sh # Save logs - - docker logs docker_caosdb-server_1 &> ../caosdb_log.txt - - docker logs docker_sqldb_1 &> ../mariadb_log.txt + - docker logs docker-caosdb-server-1 &> ../caosdb_log.txt + - docker logs docker-sqldb-1 &> ../mariadb_log.txt - cd .. # Stop the server diff --git a/CHANGELOG.md b/CHANGELOG.md index 81b6cd332416d35a8ef4c436e391890afb3a43f5..118eebe00ffae7941fdb9b03efbcbc641639ed1a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,11 +13,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * Added new converters for tables: CSVTableConverter and XLSXTableConverter * Possibility to authorize updates as in the old crawler * Allow authorization of inserts +* Allow splitting cfoods into multiple yaml documents +* Implemented macros * Converters can now filter the list of children +* You can now crawl data with name conflicts: `synchronize(unique_names=False)` ### Changed -* Renamed module from `newcrawler` to `caoscrawler` +* MAINT: Renamed module from `newcrawler` to `caoscrawler` +* MAINT: Removed global converters from `crawl.py` ### Deprecated @@ -30,6 +34,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * FIX: #35 Parent cannot be set from value * [#6](https://gitlab.com/caosdb/caosdb-crawler/-/issues/6): Fixed many type hints to be compatible to python 3.8 +* [#9](https://gitlab.com/caosdb/caosdb-crawler/-/issues/9): Sclaras of types + different than string can now be given in cfood definitions ### Security diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index 3fbf9939664af20f35150e5fff95854634ea3040..f316eba6096356511192005d5628ae4657a07454 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -36,7 +36,7 @@ from .structure_elements import (StructureElement, Directory, File, Dict, JSONFi DictFloatElement, DictDictElement, TextElement, DictTextElement, DictElement, DictListElement) from typing import Dict as Dict_t, List, Optional, Tuple, Union -from abc import abstractmethod +from abc import ABCMeta, abstractmethod from string import Template import yaml_header_tools @@ -156,15 +156,21 @@ def handle_value(value: Union[dict, str, list], values: GeneralStore): propvalue = value # variables replacement: - propvalue = [replace_variables(i, values) for i in propvalue] + propvalue = list() + for element in value: + # Do the element-wise replacement only, when its type is string: + if type(element) == str: + propvalue.append(replace_variables(element, values)) + else: + propvalue.append(element) return (propvalue, collection_mode) else: # value is another simple type - # collection_mode = "single" - # propvalue = value["value"] - # return (propvalue, collection_mode) - raise RuntimeError() + collection_mode = "single" + propvalue = value + # Return it immediately, otherwise variable substitution would be done and fail: + return (propvalue, collection_mode) propvalue = replace_variables(propvalue, values) return (propvalue, collection_mode) @@ -255,7 +261,7 @@ def create_records(values: GeneralStore, return keys_modified -class Converter(object): +class Converter(object, metaclass=ABCMeta): """ Converters treat StructureElements contained in the hierarchical sturcture. """ @@ -283,6 +289,10 @@ class Converter(object): def converter_factory(definition: dict, name: str, converter_registry: dict): + """creates a Converter instance of the appropriate class. + + The `type` key in the `definition` defines the Converter class which is being used. + """ if "type" not in definition: raise RuntimeError( @@ -535,6 +545,7 @@ class DictConverter(Converter): return {} +# TODO: difference to SimpleFileConverter? Do we need both? class FileConverter(Converter): def typecheck(self, element: StructureElement): return isinstance(element, File) @@ -566,6 +577,8 @@ class JSONFileConverter(DictConverter): def create_children(self, generalStore: GeneralStore, element: StructureElement): if not self.typecheck(element): raise RuntimeError("A JSON file is needed to create children") + # TODO: either add explicit time check for File structure element here, + # or add a comment to suppress mypy type warning. with open(element.path, 'r') as json_file: json_data = json.load(json_file) if not isinstance(json_data, dict): diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index b57090bd64a3ae40723ef71be701ba063866426e..0f0664062af498716eb4edb52d911ea185d05b08 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -55,12 +55,17 @@ from caosdb.apiutils import compare_entities, merge_entities from copy import deepcopy from jsonschema import validate -logger = logging.getLogger(__name__) +from .macros import defmacro_constructor, macro_constructor +logger = logging.getLogger(__name__) SPECIAL_PROPERTIES_STRICT = ("description", "name", "id", "path") SPECIAL_PROPERTIES_NOT_STRICT = ("file", "checksum", "size") +# Register the macro functions from the submodule: +yaml.SafeLoader.add_constructor("!defmacro", defmacro_constructor) +yaml.SafeLoader.add_constructor("!macro", macro_constructor) + def check_identical(record1: db.Entity, record2: db.Entity, ignore_id=False): """ @@ -160,7 +165,6 @@ class Crawler(object): """ def __init__(self, - converters: List[Converter] = [], generalStore: Optional[GeneralStore] = None, debug: bool = False, identifiableAdapter: IdentifiableAdapter = None, @@ -171,15 +175,14 @@ class Crawler(object): Parameters ---------- - converters : List[Converter] - The set of converters used for this crawler. recordStore : GeneralStore An initial GeneralStore which might store e.g. environment variables. debug : bool Create a debugging information tree when set to True. The debugging information tree is a variable stored in self.debug_tree. It is a dictionary mapping directory entries - to a tuple of general stores and record stores which are valid for the directory scope. + to a tuple of general stores and record stores which are valid for + the directory scope. Furthermore, it is stored in a second tree named self.debug_copied whether the objects in debug_tree had been copied from a higher level in the hierarchy of the structureelements. @@ -191,7 +194,6 @@ class Crawler(object): """ # TODO: check if this feature is really needed - self.global_converters = converters self.identified_cache = IdentifiedCache() self.recordStore = RecordStore() @@ -225,7 +227,16 @@ class Crawler(object): # Load the cfood from a yaml file: with open(crawler_definition_path, "r") as f: - crawler_definition = yaml.safe_load(f) + crawler_definitions = list(yaml.safe_load_all(f)) + + if len(crawler_definitions) == 1: + # Simple case, just one document: + crawler_definition = crawler_definitions[0] + elif len(crawler_definitions) == 2: + crawler_definition = crawler_definitions[1] + else: + raise RuntimeError( + "Crawler definition must not contain more than two documents.") # TODO: at this point this function can already load the cfood schema extensions # from the crawler definition and add them to the yaml schema that will be @@ -376,9 +387,13 @@ class Crawler(object): converter_registry) @staticmethod - def create_local_converters(crawler_definition: dict, - converter_registry: dict): - local_converters = [] + def initialize_converters(crawler_definition: dict, converter_registry: dict): + """ + takes the cfood as dict (`crawler_definition`) and creates the converter objects that + are defined on the highest level. Child Converters will in turn be created during the + initialization of the Converters. + """ + converters = [] for key, value in crawler_definition.items(): # Definitions and Converters are reserved keywords @@ -390,10 +405,9 @@ class Crawler(object): continue elif key == "Converters": continue - local_converters.append(Converter.converter_factory( - value, key, converter_registry)) + converters.append(Converter.converter_factory(value, key, converter_registry)) - return local_converters + return converters def start_crawling(self, items: Union[List[StructureElement], StructureElement], crawler_definition: dict, @@ -425,20 +439,19 @@ class Crawler(object): items = [items] self.run_id = uuid.uuid1() - local_converters = Crawler.create_local_converters(crawler_definition, - converter_registry) + local_converters = Crawler.initialize_converters( + crawler_definition, converter_registry) # This recursive crawling procedure generates the update list: self.target_data: List[db.Record] = [] - self._crawl(items, - self.global_converters, local_converters, self.generalStore, self.recordStore, - [], []) + self._crawl(items, local_converters, self.generalStore, + self.recordStore, [], []) if self.debug: - self.debug_converters = self.global_converters + local_converters + self.debug_converters = local_converters return self.target_data - def synchronize(self, commit_changes: bool = True): + def synchronize(self, commit_changes: bool = True, unique_names=True): """ Carry out the actual synchronization. """ @@ -446,7 +459,7 @@ class Crawler(object): # After the crawling, the actual synchronization with the database, based on the # update list is carried out: - return self._synchronize(self.target_data, commit_changes) + return self._synchronize(self.target_data, commit_changes, unique_names=unique_names) def can_be_checked_externally(self, record: db.Record): """ @@ -807,7 +820,8 @@ class Crawler(object): return db.Entity(name=name).retrieve() @staticmethod - def execute_inserts_in_list(to_be_inserted, securityMode, run_id: int = None): + def execute_inserts_in_list(to_be_inserted, securityMode, run_id: int = None, + unique_names=True): for record in to_be_inserted: for prop in record.properties: entity = Crawler._get_entity_by_name(prop.name) @@ -816,7 +830,7 @@ class Crawler(object): logger.debug(to_be_inserted) if len(to_be_inserted) > 0: if securityMode.value > SecurityMode.RETRIEVE.value: - db.Container().extend(to_be_inserted).insert() + db.Container().extend(to_be_inserted).insert(unique=unique_names) elif run_id is not None: update_cache = UpdateCache() update_cache.insert(to_be_inserted, run_id, insert=True) @@ -834,18 +848,20 @@ class Crawler(object): _resolve_datatype(prop, entity) @staticmethod - def execute_updates_in_list(to_be_updated, securityMode, run_id: int = None): + def execute_updates_in_list(to_be_updated, securityMode, run_id: int = None, + unique_names=True): Crawler.set_ids_and_datatype_of_parents_and_properties(to_be_updated) logger.debug("UPDATE") logger.debug(to_be_updated) if len(to_be_updated) > 0: if securityMode.value > SecurityMode.INSERT.value: - db.Container().extend(to_be_updated).update() + db.Container().extend(to_be_updated).update(unique=unique_names) elif run_id is not None: update_cache = UpdateCache() update_cache.insert(to_be_updated, run_id) - def _synchronize(self, target_data: List[db.Record], commit_changes: bool = True): + def _synchronize(self, target_data: List[db.Record], commit_changes: bool = True, + unique_names=True): """ This function applies several stages: 1) Retrieve identifiables for all records in target_data. @@ -884,9 +900,9 @@ class Crawler(object): self.execute_parent_updates_in_list(to_be_updated) self.execute_inserts_in_list( - to_be_inserted, self.securityMode, self.run_id) + to_be_inserted, self.securityMode, self.run_id, unique_names=unique_names) self.execute_updates_in_list( - to_be_updated, self.securityMode, self.run_id) + to_be_updated, self.securityMode, self.run_id, unique_names=unique_names) update_cache = UpdateCache() pending_inserts = update_cache.get_inserts(self.run_id) @@ -969,7 +985,6 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) f.write(yaml.dump(paths, sort_keys=False)) def _crawl(self, items: List[StructureElement], - global_converters: List[Converter], local_converters: List[Converter], generalStore: GeneralStore, recordStore: RecordStore, @@ -978,7 +993,7 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) Crawl a list of StructureElements and apply any matching converters. items: structure_elements (e.g. files and folders on one level on the hierarchy) - global_converters and local_converters: globally or locally defined converters for + local_converters: locally defined converters for treating structure elements. A locally defined converter could be one that is only valid for a specific subtree of the originally cralwed StructureElement structure. @@ -986,7 +1001,8 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) global stores of the Crawler object. """ for element in items: - for converter in global_converters + local_converters: + for converter in local_converters: + # type is something like "matches files", replace isinstance with "type_matches" # match function tests regexp for example if (converter.typecheck(element) and @@ -1012,7 +1028,8 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) self.debug_tree[str(element)] = ( generalStore_copy.get_storage(), recordStore_copy.get_storage()) self.debug_metadata["copied"][str(element)] = ( - generalStore_copy.get_dict_copied(), recordStore_copy.get_dict_copied()) + generalStore_copy.get_dict_copied(), + recordStore_copy.get_dict_copied()) self.debug_metadata["usage"][str(element)].add( "/".join(converters_path + [converter.name])) mod_info = self.debug_metadata["provenance"] @@ -1023,10 +1040,11 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) record_identifier = record_name + \ "_" + str(internal_id) converter.metadata["usage"].add(record_identifier) - mod_info[record_identifier][prop_name] = (structure_elements_path + [element.get_name()], - converters_path + [converter.name]) + mod_info[record_identifier][prop_name] = ( + structure_elements_path + [element.get_name()], + converters_path + [converter.name]) - self._crawl(children, global_converters, converter.converters, + self._crawl(children, converter.converters, generalStore_copy, recordStore_copy, structure_elements_path + [element.get_name()], converters_path + [converter.name]) @@ -1058,7 +1076,9 @@ def crawler_main(crawled_directory_path: str, provenance_file: str = None, dry_run: bool = False, prefix: str = "", - securityMode: int = SecurityMode.UPDATE): + securityMode: int = SecurityMode.UPDATE, + unique_names=True, + ): """ Parameters @@ -1079,6 +1099,8 @@ def crawler_main(crawled_directory_path: str, remove the given prefix from file paths securityMode : int securityMode of Crawler + unique_names : bool + whether or not to update or insert entities inspite of name conflicts Returns ------- @@ -1110,6 +1132,8 @@ def crawler_main(crawled_directory_path: str, if isinstance(elem, db.File): # correct the file path: # elem.file = os.path.join(args.path, elem.file) + if prefix is None: + raise RuntimeError("No prefix set. Prefix must be set if files are used.") if elem.path.startswith(prefix): elem.path = elem.path[len(prefix):] elem.file = None @@ -1136,7 +1160,7 @@ def crawler_main(crawled_directory_path: str, raise RuntimeError("Missing RecordTypes: {}". format(", ".join(notfound))) - crawler.synchronize(commit_changes=True) + crawler.synchronize(commit_changes=True, unique_names=unique_names) return 0 @@ -1154,6 +1178,7 @@ def parse_args(): help="The subtree of files below the given path will " "be considered. Use '/' for everything.") parser.add_argument("-s", "--security-mode", choices=["retrieve", "insert", "update"], + default="retrieve", help="Determines whether entities may only be read from the server, or " "whether inserts or even updates may be done.") parser.add_argument("-n", "--dry-run", action="store_true", @@ -1162,9 +1187,9 @@ def parse_args(): # TODO: load identifiables is a dirty implementation currently parser.add_argument("-i", "--load-identifiables", - help="Load identifiables from " - "the given yaml file.") - + help="Load identifiables from the given yaml file.") + parser.add_argument("-u", "--unique-names", + help="Insert or updates entities even if name conflicts exist.") parser.add_argument("-p", "--prefix", help="Remove the given prefix from the paths " "of all file objects.") @@ -1186,16 +1211,17 @@ def main(): logger.setLevel(logging.INFO) sys.exit(crawler_main( - args.crawled_directory_path, - args.cfood_file_name, - args.load_identifiables, - args.debug, - args.provenance, - args.dry_run, - args.prefix, - {"retrieve": SecurityMode.RETRIEVE, - "insert": SecurityMode.INSERT, - "update": SecurityMode.UPDATE}[args.security_mode] + crawled_directory_path=args.crawled_directory_path, + cfood_file_name=args.cfood_file_name, + identifiables_definition_file=args.load_identifiables, + debug=args.debug, + provenance_file=args.provenance, + dry_run=args.dry_run, + prefix=args.prefix, + securityMode={"retrieve": SecurityMode.RETRIEVE, + "insert": SecurityMode.INSERT, + "update": SecurityMode.UPDATE}[args.security_mode], + unique_names=args.unique_names, )) diff --git a/src/caoscrawler/macros/__init__.py b/src/caoscrawler/macros/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0acfb1763039a3bb800bbf0e26d6940b49d045cf --- /dev/null +++ b/src/caoscrawler/macros/__init__.py @@ -0,0 +1 @@ +from .macro_yaml_object import defmacro_constructor, macro_constructor diff --git a/src/caoscrawler/macros/macro_yaml_object.py b/src/caoscrawler/macros/macro_yaml_object.py new file mode 100644 index 0000000000000000000000000000000000000000..2849986e6deb5cb2cba9e45516e6ce8e1a93dfa0 --- /dev/null +++ b/src/caoscrawler/macros/macro_yaml_object.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2022 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +# Function to expand a macro in yaml +# A. Schlemmer, 05/2022 + +from dataclasses import dataclass +from typing import Any, Dict +from copy import deepcopy +from string import Template + + +@dataclass +class MacroDefinition: + """ + Stores a macro definition. + name: Name of the macro + params: variables and default values to be substituted in keys or values + definition: A dictionary that will be substituted including parameters + """ + name: str + params: Dict[str, Any] + definition: Any + + +# This dictionary stores the macro definitions +macro_store: Dict[str, MacroDefinition] = dict() + + +def substitute(propvalue, values: dict): + """ + Substitution of variables in strings using the variable substitution + library from python's standard library. + """ + propvalue_template = Template(propvalue) + return propvalue_template.safe_substitute(**values) + + +def substitute_dict(sourced: Dict[str, Any], values: Dict[str, Any]): + """ + Create a copy of sourced. + Afterwards recursively do variable substitution on all keys and values. + """ + d = deepcopy(sourced) + # Changes in keys: + replace: Dict[str, str] = dict() + for k in d: + replacement = substitute(k, values) + if replacement != k: + replace[k] = replacement + for k, v in replace.items(): + d[v] = d[k] + del d[k] + # Changes in values: + for k, v in d.items(): + if isinstance(v, str): + d[k] = substitute(v, values) + elif isinstance(v, list): + subst_list = list() + for i in d[k]: + if isinstance(i, str): + subst_list.append(substitute(i, values)) + elif isinstance(i, dict): + subst_list.append(substitute_dict(i, values)) + else: + subst_list.append(i) + d[k] = subst_list + elif isinstance(v, dict): + d[k] = substitute_dict(v, values) + else: + pass + return d + + +def defmacro_constructor(loader, node): + """ + Function for registering macros in yaml files. + + It can be registered in pyaml using: + yaml.SafeLoader.add_constructor("!defmacro", defmacro_constructor) + """ + + value = loader.construct_mapping(node, deep=True) + params = {} + if "params" in value: + params = value["params"] + macro = MacroDefinition( + value["name"], params, + value["definition"]) + macro_store[macro.name] = macro + return {} + + +def macro_constructor(loader, node): + """ + Function for substituting macros in yaml files. + + It can be registered in pyaml using: + yaml.SafeLoader.add_constructor("!macro", macro_constructor) + """ + res = dict() + value = loader.construct_mapping(node, deep=True) + for name, params_setter in value.items(): + if name in macro_store: + # If params_setter is a list, run this for every element: + if params_setter is not None and isinstance(params_setter, list): + for el in params_setter: + macro = macro_store[name] + params = deepcopy(macro.params) + if el is not None: + if isinstance(el, dict): + params.update(el) + else: + raise RuntimeError("params type not supported") + else: + raise RuntimeError("params type must not be None") + definition = substitute_dict(macro.definition, params) + res.update(definition) + else: + # This is just a single macro: + macro = macro_store[name] + params = deepcopy(macro.params) + if params_setter is not None: + if isinstance(params_setter, dict): + params.update(params_setter) + else: + raise RuntimeError("params type not supported") + definition = substitute_dict(macro.definition, params) + res.update(definition) + else: + # If there is no macro with that name, just keep that node: + res[name] = params_setter + + return res diff --git a/src/doc/README_SETUP.md b/src/doc/README_SETUP.md new file mode 100644 index 0000000000000000000000000000000000000000..b6995c9a2d950ecd1e832d5b49dac9ed88a7e455 --- /dev/null +++ b/src/doc/README_SETUP.md @@ -0,0 +1,82 @@ +# Getting started with the CaosDB Crawler # + +## Installation ## + +### Requirements ### + + +### How to install ### + +#### Linux #### + +Make sure that Python (at least version 3.8) and pip is installed, using your system tools and +documentation. + +Then open a terminal and continue in the [Generic installation](#generic-installation) section. + +#### Windows #### + +If a Python distribution is not yet installed, we recommend Anaconda Python, which you can download +for free from [https://www.anaconda.com](https://www.anaconda.com). The "Anaconda Individual Edition" provides most of all +packages you will ever need out of the box. If you prefer, you may also install the leaner +"Miniconda" installer, which allows you to install packages as you need them. + +After installation, open an Anaconda prompt from the Windows menu and continue in the [Generic +installation](#generic-installation) section. + +#### MacOS #### + +If there is no Python 3 installed yet, there are two main ways to +obtain it: Either get the binary package from +[python.org](https://www.python.org/downloads/) or, for advanced +users, install via [Homebrew](https://brew.sh/). After installation +from python.org, it is recommended to also update the TLS certificates +for Python (this requires administrator rights for your user): + +```sh +# Replace this with your Python version number: +cd /Applications/Python\ 3.9/ + +# This needs administrator rights: +sudo ./Install\ Certificates.command +``` + +After these steps, you may continue with the [Generic +installation](#generic-installation). + +#### Generic installation #### + +--- + +Obtain the sources from GitLab and install from there (`git` must be installed for +this option): + +```sh +git clone https://gitlab.com/caosdb/caosdb-crawler +cd caosdb-crawler +pip3 install --user . +``` + +**Note**: In the near future, this package will also be made available on PyPi. + +## Configuration ## + + + +## Try it out ## + + + +## Run Unit Tests + +## Documentation ## + +Build documentation in `src/doc` with `make html`. + +### Requirements ### + +- `sphinx` +- `sphinx-autoapi` +- `recommonmark` + +### Troubleshooting ### diff --git a/src/doc/_apidoc/modules.rst b/src/doc/_apidoc/modules.rst deleted file mode 100644 index 17f187982981ffbf7bcc857056d10644c2bd422b..0000000000000000000000000000000000000000 --- a/src/doc/_apidoc/modules.rst +++ /dev/null @@ -1,7 +0,0 @@ -newcrawler -========== - -.. toctree:: - :maxdepth: 4 - - newcrawler diff --git a/src/doc/_apidoc/newcrawler.converters.rst b/src/doc/_apidoc/newcrawler.converters.rst deleted file mode 100644 index 893391c229b94baeed9a44c57877ed33f37b2f5e..0000000000000000000000000000000000000000 --- a/src/doc/_apidoc/newcrawler.converters.rst +++ /dev/null @@ -1,7 +0,0 @@ -newcrawler.converters module -============================ - -.. automodule:: newcrawler.converters - :members: - :undoc-members: - :show-inheritance: diff --git a/src/doc/_apidoc/newcrawler.crawl.rst b/src/doc/_apidoc/newcrawler.crawl.rst deleted file mode 100644 index b00a6ab6498a0482cea3e9faa54d66d66991dc2d..0000000000000000000000000000000000000000 --- a/src/doc/_apidoc/newcrawler.crawl.rst +++ /dev/null @@ -1,7 +0,0 @@ -newcrawler.crawl module -======================= - -.. automodule:: newcrawler.crawl - :members: - :undoc-members: - :show-inheritance: diff --git a/src/doc/_apidoc/newcrawler.identifiable_adapters.rst b/src/doc/_apidoc/newcrawler.identifiable_adapters.rst deleted file mode 100644 index d8926f41b72d2c54931f045d75f9fe59b21e6076..0000000000000000000000000000000000000000 --- a/src/doc/_apidoc/newcrawler.identifiable_adapters.rst +++ /dev/null @@ -1,7 +0,0 @@ -newcrawler.identifiable\_adapters module -======================================== - -.. automodule:: newcrawler.identifiable_adapters - :members: - :undoc-members: - :show-inheritance: diff --git a/src/doc/_apidoc/newcrawler.identified_cache.rst b/src/doc/_apidoc/newcrawler.identified_cache.rst deleted file mode 100644 index 6f697362ad44d1fec01f328550dc8667cc889019..0000000000000000000000000000000000000000 --- a/src/doc/_apidoc/newcrawler.identified_cache.rst +++ /dev/null @@ -1,7 +0,0 @@ -newcrawler.identified\_cache module -=================================== - -.. automodule:: newcrawler.identified_cache - :members: - :undoc-members: - :show-inheritance: diff --git a/src/doc/_apidoc/newcrawler.rst b/src/doc/_apidoc/newcrawler.rst deleted file mode 100644 index 202444a5efbde248e52d712575ade49f6dd50601..0000000000000000000000000000000000000000 --- a/src/doc/_apidoc/newcrawler.rst +++ /dev/null @@ -1,24 +0,0 @@ -newcrawler package -================== - -Submodules ----------- - -.. toctree:: - :maxdepth: 4 - - newcrawler.converters - newcrawler.crawl - newcrawler.identifiable_adapters - newcrawler.identified_cache - newcrawler.stores - newcrawler.structure_elements - newcrawler.utils - -Module contents ---------------- - -.. automodule:: newcrawler - :members: - :undoc-members: - :show-inheritance: diff --git a/src/doc/_apidoc/newcrawler.stores.rst b/src/doc/_apidoc/newcrawler.stores.rst deleted file mode 100644 index 7d446c1cd45a6bf1c4b6cf1b1d33e9a2a5ad9751..0000000000000000000000000000000000000000 --- a/src/doc/_apidoc/newcrawler.stores.rst +++ /dev/null @@ -1,7 +0,0 @@ -newcrawler.stores module -======================== - -.. automodule:: newcrawler.stores - :members: - :undoc-members: - :show-inheritance: diff --git a/src/doc/_apidoc/newcrawler.structure_elements.rst b/src/doc/_apidoc/newcrawler.structure_elements.rst deleted file mode 100644 index 4613e1d58b0ef9c7cc38096aa25270f469836ce5..0000000000000000000000000000000000000000 --- a/src/doc/_apidoc/newcrawler.structure_elements.rst +++ /dev/null @@ -1,7 +0,0 @@ -newcrawler.structure\_elements module -===================================== - -.. automodule:: newcrawler.structure_elements - :members: - :undoc-members: - :show-inheritance: diff --git a/src/doc/_apidoc/newcrawler.utils.rst b/src/doc/_apidoc/newcrawler.utils.rst deleted file mode 100644 index 4df55a234fd85072068e41d1ce7bb3b17fd1a698..0000000000000000000000000000000000000000 --- a/src/doc/_apidoc/newcrawler.utils.rst +++ /dev/null @@ -1,7 +0,0 @@ -newcrawler.utils module -======================= - -.. automodule:: newcrawler.utils - :members: - :undoc-members: - :show-inheritance: diff --git a/src/doc/cfood.rst b/src/doc/cfood.rst new file mode 100644 index 0000000000000000000000000000000000000000..677cadc55709c6c25d16ff547b311102ee78699a --- /dev/null +++ b/src/doc/cfood.rst @@ -0,0 +1,149 @@ +CFood-Definition +================ + +The crawler specification is called CFood-definition. It is stored inside a yaml file, or - more precisely - inside of one single or two yaml documents inside a yaml file. + +The specification consists of three separate parts: +#. Metadata and macro definitions +#. Custom converter registrations +#. The converter tree specification + +In the simplest case, there is just one yaml file with just a single document including at least +the converter tree specification (see :ref:`example 1<example_1>`). Additionally the custom converter part may be also included in +this single document (for historical reasons, see :ref:`example 2<example_2>`), but it is recommended to include them in the separate +document together with the metadata and :doc:`macro<macros>` definitions (see :ref:`below<example_4>`). + +If metadata and macro definitions are provided, there **must** be a second document preceeding the +converter tree specification, including these definitions. + +Examples +++++++++ + +A single document with a converter tree specification: + +.. _example_1: +.. code-block:: yaml + + extroot: + type: Directory + match: ^extroot$ + subtree: + DataAnalysis: + type: Directory + match: DataAnalysis + # (...) + + +A single document with a converter tree specification, but also including a custom converters section: + +.. _example_2: +.. code-block:: yaml + + Converters: + CustomConverter_1: + package: mypackage.converters + converter: CustomConverter1 + CustomConverter_2: + package: mypackage.converters + converter: CustomConverter2 + + extroot: + type: Directory + match: ^extroot$ + subtree: + DataAnalysis: + type: Directory + match: DataAnalysis + # (...) + + + +A yaml multi-document, defining metadata and some macros in the first document and declaring +two custom converters in the second document (**not recommended**, see the recommended version :ref:`below<example_4>`). Please note, that two separate yaml documents can be defined using the ``---`` syntax: + + +.. _example_3: +.. code-block:: yaml + + --- + metadata: + name: Datascience CFood + description: CFood for data from the local data science work group + macros: + - !defmacro + name: SimulationDatasetFile + params: + match: null + recordtype: null + nodename: null + definition: + # (...) + --- + Converters: + CustomConverter_1: + package: mypackage.converters + converter: CustomConverter1 + CustomConverter_2: + package: mypackage.converters + converter: CustomConverter2 + + extroot: + type: Directory + match: ^extroot$ + subtree: + DataAnalysis: + type: Directory + match: DataAnalysis + # (...) + + + +The **recommended way** of defining metadata, custom converters, macros and the main cfood specification is shown in the following code example: + + +.. _example_4: +.. code-block:: yaml + + --- + metadata: + name: Datascience CFood + description: CFood for data from the local data science work group + macros: + - !defmacro + name: SimulationDatasetFile + params: + match: null + recordtype: null + nodename: null + definition: + # (...) + Converters: + CustomConverter_1: + package: mypackage.converters + converter: CustomConverter1 + CustomConverter_2: + package: mypackage.converters + converter: CustomConverter2 + --- + extroot: + type: Directory + match: ^extroot$ + subtree: + DataAnalysis: + type: Directory + match: DataAnalysis + # (...) + + +List Mode +--------- + +Specifying values of properties can make use of two special characters, in order to automatically +create lists or multi properties instead of single values: + +.. code-block:: yaml + + Experiment1: + Measurement: +Measurement <- Element in List (list is cleared before run) + *Measurement <- Multi Property (properties are removed before run) + Measurement <- Overwrite diff --git a/src/doc/concepts.rst b/src/doc/concepts.rst new file mode 100644 index 0000000000000000000000000000000000000000..c0f21cbaa322caddabed8e045f7b6fc4253d2959 --- /dev/null +++ b/src/doc/concepts.rst @@ -0,0 +1,119 @@ +Concepts +)))))))) + +Structure Elements +++++++++++++++++++ + +This hierarchical structure is assumed to be consituted of a tree of +StructureElements. The tree is created on the fly by so called Converters which +are defined in a yaml file. The tree of StructureElements is a model +of the existing data (For example could a tree of Python file objects +(StructureElements) represent a file tree that exists on some file server). + +Relevant sources in: +src/structure_elements.py + +Converters +++++++++++ + +Converters treat StructureElements and thereby create the StructureElement that +are the children of the treated StructureElement. Converters therefore create +the above named tree. The definition of a Converter also contains what +Converters shall be used to treat the generated child-StructureElements. The +definition is therefore a tree itself. + +See `:doc:converters<converters>` for details. + + + +Relevant sources in: +src/converters.py + + + +Identifiables ++++++++++++++ + +Relevant sources in: +src/identifiable_adapters.py + +The Crawler ++++++++++++ + +The crawler can be considered the main program doing the synchronization in basically two steps: +#. Based on a yaml-specification scan the file system (or other sources) and create a set of CaosDB Entities that are supposed to be inserted or updated in a CaosDB instance. +#. Compare the current state of the CaosDB instance with the set of CaosDB Entities created in step 1, taking into account the :ref:`registered identifiables<Identifiables>`. Insert or update entites accordingly. + +Relevant sources in: +src/crawl.py + + + +Special Cases +============= + +Variable Precedence ++++++++++++++++++++ + +Let's assume the following situation + +.. code-block:: yaml + + description: + type: DictTextElement + match_value: (?P<description>.*) + match_name: description + + +Making use of the $description variable could refer to two different variables created here: +1. The structure element path. +2. The value of the matched expression. + +The matched expression does take precedence over the structure element path and shadows it. + +Make sure, that if you want to be able to use the structure element path, to give unique names +to the variables like: + +.. code-block:: yaml + + description_text_block: + type: DictTextElement + match_value: (?P<description>.*) + match_name: description + + +Scopes +======== + +Example: + +.. code-block:: yaml + + DicomFile: + type: SimpleDicomFile + match: (?P<filename>.*)\.dicom + records: + DicomRecord: + name: $filename + subtree: # header of dicom file + PatientID: + type: DicomHeaderElement + match_name: PatientName + match_value: (?P<patient>.*) + records: + Patient: + name: $patient + dicom_name: $filename # $filename is in same scope! + ExperimentFile: + type: MarkdownFile + match: ^readme.md$ + records: + Experiment: + dicom_name: $filename # does NOT work, because $filename is out of scope! + + +# can variables be used within regexp? + + +File Objects +============ diff --git a/src/doc/conf.py b/src/doc/conf.py index fb37cdd96c440300741aeb49e90caffe4370f5d7..30ce670eb8685e9701eeeb59bf22451a21fb16b9 100644 --- a/src/doc/conf.py +++ b/src/doc/conf.py @@ -53,6 +53,7 @@ extensions = [ 'sphinx.ext.autosectionlabel', 'sphinx.ext.intersphinx', 'sphinx.ext.napoleon', # For Google style docstrings + "recommonmark", # For markdown files. "sphinx_rtd_theme", ] @@ -61,7 +62,7 @@ templates_path = ['_templates'] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: -source_suffix = ['.rst'] +source_suffix = ['.rst', '.md'] # The master toctree document. master_doc = 'index' @@ -71,7 +72,7 @@ master_doc = 'index' # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = None +language = "en" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -99,7 +100,7 @@ html_theme = "sphinx_rtd_theme" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = [] # ['_static'] # Custom sidebar templates, must be a dictionary that maps document names # to template names. diff --git a/src/doc/converters.rst b/src/doc/converters.rst new file mode 100644 index 0000000000000000000000000000000000000000..7ec93535ec41dc211e2fa7ee194b2ecbe1a659fb --- /dev/null +++ b/src/doc/converters.rst @@ -0,0 +1,309 @@ +Converters +)))))))))) + +Converters treat StructureElements and thereby create the StructureElement that +are the children of the treated StructureElement. Converters therefore create +the tree of structure elements. The definition of a Converter also contains what +Converters shall be used to treat the generated child-StructureElements. The +definition is therefore a tree itself. + +Each StructureElement in the tree has a set of data values, i.e a dictionary of +key value pairs. +Some of those values are set due to the kind of StructureElement. For example, +a file could have the file name as such a key value pair: 'filename': <sth>. +Converters may define additional functions that create further values. For +example, a regular expresion could be used to get a date from a file name. + + + + +A converter is defined via a yml file or part of it. The definition states +what kind of StructureElement it treats (typically one). +Also, it defines how children of the current StructureElement are +created and what Converters shall be used to treat those. + +The yaml definition looks like the following: + +TODO: outdated, see cfood-schema.yml + +.. code-block:: yaml + + <NodeName>: + type: <ConverterName> + match: ".*" + records: + Experiment1: + parents: + - Experiment + - Blablabla + date: $DATUM + (...) + Experiment2: + parents: + - Experiment + subtree: + (...) + +The **<NodeName>** is a description of what it represents (e.g. +'experiment-folder') and is used as identifier. + +**<type>** selects the converter that is going to be matched against the current structure +element. If the structure element matches (this is a combination of a typecheck and a detailed +match, see :py:class:`~caoscrawler.converters.Converter` for details) the converter is used +to generate records (see :py:meth:`~caoscrawler.converters.Converter.create_records`) and to possibly process a subtree, as defined by the function :func:`caoscrawler.converters.create_children`. + +**records** is a dict of definitions that define the semantic structure +(see details below). + +Subtree contains a list of Converter defnitions that look like the one +described here. + + +Standard Converters ++++++++++++++++++++ + +Directory Converter +=================== + +Simple File Converter +===================== + +Markdown File Converter +======================= + +Dict Converter +============== + +Typical Subtree converters +-------------------------- + +DictBooleanElementConverter +DictFloatElementConverter +DictTextElementConverter +DictIntegerElementConverter +DictListElementConverter +DictDictElementConverter + +YAMLFileConverter +================= + +A specialized Dict Converter for yaml files: Yaml files are opened and the contents are +converted into dictionaries that can be further converted using the typical subtree converters +of dict converter. + +**WARNING**: Currently unfinished implementation. + +JSONFileConverter +================= + + + +TextElementConverter +==================== + +TableConverter +============== + +A generic converter (abstract) for files containing tables. +Currently, there are two specialized implementations for xlsx-files and csv-files. + +All table converters generate a subtree that can be converted with DictDictElementConverters: +For each row in the table a DictDictElement (structure element) is generated. The key of the +element is the row number. The value of the element is a dict containing the mapping of +column names to values of the respective cell. + +Example: + +.. code-block:: yaml + + subtree: + TABLE: + type: CSVTableConverter + match: ^test_table.csv$ + records: + (...) # Records edited for the whole table file + subtree: + ROW: + type: DictDictElement + match_name: .* + match_value: .* + records: + (...) # Records edited for each row + subtree: + COLUMN: + type: DictFloatElement + match_name: measurement # Name of the column in the table file + match_value: (?P<column_value).*) + records: + (...) # Records edited for each cell + + +XLSXTableConverter +================== + +CSVTableConverter +================= + +Custom Converters ++++++++++++++++++ + +It was previously mentioned that it is possible to create custom converters. +These custom converters can be used to integrate arbitrary data extraction and ETL capabilities +into the caosdb-crawler and make these extensions available to any yaml specification. + +The basic syntax for adding a custom converter to a yaml cfood definition file is: + +.. code-block:: yaml + + Converters: + <NameOfTheConverterInYamlFile>: + package: <python>.<module>.<name> + converter: <PythonClassName> + +The Converters-section can be either put into the first or second document of the cfood yaml file. +It can be also part of a single-document yaml cfood file. Please refer to :doc:`the cfood documentation<cfood>` for more details. + +Details: + +- **<NameOfTheConverterInYamlFile>**: This is the name of the converter as it is going to be used in the present yaml file. +- **<python>.<module>.<name>**: The name of the module where the converter class resides. +- **<PythonClassName>**: Within this specified module there must be a class inheriting from base class :py:class:`caoscrawler.converters.Converter`. + +The following methods are abstract and need to be overwritten by your custom converter to make it work: + +- :py:meth:`~caoscrawler.converters.Converter.create_children` +- :py:meth:`~caoscrawler.converters.Converter.match` +- :py:meth:`~caoscrawler.converters.Converter.typecheck` + + +Example +======= + +In the following, we will explain the process of adding a custom converter to a yaml file using +a SourceResolver that is able to attach a source element to another entity. + +**Note**: This example might become a standard crawler soon, as part of the scifolder specification. See https://doi.org/10.3390/data5020043 for details. In this documentation example we will, therefore, add it to a package called "scifolder". + +First we will create our package and module structure, which might be: + +.. code-block:: + + scifolder_package/ + README.md + setup.cfg + setup.py + Makefile + tox.ini + src/ + scifolder/ + __init__.py + converters/ + __init__.py + sources.py # <- the actual file containing + # the converter class + doc/ + unittests/ + +Now we need to create a class called "SourceResolver" in the file "sources.py". In this - more advanced - example, we will not inherit our converter directly from :py:class:`~caoscrawler.converters.Converter`, but use :py:class:`~caoscrawler.converters.TextElementConverter`. The latter already implements :py:meth:`~caoscrawler.converters.Converter.match` and :py:meth:`~caoscrawler.converters.Converter.typecheck`, so only an implementation for :py:meth:`~caoscrawler.converters.Converter.create_children` has to be provided by us. +Furthermore we will customize the method :py:meth:`~caoscrawler.converters.Converter.create_records` that allows us to specify a more complex record generation procedure than provided in the standard implementation. One specific limitation of the standard implementation is, that only a fixed +number of records can be generated by the yaml definition. So for any applications - like here - that require an arbitrary number of records to be created, a customized implementation of :py:meth:`~caoscrawler.converters.Converter.create_records` is recommended. +In this context it is recommended to make use of the function :func:`caoscrawler.converters.create_records` that implements creation of record objects from python dictionaries of the same structure +that would be given using a yaml definition. + +.. code-block:: python + + import re + from caoscrawler.stores import GeneralStore, RecordStore + from caoscrawler.converters import TextElementConverter, create_records + from caoscrawler.structure_elements import StructureElement, TextElement + + + class SourceResolver(TextElementConverter): + """ + This resolver uses a source list element (e.g. from the markdown readme file) + to link sources correctly. + """ + + def __init__(self, definition: dict, name: str, + converter_registry: dict): + """ + Initialize a new directory converter. + """ + super().__init__(definition, name, converter_registry) + + def create_children(self, generalStore: GeneralStore, + element: StructureElement): + + # The source resolver does not create children: + + return [] + + def create_records(self, values: GeneralStore, + records: RecordStore, + element: StructureElement, + file_path_prefix): + if not isinstance(element, TextElement): + raise RuntimeError() + + # This function must return a list containing tuples, each one for a modified + # property: (name_of_entity, name_of_property) + keys_modified = [] + + # This is the name of the entity where the source is going to be attached: + attach_to_scientific_activity = self.definition["scientific_activity"] + rec = records[attach_to_scientific_activity] + + # The "source" is a path to a source project, so it should have the form: + # /<Category>/<project>/<scientific_activity>/ + # obtain these information from the structure element: + val = element.value + regexp = (r'/(?P<category>(SimulationData)|(ExperimentalData)|(DataAnalysis))' + '/(?P<project_date>.*?)_(?P<project_identifier>.*)' + '/(?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2})(_(?P<identifier>.*))?/') + + res = re.match(regexp, val) + if res is None: + raise RuntimeError("Source cannot be parsed correctly.") + + # Mapping of categories on the file system to corresponding record types in CaosDB: + cat_map = { + "SimulationData": "Simulation", + "ExperimentalData": "Experiment", + "DataAnalysis": "DataAnalysis"} + linkrt = cat_map[res.group("category")] + + keys_modified.extend(create_records(values, records, { + "Project": { + "date": res.group("project_date"), + "identifier": res.group("project_identifier"), + }, + linkrt: { + "date": res.group("date"), + "identifier": res.group("identifier"), + "project": "$Project" + }, + attach_to_scientific_activity: { + "sources": "+$" + linkrt + }}, file_path_prefix)) + + # Process the records section of the yaml definition: + keys_modified.extend( + super().create_records(values, records, element, file_path_prefix)) + + # The create_records function must return the modified keys to make it compatible + # to the crawler functions: + return keys_modified + + +If the recommended (python) package structure is used, the package containing the converter +definition can just be installed using `pip install .` or `pip install -e .` from the +`scifolder_package` directory. + +The following yaml block will register the converter in a yaml file: + +.. code-block:: yaml + + Converters: + SourceResolver: + package: scifolder.converters.sources + converter: SourceResolver diff --git a/src/doc/index.rst b/src/doc/index.rst index f11d73b58a3216b1d735d6565650148c150ebb68..724bcc543dd1cf0b9af451c487b1b3aab7fa95ca 100644 --- a/src/doc/index.rst +++ b/src/doc/index.rst @@ -1,8 +1,23 @@ Crawler 2.0 Documentation ========================= -Introduction ------------- + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + :hidden: + + Getting started<README_SETUP> + Concepts<concepts> + Converters<converters> + CFoods (Crawler Definitions)<cfood> + Macros<macros> + Tutorials<tutorials/index> + API documentation<_apidoc/modules> + + + +This is the documentation for the crawler (previously known as crawler 2.0) for CaosDB, ``caosdb-crawler``. The crawler is the main date integration tool for CaosDB. Its task is to automatically synchronize data found on file systems or in other @@ -15,172 +30,15 @@ The hierarchical sturcture can be for example a file tree. However it can be also something different like the contents of a json file or a file tree with json files. -Concepts --------- - -Structure Elements -++++++++++++++++++ - -This hierarchical structure is assumed to be consituted of a tree of -StructureElements. The tree is created on the fly by so called Converters which -are defined in a yaml file. The tree of StructureElements is a model -of the existing data (For example could a tree of Python file objects -(StructureElements) represent a file tree that exists on some file server). - -Relevant sources in: -src/structure_elements.py - -Converters -++++++++++ - -Converters treat StructureElements and thereby create the StructureElement that -are the children of the treated StructureElement. Converters therefore create -the above named tree. The definition of a Converter also contains what -Converters shall be used to treat the generated child-StructureElements. The -definition is there a tree itself. (Question: Should there be global Converters -that are always checked when treating a StructureElement? Should Converters be -associated with generated child-StructureElements? Currently, all children are -created and checked against all Converters. It could be that one would like to -check file-StructureElements against one set of Converters and -directory-StructureElements against another) - -Each StructureElement in the tree has a set of data values, i.e a dictionary of -key value pairs. -Some of those values are set due to the kind of StructureElement. For example, -a file could have the file name as such a key value pair: 'filename': <sth>. -Converters may define additional functions that create further values. For -example, a regular expresion could be used to get a date from a file name. - - - - -A converter is defined via a yml file or part of it. The definition states -what kind of StructureElement it treats (typically one). -Also, it defines how children of the current StructureElement are -created and what Converters shall be used to treat those. - -The yaml definition looks like the following: - -TODO: outdated, see cfood-schema.yml - -converter-name: - type: <StructureElement Type> - match: ".*" - records: - Experiment1: - parents: - - Experiment - - Blablabla - date: $DATUM - <...> - Experiment2: - parents: - - Experiment - valuegenerators: - datepattern: - <...> - childrengenerators: - create_children_from_directory: - sort-by-date: true - subtree: - - -records: - Measurement: <- wird automatisch ein value im valueStore - run_number: 25 - Experiment1: - Measurement: +Measurement <- Element in List (list is cleared before run) - *Measurement <- Multi Property (properties are removed before run) - Measurement <- Overwrite - -UPDATE-Stage prüft ob es z.B. Gleichheit zwischen Listen gibt (die dadurch definiert sein -kann, dass alle Elemente vorhanden, aber nicht zwingend in der richtigen Reihenfolge sind) -evtl. brauchen wir das nicht, weil crawler eh schon deterministisch ist. - -The converter-name is a description of what it represents (e.g. -'experiment-folder') and is used as identifier. - -The type restricts what kind of StructureElements are treated. -The match is by default a regular expression, that is matche against the -name of StructureElements. Discussion: StructureElements might not have a -name (e.g. a dict) or should a name be created artificially if necessary -(e.g. "root-dict")? It might make sense to allow keywords like "always" and -other kinds of checks. For example a dictionary could be checked against a -json-schema definition. - -recordtypes is a list of definitions that define the semantic structure -(see details below). - -valuegenerators allow to provide additional functionality that creates -data values in addition to the ones given by default via the -StructureElement. This can be for example a match group of a regular -expression applied to the filename. -It should be possible to access the values of parent nodes. For example, -the name of a parent node could be accessed with $converter-name.name. -Discussion: This can introduce conflicts, if the key <converver-name> -already exists. An alternative would be to identify those lookups. E.g. -$$converter-name.name (2x$). - -childrengenerators denotes how StructureElements shall be created that are -children of the current one. - -subtree contains a list of Converter defnitions that look like the one -described here. - -those keywords should be allowed but not required. I.e. if no -valuegenerators shall be defined, the keyword may be omitted. - - -Relevant sources in: -src/converters.py - -Identifiables -+++++++++++++ - -Relevant sources in: -src/identifiable_adapters.py - -The Crawler -+++++++++++ - -The crawler can be considered the main program doing the synchronization in basically two steps: -1. Based on a yaml-specification scan the file system (or other sources) and create a set - of CaosDB Entities that are supposed to be inserted or updated in a CaosDB instance. -2. Compare the current state of the CaosDB instance with the set of CaosDB Entities created in - step 1, taking into account the :ref:`registered identifiables<Identifiables>`. Insert or - update entites accordingly. - -Relevant sources in: -src/crawl.py - - - -Special Cases -============= - -Variable Precedence -++++++++++++ - -Let's assume the following situation - -.. code-block:: yaml - description: - type: DictTextElement - match_value: (?P<description>.*) - match_name: description +This documentation helps you to :doc:`get started<README_SETUP>`, explains the most important +:doc:`concepts<concepts>` and offers a range of :doc:`tutorials<tutorials/index>`. -Making use of the $description variable could refer to two different variables created here: -1. The structure element path. -2. The value of the matched expression. +Indices and tables +================== -The matched expression does take precedence over the structure element path and shadows it. +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` -Make sure, that if you want to be able to use the structure element path, to give unique names -to the variables like: -.. code-block:: yaml - description_text_block: - type: DictTextElement - match_value: (?P<description>.*) - match_name: description diff --git a/src/doc/macros.rst b/src/doc/macros.rst new file mode 100644 index 0000000000000000000000000000000000000000..3d995c1fbc67b155a6df606ac2f84a0cec26d1a5 --- /dev/null +++ b/src/doc/macros.rst @@ -0,0 +1,88 @@ +Macros +------ + +Macros highly facilitate the writing of complex :doc:`CFoods<cfood>`. Consider the following prevalent example: + +.. _example_files: +.. code-block:: yaml + + ExperimentalData: + type: Directory + match: ExperimentalData + subtree: + README: + type: SimpleFile + match: ^README.md$ + records: + ReadmeFile: + parents: + - MarkdownFile + role: File + path: $README + file: $README + +This example just inserts a file called ``README.md`` contained in Folder ``ExpreimentalData/`` into CaosDB, assigns the parent (RecordType) ``MarkdownFile`` and allows for later referencing this entity within the cfood. As file objects are created in the cfood specification using the ``records`` section with the special role ``File``, defining and using many files can become very cumbersome and make the cfood file difficult to read. + +The same version using cfood macros could be defined as follows: + +.. _example_files_2: +.. code-block:: yaml + + --- + metadata: + macros: + - !defmacro + name: MarkdownFile + params: + name: null + filename: null + definition: + ${name}_filename + type: SimpleFile + match: $filename + records: + $name: + parents: + - MarkdownFile + role: File + path: ${name}_filename + file: ${name}_filename + --- + ExperimentalData: + type: Directory + match: ExperimentalData + subtree: !macro + MarkdownFile: + - name: README + filename: ^README.md$ + + + + + +Complex Example +=============== + +.. _example_1: +.. code-block:: yaml + + macros: + - !defmacro + name: SimulationDatasetFile + params: + match: null + recordtype: null + nodename: null + definition: + $nodename: + match: $match + type: SimpleFile + records: + File: + parents: + - $recordtype + role: File + path: $$$nodename + file: $$$nodename + Simulation: + $recordtype: +$File diff --git a/src/doc/tutorials/index.rst b/src/doc/tutorials/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..1652515968c3b0025a2916604632d57c042f119b --- /dev/null +++ b/src/doc/tutorials/index.rst @@ -0,0 +1,2 @@ +Tutorials ++++++++++ diff --git a/unittests/cfoods_scalar.yml b/unittests/cfoods_scalar.yml new file mode 100644 index 0000000000000000000000000000000000000000..d0a728c35c27e331114cc5c18ebcfd1aa0905e31 --- /dev/null +++ b/unittests/cfoods_scalar.yml @@ -0,0 +1,14 @@ +# This is a test cfood for: +# https://gitlab.com/caosdb/caosdb-crawler/-/issues/9 + +Data: # name of the converter + type: Directory + match: (.*) + subtree: + DataAnalysis: # name of the converter + type: Directory + match: DataAnalysis + records: + RecordThatGetsParentsLater: + someId: 23 # <- this scalar causes problems + diff --git a/unittests/test_macros.py b/unittests/test_macros.py new file mode 100644 index 0000000000000000000000000000000000000000..98fdf6423182d665e61b08aba03ef5e3346928f2 --- /dev/null +++ b/unittests/test_macros.py @@ -0,0 +1,295 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2022 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +from caoscrawler.macros import defmacro_constructor, macro_constructor +from caoscrawler.macros.macro_yaml_object import macro_store +from caoscrawler.crawl import Crawler + +from tempfile import NamedTemporaryFile + +import yaml +import pytest + + +@pytest.fixture +def register_macros(): + yaml.SafeLoader.add_constructor("!defmacro", defmacro_constructor) + yaml.SafeLoader.add_constructor("!macro", macro_constructor) + + +@pytest.fixture +def macro_store_reset(): + macro_store.clear() + + +def _temp_file_load(txt: str): + """ + Create a temporary file with txt and load the crawler + definition using load_definition from Crawler. + """ + definition = None + with NamedTemporaryFile() as f: + f.write(txt.encode()) + f.flush() + c = Crawler() + definition = c.load_definition(f.name) + return definition + + +def test_macros(register_macros, macro_store_reset): + dat = yaml.load(""" +defs: +- !defmacro + name: test + params: + a: 2 + b: bla + c: $variable + definition: + expanded_$b: + blubb: ok$a + $b: $c + +testnode: + obl: !macro + test: + a: 4 + b: yea +""", Loader=yaml.SafeLoader) + assert dat["testnode"]["obl"]["expanded_yea"]["blubb"] == "ok4" + assert dat["testnode"]["obl"]["expanded_yea"]["yea"] == "$variable" + assert "expanded_bla" not in dat["testnode"]["obl"] + assert "bla" not in dat["testnode"]["obl"]["expanded_yea"] + + +def test_macro_list_replacment(register_macros, macro_store_reset): + dat = yaml.load(""" +defs: +- !defmacro + name: test + params: + a: 2 + b: bla + c: $variable + definition: + expanded_$b: + blubb: + - ok$a + - $b: $c + +testnode: + obl: !macro + test: + a: 4 + b: yea +""", Loader=yaml.SafeLoader) + assert isinstance(dat["testnode"]["obl"]["expanded_yea"]["blubb"], list) + assert len(dat["testnode"]["obl"]["expanded_yea"]["blubb"]) == 2 + assert dat["testnode"]["obl"]["expanded_yea"]["blubb"][0] == "ok4" + assert dat["testnode"]["obl"]["expanded_yea"]["blubb"][1]["yea"] == "$variable" + + +def test_multi_macros(register_macros, macro_store_reset): + dat = yaml.load(""" +defs: +- !defmacro + name: test_one + params: {} + definition: + replaced1: ok +- !defmacro + name: test_two + params: {} + definition: + replaced2: ok + replaced3: ok + +testnode: + obl: !macro + test_one: + test_two: +""", Loader=yaml.SafeLoader) + assert dat["testnode"]["obl"]["replaced1"] == "ok" + assert dat["testnode"]["obl"]["replaced2"] == "ok" + assert dat["testnode"]["obl"]["replaced3"] == "ok" + + +def test_multi_macros_toplevel(register_macros, macro_store_reset): + """ + See: https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/23 + """ + dat_loader = list(yaml.safe_load_all(""" +--- +metadata: + macros: + - !defmacro + name: test_one + params: {} + definition: + replaced1: ok + - !defmacro + name: test_two + params: {} + definition: + replaced2: ok + replaced3: ok +--- +testnode: !macro + test_one: + test_two: +""")) + assert len(dat_loader) == 2 + dat = dat_loader[1] + assert dat["testnode"]["replaced1"] == "ok" + assert dat["testnode"]["replaced2"] == "ok" + assert dat["testnode"]["replaced3"] == "ok" + + +def test_load_definition(register_macros, macro_store_reset): + txt = """ +extroot: + type: Directory + match: extroot + subtree: + SimulationData: + type: Directory + match: SimulationData + """ + # Check whether simple cfoods can be loaded: + cfood = _temp_file_load(txt) + assert cfood["extroot"]["subtree"]["SimulationData"]["match"] == "SimulationData" + + cfood = _temp_file_load(""" +--- +metadata: + macros: + - !defmacro + name: test_one + params: {} + definition: + replaced1: ok + - !defmacro + name: test_two + params: + match_name: null + definition: + type: Directory + match: $match_name +--- +extroot: + type: Directory + match: extroot + subtree: + SimulationData: + type: Directory + match: SimulationData +extroot2: !macro # test top level macro + test_one: +extroot3: + subtree: + SimulationData: !macro + test_two: + match_name: SimulationData + """) + assert cfood["extroot"]["subtree"]["SimulationData"]["match"] == "SimulationData" + assert cfood["extroot2"]["replaced1"] == "ok" + assert cfood["extroot3"]["subtree"]["SimulationData"]["match"] == "SimulationData" + + +@pytest.mark.xfail +def test_replace_arbitrary_objects(register_macros, macro_store_reset): + """ + See: https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/24 + """ + dat = yaml.load(""" +defs: +- !defmacro + name: test + params: + b: 25 + testvar_list: + - a + - $b + testvar_dict: + t1: a + t2: $b + definition: + replaced1: + $b: ok + c: $testvar_dict + d: $testvar_list + +testnode: + obl: !macro + test: +""", Loader=yaml.SafeLoader) + print(yaml.dump(dat)) + assert dat["testnode"]["obl"]["replaced1"]["c"]["t1"] == "a" + assert dat["testnode"]["obl"]["replaced1"]["c"]["t2"] == "25" + assert dat["testnode"]["obl"]["replaced1"]["d"][0] == "a" + assert dat["testnode"]["obl"]["replaced1"]["d"][1] == "25" + + +def test_circular_macro_definition(register_macros, macro_store_reset): + """Test the (ab-)use of macros to create an infinite loop.""" + cfood = _temp_file_load(""" +--- +metadata: + macros: + - !defmacro + name: test_one + params: {} + definition: !macro + test_two: + - !defmacro + name: test_two + params: {} + definition: !macro + test_one: + - !defmacro + name: test_three + params: {} + definition: !macro + test_two: + - !defmacro + name: test_four + params: {} + definition: !macro + test_four: +--- +extroot: !macro + test_one: +extroot2: !macro + test_three: +extroot3: !macro + test_four: + """) + # macros in macros can be used, but there are no circles; they stop at the first one. + assert "test_one" not in cfood["extroot"] + assert cfood["extroot"]["test_two"] is None + assert "test_three" not in cfood["extroot2"] + assert "test_one" not in cfood["extroot2"] + assert cfood["extroot2"]["test_two"] is None + # No recursion + assert cfood["extroot3"]["test_four"] is None diff --git a/unittests/test_scalars_cfood.py b/unittests/test_scalars_cfood.py new file mode 100644 index 0000000000000000000000000000000000000000..1bf8f0b7d67f00f2018b5b68424d6b9cc17602eb --- /dev/null +++ b/unittests/test_scalars_cfood.py @@ -0,0 +1,57 @@ +#!/bin/python +# Tests for: +# https://gitlab.com/caosdb/caosdb-crawler/-/issues/9 +# A. Schlemmer, 06/2021 + +import pytest + +# The main function that is affected by this issue: +from caoscrawler.converters import handle_value +from caoscrawler.crawl import Crawler +# We need the store for the above function +from caoscrawler.stores import GeneralStore + +from test_tool import dircheckstr, rfp + + +@pytest.fixture +def crawler(): + crawler = Crawler(debug=True) + crawler.crawl_directory(rfp("test_directories", "examples_article"), + rfp("cfoods_scalar.yml")) + return crawler + + +def test_handle_value(): + # Note that we will need this store only, if we also want to test variables substitution: + store = GeneralStore() + + # This one should work: + assert handle_value("bla", store) == ("bla", "single") + + # These failed: + assert handle_value(4, store) == (4, "single") + assert handle_value(4.2, store) == (4.2, "single") + assert handle_value(True, store) == (True, "single") + + # List test: + assert handle_value([4, 3, 2], store) == ([4, 3, 2], "single") + + +def test_record_structure_generation(crawler): + subd = crawler.debug_tree[dircheckstr("DataAnalysis")] + assert len(subd) == 2 + # variables store on Data Analysis node of debug tree + assert len(subd[0]) == 3 + assert "Data" in subd[0] + assert "DataAnalysis" in subd[0] + assert "RecordThatGetsParentsLater" in subd[0] + + prop = subd[0]["RecordThatGetsParentsLater"].get_property("someId") + assert type(prop.value) == int + assert prop.value == 23 + + # record store on Data Analysis node of debug tree + assert len(subd[1]) == 1 + prop2 = subd[1]["RecordThatGetsParentsLater"].get_property("someId") + assert prop == prop2 diff --git a/unittests/test_tool.py b/unittests/test_tool.py index 59573756fe61ef697976e480dd1550cb0ead0998..a0b8e675f79028b3b45aa248202658be22cfbf6f 100755 --- a/unittests/test_tool.py +++ b/unittests/test_tool.py @@ -33,7 +33,9 @@ def dircheckstr(*pathcomponents): """ Return the debug tree identifier for a given path. """ - return "caoscrawler.structure_elements.Directory: " + basename(join(*pathcomponents)) + ", " + rfp("test_directories", "examples_article", *pathcomponents) + return ("caoscrawler.structure_elements.Directory: " + basename( + join(*pathcomponents)) + ", " + rfp( + "test_directories", "examples_article", *pathcomponents)) @pytest.fixture @@ -87,7 +89,7 @@ def test_record_structure_generation(crawler): # The data analysis node creates one variable for the node itself: assert subd[0]["DataAnalysis"] == "examples_article/DataAnalysis" - assert subc[0]["DataAnalysis"] == False + assert subc[0]["DataAnalysis"] is False subd = crawler.debug_tree[dircheckstr( "DataAnalysis", "2020_climate-model-predict")] @@ -107,9 +109,9 @@ def test_record_structure_generation(crawler): assert subd[0]["Project"].__class__ == db.Record assert subd[0]["DataAnalysis"] == "examples_article/DataAnalysis" - assert subc[0]["DataAnalysis"] == True + assert subc[0]["DataAnalysis"] is True assert subd[0]["project_dir"] == "examples_article/DataAnalysis/2020_climate-model-predict" - assert subc[0]["project_dir"] == False + assert subc[0]["project_dir"] is False # Check the copy flags for the first level in the hierarchy: assert len(subc[0]) == 6 @@ -201,7 +203,8 @@ def test_crawler_update_list(crawler, ident): assert len(ident.get_records()) != len(crawler.target_data) # Check consistency: - # Check whether identifiables retrieved from current identifiable store return the same results. + # Check whether identifiables retrieved from current identifiable store return + # the same results. # take the first person in the list of records: for r in ident.get_records():