diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b5bc53fed1b069f3a6f665a188aa8bdcd7252570..30a8cd8fe4c08fd3fe0f3f98aaa56b83cb623086 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -152,11 +152,11 @@ inttest: - CAOSDB_TAG=$CAOSDB_TAG docker-compose up -d # Store versions of CaosDB parts - - docker exec -u 0 -t docker_caosdb-server_1 cat /opt/caosdb/git/caosdb_pylib_commit > hash_pylib - - docker exec -u 0 -t docker_caosdb-server_1 cat /opt/caosdb/git/caosdb_webui_commit > hash_webui - - docker exec -u 0 -t docker_caosdb-server_1 cat /opt/caosdb/git/caosdb_server_commit > hash_server - - docker exec -u 0 -t docker_caosdb-server_1 cat /opt/caosdb/git/caosdb_mysqlbackend_commit > hash_mysql - - docker exec -u 0 -t docker_caosdb-server_1 cat /opt/caosdb/git/caosdb_proto_commit > hash_proto + - docker exec -u 0 -t docker-caosdb-server-1 cat /opt/caosdb/git/caosdb_pylib_commit > hash_pylib + - docker exec -u 0 -t docker-caosdb-server-1 cat /opt/caosdb/git/caosdb_webui_commit > hash_webui + - docker exec -u 0 -t docker-caosdb-server-1 cat /opt/caosdb/git/caosdb_server_commit > hash_server + - docker exec -u 0 -t docker-caosdb-server-1 cat /opt/caosdb/git/caosdb_mysqlbackend_commit > hash_mysql + - docker exec -u 0 -t docker-caosdb-server-1 cat /opt/caosdb/git/caosdb_proto_commit > hash_proto - cat hash_server - cat hash_proto - cat hash_mysql @@ -167,8 +167,8 @@ inttest: - /bin/sh ./run.sh # Save logs - - docker logs docker_caosdb-server_1 &> ../caosdb_log.txt - - docker logs docker_sqldb_1 &> ../mariadb_log.txt + - docker logs docker-caosdb-server-1 &> ../caosdb_log.txt + - docker logs docker-sqldb-1 &> ../mariadb_log.txt - cd .. # Stop the server diff --git a/CHANGELOG.md b/CHANGELOG.md index 2b0ec65ea3a90e1dac7fe538128113d3857a829c..e401aec7c87b06a6d656b7c26de6ca432a568668 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * Added new converters for tables: CSVTableConverter and XLSXTableConverter * Possibility to authorize updates as in the old crawler * Allow authorization of inserts +* Allow splitting cfoods into multiple yaml documents +* Implemented macros * Converters can now filter the list of children * You can now crawl data with name conflicts: `synchronize(unique_names=False)` diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 30a34c2438b467a97a8548c355e9c87dbb175aea..ccad944776fb51fef6ed566b999340f06f838705 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -55,12 +55,17 @@ from caosdb.apiutils import compare_entities, merge_entities from copy import deepcopy from jsonschema import validate -logger = logging.getLogger(__name__) +from .macros import defmacro_constructor, macro_constructor +logger = logging.getLogger(__name__) SPECIAL_PROPERTIES_STRICT = ("description", "name", "id", "path") SPECIAL_PROPERTIES_NOT_STRICT = ("file", "checksum", "size") +# Register the macro functions from the submodule: +yaml.SafeLoader.add_constructor("!defmacro", defmacro_constructor) +yaml.SafeLoader.add_constructor("!macro", macro_constructor) + def check_identical(record1: db.Entity, record2: db.Entity, ignore_id=False): """ @@ -176,7 +181,8 @@ class Crawler(object): Create a debugging information tree when set to True. The debugging information tree is a variable stored in self.debug_tree. It is a dictionary mapping directory entries - to a tuple of general stores and record stores which are valid for the directory scope. + to a tuple of general stores and record stores which are valid for + the directory scope. Furthermore, it is stored in a second tree named self.debug_copied whether the objects in debug_tree had been copied from a higher level in the hierarchy of the structureelements. @@ -221,7 +227,16 @@ class Crawler(object): # Load the cfood from a yaml file: with open(crawler_definition_path, "r") as f: - crawler_definition = yaml.safe_load(f) + crawler_definitions = list(yaml.safe_load_all(f)) + + if len(crawler_definitions) == 1: + # Simple case, just one document: + crawler_definition = crawler_definitions[0] + elif len(crawler_definitions) == 2: + crawler_definition = crawler_definitions[1] + else: + raise RuntimeError( + "Crawler definition must not contain more than two documents.") # TODO: at this point this function can already load the cfood schema extensions # from the crawler definition and add them to the yaml schema that will be @@ -424,7 +439,8 @@ class Crawler(object): items = [items] self.run_id = uuid.uuid1() - local_converters = Crawler.initialize_converters(crawler_definition, converter_registry) + local_converters = Crawler.initialize_converters( + crawler_definition, converter_registry) # This recursive crawling procedure generates the update list: self.target_data: List[db.Record] = [] self._crawl(items, local_converters, self.generalStore, @@ -968,7 +984,8 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) self.debug_tree[str(element)] = ( generalStore_copy.get_storage(), recordStore_copy.get_storage()) self.debug_metadata["copied"][str(element)] = ( - generalStore_copy.get_dict_copied(), recordStore_copy.get_dict_copied()) + generalStore_copy.get_dict_copied(), + recordStore_copy.get_dict_copied()) self.debug_metadata["usage"][str(element)].add( "/".join(converters_path + [converter.name])) mod_info = self.debug_metadata["provenance"] @@ -979,8 +996,9 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) record_identifier = record_name + \ "_" + str(internal_id) converter.metadata["usage"].add(record_identifier) - mod_info[record_identifier][prop_name] = (structure_elements_path + [element.get_name()], - converters_path + [converter.name]) + mod_info[record_identifier][prop_name] = ( + structure_elements_path + [element.get_name()], + converters_path + [converter.name]) self._crawl(children, converter.converters, generalStore_copy, recordStore_copy, diff --git a/src/caoscrawler/macros/__init__.py b/src/caoscrawler/macros/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0acfb1763039a3bb800bbf0e26d6940b49d045cf --- /dev/null +++ b/src/caoscrawler/macros/__init__.py @@ -0,0 +1 @@ +from .macro_yaml_object import defmacro_constructor, macro_constructor diff --git a/src/caoscrawler/macros/macro_yaml_object.py b/src/caoscrawler/macros/macro_yaml_object.py new file mode 100644 index 0000000000000000000000000000000000000000..2849986e6deb5cb2cba9e45516e6ce8e1a93dfa0 --- /dev/null +++ b/src/caoscrawler/macros/macro_yaml_object.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2022 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +# Function to expand a macro in yaml +# A. Schlemmer, 05/2022 + +from dataclasses import dataclass +from typing import Any, Dict +from copy import deepcopy +from string import Template + + +@dataclass +class MacroDefinition: + """ + Stores a macro definition. + name: Name of the macro + params: variables and default values to be substituted in keys or values + definition: A dictionary that will be substituted including parameters + """ + name: str + params: Dict[str, Any] + definition: Any + + +# This dictionary stores the macro definitions +macro_store: Dict[str, MacroDefinition] = dict() + + +def substitute(propvalue, values: dict): + """ + Substitution of variables in strings using the variable substitution + library from python's standard library. + """ + propvalue_template = Template(propvalue) + return propvalue_template.safe_substitute(**values) + + +def substitute_dict(sourced: Dict[str, Any], values: Dict[str, Any]): + """ + Create a copy of sourced. + Afterwards recursively do variable substitution on all keys and values. + """ + d = deepcopy(sourced) + # Changes in keys: + replace: Dict[str, str] = dict() + for k in d: + replacement = substitute(k, values) + if replacement != k: + replace[k] = replacement + for k, v in replace.items(): + d[v] = d[k] + del d[k] + # Changes in values: + for k, v in d.items(): + if isinstance(v, str): + d[k] = substitute(v, values) + elif isinstance(v, list): + subst_list = list() + for i in d[k]: + if isinstance(i, str): + subst_list.append(substitute(i, values)) + elif isinstance(i, dict): + subst_list.append(substitute_dict(i, values)) + else: + subst_list.append(i) + d[k] = subst_list + elif isinstance(v, dict): + d[k] = substitute_dict(v, values) + else: + pass + return d + + +def defmacro_constructor(loader, node): + """ + Function for registering macros in yaml files. + + It can be registered in pyaml using: + yaml.SafeLoader.add_constructor("!defmacro", defmacro_constructor) + """ + + value = loader.construct_mapping(node, deep=True) + params = {} + if "params" in value: + params = value["params"] + macro = MacroDefinition( + value["name"], params, + value["definition"]) + macro_store[macro.name] = macro + return {} + + +def macro_constructor(loader, node): + """ + Function for substituting macros in yaml files. + + It can be registered in pyaml using: + yaml.SafeLoader.add_constructor("!macro", macro_constructor) + """ + res = dict() + value = loader.construct_mapping(node, deep=True) + for name, params_setter in value.items(): + if name in macro_store: + # If params_setter is a list, run this for every element: + if params_setter is not None and isinstance(params_setter, list): + for el in params_setter: + macro = macro_store[name] + params = deepcopy(macro.params) + if el is not None: + if isinstance(el, dict): + params.update(el) + else: + raise RuntimeError("params type not supported") + else: + raise RuntimeError("params type must not be None") + definition = substitute_dict(macro.definition, params) + res.update(definition) + else: + # This is just a single macro: + macro = macro_store[name] + params = deepcopy(macro.params) + if params_setter is not None: + if isinstance(params_setter, dict): + params.update(params_setter) + else: + raise RuntimeError("params type not supported") + definition = substitute_dict(macro.definition, params) + res.update(definition) + else: + # If there is no macro with that name, just keep that node: + res[name] = params_setter + + return res diff --git a/src/doc/converters.rst b/src/doc/converters.rst index a30a7d92850f90be14f82a4e563fb56df5fcde88..7ec93535ec41dc211e2fa7ee194b2ecbe1a659fb 100644 --- a/src/doc/converters.rst +++ b/src/doc/converters.rst @@ -185,6 +185,7 @@ a SourceResolver that is able to attach a source element to another entity. **Note**: This example might become a standard crawler soon, as part of the scifolder specification. See https://doi.org/10.3390/data5020043 for details. In this documentation example we will, therefore, add it to a package called "scifolder". First we will create our package and module structure, which might be: + .. code-block:: scifolder_package/ diff --git a/unittests/test_macros.py b/unittests/test_macros.py new file mode 100644 index 0000000000000000000000000000000000000000..98fdf6423182d665e61b08aba03ef5e3346928f2 --- /dev/null +++ b/unittests/test_macros.py @@ -0,0 +1,295 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2022 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +from caoscrawler.macros import defmacro_constructor, macro_constructor +from caoscrawler.macros.macro_yaml_object import macro_store +from caoscrawler.crawl import Crawler + +from tempfile import NamedTemporaryFile + +import yaml +import pytest + + +@pytest.fixture +def register_macros(): + yaml.SafeLoader.add_constructor("!defmacro", defmacro_constructor) + yaml.SafeLoader.add_constructor("!macro", macro_constructor) + + +@pytest.fixture +def macro_store_reset(): + macro_store.clear() + + +def _temp_file_load(txt: str): + """ + Create a temporary file with txt and load the crawler + definition using load_definition from Crawler. + """ + definition = None + with NamedTemporaryFile() as f: + f.write(txt.encode()) + f.flush() + c = Crawler() + definition = c.load_definition(f.name) + return definition + + +def test_macros(register_macros, macro_store_reset): + dat = yaml.load(""" +defs: +- !defmacro + name: test + params: + a: 2 + b: bla + c: $variable + definition: + expanded_$b: + blubb: ok$a + $b: $c + +testnode: + obl: !macro + test: + a: 4 + b: yea +""", Loader=yaml.SafeLoader) + assert dat["testnode"]["obl"]["expanded_yea"]["blubb"] == "ok4" + assert dat["testnode"]["obl"]["expanded_yea"]["yea"] == "$variable" + assert "expanded_bla" not in dat["testnode"]["obl"] + assert "bla" not in dat["testnode"]["obl"]["expanded_yea"] + + +def test_macro_list_replacment(register_macros, macro_store_reset): + dat = yaml.load(""" +defs: +- !defmacro + name: test + params: + a: 2 + b: bla + c: $variable + definition: + expanded_$b: + blubb: + - ok$a + - $b: $c + +testnode: + obl: !macro + test: + a: 4 + b: yea +""", Loader=yaml.SafeLoader) + assert isinstance(dat["testnode"]["obl"]["expanded_yea"]["blubb"], list) + assert len(dat["testnode"]["obl"]["expanded_yea"]["blubb"]) == 2 + assert dat["testnode"]["obl"]["expanded_yea"]["blubb"][0] == "ok4" + assert dat["testnode"]["obl"]["expanded_yea"]["blubb"][1]["yea"] == "$variable" + + +def test_multi_macros(register_macros, macro_store_reset): + dat = yaml.load(""" +defs: +- !defmacro + name: test_one + params: {} + definition: + replaced1: ok +- !defmacro + name: test_two + params: {} + definition: + replaced2: ok + replaced3: ok + +testnode: + obl: !macro + test_one: + test_two: +""", Loader=yaml.SafeLoader) + assert dat["testnode"]["obl"]["replaced1"] == "ok" + assert dat["testnode"]["obl"]["replaced2"] == "ok" + assert dat["testnode"]["obl"]["replaced3"] == "ok" + + +def test_multi_macros_toplevel(register_macros, macro_store_reset): + """ + See: https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/23 + """ + dat_loader = list(yaml.safe_load_all(""" +--- +metadata: + macros: + - !defmacro + name: test_one + params: {} + definition: + replaced1: ok + - !defmacro + name: test_two + params: {} + definition: + replaced2: ok + replaced3: ok +--- +testnode: !macro + test_one: + test_two: +""")) + assert len(dat_loader) == 2 + dat = dat_loader[1] + assert dat["testnode"]["replaced1"] == "ok" + assert dat["testnode"]["replaced2"] == "ok" + assert dat["testnode"]["replaced3"] == "ok" + + +def test_load_definition(register_macros, macro_store_reset): + txt = """ +extroot: + type: Directory + match: extroot + subtree: + SimulationData: + type: Directory + match: SimulationData + """ + # Check whether simple cfoods can be loaded: + cfood = _temp_file_load(txt) + assert cfood["extroot"]["subtree"]["SimulationData"]["match"] == "SimulationData" + + cfood = _temp_file_load(""" +--- +metadata: + macros: + - !defmacro + name: test_one + params: {} + definition: + replaced1: ok + - !defmacro + name: test_two + params: + match_name: null + definition: + type: Directory + match: $match_name +--- +extroot: + type: Directory + match: extroot + subtree: + SimulationData: + type: Directory + match: SimulationData +extroot2: !macro # test top level macro + test_one: +extroot3: + subtree: + SimulationData: !macro + test_two: + match_name: SimulationData + """) + assert cfood["extroot"]["subtree"]["SimulationData"]["match"] == "SimulationData" + assert cfood["extroot2"]["replaced1"] == "ok" + assert cfood["extroot3"]["subtree"]["SimulationData"]["match"] == "SimulationData" + + +@pytest.mark.xfail +def test_replace_arbitrary_objects(register_macros, macro_store_reset): + """ + See: https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/24 + """ + dat = yaml.load(""" +defs: +- !defmacro + name: test + params: + b: 25 + testvar_list: + - a + - $b + testvar_dict: + t1: a + t2: $b + definition: + replaced1: + $b: ok + c: $testvar_dict + d: $testvar_list + +testnode: + obl: !macro + test: +""", Loader=yaml.SafeLoader) + print(yaml.dump(dat)) + assert dat["testnode"]["obl"]["replaced1"]["c"]["t1"] == "a" + assert dat["testnode"]["obl"]["replaced1"]["c"]["t2"] == "25" + assert dat["testnode"]["obl"]["replaced1"]["d"][0] == "a" + assert dat["testnode"]["obl"]["replaced1"]["d"][1] == "25" + + +def test_circular_macro_definition(register_macros, macro_store_reset): + """Test the (ab-)use of macros to create an infinite loop.""" + cfood = _temp_file_load(""" +--- +metadata: + macros: + - !defmacro + name: test_one + params: {} + definition: !macro + test_two: + - !defmacro + name: test_two + params: {} + definition: !macro + test_one: + - !defmacro + name: test_three + params: {} + definition: !macro + test_two: + - !defmacro + name: test_four + params: {} + definition: !macro + test_four: +--- +extroot: !macro + test_one: +extroot2: !macro + test_three: +extroot3: !macro + test_four: + """) + # macros in macros can be used, but there are no circles; they stop at the first one. + assert "test_one" not in cfood["extroot"] + assert cfood["extroot"]["test_two"] is None + assert "test_three" not in cfood["extroot2"] + assert "test_one" not in cfood["extroot2"] + assert cfood["extroot2"]["test_two"] is None + # No recursion + assert cfood["extroot3"]["test_four"] is None diff --git a/unittests/test_tool.py b/unittests/test_tool.py index 59573756fe61ef697976e480dd1550cb0ead0998..a0b8e675f79028b3b45aa248202658be22cfbf6f 100755 --- a/unittests/test_tool.py +++ b/unittests/test_tool.py @@ -33,7 +33,9 @@ def dircheckstr(*pathcomponents): """ Return the debug tree identifier for a given path. """ - return "caoscrawler.structure_elements.Directory: " + basename(join(*pathcomponents)) + ", " + rfp("test_directories", "examples_article", *pathcomponents) + return ("caoscrawler.structure_elements.Directory: " + basename( + join(*pathcomponents)) + ", " + rfp( + "test_directories", "examples_article", *pathcomponents)) @pytest.fixture @@ -87,7 +89,7 @@ def test_record_structure_generation(crawler): # The data analysis node creates one variable for the node itself: assert subd[0]["DataAnalysis"] == "examples_article/DataAnalysis" - assert subc[0]["DataAnalysis"] == False + assert subc[0]["DataAnalysis"] is False subd = crawler.debug_tree[dircheckstr( "DataAnalysis", "2020_climate-model-predict")] @@ -107,9 +109,9 @@ def test_record_structure_generation(crawler): assert subd[0]["Project"].__class__ == db.Record assert subd[0]["DataAnalysis"] == "examples_article/DataAnalysis" - assert subc[0]["DataAnalysis"] == True + assert subc[0]["DataAnalysis"] is True assert subd[0]["project_dir"] == "examples_article/DataAnalysis/2020_climate-model-predict" - assert subc[0]["project_dir"] == False + assert subc[0]["project_dir"] is False # Check the copy flags for the first level in the hierarchy: assert len(subc[0]) == 6 @@ -201,7 +203,8 @@ def test_crawler_update_list(crawler, ident): assert len(ident.get_records()) != len(crawler.target_data) # Check consistency: - # Check whether identifiables retrieved from current identifiable store return the same results. + # Check whether identifiables retrieved from current identifiable store return + # the same results. # take the first person in the list of records: for r in ident.get_records():