diff --git a/.docker/Dockerfile b/.docker/Dockerfile index 1651fa08f7fb157e007cf5c4a992f548b7d411ba..e7bc28acad38aaf299d7427117510e10f57a903f 100644 --- a/.docker/Dockerfile +++ b/.docker/Dockerfile @@ -2,6 +2,8 @@ FROM debian:10 RUN apt-get update && \ apt-get install \ curl \ + libhdf5-dev \ + pkgconf \ python3 \ python3-pip \ python3-requests \ diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 2a80211839ae3db85765c99629247f06e2c6778b..c9cd5b631cea84f44c5296edf4b789d83982d074 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -58,8 +58,8 @@ test: - cd .docker - /bin/sh ./run.sh - cd .. - - docker logs docker_caosdb-server_1 &> ../caosdb_log.txt - - docker logs docker_sqldb_1 &> ../mariadb_log.txt + - docker logs docker_caosdb-server_1 &> caosdb_log.txt + - docker logs docker_sqldb_1 &> mariadb_log.txt - docker-compose -f .docker/docker-compose.yml down - rc=`cat .docker/result` - exit $rc diff --git a/CHANGELOG.md b/CHANGELOG.md index d07084cc5df0e360a869cdb946a830761831d743..eda225cc2e280c7ca326fe563ef94c0122684eda 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Automated documentation builds: `make doc` - Crawler documentation - Proof-of-concept integration with Bloxberg. +- Introduce a cfood that can create a Record structure based on the contents of a hdf5 file ### Changed ### diff --git a/Makefile b/Makefile index cbac0ea0a77e5523529ef181d83ffb9738d72faf..7609444bd4fd3a8ce980eca0bc3993b3cf2e168f 100644 --- a/Makefile +++ b/Makefile @@ -21,7 +21,7 @@ # This Makefile is a wrapper for several other scripts. -.PHONY: help doc install +.PHONY: help doc install unittest help: @echo 'Type `make doc` for documentation, or `make install` for (local) installation.' @@ -30,4 +30,7 @@ doc: $(MAKE) -C src/doc html install: - @echo "Not implemented yet, use pip for installation." + pip3 install . + +unittest: + pytest-3 unittests diff --git a/integrationtests/crawl.py b/integrationtests/crawl.py index 65600016ed5dff97d3794b61cf540b9d0505698d..79ed3b5ffe52d276677e2a7914f70923e5c9e70c 100755 --- a/integrationtests/crawl.py +++ b/integrationtests/crawl.py @@ -36,6 +36,8 @@ from caosadvancedtools.scifolder import (AnalysisCFood, ExperimentCFood, PublicationCFood, SimulationCFood, SoftwareCFood) +from example_hdf5cfood import ExampleH5CFood + try: from sss_helper import get_argument_parser, print_success except ModuleNotFoundError: @@ -89,6 +91,7 @@ if __name__ == "__main__": interactive=False, hideKnown=False, cfood_types=[ExperimentCFood, AnalysisCFood, SoftwareCFood, PublicationCFood, SimulationCFood, + ExampleH5CFood ]) if args.authorize_run: diff --git a/integrationtests/example_hdf5cfood.py b/integrationtests/example_hdf5cfood.py new file mode 100644 index 0000000000000000000000000000000000000000..f369f852a294d8819720e74ad4f849082b108653 --- /dev/null +++ b/integrationtests/example_hdf5cfood.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2021 IndiScale GmbH <www.indiscale.com> +# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +""" +An exemplary definition of a HDF5 CFood for integration testing +""" + +import caosdb as db +from caosadvancedtools.cfoods.h5 import H5CFood +from caosadvancedtools.scifolder import ExperimentCFood +from caosadvancedtools.scifolder.generic_pattern import readme_pattern + + +class ExampleH5CFood(H5CFood): + root_name = "ExampleH5" + + @staticmethod + def get_re(): + return ExperimentCFood.get_re()[:-len(readme_pattern)] + r".*\.hdf5" + + def create_identifiables(self): + super().create_identifiables() + self.identifiable_root = db.Record() + self.identifiable_root.add_property("hdf5File", self.crawled_file) + self.identifiable_root.add_parent("ExampleH5") + self.identifiables.append(self.identifiable_root) + + def special_treatment(self, key, value, dtype): + if key == "attr_data_root": + return "single_attribute", value, dtype + + return key, value, dtype diff --git a/integrationtests/extroot/ExperimentalData/2010_TestProject/2019-02-03/hdf5_dummy_file.hdf5 b/integrationtests/extroot/ExperimentalData/2010_TestProject/2019-02-03/hdf5_dummy_file.hdf5 new file mode 100644 index 0000000000000000000000000000000000000000..41bfb7ab3bcac19d90fd4f018cdd8118ae806eaf Binary files /dev/null and b/integrationtests/extroot/ExperimentalData/2010_TestProject/2019-02-03/hdf5_dummy_file.hdf5 differ diff --git a/integrationtests/insert_model.py b/integrationtests/insert_model.py index 270a08a36d7512a8642c2ca08a9ec6ea93b81bd9..f57ea440d4a7343a5a33c8deeaa8fa79b62d1e8e 100755 --- a/integrationtests/insert_model.py +++ b/integrationtests/insert_model.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 import caosdb as db +import h5py +from caosadvancedtools.cfoods.h5 import H5CFood +from caosadvancedtools.models.data_model import DataModel from caosadvancedtools.models.parser import parse_model_from_yaml model = parse_model_from_yaml("model.yml") @@ -9,3 +12,11 @@ if len(db.execute_query("FIND Property alias")) == 0: al = db.Property(name="alias") al.add_parent(name="name") al.insert() + +h5model = db.Container() +h5file = h5py.File('extroot/ExperimentalData/2010_TestProject/2019-02-03/hdf5_dummy_file.hdf5', 'r') +H5CFood.create_structure(h5file, create_recordTypes=True, collection=h5model) +h5model[0].name = "ExampleH5" +print(h5model) +h5model = DataModel(h5model) +h5model.sync_data_model(noquestion=True) diff --git a/integrationtests/model.yml b/integrationtests/model.yml index 0a4ad381bfc119dd65d2c192f8de823deda525ae..efcfd80e1acddfc99469a778086618dbe3add770 100644 --- a/integrationtests/model.yml +++ b/integrationtests/model.yml @@ -40,6 +40,8 @@ sources: datatype: REFERENCE scripts: datatype: REFERENCE +single_attribute: + datatype: LIST(INTEGER) Simulation: obligatory_properties: date: @@ -66,3 +68,5 @@ Presentation: Report: inherit_from_suggested: - Publication +hdf5File: + datatype: REFERENCE diff --git a/integrationtests/test_crawler_with_cfoods.py b/integrationtests/test_crawler_with_cfoods.py index 7dc19240cbd27f0e6e7cd6cc145c7e12704df0d2..05bb581058a964d76ab78583cc290c348e8c4566 100755 --- a/integrationtests/test_crawler_with_cfoods.py +++ b/integrationtests/test_crawler_with_cfoods.py @@ -26,6 +26,7 @@ import os import unittest import caosdb as db +from caosdb.apiutils import retrieve_entity_with_id def get_entity_with_id(eid): @@ -486,3 +487,17 @@ class CrawlerTest(unittest.TestCase): # Should have a description self.assertIsNotNone(ana.description) + + def test_exampleh5(self): + examp = db.execute_query("FIND Record ExampleH5", unique=True) + + for prop in examp.properties: + if prop.name == 'group_level1_a': + self.assertTrue(retrieve_entity_with_id(prop.value).get_property("group_level2_aa") is not None) + self.assertTrue(retrieve_entity_with_id(prop.value).get_property("group_level1_a") is None) + elif prop.name == 'group_level1_b': + self.assertTrue(retrieve_entity_with_id(prop.value).get_property("level1_b_floats") is not None) + elif prop.name == 'group_level1_c': + self.assertTrue(retrieve_entity_with_id(prop.value).get_property("level1_c_floats") is not None) + elif prop.name == 'root_integers': + self.assertTrue(retrieve_entity_with_id(prop.value).get_property("single_attribute") is not None) diff --git a/setup.py b/setup.py index f26b126c2a589554ace736661aa3a685b3f671d3..89b7f10674c4871c8eaedaad0355782d92a09125 100755 --- a/setup.py +++ b/setup.py @@ -157,12 +157,14 @@ def setup_package(): install_requires=["caosdb>=0.4.0", "openpyxl>=3.0.0", "pandas>=1.2.0", + "numpy>=1.17.3", "xlrd>=2.0", + "h5py", ], packages=find_packages('src'), package_dir={'': 'src'}, setup_requires=["pytest-runner>=2.0,<3dev"], - tests_require=["pytest", "pytest-cov", "coverage>=4.4.2"], + tests_require=["pytest", "pytest-pythonpath", "pytest-cov", "coverage>=4.4.2"], ) try: setup(**metadata) diff --git a/src/caosadvancedtools/cache.py b/src/caosadvancedtools/cache.py index 3dac86ec328944303c629c8de721fb1a2f6a7bef..ff807f2aba6210d643e675e7e3dd91d7c3b30906 100644 --- a/src/caosadvancedtools/cache.py +++ b/src/caosadvancedtools/cache.py @@ -32,6 +32,8 @@ from hashlib import sha256 import caosdb as db from lxml import etree +import tempfile + def put_in_container(stuff): if isinstance(stuff, list): @@ -154,7 +156,9 @@ class UpdateCache(Cache): def __init__(self, db_file=None): if db_file is None: - db_file = "/tmp/crawler_update_cache.db" + tmppath = tempfile.gettempdir() + tmpf = os.path.join(tmppath, "crawler_update_cache.db") + db_file = tmpf super().__init__(db_file=db_file) @staticmethod diff --git a/src/caosadvancedtools/cfood.py b/src/caosadvancedtools/cfood.py index 8ce1dced48ba12e62717fe5bd788178e1e5a9488..fcdd2b83769e847bd2f00066ce1442a03b74e0fc 100644 --- a/src/caosadvancedtools/cfood.py +++ b/src/caosadvancedtools/cfood.py @@ -298,7 +298,7 @@ class AbstractFileCFood(AbstractCFood): super().__init__(*args, item=crawled_path, **kwargs) self._crawled_file = None self.crawled_path = crawled_path - self.match = re.match(type(self).get_re(), crawled_path) + self.match = re.match(self.get_re(), crawled_path) self.attached_filenames = [] @property @@ -309,7 +309,31 @@ class AbstractFileCFood(AbstractCFood): return self._crawled_file @staticmethod - def get_re(): + def re_from_extensions(extensions): + """Return a regular expression which matches the given file extensions. + + Useful for inheriting classes. + + Parameters + ---------- + extensions : iterable<str> + An iterable with the allowed extensions. + + Returns + ------- + out : str + The regular expression, starting with ``.*\\.`` and ending with the EOL dollar + character. The actual extension will be accessible in the + :py:attribute:`pattern group name<python:re.Pattern.groupindexe>` ``ext``. + """ + + if not extensions: + return None + + return r".*\.(?P<ext>" + "|".join(extensions) + ")$" + + @classmethod + def get_re(cls): """ Returns the regular expression used to identify files that shall be processed @@ -377,6 +401,7 @@ def assure_object_is_in_list(obj, containing_object, property_name, if containing_object.get_property(property_name) is None: containing_object.add_property(property_name, value=[], datatype=datatype) + # TODO: case where multiple times the same property exists is not treated if not isinstance(containing_object.get_property(property_name).value, list): containing_object.get_property(property_name).value = [ @@ -628,7 +653,12 @@ def assure_has_property(entity, name, value, to_be_updated=None, value = value.id for el in possible_properties: - if el.value == value: + tmp_value = el.value + + if isinstance(tmp_value, db.Entity): + tmp_value = el.value.id + + if tmp_value == value: contained = True break diff --git a/src/caosadvancedtools/cfoods/__init__.py b/src/caosadvancedtools/cfoods/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..30ce05add09a223c2f65dbe187a6cfb1768d7a22 --- /dev/null +++ b/src/caosadvancedtools/cfoods/__init__.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 + +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2020 IndiScale GmbH <www.indiscale.com> +# Copyright (C) 2020 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Specialized CFoods.""" diff --git a/src/caosadvancedtools/cfoods/h5.py b/src/caosadvancedtools/cfoods/h5.py new file mode 100644 index 0000000000000000000000000000000000000000..0e56da71d14d0ce643caab16a1846a36e5917c06 --- /dev/null +++ b/src/caosadvancedtools/cfoods/h5.py @@ -0,0 +1,285 @@ +#!/usr/bin/env python3 + +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2020,2021 IndiScale GmbH <www.indiscale.com> +# Copyright (C) 2020 Daniel Hornung <d.hornung@indiscale.com> +# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# Copyright (C) 2021 Alexander Kreft +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""A CFood for hdf5 files + + +This module allows to parse hdf5 files and reproduce their structure in form +of Records that reference each other. + +hdf5 files are composed of groups and datasets. Both of which can have +attributes. Groups and datasets are mapped to Records and attributes to +Properties. +""" + +import re +from copy import deepcopy + +import caosdb as db +import h5py +import numpy as np +from caosadvancedtools.cfood import fileguide +from caosdb.common.datatype import is_reference +from caosdb.common.utils import uuid + +from ..cfood import (AbstractFileCFood, assure_has_description, + assure_has_parent, assure_has_property, + assure_property_is) +from ..structure_mapping import (EntityMapping, collect_existing_structure, + update_structure) + + +def h5_attr_to_property(val): + """ returns the value and datatype of a CaosDB Property for the given value + + + 1d arrays are converted to lists + If no suitable Property can be created (None, None) is returned. + + 2d and higher dimensionality arrays are being ignored. + """ + + if isinstance(val, str): + return val, db.TEXT + elif isinstance(val, complex): + return val, db.TEXT + else: + if not hasattr(val, 'dtype'): + raise NotImplementedError("Code assumes only str are missing the" + "dtype attribute") + + if issubclass(val.dtype.type, np.floating): + dtype = db.DOUBLE + elif issubclass(val.dtype.type, np.integer): + dtype = db.INTEGER + elif val.dtype.kind in ['S', 'U']: + dtype = db.TEXT + val = val.astype(str) + elif val.dtype.kind == 'O': + if not np.all([isinstance(el, str) for el in val]): + raise NotImplementedError("Cannot convert arbitrary objects") + dtype = db.TEXT + val = val.astype(str) + else: + raise NotImplementedError("Unknown dtype used") + + if isinstance(val, np.ndarray): + if val.ndim > 1: + return None, None + + if val.ndim == 0: + raise NotImplementedError( + "Code assumes that scalar values " + "will not be given as np.ndarray objects") + val = list(val) + dtype = db.LIST(dtype) + + # TODO this can eventually be removed + + if(hasattr(val, 'ndim')): + if not isinstance(val, np.ndarray) and val.ndim != 0: + print(val, val.ndim) + raise Exception( + "Implementation assumes that only np.arrays have ndim.") + + return val, dtype + + +class H5CFood(AbstractFileCFood): + """ H5CFood which consumes a HDF5 file. + + The structure is mapped onto an equivalent structure of interconnected + Records. + + Attributes + ---------- + root_name : str, default "root" + Type of the root Record (the Record corresponding to the root node in + the HDF5 file) + h5file : h5py.File, default None + Name of the hdf5-file to read + """ + + # to be overwritten by subclasses + root_name = "root" + + def __init__(self, *args, **kwargs): + """CFood which consumes HDF5 files.""" + super().__init__(*args, **kwargs) + self.h5file = None + self.hdf5Container = db.Container() + self.em = EntityMapping() + + def collect_information(self): + self.h5file = h5py.File(fileguide.access(self.crawled_path), 'r') + + @staticmethod + def get_re(): + """Return a regular expression string to match *.h5, *.nc, *.hdf, *.hdf5.""" + extensions = [ + "h5", + "nc", + "hdf", + "hdf5", + ] + + return AbstractFileCFood.re_from_extensions(extensions) + + def create_identifiables(self): + """Create identifiables out of groups in the HDF5 file. + + This method will call is_identifiable(h5path, h5object) and create_identifiable(h5path, + h5object) on each HDF5 object to decide and actually create the identifiables. + """ + # manually create the identifiable root element: self.identifiable_root + self.structure = self.create_structure(self.h5file, + special_treatment=self.special_treatment) + + def update_identifiables(self): + """Check if the identifiables need to be updated. + + In that case also add the updated entities to the list of updateables. + + This method will iterate over the groups and datasets governed by this CFood's identifiables + and call ``update_object(path, h5object)`` on each object. + + """ + + self.structure._cuid = "root element" + self.em.add(self.structure, self.identifiable_root) + collect_existing_structure(self.structure, self.identifiable_root, + self.em) + self.to_be_inserted = db.Container() + self.insert_missing_structure(self.structure) + + # TODO this is a workaround due to the fact that the caosdb library + # changes the objects in the Container if it is inserted. The graph + # structure is flattened. I.e. references to other entity objects are + # replaced with their IDs. However this code depends on this graph. + tmp_copy = deepcopy(self.to_be_inserted) + tmp_copy.insert() + + for e1, e2 in zip(tmp_copy, self.to_be_inserted): + e2.id = e1.id + # End workaround + + # self.update_structure(self.structure) + update_structure(self.em, self.to_be_updated, self.structure) + + def special_treatment(self, key, value, dtype): + """define special treatment of attributes + + to be overwritten by child classes. + + key: attribute name + value: attribute value + """ + + return key, value, dtype + + @classmethod + def create_structure(cls, h5obj, create_recordTypes=False, collection=None, + special_treatment=None): + """Create Records and Record types from a given hdf5-object for all + items in the tree. Attributes are added as properties, the + values only if the dimension < 2. + + Parameters + ---------- + h5obj : h5py.File + a hdf5-file object + + Returns + ------- + rec : db.Container + Contains the Record Types, Records and Properties for the + input-tree + + """ + + if collection is None: + collection = [] + + if special_treatment is None: + def special_treatment(x, y, z): return x, y, z + + if h5obj.name == "/": + name_without_path = cls.root_name + else: + name_without_path = h5obj.name.split("/")[-1] + + if create_recordTypes: + rec = db.RecordType(name=name_without_path) + else: + rec = db.Record().add_parent(name=name_without_path) + collection.append(rec) + + if isinstance(h5obj, h5py.Group): + for subgroup in h5obj.keys(): + subgroup_name = h5obj[subgroup].name.split("/")[-1] + + sub = H5CFood.create_structure(h5obj[subgroup], + create_recordTypes=create_recordTypes, + collection=collection, + special_treatment=special_treatment) + + if create_recordTypes: + rec.add_property(subgroup_name) + else: + rec.add_property(subgroup_name, value=sub) + + for key, val in h5obj.attrs.items(): + # ignored + + if key in ["REFERENCE_LIST", "DIMENSION_LIST", "NAME", "CLASS"]: + continue + + val, dtype = h5_attr_to_property(val) + + if val is None and dtype is None: + continue + + if create_recordTypes and key.lower() not in ['description']: + treated_k, _, treated_dtype = special_treatment( + key, val, dtype) + + if treated_k is not None: + prop = db.Property(name=treated_k, datatype=treated_dtype) + collection.append(prop) + rec.add_property(name=treated_k) + else: + treated_k, treated_v, treated_dtype = special_treatment( + key, val, dtype) + + if treated_k is not None: + rec.add_property(name=treated_k, value=treated_v, + datatype=treated_dtype) + + return rec + + def insert_missing_structure(self, target_structure: db.Record): + if target_structure._cuid not in self.em.to_existing: + self.to_be_inserted.append(target_structure) + + for prop in target_structure.get_properties(): + if prop.is_reference(server_retrieval=True): + self.insert_missing_structure(prop.value) diff --git a/src/caosadvancedtools/example_cfood.py b/src/caosadvancedtools/example_cfood.py index 6111d95defc37bbb6d836feec3fa3d2e4e3d91ab..2e395d5c3030508087e25a7156d35c8954d223d7 100644 --- a/src/caosadvancedtools/example_cfood.py +++ b/src/caosadvancedtools/example_cfood.py @@ -26,8 +26,8 @@ from .cfood import AbstractFileCFood, assure_has_property class ExampleCFood(AbstractFileCFood): - @staticmethod - def get_re(): + @classmethod + def get_re(cls): return (r".*/(?P<species>[^/]+)/" r"(?P<date>\d{4}-\d{2}-\d{2})/README.md") diff --git a/src/caosadvancedtools/structure_mapping.py b/src/caosadvancedtools/structure_mapping.py new file mode 100644 index 0000000000000000000000000000000000000000..50e57ac4d84f2034fbdb6da6c7159f450a993c3a --- /dev/null +++ b/src/caosadvancedtools/structure_mapping.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 + +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2021 IndiScale GmbH <www.indiscale.com> +# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +import caosdb as db +from caosdb.apiutils import resolve_reference +from caosdb.common.utils import uuid + +from .cfood import (assure_has_description, assure_has_parent, + assure_property_is) + + +class EntityMapping(object): + """ + map local entities to entities on the server + + the dict to_existing maps _cuid property to entity objects + the dict to_target maps id property to entity objects + """ + + def __init__(self): + self.to_existing = {} + self.to_target = {} + + def add(self, target, existing): + if target._cuid is None: + target._cuid = str(uuid()) + self.to_existing[str(target._cuid)] = existing + self.to_target[existing.id] = target + + +def collect_existing_structure(target_structure, existing_root, em): + """ recursively collects existing entities + + The collected entities are those that correspond to the ones in + target_structure. + + + em: EntityMapping + """ + + for prop in target_structure.properties: + if prop.value is None: + continue + + if not prop.is_reference(server_retrieval=True): + continue + + if (len([p for p in target_structure.properties if p.name == prop.name]) + != 1): + raise ValueError("Current implementation allows only one property " + "for each property name") + + if (existing_root.get_property(prop.name) is not None and + existing_root.get_property(prop.name).value is not None): + resolve_reference(prop) + + resolve_reference(existing_root.get_property(prop.name)) + referenced = existing_root.get_property(prop.name).value + + if not isinstance(referenced, list): + referenced = [referenced] + target_value = prop.value + + if not isinstance(target_value, list): + target_value = [target_value] + + if len(target_value) != len(referenced): + raise ValueError() + + for tent, eent in zip(target_value, referenced): + em.add(tent, eent) + collect_existing_structure(tent, eent, em) + + +def update_structure(em, updating: db.Container, target_structure: db.Record): + """compare the existing records with the target record tree created + from the h5 object + + Parameters + ---------- + + existing_structure + retrieved entity; e.g. the top level identifiable + + target_structure : db.Record + A record which may have references to other records. Must be a DAG. + """ + + if target_structure._cuid in em.to_existing: + update_matched_entity(em, + updating, + target_structure, + em.to_existing[target_structure._cuid]) + + for prop in target_structure.get_properties(): + if prop.is_reference(server_retrieval=True): + update_structure(em, updating, prop.value) + + +def update_matched_entity(em, updating, target_record, existing_record): + """ + update the Record existing in the server according to the Record + supplied as target_record + """ + + for parent in target_record.get_parents(): + if parent.name == "": + raise ValueError("Parent name must not be empty.") + assure_has_parent(existing_record, parent.name, force=True) + + if target_record.description is not None: + # check whether description is equal + assure_has_description(existing_record, target_record.description, + to_be_updated=updating) + + for prop in target_record.get_properties(): + # check for remaining property types + + if isinstance(prop.value, db.Entity): + if prop.value._cuid in em.to_existing: + value = em.to_existing[prop.value._cuid].id + else: + value = prop.value.id + else: + value = prop.value + assure_property_is(existing_record, prop.name, value, + to_be_updated=updating) diff --git a/src/caosadvancedtools/suppressKnown.py b/src/caosadvancedtools/suppressKnown.py index c15f0e06fa7d126937497aeb877dd5d2991b6ff7..c4b57039c5184f2443e4dbb91cf11f5e59ae6790 100644 --- a/src/caosadvancedtools/suppressKnown.py +++ b/src/caosadvancedtools/suppressKnown.py @@ -5,6 +5,8 @@ import os import sqlite3 from hashlib import sha256 +import tempfile + class SuppressKnown(logging.Filter): """ @@ -26,8 +28,9 @@ class SuppressKnown(logging.Filter): if db_file: self.db_file = db_file else: - self.db_file = "/tmp/caosadvanced_suppressed_cache.db" - + tmppath = tempfile.gettempdir() + tmpf = os.path.join(tmppath, "caosadvanced_suppressed_cache.db") + self.db_file = tmpf if not os.path.exists(self.db_file): self.create_cache() diff --git a/tox.ini b/tox.ini index 3d7f652203ed0caf9cdfaebbb159784e6f9b2835..d41e9930870390ea52f447bc91fbcff3c4e32a0f 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ skip_missing_interpreters = true [testenv] deps=nose pandas - caosdb + git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev pytest pytest-cov openpyxl diff --git a/unittests/create_dummy_hdf5file.py b/unittests/create_dummy_hdf5file.py new file mode 100644 index 0000000000000000000000000000000000000000..ce04030154c70e1d533f67aeec12321b86ddf305 --- /dev/null +++ b/unittests/create_dummy_hdf5file.py @@ -0,0 +1,70 @@ +import h5py +import numpy as np + + +def create_hdf5_file(filename="hdf5_dummy_file.hdf5"): + ''' + Create a dummy hdf5-file for testing. + Structure: + + root:-->root + group_level1_a:-->group + group_level2_aa:-->group + group_level3_aaa:-->group + level3_aaa_floats_2d = float64(100x100) + group_level3_aab:-->group + group_level2_ab:-->group + group_level3_aba:-->group + level3_aba_floats_2d = float64(100x100) + group_level2_ac:-->group + level2_ac_integers_2d = int32(100x100) + group_level1_b:-->group + group_level2_ba:-->group + level2_ba_integers_2d = int32(100x100) + level1_b_floats = float64(10000) + group_level1_c:-->group + level1_c_floats = float64(10000) + root_integers = int32(10000) + ''' + + with h5py.File(filename, mode="w") as hdf5: + '''Create toplevel groups''' + group_lvl1_a = hdf5.create_group("group_level1_a") + group_lvl1_b = hdf5.create_group("group_level1_b") + group_lvl1_c = hdf5.create_group("group_level1_c") + + '''Create level 2 groups''' + group_lvl2_aa = group_lvl1_a.create_group("group_level2_aa") + group_lvl2_ab = group_lvl1_a.create_group("group_level2_ab") + group_lvl2_ac = group_lvl1_a.create_group("group_level2_ac") + group_lvl2_ba = group_lvl1_b.create_group("group_level2_ba") + + '''Create level 3 groups''' + group_lvl3_aaa = group_lvl2_aa.create_group("group_level3_aaa") + group_lvl3_aab = group_lvl2_aa.create_group("group_level3_aab") + group_lvl3_aba = group_lvl2_ab.create_group("group_level3_aba") + + '''Create datasets''' + integers = np.arange(10000) + floats = np.arange(0, 1000, 0.1) + integers_2d = np.diag(np.arange(100)) + floats_2d = np.eye(100) + data_root = hdf5.create_dataset("root_integers", data=integers) + data_lvl1_b = group_lvl1_b.create_dataset("level1_b_floats", data=floats) + data_lvl2_c = group_lvl1_c.create_dataset("level1_c_floats", data=floats) + data_lvl2_ac = group_lvl2_ac.create_dataset("level2_ac_integers_2d", data=integers_2d) + data_lvl2_ba = group_lvl2_ba.create_dataset("level2_ba_integers_2d", data=integers_2d) + data_lvl3_aaa = group_lvl3_aaa.create_dataset("level3_aaa_floats_2d", data=floats_2d) + data_lvl3_aba = group_lvl3_aba.create_dataset("level3_aba_floats_2d", data=floats_2d) + + '''Create attributes''' + attr_group_lvl1_a = group_lvl1_a.attrs.create("attr_group_lvl1_a", 1) + attr_group_lvl2_aa = group_lvl2_aa.attrs.create("attr_group_lvl2_aa", -2) + attr_group_lvl3_aaa = group_lvl3_aaa.attrs.create("attr_group_lvl3_aaa", 1.0) + attr_data_root = data_root.attrs.create("attr_data_root", -2.0) + attr_data_lvl2_ac = data_lvl2_ac.attrs.create("attr_data_lvl2_ac", np.diag(np.arange(10))) + attr_data_lvl3_aaa = data_lvl3_aaa.attrs.create("attr_data_lvl3_aaa", np.eye(10)) + + +if __name__ == "__main__": + create_hdf5_file() diff --git a/unittests/hdf5_dummy_file.hdf5 b/unittests/hdf5_dummy_file.hdf5 new file mode 100644 index 0000000000000000000000000000000000000000..41bfb7ab3bcac19d90fd4f018cdd8118ae806eaf Binary files /dev/null and b/unittests/hdf5_dummy_file.hdf5 differ diff --git a/unittests/test_cfood.py b/unittests/test_cfood.py index 1bad508a2c22cf1ee1e29be11c3342d2115dd5a2..ab5cb11e9dc89faf26527d72e64459cae73b1d88 100644 --- a/unittests/test_cfood.py +++ b/unittests/test_cfood.py @@ -112,6 +112,36 @@ class CFoodReTest(unittest.TestCase): self.assertTrue(SimpleCFood.match_item("hallo")) self.assertFalse(SimpleCFood.match_item("allo")) + def test_extensions(self): + """Test the RE generation.""" + empty_extensions = [] + extensions = ["foo", "bar"] + + self.assertIsNone(AbstractFileCFood.re_from_extensions(empty_extensions)) + self.assertIsNotNone(SimpleCFood.re_from_extensions(extensions)) + + class ExtCFood(AbstractFileCFood): + + @staticmethod + def get_re(): + return AbstractFileCFood.re_from_extensions(extensions) + create_identifiables = None + update_identifiables = None + + # test which paths are matched + print(ExtCFood.re_from_extensions(extensions)) + self.assertTrue(ExtCFood.match_item("hello/world.foo")) + self.assertTrue(ExtCFood.match_item("hello/world.bar")) + self.assertFalse(ExtCFood.match_item("hello/world.baz")) + self.assertFalse(ExtCFood.match_item("hello/world.foo ")) # Mind the space. + self.assertFalse(ExtCFood.match_item("hello/world.foobar")) + self.assertFalse(ExtCFood.match_item("hello/world.foo|bar")) + self.assertFalse(ExtCFood.match_item("hello/world.fobar")) + self.assertFalse(ExtCFood.match_item("hello/world.fooar")) + + # Test stored extension + self.assertEqual(ExtCFood("hello/world.foo").match["ext"], "foo") + class InsertionTest(unittest.TestCase): def test_contained_in_list(self): diff --git a/unittests/test_h5.py b/unittests/test_h5.py new file mode 100644 index 0000000000000000000000000000000000000000..12b04844e173ac2f778b34daafcd876fdf527a49 --- /dev/null +++ b/unittests/test_h5.py @@ -0,0 +1,184 @@ +import unittest +from tempfile import NamedTemporaryFile + +import caosdb as db +import caosdb.apiutils +import h5py +import numpy as np +from caosadvancedtools.cfoods import h5 +from caosadvancedtools.cfoods.h5 import h5_attr_to_property + +from create_dummy_hdf5file import create_hdf5_file + +ENTS = { + 101: db.Record(id=101), + 102: db.Record(id=102), + 103: db.Record(id=103).add_property("test", value=101, + datatype=db.REFERENCE), +} + + +def dummy_get(eid): + return ENTS[eid] + + +class H5CFoodTest(unittest.TestCase): + def setUp(self): + self.h5file = NamedTemporaryFile(delete=False, suffix=".h5") + self.h5file.close() + create_hdf5_file(self.h5file.name) + self.h5obj = h5py.File(self.h5file.name, mode="a") + + def test_create_record_records(self): + result = h5.H5CFood.create_structure(self.h5obj) + + record_list = [] + parents = ['group_level1_a', 'group_level1_b', 'group_level1_c', 'root_integers'] + + for i in parents: + record_list.append(db.Record().add_parent(name=i)) + + found_parents = [] + + for ent in [p.value for p in result.properties]: + if ent.parents[0].name == 'group_level1_a': + found_parents.append('group_level1_a') + self.assertTrue(ent.get_property("group_level2_aa") is not None) + self.assertTrue(ent.get_property("group_level1_a") is None) + elif ent.parents[0].name == 'group_level1_b': + found_parents.append('group_level1_b') + pass + elif ent.parents[0].name == 'group_level1_c': + found_parents.append('group_level1_c') + pass + elif ent.parents[0].name == 'root_integers': + found_parents.append('root_integers') + pass + + for p in parents: + self.assertTrue(p in found_parents) + + for i in range(len(result.properties)): + for j in result.properties[i].value.get_parents(): + for k in record_list[i].get_parents(): + self.assertEqual(j.name, k.name) + + result1 = h5.H5CFood.create_structure(self.h5obj["group_level1_a"]) + + for i in result1.get_parents(): + self.assertEqual(i.name, "group_level1_a") + + result2 = h5.H5CFood.create_structure(self.h5obj["group_level1_a/group_level2_aa"]) + + for i in result2.get_parents(): + self.assertEqual(i.name, "group_level2_aa") + + def test_collect_existing_structure(self): + real_retrieve = caosdb.apiutils.retrieve_entity_with_id + caosdb.apiutils.retrieve_entity_with_id = dummy_get + + # should run without problem + h5.collect_existing_structure(db.Record(), db.Record(id=234), h5.EntityMapping()) + + # test with retrieval: both Records have one test Property with one + # value -> The referenced Entities are matched + r_exist = db.Record(id=234) + r_exist.add_property("test", value=101, datatype=db.REFERENCE) + r_target = db.Record() + r_child = db.Record() + r_target.add_property("test", value=r_child, datatype=db.REFERENCE) + em = h5.EntityMapping() + h5.collect_existing_structure(r_target, r_exist, em) + self.assertTrue(em.to_existing[r_child._cuid] is ENTS[101]) + self.assertTrue(em.to_target[101] is r_child) + + # test with retrieval: the existing Record has another Property + # -> The referenced Entities are matched + r_exist = db.Record(id=234) + r_exist.add_property("test_other", value=101, datatype=db.REFERENCE) + r_target = db.Record() + r_child = db.Record() + r_target.add_property("test", value=r_child, datatype=db.REFERENCE) + em = h5.EntityMapping() + h5.collect_existing_structure(r_target, r_exist, em) + self.assertEqual(em.to_existing, {}) + self.assertEqual(em.to_target, {}) + + # test with retrieval: both Records have one test Property; the + # existing is missing the value -> The referenced Entities are matched + r_exist = db.Record(id=234) + r_exist.add_property("test", value=None, datatype=db.REFERENCE) + r_target = db.Record() + r_child = db.Record() + r_target.add_property("test", value=r_child, datatype=db.REFERENCE) + em = h5.EntityMapping() + h5.collect_existing_structure(r_target, r_exist, em) + self.assertEqual(em.to_existing, {}) + self.assertEqual(em.to_target, {}) + + # test with retrieval: both Records have one test Property with + # multiple values -> The referenced Entities are matched + r_exist = db.Record(id=234) + r_exist.add_property("test", value=[101, 102], datatype=db.LIST(db.REFERENCE)) + r_target = db.Record() + r_child = db.Record() + r_child2 = db.Record() + r_target.add_property("test", value=[r_child, r_child2], + datatype=db.LIST(db.REFERENCE)) + em = h5.EntityMapping() + h5.collect_existing_structure(r_target, r_exist, em) + self.assertEqual(em.to_existing[r_child._cuid], ENTS[101]) + self.assertEqual(em.to_existing[r_child2._cuid], ENTS[102]) + self.assertEqual(em.to_target[101], r_child) + self.assertEqual(em.to_target[102], r_child2) + + # test with retrieval: both Records have one test Property with one + # value; Add another recursion level -> The referenced Entities are matched + r_exist = db.Record(id=234) + r_exist.add_property("test", value=103, datatype=db.REFERENCE) + r_target = db.Record() + r_child = db.Record() + r_child2 = db.Record() + r_target.add_property("test", value=r_child, datatype=db.REFERENCE) + r_child.add_property("test", value=r_child2, datatype=db.REFERENCE) + em = h5.EntityMapping() + h5.collect_existing_structure(r_target, r_exist, em) + self.assertEqual(em.to_existing[r_child._cuid], ENTS[103]) + self.assertEqual(em.to_target[103], r_child) + self.assertEqual(em.to_existing[r_child2._cuid], ENTS[101]) + self.assertEqual(em.to_target[101], r_child2) + + caosdb.apiutils.retrieve_entity_with_id = real_retrieve + + def test_h5_attr_to_property(self): + + test_int: int = 1 + test_integer = np.int_(1) + test_float = np.float_(1.0) + test_str = "Test" + test_complex: complex = 2+3j + self.assertRaises(NotImplementedError, h5_attr_to_property, test_int) # only numpy-integers processed? + self.assertTupleEqual((1, db.INTEGER), h5_attr_to_property(test_integer)) + self.assertTupleEqual((1.0, db.DOUBLE), h5_attr_to_property(test_float)) + self.assertTupleEqual(("Test", db.TEXT), h5_attr_to_property(test_str)) + self.assertTupleEqual((2+3j, db.TEXT), h5_attr_to_property(test_complex)) + # strings are often represented using a binary format + self.assertTupleEqual(("yeti", db.TEXT), h5_attr_to_property( + np.array(["yeti"], dtype=h5py.string_dtype(r'utf-8', 8))[0])) + + test_integer_1d = np.arange(10) + test_float_1d = np.arange(0, 1, 0.1) + test_str_1d = np.array(["a", "b", "c"]) + self.assertTrue((np.arange(10) == h5_attr_to_property(test_integer_1d)[0]).all()) + self.assertTrue(db.LIST(db.INTEGER) == h5_attr_to_property(test_integer_1d)[1]) + self.assertTrue((np.arange(0, 1, 0.1) == h5_attr_to_property(test_float_1d)[0]).all()) + self.assertTrue(db.LIST(db.DOUBLE) == h5_attr_to_property(test_float_1d)[1]) + self.assertTrue((np.array(["a", "b", "c"]) == h5_attr_to_property(test_str_1d)[0]).all()) + self.assertTrue(db.LIST(db.TEXT) == h5_attr_to_property(test_str_1d)[1]) + + test_integers_2d = np.diag(np.arange(100)) + test_floats_2d = np.eye(100) + self.assertTupleEqual((None, None), h5_attr_to_property(test_integers_2d)) + self.assertTupleEqual((None, None), h5_attr_to_property(test_floats_2d)) + + self.assertRaises(NotImplementedError, h5_attr_to_property, np.array(1)) diff --git a/unittests/test_structure_mapping.py b/unittests/test_structure_mapping.py new file mode 100644 index 0000000000000000000000000000000000000000..5cc4114fc7f92c580f53dd8855bda659082e2b46 --- /dev/null +++ b/unittests/test_structure_mapping.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 + +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2021 IndiScale GmbH <www.indiscale.com> +# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# Copyright (C) 2021 Alexander Kreft <akreft@trineo.org> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +import unittest +from os import name + +import caosdb as db +from caosadvancedtools.structure_mapping import (EntityMapping, + collect_existing_structure) +from caosdb.common import datatype + + +class structureMappingTest(unittest.TestCase): + def test_Entitymapping(self): + ex = db.Record(id=100) # existing Record + tar = db.Record() # target Record + em = EntityMapping() + em.add(tar, ex) + + for key, val in em.to_existing.items(): + self.assertEqual(key, tar._cuid) + self.assertEqual(val, ex) + + for key, val in em.to_target.items(): + self.assertEqual(key, ex.id) + self.assertEqual(val, tar) + + def test_collect_existing_structure(self): + emap = EntityMapping() + reca1 = db.Record(name="Animals", id=100) + reca2 = db.Record(name="Dogs", id=200) + reca3 = db.Record(name="Husky", id=300) + reca1.add_property(id=101, name="Cute Animals", datatype=db.REFERENCE, value=reca2) + reca2.add_property(id=201, name="Cute Dogs", datatype=db.REFERENCE, value=reca3) + + recb1 = db.Record(name="Animals") + recb2 = db.Record(name="Dogs") + recb3 = db.Record(name="Husky") + recb1.add_property(name="Cute Animals", datatype=db.REFERENCE, value=recb2) + recb2.add_property(name="Cute Dogs", datatype=db.REFERENCE, value=recb3) + + collect_existing_structure(recb1, reca1, emap) + + # Test if the two dicts of the entity mapping correctly depend on each other + + for i in emap.to_existing.keys(): + self.assertEqual(i, emap.to_target[emap.to_existing[i].id]._cuid) + + for j in emap.to_target.keys(): + self.assertEqual(j, emap.to_existing[emap.to_target[j]._cuid].id) + + # Test if only the right Properties are in the dicts + self.assertTrue((reca2 in emap.to_existing.values()) and + (reca3 in emap.to_existing.values()) and + (reca1 not in emap.to_existing.values())) + self.assertTrue((recb2 in emap.to_target.values()) and + (recb3 in emap.to_target.values()) and + (recb1 not in emap.to_target.values())) + + # Test the correct assignment of the properties + self.assertTrue(reca2 is emap.to_existing[recb2._cuid]) + self.assertTrue(reca3 is emap.to_existing[recb3._cuid]) + + self.assertTrue(recb2 is emap.to_target[reca2.id]) + self.assertTrue(recb3 is emap.to_target[reca3.id]) + + """Test with one additional Property and Properties, which are not Records""" + emap2 = EntityMapping() + recc1 = db.Record(name="Transportation", id=100) + recc2 = db.Record(name="Cars", id=200) + recc3 = db.Record(name="Volvo", id=300) + recc1.add_property(id=101, name="Type", datatype=db.REFERENCE, value=recc2) + recc2.add_property(id=201, name="Brand", datatype=db.REFERENCE, value=recc3) + # other datatypes + recc3.add_property(id=301, name="max_speed", value=200.2, datatype=db.DOUBLE) + recc3.add_property(id=302, name="doors", value=3, datatype=db.INTEGER) + + recd1 = db.Record(name="Transportation") + recd2 = db.Record(name="Cars") + recd3 = db.Record(name="Volvo") + recd4 = db.Record(name="VW") + recd1.add_property(name="Type", datatype=db.REFERENCE, value=recd2) + recd2.add_property(name="Brand", datatype=db.REFERENCE, value=recd3) + # additional Property + recd2.add_property(name="Another Brand", datatype=db.REFERENCE, value=recd4) + # other datatypes + recd3.add_property(name="max_speed", value=200.2, datatype=db.DOUBLE) + recd3.add_property(name="doors", value=3, datatype=db.INTEGER) + recd4.add_property(name="max_speed", value=210.4, datatype=db.DOUBLE) + recd4.add_property(name="doors", value=5, datatype=db.INTEGER) + recd4.add_property(name="Warp engine", value=None) + + collect_existing_structure(recd1, recc1, emap2) + + # Test the correct assignment of the properties + self.assertTrue(recc2 is emap2.to_existing[recd2._cuid]) + self.assertTrue(recc3 is emap2.to_existing[recd3._cuid]) + + self.assertTrue(recd2 is emap2.to_target[recc2.id]) + self.assertTrue(recd3 is emap2.to_target[recc3.id]) + + """ Test, if the Record `Cars` in `target_structure` have one additional Property """ + # Test existing structure + self.assertEqual(len(recc2.get_properties()), 1) # number of properties stay unchanged + self.assertEqual(len(recd2.get_properties()), 2) # number of properties stay unchanged + + for prop_record, prop_em in zip(recc2.get_properties(), recd2.get_properties()): + self.assertTrue(prop_record.value is emap2.to_existing[prop_em.value._cuid]) + + # Test target structure + self.assertEqual(len(recc3.get_properties()), 2) # number of properties stay unchanged + self.assertEqual(len(recd3.get_properties()), 2) # number of properties stay unchanged + + """ Test if the Properties that are not References show up in the entity map """ + for rec_existing, rec_target in zip(emap2.to_existing.values(), emap2.to_target.values()): + self.assertTrue(isinstance(rec_existing, db.Record)) + self.assertTrue(isinstance(rec_target, db.Record))