Merge branch 'dev' into resulttable

db13ac4e · Henrik tom Wörden · 9e2c6b31 · d5b3e46b · db13ac4e · db13ac4e
Commit db13ac4e authored Oct 11, 2021 by Henrik tom Wörden
--- a/.docker/Dockerfile
+++ b/.docker/Dockerfile
@@ -2,6 +2,8 @@ FROM debian:10
 RUN apt-get update && \
    apt-get install \
    curl \
+    libhdf5-dev \
+    pkgconf \
    python3 \
    python3-pip \
    python3-requests \
@@ -27,6 +29,6 @@ RUN pip3 install recommonmark sphinx-rtd-theme
 COPY . /git
 RUN rm -r /git/.git \
    && mv /git/.docker/pycaosdb.ini /git/integrationtests
-RUN cd /git && pip3 install .
+RUN cd /git && pip3 install .[h5-crawler]
 WORKDIR /git/integrationtests
-CMD /wait-for-it.sh caosdb-server:10443 -t 500 -- ./test.sh
+CMD /wait-for-it.sh caosdb-server:10443 -t 500 -- ./test.sh --force
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -58,8 +58,8 @@ test:
      - cd .docker 
      - /bin/sh ./run.sh
      - cd .. 
-      - docker logs docker_caosdb-server_1 &> ../caosdb_log.txt
+      - docker logs docker_caosdb-server_1 &> caosdb_log.txt
-      - docker logs docker_sqldb_1 &> ../mariadb_log.txt
+      - docker logs docker_sqldb_1 &> mariadb_log.txt
      - docker-compose -f .docker/docker-compose.yml down
      - rc=`cat .docker/result`  
      - exit $rc

--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -28,9 +28,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Automated documentation builds: `make doc`
 - Crawler documentation
 - Proof-of-concept integration with Bloxberg.
+- Introduce a cfood that can create a Record structure based on the contents of a hdf5 file
+  h5py is now an optional dependency
 ### Changed ###
+- identifiables of single CFoods are now treated one after the other. This 
+  allows them to have dependencies among each other if they are ordered 
+  correctly
 - identifiables must have at least one property or a name
 * `caosadvancedtools.serverside.helper.init_data_model` also checks the role
  and data type of entities.
@@ -61,6 +66,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
  cause an `sqlite3.IntegrityError` if more than one change was cached
  for the same entity.
 * #40 Insertion of identifiables with missing obligatory properties
+- Before, a Property with the datatype "LIST(TEXT)" would lead to the creation 
+  of a RecordType. This is fixed now.
+* #52 `XLSimporter.read_xls` throwed a wrong error when reading from a file with a wrong ending. 
+  Now, a `DataInconsistencyError` is raised instead of a ValueError.
 ### Security ###

--- a/Makefile
+++ b/Makefile
@@ -21,7 +21,7 @@
 # This Makefile is a wrapper for several other scripts.
-.PHONY: help doc install
+.PHONY: help doc install unittest
 help:
 	@echo 'Type `make doc` for documentation, or `make install` for (local) installation.'
@@ -30,4 +30,7 @@ doc:
 	$(MAKE) -C src/doc html
 install:
-	@echo "Not implemented yet, use pip for installation."
+	pip3 install .
+unittest:
+	pytest-3 unittests
--- a/README_SETUP.md
+++ b/README_SETUP.md
@@ -12,6 +12,11 @@ Dependencies will be installed automatically if you use the below described proc
 - `caosdb>=0.4.0`                                      
 - `openpyxl>=3.0.0`
 - `xlrd>=1.2.0`
+-  `pandas>=1.2.0`
+- `numpy>=1.17.3`
+If you want to use the optional h5-crawler the following dependencies will be installed additionally:
+- `h5py>=3.3.0`
 For testing:
 - `tox`
@@ -21,6 +26,9 @@ For testing:
 - `pip install . --user`
 - `pip install tox --user`
+Optional h5-crawler:
+- `pip install .[h5-crawler] --user`
 ## Run Unit Tests
 `tox`
@@ -31,9 +39,11 @@ For testing:
   extroot. E.g. `sudo mount -o bind extroot
   ../../caosdb-deploy/profiles/empty/paths/extroot` (or whatever path
   the extroot of the empty profile to be used is located at).
-3. Start an empty (!) CaosDB instance (with the mounted extroot). The
+3. Start (or restart) an empty (!) CaosDB instance (with the mounted extroot).
-   database will be cleared during testing, so it's important to use
+   The database will be cleared during testing, so it's important to use
   an empty instance.
+   Make sure your configuration for the python caosdb module is correct and
+   allows to connect to the server.
 4. Run `test.sh`.  Note that this may modify content of the `integrationtest/extroot/` directory.
 ## Code Formatting

--- a/integrationtests/crawl.py
+++ b/integrationtests/crawl.py
@@ -36,6 +36,8 @@ from caosadvancedtools.scifolder import (AnalysisCFood, ExperimentCFood,
                                         PublicationCFood, SimulationCFood,
                                         SoftwareCFood, ResultTableCFood)
+from example_hdf5cfood import ExampleH5CFood
 try:
    from sss_helper import get_argument_parser, print_success
 except ModuleNotFoundError:
@@ -89,7 +91,7 @@ if __name__ == "__main__":
                    interactive=False, hideKnown=False,
                    cfood_types=[ExperimentCFood, AnalysisCFood, SoftwareCFood,
                                 PublicationCFood, SimulationCFood, 
-                                 ResultTableCFood])
+                                 ExampleH5CFood, ResultTableCFood])
    if args.authorize_run:
        for run_id in args.authorize_run:

--- a/integrationtests/example_hdf5cfood.py
+++ b/integrationtests/example_hdf5cfood.py
+#!/usr/bin/env python3
+# encoding: utf-8
+#
+# ** header v3.0
+# This file is a part of the CaosDB Project.
+#
+# Copyright (C) 2021 IndiScale GmbH <www.indiscale.com>
+# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+# ** end header
+#
+"""
+An exemplary definition of a HDF5 CFood for integration testing
+"""
+import caosdb as db
+from caosadvancedtools.cfoods.h5 import H5CFood
+from caosadvancedtools.scifolder import ExperimentCFood
+from caosadvancedtools.scifolder.generic_pattern import readme_pattern
+class ExampleH5CFood(H5CFood):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.root_name = "ExampleH5"
+    @staticmethod
+    def get_re():
+        return ExperimentCFood.get_re()[:-len(readme_pattern)] + r".*\.hdf5"
+    def create_identifiables(self):
+        super().create_identifiables()
+        self.identifiable_root = db.Record()
+        self.identifiable_root.add_property("hdf5File", self.crawled_file)
+        self.identifiable_root.add_parent("ExampleH5")
+        self.identifiables.append(self.identifiable_root)
+    def special_treatment(self, key, value, dtype):
+        if key == "attr_data_root":
+            return "single_attribute", value, dtype
+        return key, value, dtype
--- a/integrationtests/extroot/ExperimentalData/2010_TestProject/2019-02-03/hdf5_dummy_file.hdf5
+++ b/integrationtests/extroot/ExperimentalData/2010_TestProject/2019-02-03/hdf5_dummy_file.hdf5
--- a/integrationtests/extroot/ExperimentalData/2010_TestProject/2019-02-04/README.md
+++ b/integrationtests/extroot/ExperimentalData/2010_TestProject/2019-02-04/README.md
+---
+responsible:
+- Tom Wood
+description: Something.
+...
--- a/integrationtests/filldb.sh
+++ b/integrationtests/filldb.sh
@@ -7,4 +7,5 @@ python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/SimulationData
 python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/Publications
 python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/Software
 python3 insert_model.py 
+python3 insert_some.py
 python3 crawl.py /
--- a/integrationtests/insert_model.py
+++ b/integrationtests/insert_model.py
 #!/usr/bin/env python3
 import caosdb as db
+import h5py
+from caosadvancedtools.cfoods.h5 import H5CFood
+from caosadvancedtools.models.data_model import DataModel
 from caosadvancedtools.models.parser import parse_model_from_yaml
 model = parse_model_from_yaml("model.yml")
@@ -9,3 +12,11 @@ if len(db.execute_query("FIND Property alias")) == 0:
    al = db.Property(name="alias")
    al.add_parent(name="name")
    al.insert()
+h5model = db.Container()
+h5file = h5py.File('extroot/ExperimentalData/2010_TestProject/2019-02-03/hdf5_dummy_file.hdf5', 'r')
+H5CFood.create_structure(h5file, create_recordTypes=True, collection=h5model,
+                         root_name="ExampleH5")
+print(h5model)
+h5model = DataModel(h5model)
+h5model.sync_data_model(noquestion=True)
--- a/integrationtests/insert_some.py
+++ b/integrationtests/insert_some.py
+#!/usr/bin/env python3
+import caosdb as db
+from caosadvancedtools.scifolder.experiment_cfood import dm
+# This inserts two identifiables. When no dependencies are possible among
+# identifiables, it should not be possible to find both: the experiment
+# identifiable would for example not reference the correct project Record
+project = db.Record(name='2010_TestProject')
+project.add_parent(name=dm.Project)
+project.insert()
+pers = db.Record()
+pers.add_parent("Person")
+pers.add_property("lastname", "Wood")
+pers.add_property("firstname", "Tom")
+pers.insert()
+experiment = db.Record()
+experiment.add_parent(name=dm.Experiment)
+experiment.description = "Something."
+experiment.add_property(
+    name=dm.date, value='2019-02-04')
+experiment.add_property(name=dm.Project, value=project)
+experiment.add_property(
+        name="identifier", value="empty_identifier")
+experiment.add_property(
+        name="responsible", value=pers)
+experiment.insert(flags={"force-missing-obligatory": "ignore"})
--- a/integrationtests/model.yml
+++ b/integrationtests/model.yml
@@ -9,6 +9,7 @@ Experiment:
  # TODO empty  recommended_properties is a problem
  #recommended_properties:
    responsible:
+      datatype: LIST<Person>
 Project:
 SoftwareVersion:
  recommended_properties:
@@ -38,16 +39,16 @@ Person:
    email:
      datatype: TEXT 
      description: 'Email of a Person.'
-responsible:
-  datatype: REFERENCE
 revisionOf:
  datatype: REFERENCE
 results:
-  datatype: REFERENCE
+  datatype: LIST<REFERENCE>
 sources:
-  datatype: REFERENCE
+  datatype: LIST<REFERENCE>
 scripts:
-  datatype: REFERENCE
+  datatype: LIST<REFERENCE>
+single_attribute:
+  datatype: LIST<INTEGER>
 Simulation:
  obligatory_properties:
    date:
@@ -74,3 +75,5 @@ Presentation:
 Report:
  inherit_from_suggested:
  - Publication
+hdf5File:
+  datatype: REFERENCE
--- a/integrationtests/test.sh
+++ b/integrationtests/test.sh
 #!/bin/bash
+if [ "$1" != "--force" ]
+then
+    echo "Warning: For these tests, the whole database will be deleted. Do you want to proceed? (yes/Exit)"
+    read safety
+    if [ -z $safety ]
+    then
+        echo "Exiting..."
+        exit 0
+    elif [ $safety != "yes" ]
+    then
+        echo "Exiting..."
+        exit 0
+    fi
+fi
 OUT=/tmp/crawler.output
 ls 
 cat pycaosdb.ini
 rm -rf cache.db
+set -e
 echo "Clearing database"
 python3 clear_database.py
 echo "Testing crawler without cfoods"
@@ -19,17 +34,16 @@ echo "Filling the database"
 echo "Testing the crawler database"
 python3 -m pytest test_crawler_with_cfoods.py
 echo "make a change"
-pushd extroot
+cd extroot
 egrep -liRZ 'A description of another example' . | xargs -0 -l sed -i -e 's/A description of another example/A description of this example/g'
 # remove a file to check that this does not lead to a crawler crash
 mv DataAnalysis/2010_TestProject/2019-02-03_something/README.xlsx DataAnalysis/2010_TestProject/2019-02-03_something/README.xlsx_back
-popd
+cd ..
 echo "run crawler"
 ./crawl.py  / | tee $OUT
 # rename the moved file
 mv extroot/DataAnalysis/2010_TestProject/2019-02-03_something/README.xlsx_back extroot/DataAnalysis/2010_TestProject/2019-02-03_something/README.xlsx
 # check whether there was something UNAUTHORIZED
-set -e
 grep "There where unauthorized changes" $OUT
 # get the id of the run which is the last field of the output string
 RUN_ID=$(grep "run id:" $OUT | awk '{ print $NF }')
@@ -44,9 +58,9 @@ then
 fi
 set -e
 echo "undo changes"
-pushd extroot
+cd extroot
 egrep -liRZ 'A description of this example' . | xargs -0 -l sed -i -e 's/A description of this example/A description of another example/g'
-popd
+cd ..
 python3 test_table.py
 # TODO the following test deletes lots of the data inserted by the crawler
 echo "Testing im and export"

--- a/integrationtests/test_crawler_with_cfoods.py
+++ b/integrationtests/test_crawler_with_cfoods.py
@@ -26,6 +26,7 @@ import os
 import unittest
 import caosdb as db
+from caosdb.apiutils import retrieve_entity_with_id
 def get_entity_with_id(eid):
@@ -34,6 +35,14 @@ def get_entity_with_id(eid):
 class CrawlerTest(unittest.TestCase):
    def test_experiment(self):
+        ########################
+        # # dummy for dependency test experiment # #
+        ########################
+        exp = db.execute_query(
+            "FIND Experiment with date=2019-02-04 and identifier=empty_identifier",
+            unique=True)
        ########################
        # # first experiment # #
        ########################
@@ -489,3 +498,17 @@ class CrawlerTest(unittest.TestCase):
        # Should have a description
        self.assertIsNotNone(ana.description)
+    def test_exampleh5(self):
+        examp = db.execute_query("FIND Record ExampleH5", unique=True)
+        for prop in examp.properties:
+            if prop.name == 'group_level1_a':
+                self.assertTrue(retrieve_entity_with_id(prop.value).get_property("group_level2_aa") is not None)
+                self.assertTrue(retrieve_entity_with_id(prop.value).get_property("group_level1_a") is None)
+            elif prop.name == 'group_level1_b':
+                self.assertTrue(retrieve_entity_with_id(prop.value).get_property("level1_b_floats") is not None)
+            elif prop.name == 'group_level1_c':
+                self.assertTrue(retrieve_entity_with_id(prop.value).get_property("level1_c_floats") is not None)
+            elif prop.name == 'root_integers':
+                self.assertTrue(retrieve_entity_with_id(prop.value).get_property("single_attribute") is not None)
--- a/integrationtests/test_data_model.py
+++ b/integrationtests/test_data_model.py
@@ -33,13 +33,6 @@ class DataModelTest(unittest.TestCase):
        rt = db.execute_query("FIND RECORDTYPE TestRecord", unique=True)
        assert rt.get_property("test") is not None
-    def tearDown(self):
-        try:
-            tests = db.execute_query("FIND test*")
-            tests.delete()
-        except Exception:
-            pass
    def test_missing(self):
        # Test sync with missing prop
        # insert propt
@@ -52,3 +45,19 @@ class DataModelTest(unittest.TestCase):
        dm.sync_data_model(noquestion=True)
        rt = db.execute_query("FIND RECORDTYPE TestRecord", unique=True)
        assert rt.get_property("testproperty") is not None
+    def test_get_existing_entities(self):
+        db.RecordType(name="TestRecord").insert()
+        c = db.Container().extend([
+            db.Property(name="test"),
+            db.RecordType(name="TestRecord")])
+        exist = DataModel.get_existing_entities(c)
+        assert len(exist) == 1
+        assert exist[0].name == "TestRecord"
+    def tearDown(self):
+        try:
+            tests = db.execute_query("FIND test*")
+            tests.delete()
+        except Exception:
+            pass
--- a/integrationtests/test_im_und_export.py
+++ b/integrationtests/test_im_und_export.py
@@ -3,15 +3,14 @@ import os
 from tempfile import TemporaryDirectory
 import caosdb as db
+from caosadvancedtools.export_related import export_related_to
-from caosadvancedtools.export_related import export
 from caosadvancedtools.import_from_xml import import_xml
 if __name__ == "__main__":
    print("Conducting im- and export tests")
    rec = db.execute_query("FIND 2019-02-03_really_cool_finding", unique=True)
    directory = TemporaryDirectory()
-    export(rec.id, directory=directory.name)
+    export_related_to(rec.id, directory=directory.name)
    # delete everything
    recs = db.execute_query("FIND entity with id>99")
    recs.delete()

--- a/setup.py
+++ b/setup.py
@@ -157,12 +157,15 @@ def setup_package():
        install_requires=["caosdb>=0.4.0",
                          "openpyxl>=3.0.0",
                          "pandas>=1.2.0",
+                          "numpy>=1.17.3",
                          "xlrd>=2.0",
                          ],
+        extras_require={"h5-crawler": ["h5py>=3.3.0", ],
+                        },
        packages=find_packages('src'),
        package_dir={'': 'src'},
        setup_requires=["pytest-runner>=2.0,<3dev"],
-        tests_require=["pytest", "pytest-cov", "coverage>=4.4.2"],
+        tests_require=["pytest", "pytest-pythonpath", "pytest-cov", "coverage>=4.4.2"],
    )
    try:
        setup(**metadata)

--- a/src/caosadvancedtools/cache.py
+++ b/src/caosadvancedtools/cache.py
@@ -32,6 +32,8 @@ from hashlib import sha256
 import caosdb as db
 from lxml import etree
+import tempfile
 def put_in_container(stuff):
    if isinstance(stuff, list):
@@ -154,7 +156,9 @@ class UpdateCache(Cache):
    def __init__(self, db_file=None):
        if db_file is None:
-            db_file = "/tmp/crawler_update_cache.db"
+            tmppath = tempfile.gettempdir()
+            tmpf = os.path.join(tmppath, "crawler_update_cache.db")
+            db_file = tmpf
        super().__init__(db_file=db_file)
    @staticmethod

--- a/src/caosadvancedtools/cfood.py
+++ b/src/caosadvancedtools/cfood.py
@@ -152,9 +152,19 @@ fileguide = FileGuide()
 class AbstractCFood(object, metaclass=ABCMeta):
+    """ Abstract base class for Crawler food (CFood)."""
    def __init__(self, item):
-        """ Abstract base class for Crawler food (CFood)."""
+        """A CFood has two main methods which must be customized:
+    1. `create_identifiables`
+        This method defines (and inserts if necessary) the identifiables which may be updated at a
+        later stage.  After calling this method, the `identifiables` Container contains those
+        Records which will be updated at a later time.
+    2. `update_identifiables`
+        This method updates the stored identifiables as necessary.
+        """
        self.to_be_updated = db.Container()
        self.identifiables = db.Container()
        self.item = item
@@ -298,7 +308,7 @@ class AbstractFileCFood(AbstractCFood):
        super().__init__(*args, item=crawled_path, **kwargs)
        self._crawled_file = None
        self.crawled_path = crawled_path
-        self.match = re.match(type(self).get_re(), crawled_path)
+        self.match = re.match(self.get_re(), crawled_path)
        self.attached_filenames = []
    @property
@@ -309,7 +319,31 @@ class AbstractFileCFood(AbstractCFood):
        return self._crawled_file
    @staticmethod
-    def get_re():
+    def re_from_extensions(extensions):
+        """Return a regular expression which matches the given file extensions.
+        Useful for inheriting classes.
+        Parameters
+        ----------
+        extensions : iterable<str>
+            An iterable with the allowed extensions.
+        Returns
+        -------
+        out : str
+            The regular expression, starting with ``.*\\.`` and ending with the EOL dollar
+            character.  The actual extension will be accessible in the
+            :py:attribute:`pattern group name<python:re.Pattern.groupindexe>` ``ext``.
+        """
+        if not extensions:
+            return None
+        return r".*\.(?P<ext>" + "|".join(extensions) + ")$"
+    @classmethod
+    def get_re(cls):
        """ Returns the regular expression used to identify files that shall be
        processed
@@ -377,6 +411,7 @@ def assure_object_is_in_list(obj, containing_object, property_name,
    if containing_object.get_property(property_name) is None:
        containing_object.add_property(property_name, value=[],
                                       datatype=datatype)
+    # TODO: case where multiple times the same property exists is not treated
    if not isinstance(containing_object.get_property(property_name).value, list):
        containing_object.get_property(property_name).value = [
@@ -628,7 +663,12 @@ def assure_has_property(entity, name, value, to_be_updated=None,
        value = value.id
    for el in possible_properties:
-        if el.value == value:
+        tmp_value = el.value
+        if isinstance(tmp_value, db.Entity):
+            tmp_value = el.value.id
+        if tmp_value == value:
            contained = True
            break