diff --git a/.docker/Dockerfile b/.docker/Dockerfile index 1651fa08f7fb157e007cf5c4a992f548b7d411ba..43e5eff1171da8d69eb8897bea678bf90572570a 100644 --- a/.docker/Dockerfile +++ b/.docker/Dockerfile @@ -2,6 +2,8 @@ FROM debian:10 RUN apt-get update && \ apt-get install \ curl \ + libhdf5-dev \ + pkgconf \ python3 \ python3-pip \ python3-requests \ @@ -27,6 +29,6 @@ RUN pip3 install recommonmark sphinx-rtd-theme COPY . /git RUN rm -r /git/.git \ && mv /git/.docker/pycaosdb.ini /git/integrationtests -RUN cd /git && pip3 install . +RUN cd /git && pip3 install .[h5-crawler] WORKDIR /git/integrationtests -CMD /wait-for-it.sh caosdb-server:10443 -t 500 -- ./test.sh +CMD /wait-for-it.sh caosdb-server:10443 -t 500 -- ./test.sh --force diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 2a80211839ae3db85765c99629247f06e2c6778b..c9cd5b631cea84f44c5296edf4b789d83982d074 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -58,8 +58,8 @@ test: - cd .docker - /bin/sh ./run.sh - cd .. - - docker logs docker_caosdb-server_1 &> ../caosdb_log.txt - - docker logs docker_sqldb_1 &> ../mariadb_log.txt + - docker logs docker_caosdb-server_1 &> caosdb_log.txt + - docker logs docker_sqldb_1 &> mariadb_log.txt - docker-compose -f .docker/docker-compose.yml down - rc=`cat .docker/result` - exit $rc diff --git a/CHANGELOG.md b/CHANGELOG.md index e885b7c5d389d05af2f48f3b184355e4e53ad157..3e4f6f8eeddbf62f599bdd4f3fd230cfc3beb9d0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,9 +28,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Automated documentation builds: `make doc` - Crawler documentation - Proof-of-concept integration with Bloxberg. +- Introduce a cfood that can create a Record structure based on the contents of a hdf5 file + h5py is now an optional dependency +- table importer implementations for csv and tsv +- string-in-list check for table imports ### Changed ### +- identifiables of single CFoods are now treated one after the other. This + allows them to have dependencies among each other if they are ordered + correctly - identifiables must have at least one property or a name * `caosadvancedtools.serverside.helper.init_data_model` also checks the role and data type of entities. @@ -61,6 +68,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 cause an `sqlite3.IntegrityError` if more than one change was cached for the same entity. * #40 Insertion of identifiables with missing obligatory properties +- Before, a Property with the datatype "LIST(TEXT)" would lead to the creation + of a RecordType. This is fixed now. +* #52 `XLSimporter.read_xls` throwed a wrong error when reading from a file with a wrong ending. + Now, a `DataInconsistencyError` is raised instead of a ValueError. +* List properties are no longer updated unnecessarily by the crawler. ### Security ### diff --git a/Makefile b/Makefile index cbac0ea0a77e5523529ef181d83ffb9738d72faf..7609444bd4fd3a8ce980eca0bc3993b3cf2e168f 100644 --- a/Makefile +++ b/Makefile @@ -21,7 +21,7 @@ # This Makefile is a wrapper for several other scripts. -.PHONY: help doc install +.PHONY: help doc install unittest help: @echo 'Type `make doc` for documentation, or `make install` for (local) installation.' @@ -30,4 +30,7 @@ doc: $(MAKE) -C src/doc html install: - @echo "Not implemented yet, use pip for installation." + pip3 install . + +unittest: + pytest-3 unittests diff --git a/README_SETUP.md b/README_SETUP.md index 19f051636952945fe76b2ab752264031ac43378d..e5ebd969462f7d2c28a329e2c6b6e1bab1252775 100644 --- a/README_SETUP.md +++ b/README_SETUP.md @@ -12,6 +12,11 @@ Dependencies will be installed automatically if you use the below described proc - `caosdb>=0.4.0` - `openpyxl>=3.0.0` - `xlrd>=1.2.0` +- `pandas>=1.2.0` +- `numpy>=1.17.3` + +If you want to use the optional h5-crawler the following dependencies will be installed additionally: +- `h5py>=3.3.0` For testing: - `tox` @@ -21,6 +26,9 @@ For testing: - `pip install . --user` - `pip install tox --user` +Optional h5-crawler: +- `pip install .[h5-crawler] --user` + ## Run Unit Tests `tox` @@ -31,9 +39,11 @@ For testing: extroot. E.g. `sudo mount -o bind extroot ../../caosdb-deploy/profiles/empty/paths/extroot` (or whatever path the extroot of the empty profile to be used is located at). -3. Start an empty (!) CaosDB instance (with the mounted extroot). The - database will be cleared during testing, so it's important to use +3. Start (or restart) an empty (!) CaosDB instance (with the mounted extroot). + The database will be cleared during testing, so it's important to use an empty instance. + Make sure your configuration for the python caosdb module is correct and + allows to connect to the server. 4. Run `test.sh`. Note that this may modify content of the `integrationtest/extroot/` directory. ## Code Formatting diff --git a/integrationtests/crawl.py b/integrationtests/crawl.py index 61f51c297bc1fafa686a334031f772e095ab3896..defed2cb4f5fb0a0f349898e555c5d25924e2f9b 100755 --- a/integrationtests/crawl.py +++ b/integrationtests/crawl.py @@ -36,6 +36,8 @@ from caosadvancedtools.scifolder import (AnalysisCFood, ExperimentCFood, PublicationCFood, SimulationCFood, SoftwareCFood, ResultTableCFood) +from example_hdf5cfood import ExampleH5CFood + try: from sss_helper import get_argument_parser, print_success except ModuleNotFoundError: @@ -89,7 +91,9 @@ if __name__ == "__main__": interactive=False, hideKnown=False, cfood_types=[ExperimentCFood, AnalysisCFood, SoftwareCFood, PublicationCFood, SimulationCFood, - ResultTableCFood]) + ResultTableCFood, + ExampleH5CFood + ]) if args.authorize_run: for run_id in args.authorize_run: diff --git a/integrationtests/example_hdf5cfood.py b/integrationtests/example_hdf5cfood.py new file mode 100644 index 0000000000000000000000000000000000000000..5485402d2042b2055a087b99abcba409095a7c70 --- /dev/null +++ b/integrationtests/example_hdf5cfood.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2021 IndiScale GmbH <www.indiscale.com> +# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +""" +An exemplary definition of a HDF5 CFood for integration testing +""" + +import caosdb as db +from caosadvancedtools.cfoods.h5 import H5CFood +from caosadvancedtools.scifolder import ExperimentCFood +from caosadvancedtools.scifolder.generic_pattern import readme_pattern + + +class ExampleH5CFood(H5CFood): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.root_name = "ExampleH5" + + @staticmethod + def get_re(): + return ExperimentCFood.get_re()[:-len(readme_pattern)] + r".*\.hdf5" + + def create_identifiables(self): + super().create_identifiables() + self.identifiable_root = db.Record() + self.identifiable_root.add_property("hdf5File", self.crawled_file) + self.identifiable_root.add_parent("ExampleH5") + self.identifiables.append(self.identifiable_root) + + def special_treatment(self, key, value, dtype): + if key == "attr_data_root": + return "single_attribute", value, dtype + + return key, value, dtype diff --git a/integrationtests/extroot/ExperimentalData/2010_TestProject/2019-02-03/hdf5_dummy_file.hdf5 b/integrationtests/extroot/ExperimentalData/2010_TestProject/2019-02-03/hdf5_dummy_file.hdf5 new file mode 100644 index 0000000000000000000000000000000000000000..41bfb7ab3bcac19d90fd4f018cdd8118ae806eaf Binary files /dev/null and b/integrationtests/extroot/ExperimentalData/2010_TestProject/2019-02-03/hdf5_dummy_file.hdf5 differ diff --git a/integrationtests/extroot/ExperimentalData/2010_TestProject/2019-02-04/README.md b/integrationtests/extroot/ExperimentalData/2010_TestProject/2019-02-04/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7de3bd15d29b93085322250a06adb9b8f389f8e4 --- /dev/null +++ b/integrationtests/extroot/ExperimentalData/2010_TestProject/2019-02-04/README.md @@ -0,0 +1,5 @@ +--- +responsible: +- Tom Wood +description: Something. +... diff --git a/integrationtests/filldb.sh b/integrationtests/filldb.sh index 98d22347bd2d40e8384a2a217452fd3ba5bc445f..9f55365eb595537b43caa9b197c8bc31ea1e69cb 100755 --- a/integrationtests/filldb.sh +++ b/integrationtests/filldb.sh @@ -7,4 +7,5 @@ python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/SimulationData python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/Publications python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/Software python3 insert_model.py +python3 insert_some.py python3 crawl.py / diff --git a/integrationtests/insert_model.py b/integrationtests/insert_model.py index 270a08a36d7512a8642c2ca08a9ec6ea93b81bd9..ae3dd7701b44f5008bd976d81f8ecc8d9a02bf89 100755 --- a/integrationtests/insert_model.py +++ b/integrationtests/insert_model.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 import caosdb as db +import h5py +from caosadvancedtools.cfoods.h5 import H5CFood +from caosadvancedtools.models.data_model import DataModel from caosadvancedtools.models.parser import parse_model_from_yaml model = parse_model_from_yaml("model.yml") @@ -9,3 +12,11 @@ if len(db.execute_query("FIND Property alias")) == 0: al = db.Property(name="alias") al.add_parent(name="name") al.insert() + +h5model = db.Container() +h5file = h5py.File('extroot/ExperimentalData/2010_TestProject/2019-02-03/hdf5_dummy_file.hdf5', 'r') +H5CFood.create_structure(h5file, create_recordTypes=True, collection=h5model, + root_name="ExampleH5") +print(h5model) +h5model = DataModel(h5model) +h5model.sync_data_model(noquestion=True) diff --git a/integrationtests/insert_some.py b/integrationtests/insert_some.py new file mode 100644 index 0000000000000000000000000000000000000000..cf16a45ddf1f95ed261af1d9f18edfa1cbf4b450 --- /dev/null +++ b/integrationtests/insert_some.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 +import caosdb as db +from caosadvancedtools.scifolder.experiment_cfood import dm + +# This inserts two identifiables. When no dependencies are possible among +# identifiables, it should not be possible to find both: the experiment +# identifiable would for example not reference the correct project Record +project = db.Record(name='2010_TestProject') +project.add_parent(name=dm.Project) +project.insert() + +pers = db.Record() +pers.add_parent("Person") +pers.add_property("lastname", "Wood") +pers.add_property("firstname", "Tom") +pers.insert() + +experiment = db.Record() +experiment.add_parent(name=dm.Experiment) +experiment.description = "Something." +experiment.add_property( + name=dm.date, value='2019-02-04') +experiment.add_property(name=dm.Project, value=project) +experiment.add_property( + name="identifier", value="empty_identifier") +experiment.add_property( + name="responsible", value=pers) +experiment.insert(flags={"force-missing-obligatory": "ignore"}) diff --git a/integrationtests/model.yml b/integrationtests/model.yml index 357adfc7b6618f427297105f722b2d333c34c792..eaf1c084787fb8ed181db9abbdc05ae74d6a212f 100644 --- a/integrationtests/model.yml +++ b/integrationtests/model.yml @@ -9,6 +9,7 @@ Experiment: # TODO empty recommended_properties is a problem #recommended_properties: responsible: + datatype: LIST<Person> Project: SoftwareVersion: recommended_properties: @@ -38,16 +39,16 @@ Person: email: datatype: TEXT description: 'Email of a Person.' -responsible: - datatype: REFERENCE revisionOf: datatype: REFERENCE results: - datatype: REFERENCE + datatype: LIST<REFERENCE> sources: - datatype: REFERENCE + datatype: LIST<REFERENCE> scripts: - datatype: REFERENCE + datatype: LIST<REFERENCE> +single_attribute: + datatype: LIST<INTEGER> Simulation: obligatory_properties: date: @@ -74,3 +75,5 @@ Presentation: Report: inherit_from_suggested: - Publication +hdf5File: + datatype: REFERENCE diff --git a/integrationtests/test.sh b/integrationtests/test.sh index a56b758421a059a0cc3461c08600c13ffd93705c..71af543643a35cb082f10a24440c5ea87df946c9 100755 --- a/integrationtests/test.sh +++ b/integrationtests/test.sh @@ -1,8 +1,23 @@ #!/bin/bash +if [ "$1" != "--force" ] +then + echo "Warning: For these tests, the whole database will be deleted. Do you want to proceed? (yes/Exit)" + read safety + if [ -z $safety ] + then + echo "Exiting..." + exit 0 + elif [ $safety != "yes" ] + then + echo "Exiting..." + exit 0 + fi +fi OUT=/tmp/crawler.output ls cat pycaosdb.ini rm -rf cache.db +set -e echo "Clearing database" python3 clear_database.py echo "Testing crawler without cfoods" @@ -19,17 +34,16 @@ echo "Filling the database" echo "Testing the crawler database" python3 -m pytest test_crawler_with_cfoods.py echo "make a change" -pushd extroot +cd extroot egrep -liRZ 'A description of another example' . | xargs -0 -l sed -i -e 's/A description of another example/A description of this example/g' # remove a file to check that this does not lead to a crawler crash mv DataAnalysis/2010_TestProject/2019-02-03_something/README.xlsx DataAnalysis/2010_TestProject/2019-02-03_something/README.xlsx_back -popd +cd .. echo "run crawler" ./crawl.py / | tee $OUT # rename the moved file mv extroot/DataAnalysis/2010_TestProject/2019-02-03_something/README.xlsx_back extroot/DataAnalysis/2010_TestProject/2019-02-03_something/README.xlsx # check whether there was something UNAUTHORIZED -set -e grep "There where unauthorized changes" $OUT # get the id of the run which is the last field of the output string RUN_ID=$(grep "run id:" $OUT | awk '{ print $NF }') @@ -44,9 +58,9 @@ then fi set -e echo "undo changes" -pushd extroot +cd extroot egrep -liRZ 'A description of this example' . | xargs -0 -l sed -i -e 's/A description of this example/A description of another example/g' -popd +cd .. python3 test_table.py # TODO the following test deletes lots of the data inserted by the crawler echo "Testing im and export" diff --git a/integrationtests/test_crawler_with_cfoods.py b/integrationtests/test_crawler_with_cfoods.py index 18aa4847845ca2353d82a0439102211f1072e77e..fc07b6bde0ec5f0462cc6f51c27b875ff3a22b5c 100755 --- a/integrationtests/test_crawler_with_cfoods.py +++ b/integrationtests/test_crawler_with_cfoods.py @@ -26,6 +26,7 @@ import os import unittest import caosdb as db +from caosdb.apiutils import retrieve_entity_with_id def get_entity_with_id(eid): @@ -34,6 +35,14 @@ def get_entity_with_id(eid): class CrawlerTest(unittest.TestCase): def test_experiment(self): + + ######################## + # # dummy for dependency test experiment # # + ######################## + exp = db.execute_query( + "FIND Experiment with date=2019-02-04 and identifier=empty_identifier", + unique=True) + ######################## # # first experiment # # ######################## @@ -489,3 +498,17 @@ class CrawlerTest(unittest.TestCase): # Should have a description self.assertIsNotNone(ana.description) + + def test_exampleh5(self): + examp = db.execute_query("FIND Record ExampleH5", unique=True) + + for prop in examp.properties: + if prop.name == 'group_level1_a': + self.assertTrue(retrieve_entity_with_id(prop.value).get_property("group_level2_aa") is not None) + self.assertTrue(retrieve_entity_with_id(prop.value).get_property("group_level1_a") is None) + elif prop.name == 'group_level1_b': + self.assertTrue(retrieve_entity_with_id(prop.value).get_property("level1_b_floats") is not None) + elif prop.name == 'group_level1_c': + self.assertTrue(retrieve_entity_with_id(prop.value).get_property("level1_c_floats") is not None) + elif prop.name == 'root_integers': + self.assertTrue(retrieve_entity_with_id(prop.value).get_property("single_attribute") is not None) diff --git a/integrationtests/test_data_model.py b/integrationtests/test_data_model.py index 6f530719a810d76e5cc5a2c59fcd2d0325ff5268..2949fa81727a6c61a8646a48c249204fa87542d8 100644 --- a/integrationtests/test_data_model.py +++ b/integrationtests/test_data_model.py @@ -33,13 +33,6 @@ class DataModelTest(unittest.TestCase): rt = db.execute_query("FIND RECORDTYPE TestRecord", unique=True) assert rt.get_property("test") is not None - def tearDown(self): - try: - tests = db.execute_query("FIND test*") - tests.delete() - except Exception: - pass - def test_missing(self): # Test sync with missing prop # insert propt @@ -52,3 +45,19 @@ class DataModelTest(unittest.TestCase): dm.sync_data_model(noquestion=True) rt = db.execute_query("FIND RECORDTYPE TestRecord", unique=True) assert rt.get_property("testproperty") is not None + + def test_get_existing_entities(self): + db.RecordType(name="TestRecord").insert() + c = db.Container().extend([ + db.Property(name="test"), + db.RecordType(name="TestRecord")]) + exist = DataModel.get_existing_entities(c) + assert len(exist) == 1 + assert exist[0].name == "TestRecord" + + def tearDown(self): + try: + tests = db.execute_query("FIND test*") + tests.delete() + except Exception: + pass diff --git a/integrationtests/test_im_und_export.py b/integrationtests/test_im_und_export.py index db26249b14d3d547db8dcea4e49de2aa07479e5b..27995080aa5cbeeb6f562226d4f0c0ca19c64d83 100644 --- a/integrationtests/test_im_und_export.py +++ b/integrationtests/test_im_und_export.py @@ -3,15 +3,14 @@ import os from tempfile import TemporaryDirectory import caosdb as db - -from caosadvancedtools.export_related import export +from caosadvancedtools.export_related import export_related_to from caosadvancedtools.import_from_xml import import_xml if __name__ == "__main__": print("Conducting im- and export tests") rec = db.execute_query("FIND 2019-02-03_really_cool_finding", unique=True) directory = TemporaryDirectory() - export(rec.id, directory=directory.name) + export_related_to(rec.id, directory=directory.name) # delete everything recs = db.execute_query("FIND entity with id>99") recs.delete() diff --git a/setup.py b/setup.py index f26b126c2a589554ace736661aa3a685b3f671d3..772866537d02b71adddfab2a351a3e3372b05ab2 100755 --- a/setup.py +++ b/setup.py @@ -157,12 +157,15 @@ def setup_package(): install_requires=["caosdb>=0.4.0", "openpyxl>=3.0.0", "pandas>=1.2.0", + "numpy>=1.17.3", "xlrd>=2.0", ], + extras_require={"h5-crawler": ["h5py>=3.3.0", ], + }, packages=find_packages('src'), package_dir={'': 'src'}, setup_requires=["pytest-runner>=2.0,<3dev"], - tests_require=["pytest", "pytest-cov", "coverage>=4.4.2"], + tests_require=["pytest", "pytest-pythonpath", "pytest-cov", "coverage>=4.4.2"], ) try: setup(**metadata) diff --git a/src/caosadvancedtools/cache.py b/src/caosadvancedtools/cache.py index 3dac86ec328944303c629c8de721fb1a2f6a7bef..ff807f2aba6210d643e675e7e3dd91d7c3b30906 100644 --- a/src/caosadvancedtools/cache.py +++ b/src/caosadvancedtools/cache.py @@ -32,6 +32,8 @@ from hashlib import sha256 import caosdb as db from lxml import etree +import tempfile + def put_in_container(stuff): if isinstance(stuff, list): @@ -154,7 +156,9 @@ class UpdateCache(Cache): def __init__(self, db_file=None): if db_file is None: - db_file = "/tmp/crawler_update_cache.db" + tmppath = tempfile.gettempdir() + tmpf = os.path.join(tmppath, "crawler_update_cache.db") + db_file = tmpf super().__init__(db_file=db_file) @staticmethod diff --git a/src/caosadvancedtools/cfood.py b/src/caosadvancedtools/cfood.py index 8ce1dced48ba12e62717fe5bd788178e1e5a9488..c818792c79440dc1fcc78f3c0b1ed1b9bd215cb8 100644 --- a/src/caosadvancedtools/cfood.py +++ b/src/caosadvancedtools/cfood.py @@ -47,6 +47,7 @@ from abc import ABCMeta, abstractmethod from datetime import datetime import caosdb as db +from caosdb.common.models import Entity from caosdb.exceptions import (BadQueryError, EmptyUniqueQueryError, QueryNotUniqueError, TransactionError) @@ -152,9 +153,19 @@ fileguide = FileGuide() class AbstractCFood(object, metaclass=ABCMeta): + """ Abstract base class for Crawler food (CFood).""" def __init__(self, item): - """ Abstract base class for Crawler food (CFood).""" + """A CFood has two main methods which must be customized: + + 1. `create_identifiables` + This method defines (and inserts if necessary) the identifiables which may be updated at a + later stage. After calling this method, the `identifiables` Container contains those + Records which will be updated at a later time. + + 2. `update_identifiables` + This method updates the stored identifiables as necessary. + """ self.to_be_updated = db.Container() self.identifiables = db.Container() self.item = item @@ -298,7 +309,7 @@ class AbstractFileCFood(AbstractCFood): super().__init__(*args, item=crawled_path, **kwargs) self._crawled_file = None self.crawled_path = crawled_path - self.match = re.match(type(self).get_re(), crawled_path) + self.match = re.match(self.get_re(), crawled_path) self.attached_filenames = [] @property @@ -309,7 +320,31 @@ class AbstractFileCFood(AbstractCFood): return self._crawled_file @staticmethod - def get_re(): + def re_from_extensions(extensions): + """Return a regular expression which matches the given file extensions. + + Useful for inheriting classes. + + Parameters + ---------- + extensions : iterable<str> + An iterable with the allowed extensions. + + Returns + ------- + out : str + The regular expression, starting with ``.*\\.`` and ending with the EOL dollar + character. The actual extension will be accessible in the + :py:attribute:`pattern group name<python:re.Pattern.groupindexe>` ``ext``. + """ + + if not extensions: + return None + + return r".*\.(?P<ext>" + "|".join(extensions) + ")$" + + @classmethod + def get_re(cls): """ Returns the regular expression used to identify files that shall be processed @@ -377,6 +412,7 @@ def assure_object_is_in_list(obj, containing_object, property_name, if containing_object.get_property(property_name) is None: containing_object.add_property(property_name, value=[], datatype=datatype) + # TODO: case where multiple times the same property exists is not treated if not isinstance(containing_object.get_property(property_name).value, list): containing_object.get_property(property_name).value = [ @@ -627,8 +663,19 @@ def assure_has_property(entity, name, value, to_be_updated=None, if isinstance(value, db.Entity): value = value.id + if isinstance(value, list): + value = [i.id if isinstance(i, db.Entity) else i for i in value] + for el in possible_properties: - if el.value == value: + tmp_value = el.value + + if isinstance(tmp_value, db.Entity): + tmp_value = el.value.id + + if isinstance(tmp_value, list): + tmp_value = [i.id if isinstance(i, db.Entity) else i for i in tmp_value] + + if tmp_value == value: contained = True break diff --git a/src/caosadvancedtools/cfoods/__init__.py b/src/caosadvancedtools/cfoods/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..30ce05add09a223c2f65dbe187a6cfb1768d7a22 --- /dev/null +++ b/src/caosadvancedtools/cfoods/__init__.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 + +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2020 IndiScale GmbH <www.indiscale.com> +# Copyright (C) 2020 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Specialized CFoods.""" diff --git a/src/caosadvancedtools/cfoods/h5.py b/src/caosadvancedtools/cfoods/h5.py new file mode 100644 index 0000000000000000000000000000000000000000..6c68edd3668fec957126aa3234a830aab98fcd25 --- /dev/null +++ b/src/caosadvancedtools/cfoods/h5.py @@ -0,0 +1,289 @@ +#!/usr/bin/env python3 + +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2020,2021 IndiScale GmbH <www.indiscale.com> +# Copyright (C) 2020 Daniel Hornung <d.hornung@indiscale.com> +# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# Copyright (C) 2021 Alexander Kreft +# Copyright (C) 2021 Laboratory for Fluid Physics and Biocomplexity, +# Max-Planck-Insitute für Dynamik und Selbstorganisation <www.lfpn.ds.mpg.de> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""A CFood for hdf5 files + + +This module allows to parse hdf5 files and reproduce their structure in form +of Records that reference each other. + +hdf5 files are composed of groups and datasets. Both of which can have +attributes. Groups and datasets are mapped to Records and attributes to +Properties. +""" + +import re +from copy import deepcopy + +import caosdb as db +import h5py +import numpy as np +from caosadvancedtools.cfood import fileguide +from caosdb.common.datatype import is_reference +from caosdb.common.utils import uuid + +from ..cfood import (AbstractFileCFood, assure_has_description, + assure_has_parent, assure_has_property, + assure_property_is) +from ..structure_mapping import (EntityMapping, collect_existing_structure, + update_structure) + + +def h5_attr_to_property(val): + """ returns the value and datatype of a CaosDB Property for the given value + + + 1d arrays are converted to lists + If no suitable Property can be created (None, None) is returned. + + 2d and higher dimensionality arrays are being ignored. + """ + + if isinstance(val, str): + return val, db.TEXT + elif isinstance(val, complex): + return val, db.TEXT + else: + if not hasattr(val, 'dtype'): + raise NotImplementedError("Code assumes only str are missing the" + "dtype attribute") + + if issubclass(val.dtype.type, np.floating): + dtype = db.DOUBLE + elif issubclass(val.dtype.type, np.integer): + dtype = db.INTEGER + elif val.dtype.kind in ['S', 'U']: + dtype = db.TEXT + val = val.astype(str) + elif val.dtype.kind == 'O': + if not np.all([isinstance(el, str) for el in val]): + raise NotImplementedError("Cannot convert arbitrary objects") + dtype = db.TEXT + val = val.astype(str) + else: + raise NotImplementedError("Unknown dtype used") + + if isinstance(val, np.ndarray): + if val.ndim > 1: + return None, None + # The tolist method is on both numpy.ndarray and numpy.generic + # and properly converts scalars (including 0-dimensional + # numpy.ndarray) to Python scalars and 1D arrays to lists of + # Python scalars. + if val.ndim != 0: + dtype = db.LIST(dtype) + val = val.tolist() + + # TODO this can eventually be removed + + if(hasattr(val, 'ndim')): + if not isinstance(val, np.ndarray) and val.ndim != 0: + print(val, val.ndim) + raise Exception( + "Implementation assumes that only np.arrays have ndim.") + + return val, dtype + + +class H5CFood(AbstractFileCFood): + """ H5CFood which consumes a HDF5 file. + + The structure is mapped onto an equivalent structure of interconnected + Records. + + Attributes + ---------- + h5file : h5py.File, default None + Name of the hdf5-file to read + """ + + # to be overwritten by subclasses + + def __init__(self, *args, **kwargs): + """CFood which consumes HDF5 files.""" + super().__init__(*args, **kwargs) + self.h5file = None + self.root_name = "root" + self.hdf5Container = db.Container() + self.em = EntityMapping() + + def collect_information(self): + self.h5file = h5py.File(fileguide.access(self.crawled_path), 'r') + + @staticmethod + def get_re(): + """Return a regular expression string to match *.h5, *.nc, *.hdf, *.hdf5.""" + extensions = [ + "h5", + "nc", + "hdf", + "hdf5", + ] + + return AbstractFileCFood.re_from_extensions(extensions) + + def create_identifiables(self): + """Create identifiables out of groups in the HDF5 file. + + This method will call is_identifiable(h5path, h5object) and create_identifiable(h5path, + h5object) on each HDF5 object to decide and actually create the identifiables. + """ + # manually create the identifiable root element: self.identifiable_root + self.structure = self.create_structure(self.h5file, + special_treatment=self.special_treatment, + root_name=self.root_name) + + def update_identifiables(self): + """Check if the identifiables need to be updated. + + In that case also add the updated entities to the list of updateables. + + This method will iterate over the groups and datasets governed by this CFood's identifiables + and call ``update_object(path, h5object)`` on each object. + + """ + + self.structure._cuid = "root element" + self.em.add(self.structure, self.identifiable_root) + collect_existing_structure(self.structure, self.identifiable_root, + self.em) + self.to_be_inserted = db.Container() + self.insert_missing_structure(self.structure) + + # TODO this is a workaround due to the fact that the caosdb library + # changes the objects in the Container if it is inserted. The graph + # structure is flattened. I.e. references to other entity objects are + # replaced with their IDs. However this code depends on this graph. + tmp_copy = deepcopy(self.to_be_inserted) + tmp_copy.insert() + + for e1, e2 in zip(tmp_copy, self.to_be_inserted): + e2.id = e1.id + # End workaround + + # self.update_structure(self.structure) + update_structure(self.em, self.to_be_updated, self.structure) + + def special_treatment(self, key, value, dtype): + """define special treatment of attributes + + to be overwritten by child classes. + + key: attribute name + value: attribute value + """ + + return key, value, dtype + + @classmethod + def create_structure(cls, h5obj, create_recordTypes=False, collection=None, + special_treatment=None, root_name="root"): + """Create Records and Record types from a given hdf5-object for all + items in the tree. Attributes are added as properties, the + values only if the dimension < 2. + + Parameters + ---------- + h5obj : h5py.File + a hdf5-file object + + root_name : name that is used instead of '/' + Type of the root Record (the Record corresponding to + the root node in the HDF5 file) + + Returns + ------- + rec : db.Container + Contains the Record Types, Records and Properties for the + input-tree + + """ + + if collection is None: + collection = [] + + if special_treatment is None: + def special_treatment(x, y, z): return x, y, z + + if h5obj.name == "/": + name_without_path = root_name + else: + name_without_path = h5obj.name.split("/")[-1] + + if create_recordTypes: + rec = db.RecordType(name=name_without_path) + else: + rec = db.Record().add_parent(name=name_without_path) + collection.append(rec) + + if isinstance(h5obj, h5py.Group): + for subgroup in h5obj.keys(): + subgroup_name = h5obj[subgroup].name.split("/")[-1] + + sub = H5CFood.create_structure(h5obj[subgroup], + create_recordTypes=create_recordTypes, + collection=collection, + special_treatment=special_treatment) + + if create_recordTypes: + rec.add_property(subgroup_name) + else: + rec.add_property(subgroup_name, value=sub) + + for key, val in h5obj.attrs.items(): + # ignored + + if key in ["REFERENCE_LIST", "DIMENSION_LIST", "NAME", "CLASS"]: + continue + + val, dtype = h5_attr_to_property(val) + + if val is None and dtype is None: + continue + + if create_recordTypes and key.lower() not in ['description']: + treated_k, _, treated_dtype = special_treatment( + key, val, dtype) + + if treated_k is not None: + prop = db.Property(name=treated_k, datatype=treated_dtype) + collection.append(prop) + rec.add_property(name=treated_k) + else: + treated_k, treated_v, treated_dtype = special_treatment( + key, val, dtype) + + if treated_k is not None: + rec.add_property(name=treated_k, value=treated_v, + datatype=treated_dtype) + + return rec + + def insert_missing_structure(self, target_structure: db.Record): + if target_structure._cuid not in self.em.to_existing: + self.to_be_inserted.append(target_structure) + + for prop in target_structure.get_properties(): + if prop.is_reference(server_retrieval=True): + self.insert_missing_structure(prop.value) diff --git a/src/caosadvancedtools/collect_datamodel.py b/src/caosadvancedtools/collect_datamodel.py index 1ca68068e713dd34ebc3368ad760461578dee4ef..806d15333cac7f745ce2fb82a02e0214ad2b6616 100644 --- a/src/caosadvancedtools/collect_datamodel.py +++ b/src/caosadvancedtools/collect_datamodel.py @@ -26,14 +26,19 @@ import argparse import os import caosdb as db +from caosdb.apiutils import retrieve_entities_with_ids + +from export_related import export def get_dm(): - rts = set([r.name for r in db.execute_query("SELECT name FROM RECORDTYPE")]) + rts = set([(r.id, r.name) for r + in db.execute_query("SELECT name FROM RECORDTYPE")]) if None in rts: rts.remove(None) - ps = set([r.name for r in db.execute_query("SELECT name FROM PROPERTY")]) + ps = set([(r.id, r.name) for r + in db.execute_query("SELECT name FROM PROPERTY")]) if None in ps: ps.remove(None) @@ -47,18 +52,26 @@ def get_parser(): "be stored") p.add_argument("-c", "--compare", help="directory where the datamodel that" " shall be compared is stored") + p.add_argument("-x", "--xml", action="store_true", + help="store xml as well") return p -def store(directory): +def store(directory, xml=False): rts, ps = get_dm() os.makedirs(directory, exist_ok=True) with open(os.path.join(directory, "recordtypes.txt"), "w") as fi: - fi.write(",".join(rts)) + fi.write(",".join([el[1] for el in rts])) with open(os.path.join(directory, "properties.txt"), "w") as fi: - fi.write(",".join(ps)) + fi.write(",".join([el[1] for el in ps])) + + if xml: + cont = retrieve_entities_with_ids( + [el[0] for el in rts]+[el[0] for el in ps]) + + export(cont, directory) def load_dm(directory): @@ -104,7 +117,7 @@ if __name__ == "__main__": args = p.parse_args() if args.store: - store(args.store) + store(args.store, xml=args.xml) if args.compare: compare(args.compare) diff --git a/src/caosadvancedtools/crawler.py b/src/caosadvancedtools/crawler.py index 747c533d7a4652434f967147d8a53d1847cfbb4e..5a8d428655791169557f5c292d30698f6ad69798 100644 --- a/src/caosadvancedtools/crawler.py +++ b/src/caosadvancedtools/crawler.py @@ -56,6 +56,7 @@ from .datainconsistency import DataInconsistencyError from .datamodel_problems import DataModelProblems from .guard import RETRIEVE, ProhibitedException from .guard import global_guard as guard +from .serverside.helper import send_mail as main_send_mail from .suppressKnown import SuppressKnown logger = logging.getLogger(__name__) @@ -500,7 +501,6 @@ carefully and if the changes are ok, click on the following link: """.format(url=caosdb_config["Connection"]["url"], filename=filename, changes="\n".join(changes)) - sendmail = caosdb_config["Misc"]["sendmail"] try: fro = caosdb_config["advancedtools"]["crawler.from_mail"] to = caosdb_config["advancedtools"]["crawler.to_mail"] @@ -510,8 +510,11 @@ carefully and if the changes are ok, click on the following link: "'from_mail' and 'to_mail'.") return - p = subprocess.Popen([sendmail, "-f", fro, to], stdin=subprocess.PIPE) - p.communicate(input=text.encode()) + main_send_mail( + from_addr=fro, + to=to, + subject="Crawler Update", + body=text) def push_identifiables_to_CaosDB(self, cfood): """ @@ -576,44 +579,51 @@ carefully and if the changes are ok, click on the following link: # looking for matching entities in CaosDB when there is no valid id # i.e. there was none set from a cache + existing = [] + inserted = [] + for ent in identifiables: if ent.id is None or ent.id < 0: logger.debug("Looking for: {}".format( ent.id if ent.id is not None else ent.name)) - existing = Crawler.find_existing(ent) + found = Crawler.find_existing(ent) - if existing is not None: - ent.id = existing.id + if found is not None: + ent.id = found.id else: logger.debug("Id is known of: {}".format(ent)) - # insert missing, i.e. those which are not valid - missing_identifiables = db.Container() - missing_identifiables.extend([ent for ent in identifiables - if ent.id is None or ent.id < 0]) - # TODO the following should not be necessary. Fix it - - for ent in missing_identifiables: - ent.id = None + # insert missing, i.e. those which are not valid + if ent.id is None or ent.id < 0: + missing = ent + ent.id = None + else: + missing = None + existing.append(ent) - if len(missing_identifiables) > 0: - info = "Going to insert the following entities:\n" + if missing: + try: + guard.safe_insert(missing, unique=False, + flags={"force-missing-obligatory": "ignore"}) + inserted.append(ent) + except Exception as e: + DataModelProblems.evaluate_exception(e) + if len(existing) > 0: + info = "Identified the following existing entities:\n" - for ent in missing_identifiables: + for ent in existing: info += str(ent)+"\n" logger.debug(info) + else: + logger.debug("Did not identify any existing entities") + if len(inserted) > 0: + info = "Inserted the following entities:\n" - if len(missing_identifiables) == 0: - logger.debug("No new entities to be inserted.") + for ent in inserted: + info += str(ent)+"\n" + logger.debug(info) else: - try: - logger.info( - "Inserting {} Records...".format( - len(missing_identifiables))) - guard.safe_insert(missing_identifiables, unique=False, - flags={"force-missing-obligatory": "ignore"}) - except Exception as e: - DataModelProblems.evaluate_exception(e) + logger.debug("Did not insert any new entities") logger.debug("Retrieving entities from CaosDB...") identifiables.retrieve(unique=True, raise_exception_on_error=False) @@ -693,8 +703,8 @@ class FileCrawler(Crawler): @staticmethod def query_files(path): - query_str = "FIND FILE WHICH IS STORED AT " + ( - path if path.endswith("/") else path + "/") + "**" + query_str = "FIND FILE WHICH IS STORED AT '" + ( + path if path.endswith("/") else path + "/") + "**'" q_info = "Sending the following query: '" + query_str + "'\n" files = db.execute_query(query_str) logger.info( diff --git a/src/caosadvancedtools/example_cfood.py b/src/caosadvancedtools/example_cfood.py index 6111d95defc37bbb6d836feec3fa3d2e4e3d91ab..2e395d5c3030508087e25a7156d35c8954d223d7 100644 --- a/src/caosadvancedtools/example_cfood.py +++ b/src/caosadvancedtools/example_cfood.py @@ -26,8 +26,8 @@ from .cfood import AbstractFileCFood, assure_has_property class ExampleCFood(AbstractFileCFood): - @staticmethod - def get_re(): + @classmethod + def get_re(cls): return (r".*/(?P<species>[^/]+)/" r"(?P<date>\d{4}-\d{2}-\d{2})/README.md") diff --git a/src/caosadvancedtools/export_related.py b/src/caosadvancedtools/export_related.py index 00f440d28a2ae1da14132083e4b8d3c5003d1b65..69b588c34cc7c8123ab4291f6d8f76f06e7400be 100755 --- a/src/caosadvancedtools/export_related.py +++ b/src/caosadvancedtools/export_related.py @@ -96,12 +96,15 @@ def invert_ids(entities): apply_to_ids(entities, lambda x: x*-1) -def export(rec_id, directory="."): +def export_related_to(rec_id, directory="."): if not isinstance(rec_id, int): raise ValueError("rec_id needs to be an integer") ent = db.execute_query("FIND {}".format(rec_id), unique=True) cont = recursively_collect_related(ent) + export(cont, directory=directory) + +def export(cont, directory="."): directory = os.path.abspath(directory) dl_dir = os.path.join(directory, "downloads") @@ -119,6 +122,9 @@ def export(rec_id, directory="."): print("Failed download of:", target) invert_ids(cont) + + for el in cont: + el.version = None xml = etree.tounicode(cont.to_xml( local_serialization=True), pretty_print=True) @@ -147,4 +153,4 @@ if __name__ == "__main__": parser = defineParser() args = parser.parse_args() - export(args.id, directory=args.directory) + export_related_to(args.id, directory=args.directory) diff --git a/src/caosadvancedtools/models/parser.py b/src/caosadvancedtools/models/parser.py index 5e1532e03690e753b8926b87b01db4e3a89f2c4c..e56a492fa3e9199a312d374a622770e7836f42cb 100644 --- a/src/caosadvancedtools/models/parser.py +++ b/src/caosadvancedtools/models/parser.py @@ -43,8 +43,27 @@ KEYWORDS_IGNORED = [ ] +def _get_listdatatype(dtype): + """matches a string to check whether the type definition is a list + + returns the type within the list or None, if it cannot be matched with a + list definition + """ + # TODO: string representation should be the same as used by the server: + # e.g. LIST<TEXT> + # this should be changed in the module and the old behavour should be + # marked as depricated + match = re.match(r"^LIST[(<](?P<dt>.*)[)>]$", dtype) + + if match is None: + return None + else: + return match.group("dt") + # Taken from https://stackoverflow.com/a/53647080, CC-BY-SA, 2018 by # https://stackoverflow.com/users/2572431/augurar + + class SafeLineLoader(yaml.SafeLoader): """Load a line and keep meta-information. @@ -56,6 +75,7 @@ class SafeLineLoader(yaml.SafeLoader): mapping = super().construct_mapping(node, deep=deep) # Add 1 so line numbering starts at 1 mapping['__line__'] = node.start_mark.line + 1 + return mapping # End of https://stackoverflow.com/a/53647080 @@ -76,12 +96,14 @@ class YamlDefinitionError(RuntimeError): def parse_model_from_yaml(filename): """Shortcut if the Parser object is not needed.""" parser = Parser() + return parser.parse_model_from_yaml(filename) def parse_model_from_string(string): """Shortcut if the Parser object is not needed.""" parser = Parser() + return parser.parse_model_from_string(string) @@ -105,6 +127,7 @@ class Parser(object): """ with open(filename, 'r') as outfile: ymlmodel = yaml.load(outfile, Loader=SafeLineLoader) + return self._create_model_from_dict(ymlmodel) def parse_model_from_string(self, string): @@ -121,6 +144,7 @@ class Parser(object): The created DataModel """ ymlmodel = yaml.load(string, Loader=SafeLineLoader) + return self._create_model_from_dict(ymlmodel) def _create_model_from_dict(self, ymlmodel): @@ -148,6 +172,7 @@ class Parser(object): # a record type with the name of the element. # The retrieved entity will be added to the model. # If no entity with that name is found an exception is raised. + if "extern" not in ymlmodel: ymlmodel["extern"] = [] @@ -170,7 +195,7 @@ class Parser(object): self._add_entity_to_model(name, entity) # initialize recordtypes self._set_recordtypes() - self._check_datatypes() + self._check_and_convert_datatypes() for name, entity in ymlmodel.items(): self._treat_entity(name, entity, line=ymlmodel["__line__"]) @@ -196,11 +221,14 @@ class Parser(object): out : str If `name` was a string, return it. Else return str(`name`). """ + if name is None: print("WARNING: Name of this context is None: {}".format(context), file=sys.stderr) + if not isinstance(name, str): name = str(name) + return name def _add_entity_to_model(self, name, definition): @@ -208,9 +236,11 @@ class Parser(object): Properties are also initialized. """ + if name == "__line__": return name = self._stringify(name) + if name not in self.model: self.model[name] = None @@ -221,8 +251,9 @@ class Parser(object): and isinstance(definition, dict) # is it a property and "datatype" in definition - # but not a list - and not definition["datatype"].startswith("LIST")): + # but not simply an RT of the model + and not (_get_listdatatype(definition["datatype"]) == name and + _get_listdatatype(definition["datatype"]) in self.model)): # and create the new property self.model[name] = db.Property(name=name, @@ -235,6 +266,7 @@ class Parser(object): if prop_type in definition: # Empty property mapping should be allowed. + if definition[prop_type] is None: definition[prop_type] = {} try: @@ -245,6 +277,7 @@ class Parser(object): except AttributeError as ate: if ate.args[0].endswith("'items'"): line = definition["__line__"] + if isinstance(definition[prop_type], list): line = definition[prop_type][0]["__line__"] raise YamlDefinitionError(line) from None @@ -252,26 +285,24 @@ class Parser(object): def _add_to_recordtype(self, ent_name, props, importance): """Add properties to a RecordType.""" + for n, e in props.items(): if n in KEYWORDS: if n in KEYWORDS_IGNORED: continue raise YamlDefinitionError("Unexpected keyword in line {}: {}".format( props["__line__"], n)) + if n == "__line__": continue n = self._stringify(n) - if isinstance(e, dict) and "datatype" in e and e["datatype"].startswith("LIST"): - match = re.match(r"LIST[(](.*)[)]", e["datatype"]) - - if match is None: - raise ValueError("List datatype definition is wrong") - dt = db.LIST(match.group(1)) - self.model[ent_name].add_property(name=n, - importance=importance, - datatype=dt - ) + if (isinstance(e, dict) and "datatype" in e + and (_get_listdatatype(e["datatype"]) is not None)): + self.model[ent_name].add_property( + name=n, + importance=importance, + datatype=db.LIST(_get_listdatatype(e["datatype"]))) else: self.model[ent_name].add_property(name=n, importance=importance) @@ -288,6 +319,7 @@ class Parser(object): def _treat_entity(self, name, definition, line=None): """Parse the definition and the information to the entity.""" + if name == "__line__": return name = self._stringify(name) @@ -316,19 +348,22 @@ class Parser(object): self.model[name].description = prop elif prop_name == "recommended_properties": - self._add_to_recordtype(name, prop, importance=db.RECOMMENDED) + self._add_to_recordtype( + name, prop, importance=db.RECOMMENDED) for n, e in prop.items(): self._treat_entity(n, e) elif prop_name == "obligatory_properties": - self._add_to_recordtype(name, prop, importance=db.OBLIGATORY) + self._add_to_recordtype( + name, prop, importance=db.OBLIGATORY) for n, e in prop.items(): self._treat_entity(n, e) elif prop_name == "suggested_properties": - self._add_to_recordtype(name, prop, importance=db.SUGGESTED) + self._add_to_recordtype( + name, prop, importance=db.SUGGESTED) for n, e in prop.items(): self._treat_entity(n, e) @@ -354,21 +389,50 @@ class Parser(object): raise e self.treated.append(name) - def _check_datatypes(self): + def _check_and_convert_datatypes(self): """ checks if datatype is valid. - datatype of properties is simply initialized with string. Here over - properties is iterated and datatype is corrected. """ + datatype of properties is simply initialized with string. Here, we + iterate over properties and check whether it is a base datatype of a + name that was defined in the model (or extern part) + + the string representations are replaced with caosdb objects + + """ for key, value in self.model.items(): + if isinstance(value, db.Property): - if value.datatype in self.model: - value.datatype = self.model[value.datatype] - else: - # get the datatype - try: - value.datatype = db.__getattribute__(value.datatype) - except AttributeError: - raise ValueError("Unknown Datatype.") + dtype = value.datatype + is_list = False + + if _get_listdatatype(value.datatype) is not None: + dtype = _get_listdatatype(value.datatype) + is_list = True + + if dtype in self.model: + if is_list: + value.datatype = db.LIST(self.model[dtype]) + else: + value.datatype = self.model[dtype] + + continue + + if dtype in [db.DOUBLE, + db.REFERENCE, + db.TEXT, + db.DATETIME, + db.INTEGER, + db.FILE, + db.BOOLEAN]: + + if is_list: + value.datatype = db.LIST(db.__getattribute__(dtype)) + else: + value.datatype = db.__getattribute__(dtype) + + continue + + raise ValueError("Property {} has an unknown datatype: {}".format(value.name, value.datatype)) def _set_recordtypes(self): """ properties are defined in first iteration; set remaining as RTs """ diff --git a/src/caosadvancedtools/structure_mapping.py b/src/caosadvancedtools/structure_mapping.py new file mode 100644 index 0000000000000000000000000000000000000000..50e57ac4d84f2034fbdb6da6c7159f450a993c3a --- /dev/null +++ b/src/caosadvancedtools/structure_mapping.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 + +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2021 IndiScale GmbH <www.indiscale.com> +# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +import caosdb as db +from caosdb.apiutils import resolve_reference +from caosdb.common.utils import uuid + +from .cfood import (assure_has_description, assure_has_parent, + assure_property_is) + + +class EntityMapping(object): + """ + map local entities to entities on the server + + the dict to_existing maps _cuid property to entity objects + the dict to_target maps id property to entity objects + """ + + def __init__(self): + self.to_existing = {} + self.to_target = {} + + def add(self, target, existing): + if target._cuid is None: + target._cuid = str(uuid()) + self.to_existing[str(target._cuid)] = existing + self.to_target[existing.id] = target + + +def collect_existing_structure(target_structure, existing_root, em): + """ recursively collects existing entities + + The collected entities are those that correspond to the ones in + target_structure. + + + em: EntityMapping + """ + + for prop in target_structure.properties: + if prop.value is None: + continue + + if not prop.is_reference(server_retrieval=True): + continue + + if (len([p for p in target_structure.properties if p.name == prop.name]) + != 1): + raise ValueError("Current implementation allows only one property " + "for each property name") + + if (existing_root.get_property(prop.name) is not None and + existing_root.get_property(prop.name).value is not None): + resolve_reference(prop) + + resolve_reference(existing_root.get_property(prop.name)) + referenced = existing_root.get_property(prop.name).value + + if not isinstance(referenced, list): + referenced = [referenced] + target_value = prop.value + + if not isinstance(target_value, list): + target_value = [target_value] + + if len(target_value) != len(referenced): + raise ValueError() + + for tent, eent in zip(target_value, referenced): + em.add(tent, eent) + collect_existing_structure(tent, eent, em) + + +def update_structure(em, updating: db.Container, target_structure: db.Record): + """compare the existing records with the target record tree created + from the h5 object + + Parameters + ---------- + + existing_structure + retrieved entity; e.g. the top level identifiable + + target_structure : db.Record + A record which may have references to other records. Must be a DAG. + """ + + if target_structure._cuid in em.to_existing: + update_matched_entity(em, + updating, + target_structure, + em.to_existing[target_structure._cuid]) + + for prop in target_structure.get_properties(): + if prop.is_reference(server_retrieval=True): + update_structure(em, updating, prop.value) + + +def update_matched_entity(em, updating, target_record, existing_record): + """ + update the Record existing in the server according to the Record + supplied as target_record + """ + + for parent in target_record.get_parents(): + if parent.name == "": + raise ValueError("Parent name must not be empty.") + assure_has_parent(existing_record, parent.name, force=True) + + if target_record.description is not None: + # check whether description is equal + assure_has_description(existing_record, target_record.description, + to_be_updated=updating) + + for prop in target_record.get_properties(): + # check for remaining property types + + if isinstance(prop.value, db.Entity): + if prop.value._cuid in em.to_existing: + value = em.to_existing[prop.value._cuid].id + else: + value = prop.value.id + else: + value = prop.value + assure_property_is(existing_record, prop.name, value, + to_be_updated=updating) diff --git a/src/caosadvancedtools/suppressKnown.py b/src/caosadvancedtools/suppressKnown.py index c15f0e06fa7d126937497aeb877dd5d2991b6ff7..c4b57039c5184f2443e4dbb91cf11f5e59ae6790 100644 --- a/src/caosadvancedtools/suppressKnown.py +++ b/src/caosadvancedtools/suppressKnown.py @@ -5,6 +5,8 @@ import os import sqlite3 from hashlib import sha256 +import tempfile + class SuppressKnown(logging.Filter): """ @@ -26,8 +28,9 @@ class SuppressKnown(logging.Filter): if db_file: self.db_file = db_file else: - self.db_file = "/tmp/caosadvanced_suppressed_cache.db" - + tmppath = tempfile.gettempdir() + tmpf = os.path.join(tmppath, "caosadvanced_suppressed_cache.db") + self.db_file = tmpf if not os.path.exists(self.db_file): self.create_cache() diff --git a/src/caosadvancedtools/table_importer.py b/src/caosadvancedtools/table_importer.py index 04c8ea23b19ee0cc055dc58b69f1b3d6fecd1b55..cb61e8389de69a2d0d0527ad01cb8b9991b19ece 100755 --- a/src/caosadvancedtools/table_importer.py +++ b/src/caosadvancedtools/table_importer.py @@ -88,7 +88,10 @@ def date_converter(val, fmt="%Y-%m-%d"): converts it using format string """ - return datetime_converter(val, fmt=fmt).date() + if val is None: + return None + else: + return datetime_converter(val, fmt=fmt).date() def incomplete_date_converter(val, fmts={"%Y-%m-%d": "%Y-%m-%d", @@ -145,12 +148,44 @@ def win_path_converter(val): return path.as_posix() -class TSVImporter(object): - def __init__(self, converters, obligatory_columns=[], unique_columns=[]): - raise NotImplementedError() +def string_in_list(val, options, ignore_case=True): + """Return the given value if it is contained in options, raise an + error otherwise. + Parameters + ---------- + val : str + String value to be checked. + options : list<str> + List of possible values that val may obtain + ignore_case : bool, optional + Specify whether the comparison of val and the possible options + should ignor capitalization. Default is True. + + Returns + ------- + val : str + The original value if it is contained in options -class XLSImporter(object): + Raises + ------ + ValueError + If val is not contained in options. + """ + if ignore_case: + val = val.lower() + options = [o.lower() for o in options] + + if val not in options: + raise ValueError( + "Field value is '{}', but it should be one of the following " + "values: {}.".format(val, ", ".join( + ["'{}'".format(o) for o in options]))) + + return val + + +class TableImporter(object): def __init__(self, converters, obligatory_columns=None, unique_keys=None): """ converters: dict with column names as keys and converter functions as @@ -168,50 +203,14 @@ class XLSImporter(object): """ self.sup = SuppressKnown() self.required_columns = list(converters.keys()) - self.obligatory_columns = [] if obligatory_columns is None else obligatory_columns + self.obligatory_columns = ([] + if obligatory_columns is None + else obligatory_columns) self.unique_keys = [] if unique_keys is None else unique_keys self.converters = converters - def read_xls(self, filename, **kwargs): - """ - converts an xls file into a Pandas DataFrame. - - The converters of the XLSImporter object are used. - - Raises: DataInconsistencyError - """ - try: - xls_file = pd.io.excel.ExcelFile(filename) - except XLRDError as e: - logger.warning( - "Cannot read \n{}.\nError:{}".format(filename, - str(e)), - extra={'identifier': str(filename), - 'category': "inconsistency"}) - raise DataInconsistencyError(*e.args) - - if len(xls_file.sheet_names) > 1: - # Multiple sheets is the default now. Only show in debug - logger.debug( - "Excel file {} contains multiple sheets. " - "All but the first are being ignored.".format(filename)) - - try: - df = xls_file.parse(converters=self.converters, **kwargs) - except Exception as e: - logger.warning( - "Cannot parse {}.".format(filename), - extra={'identifier': str(filename), - 'category': "inconsistency"}) - raise DataInconsistencyError(*e.args) - - self.check_columns(df, filename=filename) - df = self.check_missing(df, filename=filename) - - if len(self.unique_keys) > 0: - df = self.check_unique(df, filename=filename) - - return df + def read_file(self, filename, **kwargs): + raise NotImplementedError() def check_columns(self, df, filename=None): """ @@ -306,3 +305,70 @@ class XLSImporter(object): okay = False return df + + def check_dataframe(self, df, filename): + self.check_columns(df, filename=filename) + df = self.check_missing(df, filename=filename) + + if len(self.unique_keys) > 0: + df = self.check_unique(df, filename=filename) + + +class XLSImporter(TableImporter): + def read_file(self, filename, **kwargs): + return self.read_xls(filename=filename, **kwargs) + + def read_xls(self, filename, **kwargs): + """ + converts an xls file into a Pandas DataFrame. + + The converters of the XLSImporter object are used. + + Raises: DataInconsistencyError + """ + try: + xls_file = pd.io.excel.ExcelFile(filename) + except (XLRDError, ValueError) as e: + logger.warning( + "Cannot read \n{}.\nError:{}".format(filename, + str(e)), + extra={'identifier': str(filename), + 'category': "inconsistency"}) + raise DataInconsistencyError(*e.args) + + if len(xls_file.sheet_names) > 1: + # Multiple sheets is the default now. Only show in debug + logger.debug( + "Excel file {} contains multiple sheets. " + "All but the first are being ignored.".format(filename)) + + try: + df = xls_file.parse(converters=self.converters, **kwargs) + except Exception as e: + logger.warning( + "Cannot parse {}.".format(filename), + extra={'identifier': str(filename), + 'category': "inconsistency"}) + raise DataInconsistencyError(*e.args) + + self.check_dataframe(df, filename) + + return df + + +class CSVImporter(TableImporter): + def read_file(self, filename, sep=",", **kwargs): + df = pd.read_csv(filename, sep=sep, converters=self.converters, + **kwargs) + self.check_dataframe(df, filename) + + return df + + +class TSVImporter(TableImporter): + def read_file(self, filename, **kwargs): + df = pd.read_csv(filename, sep="\t", converters=self.converters, + **kwargs) + self.check_dataframe(df, filename) + + return df diff --git a/src/doc/README_SETUP.md b/src/doc/README_SETUP.md new file mode 120000 index 0000000000000000000000000000000000000000..88332e357f5e06f3de522768ccdcd9e513c15f62 --- /dev/null +++ b/src/doc/README_SETUP.md @@ -0,0 +1 @@ +../../README_SETUP.md \ No newline at end of file diff --git a/src/doc/crawler.rst b/src/doc/crawler.rst index 0a710fa1e3ed2c8115f7209be30de758c0c23ec3..c52bbf2fe9b9f5fd77805e45ec85d195f5aa95f3 100644 --- a/src/doc/crawler.rst +++ b/src/doc/crawler.rst @@ -71,7 +71,7 @@ indicated in the messages). Invocation as Python Script --------------------------- -The crawler can be executed directly via a python script (usually called +The crawler can be executed directly via a Python script (usually called ``crawl.py``). The script prints the progress and reports potential problems. The exact behavior depends on your setup. However, you can have a look at the example in the @@ -84,7 +84,7 @@ have a look at the example in the Call ``python3 crawl.py --help`` to see what parameters can be provided. Typically, an invocation looks like: -.. code:: python +.. code:: sh python3 crawl.py /someplace/ @@ -392,7 +392,7 @@ shows how a set of CFoods can be defined to deal with a complex file structure. You can find detailed information on files need to be structured `here <https://gitlab.com/salexan/check-sfs/-/blob/f-software/filesystem_structure.md>`__ and the source -code of the CFoods `here <https://gitlab.com/henrik_indiscale/scifolder>`__. +code of the CFoods `here <https://gitlab.com/caosdb/caosdb-advanced-user-tools>`__. Sources ======= diff --git a/src/doc/index.rst b/src/doc/index.rst index ee266598cd6cfbcfaa6f54b8e39aa32e4c2b6915..9aa045349ab05d3f5130a7f33b38c7eca0c4f32e 100644 --- a/src/doc/index.rst +++ b/src/doc/index.rst @@ -15,6 +15,7 @@ This documentation helps you to :doc:`get started<getting_started>`, explains th Concepts <concepts> tutorials Caosdb-Crawler <crawler> + YAML Interface <yaml_interface> _apidoc/modules diff --git a/src/doc/yaml_interface.rst b/src/doc/yaml_interface.rst new file mode 100644 index 0000000000000000000000000000000000000000..06248f2b5c17f40b6f15f5f55664c5a4a5530a86 --- /dev/null +++ b/src/doc/yaml_interface.rst @@ -0,0 +1,117 @@ +YAML-Interface +-------------- + +The yaml interface is a module in caosdb-pylib that can be used to create and update +CaosDB models using a simplified definition in YAML format. + +Let's start with an example taken from https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools/-/blob/dev/unittests/model.yml. + +.. code-block:: yaml + + Project: + obligatory_properties: + projectId: + datatype: INTEGER + description: 'UID of this project' + Person: + recommended_properties: + firstName: + datatype: TEXT + description: 'first name' + lastName: + datatype: TEXT + description: 'last name' + LabbookEntry: + recommended_properties: + Project: + entryId: + datatype: INTEGER + description: 'UID of this entry' + responsible: + datatype: Person + description: 'the person responsible for these notes' + textElement: + datatype: TEXT + description: 'a text element of a labbook recording' + associatedFile: + datatype: FILE + description: 'A file associated with this recording' + table: + datatype: FILE + description: 'A table document associated with this recording' + + +This example defines 3 ``RecordType``s: + +- A ``Project`` with one obligatory property ``datatype`` +- A Person with a ``firstName`` and a ``lastName`` (as recommended properties) +- A ``LabbookEntry`` with multiple recommended properties of different data types + +One major advantage of using this interface (in contrast to the standard python interface) is that properties can be defined and added to record types "on-the-fly". E.g. the three lines for ``firstName`` as sub entries of ``Person`` have two effects on CaosDB: + +- A new property with name ``firstName``, datatype ``TEXT`` and description ``first name`` is inserted (or updated, if already present) into CaosDB. +- The new property is added as a recommended property to record type ``Person``. + +Any further occurrences of ``firstName`` in the yaml file will reuse the definition provided for ``Person``. + +Note the difference between the three property declarations of ``LabbookEntry``: + +- ``Project``: This record type is added directly as a property of ``LabbookEntry``. Therefore it does not specify any further attributes. Compare to the original declaration of record type ``Project``. +- ``responsible``: This defines and adds a property with name "responsible" to ``LabbookEntry`, which has a datatype ``Person``. ``Person`` is defined above. +- ``firstName``: This defines and adds a property with the standard data type ``TEXT`` to record type ``Person``. + +Datatypes +--------- + +You can use any data type understood by CaosDB as datatype attribute in the yaml model. + +List attributes are a bit special: + +.. code-block:: yaml + + datatype: LIST<DOUBLE> + +would declare a list datatype of DOUBLE elements. + +.. code-block:: yaml + + datatype: LIST<Project> + +would declare a list of elements with datatype Project. + + +Keywords +-------- + +- **parent**: Parent of this entity. +- **importance**: Importance of this entity. Possible values: "recommended", "obligatory", "suggested" +- **datatype**: The datatype of this property, e.g. TEXT, INTEGER or Project. +- **unit**: The unit of the property, e.g. "m/s". +- **description**: A description for this entity. +- **recommended_properties**: Add properties to this entity with importance "recommended". +- **obligatory_properties**: Add properties to this entity with importance "obligatory". +- **suggested_properties**: Add properties to this entity with importance "suggested". +- **inherit_from_recommended**: Inherit from another entity using the specified importance level including the higher importance level "obligatory". This would add a corresponding parent and add all obligatory and recommended properties from the parent. +- **inherit_from_suggested including higher importance levels**: Inherit from another entity using the specified importance level. This would add a corresponding parent and add all obligatory, recommended and suggested properties from the parent. +- **inherit_from_obligatory**: Inherit from another entity using the specified importance level. This would add a corresponding parent and add all obligatory properties from the parent. + +Usage +----- + +You can use the yaml parser directly in python as follows: + + +.. code-block:: python + + from caosadvancedtools.models import parser as parser + model = parser.parse_model_from_yaml("model.yml") + + +This creates a DataModel object containing all entities defined in the yaml file. + +You can then use the functions from caosadvancedtools.models.data_model.DataModel to synchronize +the model with a CaosDB instance, e.g.: + +.. code-block:: python + + model.sync_data_model() diff --git a/tox.ini b/tox.ini index 3d7f652203ed0caf9cdfaebbb159784e6f9b2835..1b3cd4ef0d39955197448ace9fdf5d26ea6749b4 100644 --- a/tox.ini +++ b/tox.ini @@ -4,9 +4,10 @@ skip_missing_interpreters = true [testenv] deps=nose pandas - caosdb + git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev pytest pytest-cov openpyxl xlrd == 1.2 + h5py commands=py.test --cov=caosadvancedtools -vv {posargs} diff --git a/unittests/create_dummy_hdf5file.py b/unittests/create_dummy_hdf5file.py new file mode 100644 index 0000000000000000000000000000000000000000..ce04030154c70e1d533f67aeec12321b86ddf305 --- /dev/null +++ b/unittests/create_dummy_hdf5file.py @@ -0,0 +1,70 @@ +import h5py +import numpy as np + + +def create_hdf5_file(filename="hdf5_dummy_file.hdf5"): + ''' + Create a dummy hdf5-file for testing. + Structure: + + root:-->root + group_level1_a:-->group + group_level2_aa:-->group + group_level3_aaa:-->group + level3_aaa_floats_2d = float64(100x100) + group_level3_aab:-->group + group_level2_ab:-->group + group_level3_aba:-->group + level3_aba_floats_2d = float64(100x100) + group_level2_ac:-->group + level2_ac_integers_2d = int32(100x100) + group_level1_b:-->group + group_level2_ba:-->group + level2_ba_integers_2d = int32(100x100) + level1_b_floats = float64(10000) + group_level1_c:-->group + level1_c_floats = float64(10000) + root_integers = int32(10000) + ''' + + with h5py.File(filename, mode="w") as hdf5: + '''Create toplevel groups''' + group_lvl1_a = hdf5.create_group("group_level1_a") + group_lvl1_b = hdf5.create_group("group_level1_b") + group_lvl1_c = hdf5.create_group("group_level1_c") + + '''Create level 2 groups''' + group_lvl2_aa = group_lvl1_a.create_group("group_level2_aa") + group_lvl2_ab = group_lvl1_a.create_group("group_level2_ab") + group_lvl2_ac = group_lvl1_a.create_group("group_level2_ac") + group_lvl2_ba = group_lvl1_b.create_group("group_level2_ba") + + '''Create level 3 groups''' + group_lvl3_aaa = group_lvl2_aa.create_group("group_level3_aaa") + group_lvl3_aab = group_lvl2_aa.create_group("group_level3_aab") + group_lvl3_aba = group_lvl2_ab.create_group("group_level3_aba") + + '''Create datasets''' + integers = np.arange(10000) + floats = np.arange(0, 1000, 0.1) + integers_2d = np.diag(np.arange(100)) + floats_2d = np.eye(100) + data_root = hdf5.create_dataset("root_integers", data=integers) + data_lvl1_b = group_lvl1_b.create_dataset("level1_b_floats", data=floats) + data_lvl2_c = group_lvl1_c.create_dataset("level1_c_floats", data=floats) + data_lvl2_ac = group_lvl2_ac.create_dataset("level2_ac_integers_2d", data=integers_2d) + data_lvl2_ba = group_lvl2_ba.create_dataset("level2_ba_integers_2d", data=integers_2d) + data_lvl3_aaa = group_lvl3_aaa.create_dataset("level3_aaa_floats_2d", data=floats_2d) + data_lvl3_aba = group_lvl3_aba.create_dataset("level3_aba_floats_2d", data=floats_2d) + + '''Create attributes''' + attr_group_lvl1_a = group_lvl1_a.attrs.create("attr_group_lvl1_a", 1) + attr_group_lvl2_aa = group_lvl2_aa.attrs.create("attr_group_lvl2_aa", -2) + attr_group_lvl3_aaa = group_lvl3_aaa.attrs.create("attr_group_lvl3_aaa", 1.0) + attr_data_root = data_root.attrs.create("attr_data_root", -2.0) + attr_data_lvl2_ac = data_lvl2_ac.attrs.create("attr_data_lvl2_ac", np.diag(np.arange(10))) + attr_data_lvl3_aaa = data_lvl3_aaa.attrs.create("attr_data_lvl3_aaa", np.eye(10)) + + +if __name__ == "__main__": + create_hdf5_file() diff --git a/unittests/hdf5_dummy_file.hdf5 b/unittests/hdf5_dummy_file.hdf5 new file mode 100644 index 0000000000000000000000000000000000000000..41bfb7ab3bcac19d90fd4f018cdd8118ae806eaf Binary files /dev/null and b/unittests/hdf5_dummy_file.hdf5 differ diff --git a/unittests/test_cfood.py b/unittests/test_cfood.py index 1bad508a2c22cf1ee1e29be11c3342d2115dd5a2..f5125166106c4bace21121d58a025886f9b132b9 100644 --- a/unittests/test_cfood.py +++ b/unittests/test_cfood.py @@ -112,6 +112,36 @@ class CFoodReTest(unittest.TestCase): self.assertTrue(SimpleCFood.match_item("hallo")) self.assertFalse(SimpleCFood.match_item("allo")) + def test_extensions(self): + """Test the RE generation.""" + empty_extensions = [] + extensions = ["foo", "bar"] + + self.assertIsNone(AbstractFileCFood.re_from_extensions(empty_extensions)) + self.assertIsNotNone(SimpleCFood.re_from_extensions(extensions)) + + class ExtCFood(AbstractFileCFood): + + @staticmethod + def get_re(): + return AbstractFileCFood.re_from_extensions(extensions) + create_identifiables = None + update_identifiables = None + + # test which paths are matched + print(ExtCFood.re_from_extensions(extensions)) + self.assertTrue(ExtCFood.match_item("hello/world.foo")) + self.assertTrue(ExtCFood.match_item("hello/world.bar")) + self.assertFalse(ExtCFood.match_item("hello/world.baz")) + self.assertFalse(ExtCFood.match_item("hello/world.foo ")) # Mind the space. + self.assertFalse(ExtCFood.match_item("hello/world.foobar")) + self.assertFalse(ExtCFood.match_item("hello/world.foo|bar")) + self.assertFalse(ExtCFood.match_item("hello/world.fobar")) + self.assertFalse(ExtCFood.match_item("hello/world.fooar")) + + # Test stored extension + self.assertEqual(ExtCFood("hello/world.foo").match["ext"], "foo") + class InsertionTest(unittest.TestCase): def test_contained_in_list(self): @@ -160,6 +190,35 @@ class InsertionTest(unittest.TestCase): value=new_int, to_be_updated=to_be_updated) assert to_be_updated[0] is entity + """Test properties with lists""" + rec1 = db.Record(id=12345) + rec1.add_property("Exp", value=[98765], datatype=db.LIST("Exp")) + rec2 = db.Record(id=98765) + update = [] + # compare Entity with id + assure_has_property(rec1, "Exp", [rec2], to_be_updated=update) + assert len(update) == 0 + update = [] + # compare id with id + assure_has_property(rec1, "Exp", [98765], to_be_updated=update) + assert len(update) == 0 + update = [] + # compare id with different list of ids + assure_has_property(rec1, "Exp2", [98765, 444, 555], + to_be_updated=update) + assert len(update) == 1 + + rec = db.Record(id=666666) + rec3 = db.Record(id=777777) + rec.add_property("Exp", value=[888888, rec3], datatype=db.LIST("Exp")) + rec2 = db.Record(id=888888) + update = [] + # compare id and Entity with id and Entity + # i.e. check that conversion from Entity to id works in both + # directions. + assure_has_property(rec, "Exp", [rec2, 777777], to_be_updated=update) + assert len(update) == 0 + def test_property_is(self): """Test properties with string, int, float, and Boolean values""" entity = db.Record() diff --git a/unittests/test_data_model.py b/unittests/test_data_model.py index 074239399002833e8500af6369f1b2c7bcc8a3ac..159adfca1d589bb092b6f59110828b5868401e25 100644 --- a/unittests/test_data_model.py +++ b/unittests/test_data_model.py @@ -1,19 +1,11 @@ import unittest import caosdb as db -import pytest from caosadvancedtools.models.data_model import DataModel class DataModelTest(unittest.TestCase): - def tearDown(self): - try: - tests = db.execute_query("FIND test*") - tests.delete() - except Exception: - pass - def test_collecting(self): maintained = {"one": db.RecordType(name="TestRecord").add_property( name="testproperty"), @@ -24,17 +16,6 @@ class DataModelTest(unittest.TestCase): assert "TestRecord" in names assert "testproperty" in names - # TODO this seems to require integration test - @pytest.mark.xfail - def test_get_existing_entities(self): - db.RecordType(name="TestRecord").insert() - c = db.Container().extend([ - db.Property(name="testproperty"), - db.RecordType(name="TestRecord")]) - exist = DataModel.get_existing_entities(c) - assert len(exist) == 1 - assert exist[0].name == "TestRecord" - def test_sync_ids_by_name(self): container = db.Container().extend([db.RecordType(name="TestRecord"), db.RecordType(name="TestRecord2"), diff --git a/unittests/test_h5.py b/unittests/test_h5.py new file mode 100644 index 0000000000000000000000000000000000000000..e5ae94686fe4542f6833e21e9a80f01e4257538d --- /dev/null +++ b/unittests/test_h5.py @@ -0,0 +1,187 @@ +import unittest +from tempfile import NamedTemporaryFile + +import caosdb as db +import caosdb.apiutils +import h5py +import numpy as np +from caosadvancedtools.cfoods import h5 +from caosadvancedtools.cfoods.h5 import h5_attr_to_property + +from create_dummy_hdf5file import create_hdf5_file + +ENTS = { + 101: db.Record(id=101), + 102: db.Record(id=102), + 103: db.Record(id=103).add_property("test", value=101, + datatype=db.REFERENCE), +} + + +def dummy_get(eid): + return ENTS[eid] + + +class H5CFoodTest(unittest.TestCase): + def setUp(self): + self.h5file = NamedTemporaryFile(delete=False, suffix=".h5") + self.h5file.close() + create_hdf5_file(self.h5file.name) + self.h5obj = h5py.File(self.h5file.name, mode="a") + + def test_create_record_records(self): + result = h5.H5CFood.create_structure(self.h5obj) + + record_list = [] + parents = ['group_level1_a', 'group_level1_b', 'group_level1_c', 'root_integers'] + + for i in parents: + record_list.append(db.Record().add_parent(name=i)) + + found_parents = [] + + for ent in [p.value for p in result.properties]: + if ent.parents[0].name == 'group_level1_a': + found_parents.append('group_level1_a') + self.assertTrue(ent.get_property("group_level2_aa") is not None) + self.assertTrue(ent.get_property("group_level1_a") is None) + elif ent.parents[0].name == 'group_level1_b': + found_parents.append('group_level1_b') + pass + elif ent.parents[0].name == 'group_level1_c': + found_parents.append('group_level1_c') + pass + elif ent.parents[0].name == 'root_integers': + found_parents.append('root_integers') + pass + + for p in parents: + self.assertTrue(p in found_parents) + + for i in range(len(result.properties)): + for j in result.properties[i].value.get_parents(): + for k in record_list[i].get_parents(): + self.assertEqual(j.name, k.name) + + result1 = h5.H5CFood.create_structure(self.h5obj["group_level1_a"]) + + for i in result1.get_parents(): + self.assertEqual(i.name, "group_level1_a") + + result2 = h5.H5CFood.create_structure(self.h5obj["group_level1_a/group_level2_aa"]) + + for i in result2.get_parents(): + self.assertEqual(i.name, "group_level2_aa") + + def test_collect_existing_structure(self): + real_retrieve = caosdb.apiutils.retrieve_entity_with_id + caosdb.apiutils.retrieve_entity_with_id = dummy_get + + # should run without problem + h5.collect_existing_structure(db.Record(), db.Record(id=234), h5.EntityMapping()) + + # test with retrieval: both Records have one test Property with one + # value -> The referenced Entities are matched + r_exist = db.Record(id=234) + r_exist.add_property("test", value=101, datatype=db.REFERENCE) + r_target = db.Record() + r_child = db.Record() + r_target.add_property("test", value=r_child, datatype=db.REFERENCE) + em = h5.EntityMapping() + h5.collect_existing_structure(r_target, r_exist, em) + self.assertTrue(em.to_existing[r_child._cuid] is ENTS[101]) + self.assertTrue(em.to_target[101] is r_child) + + # test with retrieval: the existing Record has another Property + # -> The referenced Entities are matched + r_exist = db.Record(id=234) + r_exist.add_property("test_other", value=101, datatype=db.REFERENCE) + r_target = db.Record() + r_child = db.Record() + r_target.add_property("test", value=r_child, datatype=db.REFERENCE) + em = h5.EntityMapping() + h5.collect_existing_structure(r_target, r_exist, em) + self.assertEqual(em.to_existing, {}) + self.assertEqual(em.to_target, {}) + + # test with retrieval: both Records have one test Property; the + # existing is missing the value -> The referenced Entities are matched + r_exist = db.Record(id=234) + r_exist.add_property("test", value=None, datatype=db.REFERENCE) + r_target = db.Record() + r_child = db.Record() + r_target.add_property("test", value=r_child, datatype=db.REFERENCE) + em = h5.EntityMapping() + h5.collect_existing_structure(r_target, r_exist, em) + self.assertEqual(em.to_existing, {}) + self.assertEqual(em.to_target, {}) + + # test with retrieval: both Records have one test Property with + # multiple values -> The referenced Entities are matched + r_exist = db.Record(id=234) + r_exist.add_property("test", value=[101, 102], datatype=db.LIST(db.REFERENCE)) + r_target = db.Record() + r_child = db.Record() + r_child2 = db.Record() + r_target.add_property("test", value=[r_child, r_child2], + datatype=db.LIST(db.REFERENCE)) + em = h5.EntityMapping() + h5.collect_existing_structure(r_target, r_exist, em) + self.assertEqual(em.to_existing[r_child._cuid], ENTS[101]) + self.assertEqual(em.to_existing[r_child2._cuid], ENTS[102]) + self.assertEqual(em.to_target[101], r_child) + self.assertEqual(em.to_target[102], r_child2) + + # test with retrieval: both Records have one test Property with one + # value; Add another recursion level -> The referenced Entities are matched + r_exist = db.Record(id=234) + r_exist.add_property("test", value=103, datatype=db.REFERENCE) + r_target = db.Record() + r_child = db.Record() + r_child2 = db.Record() + r_target.add_property("test", value=r_child, datatype=db.REFERENCE) + r_child.add_property("test", value=r_child2, datatype=db.REFERENCE) + em = h5.EntityMapping() + h5.collect_existing_structure(r_target, r_exist, em) + self.assertEqual(em.to_existing[r_child._cuid], ENTS[103]) + self.assertEqual(em.to_target[103], r_child) + self.assertEqual(em.to_existing[r_child2._cuid], ENTS[101]) + self.assertEqual(em.to_target[101], r_child2) + + caosdb.apiutils.retrieve_entity_with_id = real_retrieve + + def test_h5_attr_to_property(self): + + test_int: int = 1 + test_integer = np.int_(1) + test_float = np.float_(1.0) + test_str = "Test" + test_complex: complex = 2+3j + self.assertRaises(NotImplementedError, h5_attr_to_property, test_int) # only numpy-integers processed? + self.assertTupleEqual((1, db.INTEGER), h5_attr_to_property(test_integer)) + self.assertTupleEqual((1.0, db.DOUBLE), h5_attr_to_property(test_float)) + self.assertTupleEqual(("Test", db.TEXT), h5_attr_to_property(test_str)) + self.assertTupleEqual((2+3j, db.TEXT), h5_attr_to_property(test_complex)) + # strings are often represented using a binary format + self.assertTupleEqual(("yeti", db.TEXT), h5_attr_to_property( + np.array(["yeti"], dtype=h5py.string_dtype(r'utf-8', 8))[0])) + + test_integer_1d = np.arange(10) + test_float_1d = np.arange(0, 1, 0.1) + test_str_1d = np.array(["a", "b", "c"]) + self.assertTrue((np.arange(10) == h5_attr_to_property(test_integer_1d)[0]).all()) + self.assertTrue(db.LIST(db.INTEGER) == h5_attr_to_property(test_integer_1d)[1]) + self.assertTrue((np.arange(0, 1, 0.1) == h5_attr_to_property(test_float_1d)[0]).all()) + self.assertTrue(db.LIST(db.DOUBLE) == h5_attr_to_property(test_float_1d)[1]) + self.assertTrue((np.array(["a", "b", "c"]) == h5_attr_to_property(test_str_1d)[0]).all()) + self.assertTrue(db.LIST(db.TEXT) == h5_attr_to_property(test_str_1d)[1]) + + test_integers_2d = np.diag(np.arange(100)) + test_floats_2d = np.eye(100) + self.assertTupleEqual((None, None), h5_attr_to_property(test_integers_2d)) + self.assertTupleEqual((None, None), h5_attr_to_property(test_floats_2d)) + + # Test scalar values given as np.array + self.assertTupleEqual((1, db.INTEGER), h5_attr_to_property(np.array(1))) + self.assertTupleEqual((1.123, db.DOUBLE), h5_attr_to_property(np.array(1.123))) + self.assertTupleEqual(('Hello World', db.TEXT), h5_attr_to_property(np.array("Hello World"))) diff --git a/unittests/test_parser.py b/unittests/test_parser.py index 852577a471ba15e3afc163bd8e1e6fd97abd0c0a..161e2873a9c01f9ce415818116b9e4cf9aeadb5c 100644 --- a/unittests/test_parser.py +++ b/unittests/test_parser.py @@ -168,7 +168,6 @@ RT1: RT5: """ model = parse_model_from_yaml(to_file(string)) - print(model["RT1"]) assert has_property(model["RT1"], "RT2") assert model["RT1"].get_importance("RT2") == db.RECOMMENDED assert has_property(model["RT1"], "RT3") @@ -190,7 +189,7 @@ p1: p2: datatype: TXT """ - self.assertRaises(ValueError, lambda: parse_model_from_yaml(to_file(string))) + self.assertRaises(ValueError, parse_model_from_yaml, to_file(string)) class ListTest(unittest.TestCase): @@ -200,10 +199,19 @@ RT1: recommended_properties: a: datatype: LIST(RT2) + b: + datatype: LIST(TEXT) + c: + datatype: LIST<TEXT> RT2: """ model = parse_model_from_yaml(to_file(string)) + self.assertTrue(isinstance(model['b'], db.Property)) + self.assertEqual(model['b'].datatype, db.LIST(db.TEXT)) + self.assertTrue(isinstance(model['c'], db.Property)) + self.assertEqual(model['c'].datatype, db.LIST(db.TEXT)) + # This failed for an older version of caosdb-models string_list = """ A: @@ -216,16 +224,8 @@ B: datatype: INTEGER """ model = parse_model_from_yaml(to_file(string_list)) - - def test_dmgd_list(self): - string = """ -RT1: - recommended_properties: - a: - datatype: LIST(T2 -RT2: -""" - self.assertRaises(ValueError, lambda: parse_model_from_yaml(to_file(string))) + self.assertTrue(isinstance(model['A'], db.RecordType)) + self.assertEqual(model['A'].properties[0].datatype, db.LIST("B")) class ParserTest(unittest.TestCase): @@ -274,6 +274,22 @@ A: parse_model_from_string(yaml) self.assertIn("line 3", yde.exception.args[0]) + def test_reference_property(self): + """Test correct creation of reference property using an RT.""" + modeldef = """A: + recommended_properties: + ref: + datatype: LIST<A> +""" + model = parse_model_from_string(modeldef) + self.assertEqual(len(model), 2) + for key in model.keys(): + if key == "A": + self.assertTrue(isinstance(model[key], db.RecordType)) + elif key == "ref": + self.assertTrue(isinstance(model[key], db.Property)) + self.assertEqual(model[key].datatype, "LIST<A>") + class ExternTest(unittest.TestCase): """TODO Testing the "extern" keyword in the YAML.""" diff --git a/unittests/test_structure_mapping.py b/unittests/test_structure_mapping.py new file mode 100644 index 0000000000000000000000000000000000000000..5cc4114fc7f92c580f53dd8855bda659082e2b46 --- /dev/null +++ b/unittests/test_structure_mapping.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 + +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2021 IndiScale GmbH <www.indiscale.com> +# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# Copyright (C) 2021 Alexander Kreft <akreft@trineo.org> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +import unittest +from os import name + +import caosdb as db +from caosadvancedtools.structure_mapping import (EntityMapping, + collect_existing_structure) +from caosdb.common import datatype + + +class structureMappingTest(unittest.TestCase): + def test_Entitymapping(self): + ex = db.Record(id=100) # existing Record + tar = db.Record() # target Record + em = EntityMapping() + em.add(tar, ex) + + for key, val in em.to_existing.items(): + self.assertEqual(key, tar._cuid) + self.assertEqual(val, ex) + + for key, val in em.to_target.items(): + self.assertEqual(key, ex.id) + self.assertEqual(val, tar) + + def test_collect_existing_structure(self): + emap = EntityMapping() + reca1 = db.Record(name="Animals", id=100) + reca2 = db.Record(name="Dogs", id=200) + reca3 = db.Record(name="Husky", id=300) + reca1.add_property(id=101, name="Cute Animals", datatype=db.REFERENCE, value=reca2) + reca2.add_property(id=201, name="Cute Dogs", datatype=db.REFERENCE, value=reca3) + + recb1 = db.Record(name="Animals") + recb2 = db.Record(name="Dogs") + recb3 = db.Record(name="Husky") + recb1.add_property(name="Cute Animals", datatype=db.REFERENCE, value=recb2) + recb2.add_property(name="Cute Dogs", datatype=db.REFERENCE, value=recb3) + + collect_existing_structure(recb1, reca1, emap) + + # Test if the two dicts of the entity mapping correctly depend on each other + + for i in emap.to_existing.keys(): + self.assertEqual(i, emap.to_target[emap.to_existing[i].id]._cuid) + + for j in emap.to_target.keys(): + self.assertEqual(j, emap.to_existing[emap.to_target[j]._cuid].id) + + # Test if only the right Properties are in the dicts + self.assertTrue((reca2 in emap.to_existing.values()) and + (reca3 in emap.to_existing.values()) and + (reca1 not in emap.to_existing.values())) + self.assertTrue((recb2 in emap.to_target.values()) and + (recb3 in emap.to_target.values()) and + (recb1 not in emap.to_target.values())) + + # Test the correct assignment of the properties + self.assertTrue(reca2 is emap.to_existing[recb2._cuid]) + self.assertTrue(reca3 is emap.to_existing[recb3._cuid]) + + self.assertTrue(recb2 is emap.to_target[reca2.id]) + self.assertTrue(recb3 is emap.to_target[reca3.id]) + + """Test with one additional Property and Properties, which are not Records""" + emap2 = EntityMapping() + recc1 = db.Record(name="Transportation", id=100) + recc2 = db.Record(name="Cars", id=200) + recc3 = db.Record(name="Volvo", id=300) + recc1.add_property(id=101, name="Type", datatype=db.REFERENCE, value=recc2) + recc2.add_property(id=201, name="Brand", datatype=db.REFERENCE, value=recc3) + # other datatypes + recc3.add_property(id=301, name="max_speed", value=200.2, datatype=db.DOUBLE) + recc3.add_property(id=302, name="doors", value=3, datatype=db.INTEGER) + + recd1 = db.Record(name="Transportation") + recd2 = db.Record(name="Cars") + recd3 = db.Record(name="Volvo") + recd4 = db.Record(name="VW") + recd1.add_property(name="Type", datatype=db.REFERENCE, value=recd2) + recd2.add_property(name="Brand", datatype=db.REFERENCE, value=recd3) + # additional Property + recd2.add_property(name="Another Brand", datatype=db.REFERENCE, value=recd4) + # other datatypes + recd3.add_property(name="max_speed", value=200.2, datatype=db.DOUBLE) + recd3.add_property(name="doors", value=3, datatype=db.INTEGER) + recd4.add_property(name="max_speed", value=210.4, datatype=db.DOUBLE) + recd4.add_property(name="doors", value=5, datatype=db.INTEGER) + recd4.add_property(name="Warp engine", value=None) + + collect_existing_structure(recd1, recc1, emap2) + + # Test the correct assignment of the properties + self.assertTrue(recc2 is emap2.to_existing[recd2._cuid]) + self.assertTrue(recc3 is emap2.to_existing[recd3._cuid]) + + self.assertTrue(recd2 is emap2.to_target[recc2.id]) + self.assertTrue(recd3 is emap2.to_target[recc3.id]) + + """ Test, if the Record `Cars` in `target_structure` have one additional Property """ + # Test existing structure + self.assertEqual(len(recc2.get_properties()), 1) # number of properties stay unchanged + self.assertEqual(len(recd2.get_properties()), 2) # number of properties stay unchanged + + for prop_record, prop_em in zip(recc2.get_properties(), recd2.get_properties()): + self.assertTrue(prop_record.value is emap2.to_existing[prop_em.value._cuid]) + + # Test target structure + self.assertEqual(len(recc3.get_properties()), 2) # number of properties stay unchanged + self.assertEqual(len(recd3.get_properties()), 2) # number of properties stay unchanged + + """ Test if the Properties that are not References show up in the entity map """ + for rec_existing, rec_target in zip(emap2.to_existing.values(), emap2.to_target.values()): + self.assertTrue(isinstance(rec_existing, db.Record)) + self.assertTrue(isinstance(rec_target, db.Record)) diff --git a/unittests/test_table_importer.py b/unittests/test_table_importer.py index 51b4803d4db00f1b04fdfc4b78792e6a9de61bb8..b574c867881141928ac59c2b002fb7f185dac7bb 100644 --- a/unittests/test_table_importer.py +++ b/unittests/test_table_importer.py @@ -30,9 +30,13 @@ from caosadvancedtools.datainconsistency import DataInconsistencyError from caosadvancedtools.table_importer import (XLSImporter, assure_name_format, date_converter, datetime_converter, + TableImporter, + TSVImporter, + CSVImporter, incomplete_date_converter, win_path_converter, win_path_list_converter, + string_in_list, yes_no_converter) @@ -49,6 +53,16 @@ class ConverterTest(unittest.TestCase): self.assertRaises(ValueError, yes_no_converter, "True") self.assertRaises(ValueError, yes_no_converter, "true") + def test_string_in_list(self): + self.assertEqual("false", string_in_list("false", + ["FALSE", "TRUE"])) + self.assertEqual("FALSE", string_in_list("FALSE", + ["FALSE", "TRUE"], False)) + self.assertRaises(ValueError, string_in_list, "FALSE", []) + self.assertRaises(ValueError, string_in_list, "FALSE", ["fals"]) + self.assertRaises(ValueError, string_in_list, + "FALSE", ["false"], False) + def test_assure_name_format(self): self.assertEqual(assure_name_format("Müstermann, Max"), "Müstermann, Max") @@ -62,17 +76,17 @@ class ConverterTest(unittest.TestCase): ["/this/computer"]) self.assertEqual(win_path_list_converter( r"\this\computer,\this\computer"), - ["/this/computer", "/this/computer"]) + ["/this/computer", "/this/computer"]) - @pytest.mark.xfail + @pytest.mark.xfail(reason="To be fixed, see Issue #34") def test_datetime(self): test_file = os.path.join(os.path.dirname(__file__), "date.xlsx") - self.importer = XLSImporter(converters={'d': datetime_converter, - }, obligatory_columns=['d']) + importer = XLSImporter(converters={'d': datetime_converter, + }, obligatory_columns=['d']) xls_file = pd.io.excel.ExcelFile(test_file) df = xls_file.parse() - df = self.importer.read_xls(test_file) + df = importer.read_xls(test_file) assert df.shape[0] == 2 # TODO datatypes are different; fix it assert df.d.iloc[0] == datetime.datetime(1980, 12, 31, 13, 24, 23) @@ -80,30 +94,30 @@ class ConverterTest(unittest.TestCase): def test_date_xlsx(self): """Test with .xlsx in order to check openpyxl engine.""" test_file = os.path.join(os.path.dirname(__file__), "date.xlsx") - self.importer = XLSImporter(converters={'a': date_converter, - 'b': date_converter, - 'c': partial(date_converter, - fmt="%d.%m.%y") - }, obligatory_columns=['a']) + importer = XLSImporter(converters={'a': date_converter, + 'b': date_converter, + 'c': partial(date_converter, + fmt="%d.%m.%y") + }, obligatory_columns=['a']) xls_file = pd.io.excel.ExcelFile(test_file) df = xls_file.parse() - df = self.importer.read_xls(test_file) + df = importer.read_xls(test_file) assert df.shape[0] == 2 assert df.a.iloc[0] == df.b.iloc[0] == df.c.iloc[0] def test_date_xls(self): """Test with .xls in order to check xlrd engine.""" test_file = os.path.join(os.path.dirname(__file__), "date.xls") - self.importer = XLSImporter(converters={'a': date_converter, - 'b': date_converter, - 'c': partial(date_converter, - fmt="%d.%m.%y") - }, obligatory_columns=['a']) + importer = XLSImporter(converters={'a': date_converter, + 'b': date_converter, + 'c': partial(date_converter, + fmt="%d.%m.%y") + }, obligatory_columns=['a']) xls_file = pd.io.excel.ExcelFile(test_file) df = xls_file.parse() - df = self.importer.read_xls(test_file) + df = importer.read_xls(test_file) assert df.shape[0] == 2 assert df.a.iloc[0] == df.b.iloc[0] == df.c.iloc[0] @@ -126,9 +140,9 @@ class ConverterTest(unittest.TestCase): fmts={"%Y": "%Y"}) -class XLSImporterTest(unittest.TestCase): +class TableImporterTest(unittest.TestCase): def setUp(self): - self.importer = XLSImporter( + self.importer_kwargs = dict( converters={'a': str, 'b': int, 'c': float, 'd': yes_no_converter}, obligatory_columns=['a', 'b'], unique_keys=[('a', 'b')]) self.valid_df = pd.DataFrame( @@ -136,39 +150,64 @@ class XLSImporterTest(unittest.TestCase): def test_missing_col(self): df = pd.DataFrame(columns=['a', 'b']) - self.assertRaises(ValueError, self.importer.check_columns, df) - self.importer.check_columns(self.valid_df) + importer = TableImporter(**self.importer_kwargs) + self.assertRaises(ValueError, importer.check_columns, df) + importer.check_columns(self.valid_df) def test_missing_val(self): - self.importer.check_missing(self.valid_df) + importer = TableImporter(**self.importer_kwargs) + importer.check_missing(self.valid_df) df = pd.DataFrame([[None, np.nan, 2.0, 'yes'], [None, 1, 2.0, 'yes'], ['a', np.nan, 2.0, 'yes'], ['b', 5, 3.0, 'no']], columns=['a', 'b', 'c', 'd']) - df_new = self.importer.check_missing(df) + df_new = importer.check_missing(df) self.assertEqual(df_new.shape[0], 1) self.assertEqual(df_new.shape[1], 4) self.assertEqual(df_new.iloc[0].b, 5) + def test_unique(self): + importer = TableImporter(**self.importer_kwargs) + importer.check_missing(self.valid_df) + df = pd.DataFrame([['b', 5, 3.0, 'no'], ['b', 5, 3.0, 'no']], + columns=['a', 'b', 'c', 'd']) + df_new = importer.check_unique(df) + self.assertEqual(df_new.shape[0], 1) + + +class XLSImporterTest(TableImporterTest): def test_full(self): """ test full run with example data """ tmp = NamedTemporaryFile(delete=False, suffix=".xlsx") tmp.close() self.valid_df.to_excel(tmp.name) - self.importer.read_xls(tmp.name) - - def test_unique(self): - self.importer.check_missing(self.valid_df) - df = pd.DataFrame([['b', 5, 3.0, 'no'], ['b', 5, 3.0, 'no']], - columns=['a', 'b', 'c', 'd']) - df_new = self.importer.check_unique(df) - self.assertEqual(df_new.shape[0], 1) + importer = XLSImporter(**self.importer_kwargs) + importer.read_file(tmp.name) - @pytest.mark.xfail def test_raise(self): + importer = XLSImporter(**self.importer_kwargs) tmp = NamedTemporaryFile(delete=False, suffix=".lol") tmp.close() - # TODO ValueError is raised instead - self.assertRaises(DataInconsistencyError, self.importer.read_xls, + self.assertRaises(DataInconsistencyError, importer.read_xls, tmp.name) + + +class CSVImporterTest(TableImporterTest): + def test_full(self): + """ test full run with example data """ + tmp = NamedTemporaryFile(delete=False, suffix=".csv") + tmp.close() + self.valid_df.to_csv(tmp.name) + importer = CSVImporter(**self.importer_kwargs) + importer.read_file(tmp.name) + + +class TSVImporterTest(TableImporterTest): + def test_full(self): + """ test full run with example data """ + tmp = NamedTemporaryFile(delete=False, suffix=".tsv") + tmp.close() + self.valid_df.to_csv(tmp.name, sep="\t") + importer = TSVImporter(**self.importer_kwargs) + importer.read_file(tmp.name)