Merge branch 'f-convert-int-float' into 'dev'

TableConverter now converts int to float and vice versa to match the desired dtype. See merge request !34

Merge branch 'f-convert-int-float' into 'dev'
cb9b86f6 · Florian Spreckelsen · cc64104c · 61a31fff · cb9b86f6 · cb9b86f6
Commit cb9b86f6 authored 3 years ago by Florian Spreckelsen
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -107,7 +107,7 @@ style:
  stage: style
  image: $CI_REGISTRY_IMAGE
  script:
-      - autopep8 -ar --diff --exit-code --exclude swagger_client .
+      - make style
  allow_failure: true
 unittest:

--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Changed ###
+- `TableConverter` now converts int to float and vice versa to match the desired dtype.
 ### Deprecated ###
 ### Removed ###

--- a/Makefile
+++ b/Makefile
@@ -34,3 +34,7 @@ install:
 unittest:
 	pytest-3 unittests
+style:
+	autopep8 -ar --diff --exit-code --exclude swagger_client .
+.PHONY: style
--- a/pytest.ini
+++ b/pytest.ini
 [pytest]
 testpaths = unittests
 addopts = -vv
-python_paths = src
--- a/src/caosadvancedtools/crawler.py
+++ b/src/caosadvancedtools/crawler.py
@@ -279,6 +279,8 @@ class Crawler(object):
                    except DataInconsistencyError as e:
                        logger.debug(traceback.format_exc())
                        logger.debug(e)
+                        # TODO: Generally: in which cases should exceptions be raised? When is
+                        # errors_occured set to True? The expected behavior must be documented.
                    except Exception as e:
                        try:
                            DataModelProblems.evaluate_exception(e)

--- a/src/caosadvancedtools/table_importer.py
+++ b/src/caosadvancedtools/table_importer.py
@@ -205,27 +205,33 @@ def string_in_list(val, options, ignore_case=True):
    return val
-class TableImporter(object):
+class TableImporter():
+    """Abstract base class for importing data from tables.
+    """
    def __init__(self, converters, obligatory_columns=None, unique_keys=None,
                 datatypes=None):
        """
-        converters: dict with column names as keys and converter functions as
+        Parameters
-                    values
+        ----------
-                    This dict also defines what columns are required to exist
+        converters : dict
-                    throught the existing keys. The converter functions are
+          Dict with column names as keys and converter functions as values. This dict also defines
-                    applied to the cell values. They should also check for
+          what columns are required to exist throught the existing keys. The converter functions are
-                    ValueErrors, such that a separate value check is not
+          applied to the cell values. They should also check for ValueErrors, such that a separate
-                    necessary.
+          value check is not necessary.
-        obligatory_columns: list of column names, optional
-                            each listed column must not have missing values
+        obligatory_columns : list, optional
-        unique_columns : list of column names that in
+          List of column names, each listed column must not have missing values.
-                            combination must be unique; i.e. each row has a
-                            unique combination of values in those columns.
+        unique_keys : list, optional
-        datatypes: dict with column names as keys and datatypes as values
+          List of column names that in combination must be unique: each row has a unique
-                   All non-null values will be checked whether they have the
+          combination of values in those columns.
-                   provided datatype.
-                   This dict also defines what columns are required to exist
+        datatypes : dict, optional
-                   throught the existing keys.
+          Dict with column names as keys and datatypes as values.  All non-null values will be
+          checked whether they have the provided datatype.  This dict also defines what columns are
+          required to exist throught the existing keys.
        """
        if converters is None:
@@ -247,11 +253,14 @@ class TableImporter(object):
        raise NotImplementedError()
    def check_columns(self, df, filename=None):
-        """
+        """Check whether all required columns exist.
-        checks whether all required columns, i.e. columns for which converters
-        were defined exist.
+        Required columns are columns for which converters are defined.
+        Raises
+        ------
+        DataInconsistencyError
-        Raises: DataInconsistencyError
        """
        for col in self.required_columns:
@@ -267,12 +276,11 @@ class TableImporter(object):
                raise DataInconsistencyError(errmsg)
    def check_unique(self, df, filename=None):
-        """
+        """Check whether value combinations that shall be unique for each row are unique.
-        Check whether value combinations that shall be unique for each row are
-        unique.
        If a second row is found, that uses the same combination of values as a
        previous one, the second one is removed.
        """
        df = df.copy()
        uniques = []
@@ -299,13 +307,32 @@ class TableImporter(object):
        return df
-    def check_datatype(self, df, filename=None):
+    def check_datatype(self, df, filename=None, strict=False):
-        """
+        """Check for each column whether non-null fields have the correct datatype.
-        Check for each column whether non-null fields are have the correct
-        datatype.
+        .. note::
-        """
+          If columns are integer, but should be float, this method converts the respective columns
+          in place.
+        Parameters
+        ----------
+        strict: boolean, optional
+          If False (the default), try to convert columns, otherwise raise an error.
+        """
        for key, datatype in self.datatypes.items():
+            # Check for castable numeric types first: We unconditionally cast int to the default
+            # float, because CaosDB does not have different sizes anyway.
+            col_dtype = df.dtypes[key]
+            if not strict and not np.issubdtype(col_dtype, datatype):
+                issub = np.issubdtype
+                #  These special cases should be fine.
+                if issub(col_dtype, np.integer) and issub(datatype, np.floating):
+                    df[key] = df[key].astype(datatype)
+            # Now check each element
            for idx, val in df.loc[
                    pd.notnull(df.loc[:, key]), key].iteritems():
@@ -326,6 +353,11 @@ class TableImporter(object):
        Check in each row whether obligatory fields are empty or null.
        Rows that have missing values are removed.
+        Returns
+        -------
+        out : pandas.DataFrame
+          The input DataFrame with incomplete rows removed.
        """
        df = df.copy()
@@ -362,10 +394,26 @@ class TableImporter(object):
        return df
-    def check_dataframe(self, df, filename):
+    def check_dataframe(self, df, filename=None, strict=False):
+        """Check if the dataframe conforms to the restrictions.
+        Checked restrictions are: Columns, data types, uniqueness requirements.
+        Parameters
+        ----------
+        df: pandas.DataFrame
+          The dataframe to be checked.
+        filename: string, optional
+          The file name, only used for output in case of problems.
+        strict: boolean, optional
+          If False (the default), try to convert columns, otherwise raise an error.
+        """
        self.check_columns(df, filename=filename)
        df = self.check_missing(df, filename=filename)
-        self.check_datatype(df, filename=filename)
+        self.check_datatype(df, filename=filename, strict=strict)
        if len(self.unique_keys) > 0:
            df = self.check_unique(df, filename=filename)
@@ -378,8 +426,7 @@ class XLSImporter(TableImporter):
        return self.read_xls(filename=filename, **kwargs)
    def read_xls(self, filename, **kwargs):
-        """
+        """Convert an xls file into a Pandas DataFrame.
-        converts an xls file into a Pandas DataFrame.
        The converters of the XLSImporter object are used.

--- a/unittests/data/datatypes.xlsx
+++ b/unittests/data/datatypes.xlsx
--- a/unittests/test_table_importer.py
+++ b/unittests/test_table_importer.py
@@ -23,7 +23,6 @@ import unittest
 from functools import partial
 from tempfile import NamedTemporaryFile
-import caosdb as db
 import numpy as np
 import pandas as pd
 import pytest
@@ -211,6 +210,19 @@ class XLSImporterTest(TableImporterTest):
        self.assertRaises(DataInconsistencyError, importer.read_xls,
                          tmp.name)
+    def test_datatypes(self):
+        """Test datataypes in columns."""
+        importer = XLSImporter(converters={},
+                               obligatory_columns=["float_as_float"],
+                               datatypes={
+                                   "float_as_float": float,
+                                   "int_as_float": float,
+                                   "int_as_int": int,
+                               }
+                               )
+        df = importer.read_xls(os.path.join(os.path.dirname(__file__), "data", "datatypes.xlsx"))
+        assert np.issubdtype(df.loc[0, "int_as_float"], float)
 class CSVImporterTest(TableImporterTest):
    def test_full(self):