FIX: Be less strict about numeric datatypes form xlsx files.

985cb826 · Daniel Hornung · cc64104c · 985cb826 · 985cb826 · 985cb826
Verified Commit 985cb826 authored Mar 2, 2022 by Daniel Hornung
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ### Changed ###

+- `TableConverter` now converts int to float and vice versa to match the desired dtype.
+
 ### Deprecated ###

 ### Removed ###

--- a/src/caosadvancedtools/crawler.py
+++ b/src/caosadvancedtools/crawler.py
@@ -279,6 +279,8 @@ class Crawler(object):
                    except DataInconsistencyError as e:
                        logger.debug(traceback.format_exc())
                        logger.debug(e)
+                        # TODO: Generally: in which cases should exceptions be raised? When is
+                        # errors_occured set to True? The expected behavior must be documented.
                    except Exception as e:
                        try:
                            DataModelProblems.evaluate_exception(e)

--- a/src/caosadvancedtools/table_importer.py
+++ b/src/caosadvancedtools/table_importer.py
@@ -205,27 +205,32 @@ def string_in_list(val, options, ignore_case=True):
    return val


-class TableImporter(object):
+class TableImporter():
+    """Abstract base class for importing data from tables.
+    """
    def __init__(self, converters, obligatory_columns=None, unique_keys=None,
                 datatypes=None):
        """
-        converters: dict with column names as keys and converter functions as
-                    values
-                    This dict also defines what columns are required to exist
-                    throught the existing keys. The converter functions are
-                    applied to the cell values. They should also check for
-                    ValueErrors, such that a separate value check is not
-                    necessary.
-        obligatory_columns: list of column names, optional
-                            each listed column must not have missing values
-        unique_columns : list of column names that in
-                            combination must be unique; i.e. each row has a
-                            unique combination of values in those columns.
-        datatypes: dict with column names as keys and datatypes as values
-                   All non-null values will be checked whether they have the
-                   provided datatype.
-                   This dict also defines what columns are required to exist
-                   throught the existing keys.
+        Parameters
+        ----------
+        converters : dict
+          Dict with column names as keys and converter functions as values. This dict also defines
+          what columns are required to exist throught the existing keys. The converter functions are
+          applied to the cell values. They should also check for ValueErrors, such that a separate
+          value check is not necessary.
+
+        obligatory_columns : list, optional
+          List of column names, each listed column must not have missing values.
+
+        unique_keys : list, optional
+          List of column names that in combination must be unique: each row has a unique
+          combination of values in those columns.
+
+        datatypes : dict, optional
+          Dict with column names as keys and datatypes as values.  All non-null values will be
+          checked whether they have the provided datatype.  This dict also defines what columns are
+          required to exist throught the existing keys.
+
        """

        if converters is None:
@@ -247,11 +252,14 @@ class TableImporter(object):
        raise NotImplementedError()

    def check_columns(self, df, filename=None):
-        """
-        checks whether all required columns, i.e. columns for which converters
-        were defined exist.
+        """Check whether all required columns exist.
+
+        Required columns are columns for which converters are defined.
+
+        Raises
+        ------
+        DataInconsistencyError

-        Raises: DataInconsistencyError
        """

        for col in self.required_columns:
@@ -267,12 +275,11 @@ class TableImporter(object):
                raise DataInconsistencyError(errmsg)

    def check_unique(self, df, filename=None):
-        """
-        Check whether value combinations that shall be unique for each row are
-        unique.
+        """Check whether value combinations that shall be unique for each row are unique.

        If a second row is found, that uses the same combination of values as a
        previous one, the second one is removed.
+
        """
        df = df.copy()
        uniques = []
@@ -299,13 +306,33 @@ class TableImporter(object):

        return df

-    def check_datatype(self, df, filename=None):
-        """
-        Check for each column whether non-null fields are have the correct
-        datatype.
-        """
+    def check_datatype(self, df, filename=None, strict=False):
+        """Check for each column whether non-null fields have the correct datatype.
+
+        .. note::

+          If columns are float, but should be integer or vice versa, this method converts the
+          respective columns in place.
+
+        Parameters
+        ----------
+
+        strict: boolean, optional
+          If False (the default), try to convert columns, otherwise raise an error.
+
+        """
        for key, datatype in self.datatypes.items():
+            # Check for castable numeric types first: We unconditionally cast float to int and vice
+            # versa, because CaosDB does not have different sizes anyway.
+            col_dtype = df.dtypes[key]
+            if not strict and not np.issubdtype(col_dtype, datatype):
+                issub = np.issubdtype
+                #  These special cases should be fine.
+                if ((issub(col_dtype, np.integer) and issub(datatype, np.floating))
+                        or (issub(col_dtype, np.floating) and issub(datatype, np.integer))):
+                    df[key] = df[key].astype(datatype)
+
+            # Now check each element
            for idx, val in df.loc[
                    pd.notnull(df.loc[:, key]), key].iteritems():

@@ -326,6 +353,11 @@ class TableImporter(object):
        Check in each row whether obligatory fields are empty or null.

        Rows that have missing values are removed.
+
+        Returns
+        -------
+        out : pandas.DataFrame
+          The input DataFrame with incomplete rows removed.
        """
        df = df.copy()

@@ -362,10 +394,26 @@ class TableImporter(object):

        return df

-    def check_dataframe(self, df, filename):
+    def check_dataframe(self, df, filename=None, strict=False):
+        """Check if the dataframe conforms to the restrictions.
+
+        Checked restrictions are: Columns, data types, uniqueness requirements.
+
+        Parameters
+        ----------
+
+        df: pandas.DataFrame
+          The dataframe to be checked.
+
+        filename: string, optional
+          The file name, only used for output in case of problems.
+
+        strict: boolean, optional
+          If False (the default), try to convert columns, otherwise raise an error.
+        """
        self.check_columns(df, filename=filename)
        df = self.check_missing(df, filename=filename)
-        self.check_datatype(df, filename=filename)
+        self.check_datatype(df, filename=filename, strict=strict)

        if len(self.unique_keys) > 0:
            df = self.check_unique(df, filename=filename)
@@ -378,8 +426,7 @@ class XLSImporter(TableImporter):
        return self.read_xls(filename=filename, **kwargs)

    def read_xls(self, filename, **kwargs):
-        """
-        converts an xls file into a Pandas DataFrame.
+        """Convert an xls file into a Pandas DataFrame.

        The converters of the XLSImporter object are used.


--- a/unittests/data/datatypes.xlsx
+++ b/unittests/data/datatypes.xlsx
--- a/unittests/test_table_importer.py
+++ b/unittests/test_table_importer.py
@@ -23,7 +23,6 @@ import unittest
 from functools import partial
 from tempfile import NamedTemporaryFile

-import caosdb as db
 import numpy as np
 import pandas as pd
 import pytest
@@ -211,6 +210,21 @@ class XLSImporterTest(TableImporterTest):
        self.assertRaises(DataInconsistencyError, importer.read_xls,
                          tmp.name)

+    def test_datatypes(self):
+        """Test datataypes in columns."""
+        importer = XLSImporter(converters={},
+                               obligatory_columns=["float_as_float"],
+                               datatypes={
+                                   "float_as_float": float,
+                                   "int_as_float": float,
+                                   "int_as_int": int,
+                                   "float_as_int": int,
+                               }
+                               )
+        df = importer.read_xls(os.path.join(os.path.dirname(__file__), "data", "datatypes.xlsx"))
+        assert np.issubdtype(df.loc[0, "int_as_float"], float)
+        assert df.loc[1, "float_as_int"] == 6  # This is an acceptable rounding error.
+

 class CSVImporterTest(TableImporterTest):
    def test_full(self):