FIX: Add treatment for empty fields in integer columns

2f9d7ed2 · Florian Spreckelsen · da5dcaba · 2f9d7ed2 · 2f9d7ed2
Commit 2f9d7ed2 authored 1 year ago by Florian Spreckelsen
--- a/src/caosadvancedtools/table_importer.py
+++ b/src/caosadvancedtools/table_importer.py
@@ -205,12 +205,41 @@ def string_in_list(val, options, ignore_case=True):
    return val
+def _pandas_typecheck(candidate, dtype):
+    if pd.api.types.is_integer_dtype(dtype):
+        return pd.api.types.is_integer_dtype(candidate)
+    if pd.api.types.is_float_dtype(dtype):
+        return pd.api.types.is_float_dtype(candidate)
+    if pd.api.types.is_bool_dtype(dtype):
+        return pd.api.types.is_bool_dtype(candidate)
+    return None
+def _is_subtype_of(candidate, supertype):
+    """Check whether `candidate` has a subtype of `supertype`, also respecting
+    pandas types that np.issubdtype is not aware of.
+    """
+    pandas_typecheck = _pandas_typecheck(candidate, supertype)
+    if pandas_typecheck is not None:
+        return pandas_typecheck
+    return np.issubdtype(candidate, supertype)
+def _is_instance_of_type(candidate, dtype):
+    """Wrape `isinstance` so that pandas datatypes can be handled."""
+    pandas_typecheck = _pandas_typecheck(type(candidate), dtype)
+    if pandas_typecheck is not None:
+        return pandas_typecheck
+    return isinstance(candidate, dtype)
 class TableImporter():
    """Abstract base class for importing data from tables.
    """
    def __init__(self, converters, obligatory_columns=None, unique_keys=None,
-                 datatypes=None, existing_columns=None):
+                 datatypes=None, existing_columns=None, convert_int_to_nullable_int=True):
        """
        Parameters
        ----------
@@ -234,6 +263,12 @@ class TableImporter():
        existing_columns : list, optional
          List of column names that must exist but may have missing (NULL) values
+        convert_int_to_nullable_int : bool, optional
+          Whether to convert all integer datatypes to ``pandas.Int64Dtype()``
+          which is nullable, to allow for integer columns with empty fields. If
+          set to False, a ``DataInConsistencyError`` will be raised in case of
+          empty fields in integer columns.  Default is True.
        """
        if converters is None:
@@ -250,7 +285,14 @@ class TableImporter():
        if datatypes is None:
            datatypes = {}
-        self.datatypes = datatypes
+        self.datatypes = datatypes.copy()
+        self.convert_int_to_nullable_int = convert_int_to_nullable_int
+        if convert_int_to_nullable_int is True:
+            for key, dtype in self.datatypes.items():
+                if pd.api.types.is_integer_dtype(dtype):
+                    self.datatypes[key] = pd.Int64Dtype()
        if existing_columns is None:
            existing_columns = []
@@ -333,22 +375,25 @@ class TableImporter():
        """
        for key, datatype in self.datatypes.items():
            if key not in df.columns:
+                # We ignore all datatype definitions that are not present in the
+                # dataframe.
                continue
+            col_dtype = df.dtypes[key]
            # Check for castable numeric types first: We unconditionally cast int to the default
            # float, because CaosDB does not have different sizes anyway.
-            col_dtype = df.dtypes[key]
+            if not strict and not _is_subtype_of(col_dtype, datatype):
-            if not strict and not np.issubdtype(col_dtype, datatype):
                #  These special cases should be fine.
                if ((datatype == str)
-                        or (np.issubdtype(col_dtype, np.integer)
+                        or (pd.api.types.is_integer_dtype(col_dtype)
-                            and np.issubdtype(datatype, np.floating))
+                            and pd.api.types.is_float_dtype(datatype))
                    ):  # NOQA
                    df[key] = df[key].astype(datatype)
            # Now check each element
            for idx, val in df.loc[pd.notnull(df.loc[:, key]), key].items():
-                if not isinstance(val, datatype):
+                if not _is_instance_of_type(val, datatype):
                    msg = (
                        "In row no. {rn} and column '{c}' of file '{fi}' the "
                        "datatype was {was} but it should be "
@@ -483,7 +528,8 @@ class CSVImporter(TableImporter):
                                **kwargs)
            applicable_converters = {k: v for k, v in self.converters.items()
                                     if k in tmpdf.columns}
-            df = pd.read_csv(filename, sep=sep, converters=applicable_converters,
+            df = pd.read_csv(filename, sep=sep,
+                             converters=applicable_converters, dtype=self.datatypes,
                             **kwargs)
        except ValueError as ve:
            logger.warning(
@@ -497,22 +543,6 @@ class CSVImporter(TableImporter):
        return df
-class TSVImporter(TableImporter):
+class TSVImporter(CSVImporter):
    def read_file(self, filename, **kwargs):
-        try:
+        return super().read_file(filename, sep="\t", **kwargs)
-            tmpdf = pd.read_csv(filename, sep="\t", converters=self.converters,
-                                **kwargs)
-            applicable_converters = {k: v for k, v in self.converters.items()
-                                     if k in tmpdf.columns}
-            df = pd.read_csv(filename, sep="\t", converters=self.converters,
-                             **kwargs)
-        except ValueError as ve:
-            logger.warning(
-                "Cannot parse {}.\n{}".format(filename, ve),
-                extra={'identifier': str(filename),
-                       'category': "inconsistency"})
-            raise DataInconsistencyError(*ve.args)
-        df = self.check_dataframe(df, filename)
-        return df
--- a/unittests/test_table_importer.py
+++ b/unittests/test_table_importer.py
@@ -200,6 +200,7 @@ class TableImporterTest(unittest.TestCase):
        assert df["float"].dtype == int
        # strict = False by default, so this shouldn't raise an error
        importer.check_datatype(df)
+        print(importer.datatypes)
        # The types should be correct now.
        assert df["a"].dtype == pd.StringDtype
        assert df["float"].dtype == float
@@ -325,6 +326,62 @@ class CSVImporterTest(TableImporterTest):
        importer = CSVImporter(**kwargs)
        importer.read_file(tmp.name)
+    def test_gaps_in_int_column(self):
+        """Test for
+        https://gitlab.com/linkahead/linkahead-advanced-user-tools/-/issues/62:
+        Datatype confusion when encountering empty values in integer columns.
+        """
+        tmpfile = NamedTemporaryFile(delete=False, suffix=".csv")
+        with open(tmpfile.name, 'w') as tmp:
+            tmp.write(
+                "int,int_with_gaps,float\n"
+                "1,1,1.1\n"
+                "2,,1.2\n"
+                "3,3,1.3\n"
+            )
+        kwargs = {
+            "datatypes": {
+                "int": int,
+                "int_with_gaps": int,
+                "float": float
+            },
+            "obligatory_columns": ["int"],
+            "converters": {}
+        }
+        importer = CSVImporter(**kwargs)
+        assert importer.datatypes["int"] == "Int64"
+        assert importer.datatypes["int_with_gaps"] == "Int64"
+        assert importer.datatypes["float"] == float
+        df = importer.read_file(tmpfile.name)
+        # Default is to convert nullable ints
+        assert df["int"].dtype == "Int64"
+        assert df["int_with_gaps"].dtype == "Int64"
+        assert df["float"].dtype == float
+        assert pd.isna(df["int_with_gaps"][1])
+        # When not converting, empty fields raise errors ...
+        importer_strict = CSVImporter(convert_int_to_nullable_int=False, **kwargs)
+        assert importer_strict.datatypes["int"] == int
+        assert importer_strict.datatypes["int_with_gaps"] == int
+        assert importer_strict.datatypes["float"] == float
+        with pytest.raises(DataInconsistencyError) as die:
+            df = importer_strict.read_file(tmpfile.name)
+            print(df)
+        assert "Integer column has NA values in column 1" in str(die.value)
+        # ... except when a nullable datatype is set explicitly
+        kwargs["datatypes"]["int_with_gaps"] = "Int64"
+        importer_strict = CSVImporter(convert_int_to_nullable_int=False, **kwargs)
+        df = importer_strict.read_file(tmpfile.name)
+        # Now only the one that has been specifically set to Int64 is nullable.
+        assert df["int"].dtype == int
+        assert df["int_with_gaps"].dtype == "Int64"
+        assert df["float"].dtype == float
 class TSVImporterTest(TableImporterTest):
    def test_full(self):