Skip to content
Snippets Groups Projects
Commit 2f9d7ed2 authored by Florian Spreckelsen's avatar Florian Spreckelsen
Browse files

FIX: Add treatment for empty fields in integer columns

parent da5dcaba
No related branches found
No related tags found
2 merge requests!107Release v0.11.0,!106F gaps in int columns
...@@ -205,12 +205,41 @@ def string_in_list(val, options, ignore_case=True): ...@@ -205,12 +205,41 @@ def string_in_list(val, options, ignore_case=True):
return val return val
def _pandas_typecheck(candidate, dtype):
if pd.api.types.is_integer_dtype(dtype):
return pd.api.types.is_integer_dtype(candidate)
if pd.api.types.is_float_dtype(dtype):
return pd.api.types.is_float_dtype(candidate)
if pd.api.types.is_bool_dtype(dtype):
return pd.api.types.is_bool_dtype(candidate)
return None
def _is_subtype_of(candidate, supertype):
"""Check whether `candidate` has a subtype of `supertype`, also respecting
pandas types that np.issubdtype is not aware of.
"""
pandas_typecheck = _pandas_typecheck(candidate, supertype)
if pandas_typecheck is not None:
return pandas_typecheck
return np.issubdtype(candidate, supertype)
def _is_instance_of_type(candidate, dtype):
"""Wrape `isinstance` so that pandas datatypes can be handled."""
pandas_typecheck = _pandas_typecheck(type(candidate), dtype)
if pandas_typecheck is not None:
return pandas_typecheck
return isinstance(candidate, dtype)
class TableImporter(): class TableImporter():
"""Abstract base class for importing data from tables. """Abstract base class for importing data from tables.
""" """
def __init__(self, converters, obligatory_columns=None, unique_keys=None, def __init__(self, converters, obligatory_columns=None, unique_keys=None,
datatypes=None, existing_columns=None): datatypes=None, existing_columns=None, convert_int_to_nullable_int=True):
""" """
Parameters Parameters
---------- ----------
...@@ -234,6 +263,12 @@ class TableImporter(): ...@@ -234,6 +263,12 @@ class TableImporter():
existing_columns : list, optional existing_columns : list, optional
List of column names that must exist but may have missing (NULL) values List of column names that must exist but may have missing (NULL) values
convert_int_to_nullable_int : bool, optional
Whether to convert all integer datatypes to ``pandas.Int64Dtype()``
which is nullable, to allow for integer columns with empty fields. If
set to False, a ``DataInConsistencyError`` will be raised in case of
empty fields in integer columns. Default is True.
""" """
if converters is None: if converters is None:
...@@ -250,7 +285,14 @@ class TableImporter(): ...@@ -250,7 +285,14 @@ class TableImporter():
if datatypes is None: if datatypes is None:
datatypes = {} datatypes = {}
self.datatypes = datatypes self.datatypes = datatypes.copy()
self.convert_int_to_nullable_int = convert_int_to_nullable_int
if convert_int_to_nullable_int is True:
for key, dtype in self.datatypes.items():
if pd.api.types.is_integer_dtype(dtype):
self.datatypes[key] = pd.Int64Dtype()
if existing_columns is None: if existing_columns is None:
existing_columns = [] existing_columns = []
...@@ -333,22 +375,25 @@ class TableImporter(): ...@@ -333,22 +375,25 @@ class TableImporter():
""" """
for key, datatype in self.datatypes.items(): for key, datatype in self.datatypes.items():
if key not in df.columns: if key not in df.columns:
# We ignore all datatype definitions that are not present in the
# dataframe.
continue continue
col_dtype = df.dtypes[key]
# Check for castable numeric types first: We unconditionally cast int to the default # Check for castable numeric types first: We unconditionally cast int to the default
# float, because CaosDB does not have different sizes anyway. # float, because CaosDB does not have different sizes anyway.
col_dtype = df.dtypes[key] if not strict and not _is_subtype_of(col_dtype, datatype):
if not strict and not np.issubdtype(col_dtype, datatype):
# These special cases should be fine. # These special cases should be fine.
if ((datatype == str) if ((datatype == str)
or (np.issubdtype(col_dtype, np.integer) or (pd.api.types.is_integer_dtype(col_dtype)
and np.issubdtype(datatype, np.floating)) and pd.api.types.is_float_dtype(datatype))
): # NOQA ): # NOQA
df[key] = df[key].astype(datatype) df[key] = df[key].astype(datatype)
# Now check each element # Now check each element
for idx, val in df.loc[pd.notnull(df.loc[:, key]), key].items(): for idx, val in df.loc[pd.notnull(df.loc[:, key]), key].items():
if not isinstance(val, datatype): if not _is_instance_of_type(val, datatype):
msg = ( msg = (
"In row no. {rn} and column '{c}' of file '{fi}' the " "In row no. {rn} and column '{c}' of file '{fi}' the "
"datatype was {was} but it should be " "datatype was {was} but it should be "
...@@ -483,7 +528,8 @@ class CSVImporter(TableImporter): ...@@ -483,7 +528,8 @@ class CSVImporter(TableImporter):
**kwargs) **kwargs)
applicable_converters = {k: v for k, v in self.converters.items() applicable_converters = {k: v for k, v in self.converters.items()
if k in tmpdf.columns} if k in tmpdf.columns}
df = pd.read_csv(filename, sep=sep, converters=applicable_converters, df = pd.read_csv(filename, sep=sep,
converters=applicable_converters, dtype=self.datatypes,
**kwargs) **kwargs)
except ValueError as ve: except ValueError as ve:
logger.warning( logger.warning(
...@@ -497,22 +543,6 @@ class CSVImporter(TableImporter): ...@@ -497,22 +543,6 @@ class CSVImporter(TableImporter):
return df return df
class TSVImporter(TableImporter): class TSVImporter(CSVImporter):
def read_file(self, filename, **kwargs): def read_file(self, filename, **kwargs):
try: return super().read_file(filename, sep="\t", **kwargs)
tmpdf = pd.read_csv(filename, sep="\t", converters=self.converters,
**kwargs)
applicable_converters = {k: v for k, v in self.converters.items()
if k in tmpdf.columns}
df = pd.read_csv(filename, sep="\t", converters=self.converters,
**kwargs)
except ValueError as ve:
logger.warning(
"Cannot parse {}.\n{}".format(filename, ve),
extra={'identifier': str(filename),
'category': "inconsistency"})
raise DataInconsistencyError(*ve.args)
df = self.check_dataframe(df, filename)
return df
...@@ -200,6 +200,7 @@ class TableImporterTest(unittest.TestCase): ...@@ -200,6 +200,7 @@ class TableImporterTest(unittest.TestCase):
assert df["float"].dtype == int assert df["float"].dtype == int
# strict = False by default, so this shouldn't raise an error # strict = False by default, so this shouldn't raise an error
importer.check_datatype(df) importer.check_datatype(df)
print(importer.datatypes)
# The types should be correct now. # The types should be correct now.
assert df["a"].dtype == pd.StringDtype assert df["a"].dtype == pd.StringDtype
assert df["float"].dtype == float assert df["float"].dtype == float
...@@ -325,6 +326,62 @@ class CSVImporterTest(TableImporterTest): ...@@ -325,6 +326,62 @@ class CSVImporterTest(TableImporterTest):
importer = CSVImporter(**kwargs) importer = CSVImporter(**kwargs)
importer.read_file(tmp.name) importer.read_file(tmp.name)
def test_gaps_in_int_column(self):
"""Test for
https://gitlab.com/linkahead/linkahead-advanced-user-tools/-/issues/62:
Datatype confusion when encountering empty values in integer columns.
"""
tmpfile = NamedTemporaryFile(delete=False, suffix=".csv")
with open(tmpfile.name, 'w') as tmp:
tmp.write(
"int,int_with_gaps,float\n"
"1,1,1.1\n"
"2,,1.2\n"
"3,3,1.3\n"
)
kwargs = {
"datatypes": {
"int": int,
"int_with_gaps": int,
"float": float
},
"obligatory_columns": ["int"],
"converters": {}
}
importer = CSVImporter(**kwargs)
assert importer.datatypes["int"] == "Int64"
assert importer.datatypes["int_with_gaps"] == "Int64"
assert importer.datatypes["float"] == float
df = importer.read_file(tmpfile.name)
# Default is to convert nullable ints
assert df["int"].dtype == "Int64"
assert df["int_with_gaps"].dtype == "Int64"
assert df["float"].dtype == float
assert pd.isna(df["int_with_gaps"][1])
# When not converting, empty fields raise errors ...
importer_strict = CSVImporter(convert_int_to_nullable_int=False, **kwargs)
assert importer_strict.datatypes["int"] == int
assert importer_strict.datatypes["int_with_gaps"] == int
assert importer_strict.datatypes["float"] == float
with pytest.raises(DataInconsistencyError) as die:
df = importer_strict.read_file(tmpfile.name)
print(df)
assert "Integer column has NA values in column 1" in str(die.value)
# ... except when a nullable datatype is set explicitly
kwargs["datatypes"]["int_with_gaps"] = "Int64"
importer_strict = CSVImporter(convert_int_to_nullable_int=False, **kwargs)
df = importer_strict.read_file(tmpfile.name)
# Now only the one that has been specifically set to Int64 is nullable.
assert df["int"].dtype == int
assert df["int_with_gaps"].dtype == "Int64"
assert df["float"].dtype == float
class TSVImporterTest(TableImporterTest): class TSVImporterTest(TableImporterTest):
def test_full(self): def test_full(self):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment