Skip to content
Snippets Groups Projects

F gaps in int columns

Merged Florian Spreckelsen requested to merge f-gaps-in-int-columns into dev

Files

@@ -205,18 +205,47 @@ def string_in_list(val, options, ignore_case=True):
@@ -205,18 +205,47 @@ def string_in_list(val, options, ignore_case=True):
return val
return val
 
def _pandas_typecheck(candidate, dtype):
 
if pd.api.types.is_integer_dtype(dtype):
 
return pd.api.types.is_integer_dtype(candidate)
 
if pd.api.types.is_float_dtype(dtype):
 
return pd.api.types.is_float_dtype(candidate)
 
if pd.api.types.is_bool_dtype(dtype):
 
return pd.api.types.is_bool_dtype(candidate)
 
return None
 
 
 
def _is_subtype_of(candidate, supertype):
 
"""Check whether `candidate` has a subtype of `supertype`, also respecting
 
pandas types that np.issubdtype is not aware of.
 
 
"""
 
pandas_typecheck = _pandas_typecheck(candidate, supertype)
 
if pandas_typecheck is not None:
 
return pandas_typecheck
 
return np.issubdtype(candidate, supertype)
 
 
 
def _is_instance_of_type(candidate, dtype):
 
"""Wrape `isinstance` so that pandas datatypes can be handled."""
 
pandas_typecheck = _pandas_typecheck(type(candidate), dtype)
 
if pandas_typecheck is not None:
 
return pandas_typecheck
 
return isinstance(candidate, dtype)
 
 
class TableImporter():
class TableImporter():
"""Abstract base class for importing data from tables.
"""Abstract base class for importing data from tables.
"""
"""
def __init__(self, converters, obligatory_columns=None, unique_keys=None,
def __init__(self, converters, obligatory_columns=None, unique_keys=None,
datatypes=None, existing_columns=None):
datatypes=None, existing_columns=None, convert_int_to_nullable_int=True):
"""
"""
Parameters
Parameters
----------
----------
converters : dict
converters : dict
Dict with column names as keys and converter functions as values. This dict also defines
Dict with column names as keys and converter functions as values. This dict's keys also
what columns are required to exist throught the existing keys. The converter functions are
define what columns must exist. The converter functions are
applied to the cell values. They should also check for ValueErrors, such that a separate
applied to the cell values. They should also check for ValueErrors, such that a separate
value check is not necessary.
value check is not necessary.
@@ -234,6 +263,12 @@ class TableImporter():
@@ -234,6 +263,12 @@ class TableImporter():
existing_columns : list, optional
existing_columns : list, optional
List of column names that must exist but may have missing (NULL) values
List of column names that must exist but may have missing (NULL) values
 
 
convert_int_to_nullable_int : bool, optional
 
Whether to convert all integer datatypes to ``pandas.Int64Dtype()``
 
which is nullable, to allow for integer columns with empty fields. If
 
set to False, a ``DataInconsistencyError`` will be raised in case of
 
empty fields in integer columns. Default is True.
"""
"""
if converters is None:
if converters is None:
@@ -250,7 +285,14 @@ class TableImporter():
@@ -250,7 +285,14 @@ class TableImporter():
if datatypes is None:
if datatypes is None:
datatypes = {}
datatypes = {}
self.datatypes = datatypes
self.datatypes = datatypes.copy()
 
 
self.convert_int_to_nullable_int = convert_int_to_nullable_int
 
 
if convert_int_to_nullable_int is True:
 
for key, dtype in self.datatypes.items():
 
if pd.api.types.is_integer_dtype(dtype):
 
self.datatypes[key] = pd.Int64Dtype()
if existing_columns is None:
if existing_columns is None:
existing_columns = []
existing_columns = []
@@ -333,22 +375,25 @@ class TableImporter():
@@ -333,22 +375,25 @@ class TableImporter():
"""
"""
for key, datatype in self.datatypes.items():
for key, datatype in self.datatypes.items():
if key not in df.columns:
if key not in df.columns:
 
# We ignore all datatype definitions that are not present in the
 
# dataframe.
continue
continue
 
col_dtype = df.dtypes[key]
 
# Check for castable numeric types first: We unconditionally cast int to the default
# Check for castable numeric types first: We unconditionally cast int to the default
# float, because CaosDB does not have different sizes anyway.
# float, because CaosDB does not have different sizes anyway.
col_dtype = df.dtypes[key]
if not strict and not _is_subtype_of(col_dtype, datatype):
if not strict and not np.issubdtype(col_dtype, datatype):
# These special cases should be fine.
# These special cases should be fine.
if ((datatype == str)
if ((datatype == str)
or (np.issubdtype(col_dtype, np.integer)
or (pd.api.types.is_integer_dtype(col_dtype)
and np.issubdtype(datatype, np.floating))
and pd.api.types.is_float_dtype(datatype))
): # NOQA
): # NOQA
df[key] = df[key].astype(datatype)
df[key] = df[key].astype(datatype)
# Now check each element
# Now check each element
for idx, val in df.loc[pd.notnull(df.loc[:, key]), key].items():
for idx, val in df.loc[pd.notnull(df.loc[:, key]), key].items():
if not isinstance(val, datatype):
if not _is_instance_of_type(val, datatype):
msg = (
msg = (
"In row no. {rn} and column '{c}' of file '{fi}' the "
"In row no. {rn} and column '{c}' of file '{fi}' the "
"datatype was {was} but it should be "
"datatype was {was} but it should be "
@@ -483,7 +528,8 @@ class CSVImporter(TableImporter):
@@ -483,7 +528,8 @@ class CSVImporter(TableImporter):
**kwargs)
**kwargs)
applicable_converters = {k: v for k, v in self.converters.items()
applicable_converters = {k: v for k, v in self.converters.items()
if k in tmpdf.columns}
if k in tmpdf.columns}
df = pd.read_csv(filename, sep=sep, converters=applicable_converters,
df = pd.read_csv(filename, sep=sep,
 
converters=applicable_converters, dtype=self.datatypes,
**kwargs)
**kwargs)
except ValueError as ve:
except ValueError as ve:
logger.warning(
logger.warning(
@@ -497,22 +543,6 @@ class CSVImporter(TableImporter):
@@ -497,22 +543,6 @@ class CSVImporter(TableImporter):
return df
return df
class TSVImporter(TableImporter):
class TSVImporter(CSVImporter):
def read_file(self, filename, **kwargs):
def read_file(self, filename, **kwargs):
try:
return super().read_file(filename, sep="\t", **kwargs)
tmpdf = pd.read_csv(filename, sep="\t", converters=self.converters,
**kwargs)
applicable_converters = {k: v for k, v in self.converters.items()
if k in tmpdf.columns}
df = pd.read_csv(filename, sep="\t", converters=self.converters,
**kwargs)
except ValueError as ve:
logger.warning(
"Cannot parse {}.\n{}".format(filename, ve),
extra={'identifier': str(filename),
'category': "inconsistency"})
raise DataInconsistencyError(*ve.args)
df = self.check_dataframe(df, filename)
return df
Loading