Skip to content
Snippets Groups Projects

TableConverter now converts int to float and vice versa to match the desired dtype.

Merged Daniel Hornung requested to merge f-convert-int-float into dev
1 unresolved thread
2 files
+ 5
8
Compare changes
  • Side-by-side
  • Inline
Files
2
@@ -205,27 +205,33 @@ def string_in_list(val, options, ignore_case=True):
return val
class TableImporter(object):
class TableImporter():
"""Abstract base class for importing data from tables.
"""
def __init__(self, converters, obligatory_columns=None, unique_keys=None,
datatypes=None):
"""
converters: dict with column names as keys and converter functions as
values
This dict also defines what columns are required to exist
throught the existing keys. The converter functions are
applied to the cell values. They should also check for
ValueErrors, such that a separate value check is not
necessary.
obligatory_columns: list of column names, optional
each listed column must not have missing values
unique_columns : list of column names that in
combination must be unique; i.e. each row has a
unique combination of values in those columns.
datatypes: dict with column names as keys and datatypes as values
All non-null values will be checked whether they have the
provided datatype.
This dict also defines what columns are required to exist
throught the existing keys.
Parameters
----------
converters : dict
Dict with column names as keys and converter functions as values. This dict also defines
what columns are required to exist throught the existing keys. The converter functions are
applied to the cell values. They should also check for ValueErrors, such that a separate
value check is not necessary.
obligatory_columns : list, optional
List of column names, each listed column must not have missing values.
unique_keys : list, optional
List of column names that in combination must be unique: each row has a unique
combination of values in those columns.
datatypes : dict, optional
Dict with column names as keys and datatypes as values. All non-null values will be
checked whether they have the provided datatype. This dict also defines what columns are
required to exist throught the existing keys.
"""
if converters is None:
@@ -247,11 +253,14 @@ class TableImporter(object):
raise NotImplementedError()
def check_columns(self, df, filename=None):
"""
checks whether all required columns, i.e. columns for which converters
were defined exist.
"""Check whether all required columns exist.
Required columns are columns for which converters are defined.
Raises
------
DataInconsistencyError
Raises: DataInconsistencyError
"""
for col in self.required_columns:
@@ -267,12 +276,11 @@ class TableImporter(object):
raise DataInconsistencyError(errmsg)
def check_unique(self, df, filename=None):
"""
Check whether value combinations that shall be unique for each row are
unique.
"""Check whether value combinations that shall be unique for each row are unique.
If a second row is found, that uses the same combination of values as a
previous one, the second one is removed.
"""
df = df.copy()
uniques = []
@@ -299,13 +307,32 @@ class TableImporter(object):
return df
def check_datatype(self, df, filename=None):
"""
Check for each column whether non-null fields are have the correct
datatype.
"""
def check_datatype(self, df, filename=None, strict=False):
"""Check for each column whether non-null fields have the correct datatype.
.. note::
If columns are integer, but should be float, this method converts the respective columns
in place.
Parameters
----------
strict: boolean, optional
If False (the default), try to convert columns, otherwise raise an error.
"""
for key, datatype in self.datatypes.items():
# Check for castable numeric types first: We unconditionally cast int to the default
# float, because CaosDB does not have different sizes anyway.
col_dtype = df.dtypes[key]
if not strict and not np.issubdtype(col_dtype, datatype):
issub = np.issubdtype
# These special cases should be fine.
if issub(col_dtype, np.integer) and issub(datatype, np.floating):
df[key] = df[key].astype(datatype)
# Now check each element
for idx, val in df.loc[
pd.notnull(df.loc[:, key]), key].iteritems():
@@ -326,6 +353,11 @@ class TableImporter(object):
Check in each row whether obligatory fields are empty or null.
Rows that have missing values are removed.
Returns
-------
out : pandas.DataFrame
The input DataFrame with incomplete rows removed.
"""
df = df.copy()
@@ -362,10 +394,26 @@ class TableImporter(object):
return df
def check_dataframe(self, df, filename):
def check_dataframe(self, df, filename=None, strict=False):
"""Check if the dataframe conforms to the restrictions.
Checked restrictions are: Columns, data types, uniqueness requirements.
Parameters
----------
df: pandas.DataFrame
The dataframe to be checked.
filename: string, optional
The file name, only used for output in case of problems.
strict: boolean, optional
If False (the default), try to convert columns, otherwise raise an error.
"""
self.check_columns(df, filename=filename)
df = self.check_missing(df, filename=filename)
self.check_datatype(df, filename=filename)
self.check_datatype(df, filename=filename, strict=strict)
if len(self.unique_keys) > 0:
df = self.check_unique(df, filename=filename)
@@ -378,8 +426,7 @@ class XLSImporter(TableImporter):
return self.read_xls(filename=filename, **kwargs)
def read_xls(self, filename, **kwargs):
"""
converts an xls file into a Pandas DataFrame.
"""Convert an xls file into a Pandas DataFrame.
The converters of the XLSImporter object are used.
Loading