Skip to content
Snippets Groups Projects

ENH: allow to define datatypes instead of converters

Merged Henrik tom Wörden requested to merge f-datatypes into dev
3 files
+ 53
4
Compare changes
  • Side-by-side
  • Inline
Files
3
@@ -202,7 +202,8 @@ def string_in_list(val, options, ignore_case=True):
@@ -202,7 +202,8 @@ def string_in_list(val, options, ignore_case=True):
class TableImporter(object):
class TableImporter(object):
def __init__(self, converters, obligatory_columns=None, unique_keys=None):
def __init__(self, converters, obligatory_columns=None, unique_keys=None,
 
datatypes=None):
"""
"""
converters: dict with column names as keys and converter functions as
converters: dict with column names as keys and converter functions as
values
values
@@ -216,14 +217,27 @@ class TableImporter(object):
@@ -216,14 +217,27 @@ class TableImporter(object):
unique_columns : list of column names that in
unique_columns : list of column names that in
combination must be unique; i.e. each row has a
combination must be unique; i.e. each row has a
unique combination of values in those columns.
unique combination of values in those columns.
 
datatypes: dict with column names as keys and datatypes as values
 
All non-null values will be checked whether they have the
 
provided datatype.
 
This dict also defines what columns are required to exist
 
throught the existing keys.
"""
"""
 
 
if converters is None:
 
converters = {}
 
 
if datatypes is None:
 
datatypes = {}
 
self.sup = SuppressKnown()
self.sup = SuppressKnown()
self.required_columns = list(converters.keys())
self.required_columns = list(converters.keys())+list(datatypes.keys())
self.obligatory_columns = ([]
self.obligatory_columns = ([]
if obligatory_columns is None
if obligatory_columns is None
else obligatory_columns)
else obligatory_columns)
self.unique_keys = [] if unique_keys is None else unique_keys
self.unique_keys = [] if unique_keys is None else unique_keys
self.converters = converters
self.converters = converters
 
self.datatypes = datatypes
def read_file(self, filename, **kwargs):
def read_file(self, filename, **kwargs):
raise NotImplementedError()
raise NotImplementedError()
@@ -281,6 +295,22 @@ class TableImporter(object):
@@ -281,6 +295,22 @@ class TableImporter(object):
return df
return df
 
def check_datatype(self, df, filename=None):
 
"""
 
Check for each column whether non-null fields are have the correct
 
datatype.
 
"""
 
 
for key, datatype in self.datatypes.items():
 
for idx, val in df.loc[pd.notnull(df.loc[:, key]), key].iteritems():
 
if not isinstance(val, datatype):
 
raise DataInconsistencyError(
 
"In row no. {rn} and column {c} of file '{fi}' the "
 
"datatype was {was} but it should be "
 
"{expected}".format(rn=idx, c=key, fi=filename,
 
was=type(val), expected=datatype)
 
)
 
def check_missing(self, df, filename=None):
def check_missing(self, df, filename=None):
"""
"""
Check in each row whether obligatory fields are empty or null.
Check in each row whether obligatory fields are empty or null.
@@ -325,6 +355,7 @@ class TableImporter(object):
@@ -325,6 +355,7 @@ class TableImporter(object):
def check_dataframe(self, df, filename):
def check_dataframe(self, df, filename):
self.check_columns(df, filename=filename)
self.check_columns(df, filename=filename)
df = self.check_missing(df, filename=filename)
df = self.check_missing(df, filename=filename)
 
self.check_datatype(df, filename=filename)
if len(self.unique_keys) > 0:
if len(self.unique_keys) > 0:
df = self.check_unique(df, filename=filename)
df = self.check_unique(df, filename=filename)
Loading