Skip to content
Snippets Groups Projects
Commit 45c3dbc2 authored by Florian Spreckelsen's avatar Florian Spreckelsen
Browse files

Merge branch 'f-datatypes' into 'dev'

ENH: allow to define datatypes instead of converters

See merge request !24
parents 7706438c ff13aecb
No related branches found
No related tags found
1 merge request!24ENH: allow to define datatypes instead of converters
Pipeline #15813 passed
......@@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added ###
- `check_reference_field` function to check whether entities with provided ids
exits (for example when importing data from a table)
- added the `datatypes` argument to `TableImporter` for columns that do not
need a special conversion function
### Changed ###
......
......@@ -202,7 +202,8 @@ def string_in_list(val, options, ignore_case=True):
class TableImporter(object):
def __init__(self, converters, obligatory_columns=None, unique_keys=None):
def __init__(self, converters, obligatory_columns=None, unique_keys=None,
datatypes=None):
"""
converters: dict with column names as keys and converter functions as
values
......@@ -216,14 +217,27 @@ class TableImporter(object):
unique_columns : list of column names that in
combination must be unique; i.e. each row has a
unique combination of values in those columns.
datatypes: dict with column names as keys and datatypes as values
All non-null values will be checked whether they have the
provided datatype.
This dict also defines what columns are required to exist
throught the existing keys.
"""
if converters is None:
converters = {}
if datatypes is None:
datatypes = {}
self.sup = SuppressKnown()
self.required_columns = list(converters.keys())
self.required_columns = list(converters.keys())+list(datatypes.keys())
self.obligatory_columns = ([]
if obligatory_columns is None
else obligatory_columns)
self.unique_keys = [] if unique_keys is None else unique_keys
self.converters = converters
self.datatypes = datatypes
def read_file(self, filename, **kwargs):
raise NotImplementedError()
......@@ -281,6 +295,22 @@ class TableImporter(object):
return df
def check_datatype(self, df, filename=None):
"""
Check for each column whether non-null fields are have the correct
datatype.
"""
for key, datatype in self.datatypes.items():
for idx, val in df.loc[pd.notnull(df.loc[:, key]), key].iteritems():
if not isinstance(val, datatype):
raise DataInconsistencyError(
"In row no. {rn} and column {c} of file '{fi}' the "
"datatype was {was} but it should be "
"{expected}".format(rn=idx, c=key, fi=filename,
was=type(val), expected=datatype)
)
def check_missing(self, df, filename=None):
"""
Check in each row whether obligatory fields are empty or null.
......@@ -325,6 +355,7 @@ class TableImporter(object):
def check_dataframe(self, df, filename):
self.check_columns(df, filename=filename)
df = self.check_missing(df, filename=filename)
self.check_datatype(df, filename=filename)
if len(self.unique_keys) > 0:
df = self.check_unique(df, filename=filename)
......
......@@ -146,20 +146,29 @@ class ConverterTest(unittest.TestCase):
class TableImporterTest(unittest.TestCase):
def setUp(self):
self.importer_kwargs = dict(
converters={'a': str, 'b': int, 'c': float, 'd': yes_no_converter},
converters={'c': float, 'd': yes_no_converter},
datatypes={'a': str, 'b': int},
obligatory_columns=['a', 'b'], unique_keys=[('a', 'b')])
self.valid_df = pd.DataFrame(
[['a', 1, 2.0, 'yes']], columns=['a', 'b', 'c', 'd'])
def test_missing_col(self):
df = pd.DataFrame(columns=['a', 'b'])
# check missing from converters
df = pd.DataFrame(columns=['a', 'b', 'c'])
importer = TableImporter(**self.importer_kwargs)
self.assertRaises(ValueError, importer.check_columns, df)
# check missing from datatypes
df = pd.DataFrame(columns=['a', 'd', 'c'])
importer = TableImporter(**self.importer_kwargs)
self.assertRaises(ValueError, importer.check_columns, df)
# check valid
importer.check_columns(self.valid_df)
def test_missing_val(self):
importer = TableImporter(**self.importer_kwargs)
# check valid
importer.check_missing(self.valid_df)
# check invalid
df = pd.DataFrame([[None, np.nan, 2.0, 'yes'],
[None, 1, 2.0, 'yes'],
['a', np.nan, 2.0, 'yes'],
......@@ -170,6 +179,13 @@ class TableImporterTest(unittest.TestCase):
self.assertEqual(df_new.shape[1], 4)
self.assertEqual(df_new.iloc[0].b, 5)
def test_wrong_datatype(self):
importer = TableImporter(**self.importer_kwargs)
df = pd.DataFrame([[None, np.nan, 2.0, 'yes'],
[5, 1, 2.0, 'yes']],
columns=['a', 'b', 'c', 'd'])
self.assertRaises(DataInconsistencyError, importer.check_datatype, df)
def test_unique(self):
importer = TableImporter(**self.importer_kwargs)
importer.check_missing(self.valid_df)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment