Skip to content
Snippets Groups Projects
Commit b392b075 authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

ENH: allow to define datatypes instead of converters

parent 4f6c5669
No related branches found
No related tags found
1 merge request!24ENH: allow to define datatypes instead of converters
Pipeline #15806 canceled
...@@ -186,7 +186,8 @@ def string_in_list(val, options, ignore_case=True): ...@@ -186,7 +186,8 @@ def string_in_list(val, options, ignore_case=True):
class TableImporter(object): class TableImporter(object):
def __init__(self, converters, obligatory_columns=None, unique_keys=None): def __init__(self, converters, obligatory_columns=None, unique_keys=None,
datatypes=None):
""" """
converters: dict with column names as keys and converter functions as converters: dict with column names as keys and converter functions as
values values
...@@ -200,14 +201,27 @@ class TableImporter(object): ...@@ -200,14 +201,27 @@ class TableImporter(object):
unique_columns : list of column names that in unique_columns : list of column names that in
combination must be unique; i.e. each row has a combination must be unique; i.e. each row has a
unique combination of values in those columns. unique combination of values in those columns.
datatypes: dict with column names as keys and datatypes as values
All non-null values will be checked whether they have the
provided datatype.
This dict also defines what columns are required to exist
throught the existing keys.
""" """
if converters is None:
converters = {}
if datatypes is None:
datatypes = {}
self.sup = SuppressKnown() self.sup = SuppressKnown()
self.required_columns = list(converters.keys()) self.required_columns = list(converters.keys())+list(datatypes.keys())
self.obligatory_columns = ([] self.obligatory_columns = ([]
if obligatory_columns is None if obligatory_columns is None
else obligatory_columns) else obligatory_columns)
self.unique_keys = [] if unique_keys is None else unique_keys self.unique_keys = [] if unique_keys is None else unique_keys
self.converters = converters self.converters = converters
self.datatypes = datatypes
def read_file(self, filename, **kwargs): def read_file(self, filename, **kwargs):
raise NotImplementedError() raise NotImplementedError()
...@@ -265,6 +279,22 @@ class TableImporter(object): ...@@ -265,6 +279,22 @@ class TableImporter(object):
return df return df
def check_datatype(self, df, filename=None):
"""
Check for each column whether non-null fields are have the correct
datatype.
"""
for key, datatype in self.datatypes.items():
for idx, val in df.loc[pd.notnull(df.loc[:, key]), key].iteritems():
if not isinstance(val, datatype):
raise DataInconsistencyError(
"In row no. {rn} and column {c} of file '{fi}' the "
"datatype was {was} but it should be "
"{expected}".format(rn=idx, c=key, fi=filename,
was=type(val), expected=datatype)
)
def check_missing(self, df, filename=None): def check_missing(self, df, filename=None):
""" """
Check in each row whether obligatory fields are empty or null. Check in each row whether obligatory fields are empty or null.
...@@ -309,6 +339,7 @@ class TableImporter(object): ...@@ -309,6 +339,7 @@ class TableImporter(object):
def check_dataframe(self, df, filename): def check_dataframe(self, df, filename):
self.check_columns(df, filename=filename) self.check_columns(df, filename=filename)
df = self.check_missing(df, filename=filename) df = self.check_missing(df, filename=filename)
self.check_datatype(df, filename=filename)
if len(self.unique_keys) > 0: if len(self.unique_keys) > 0:
df = self.check_unique(df, filename=filename) df = self.check_unique(df, filename=filename)
......
...@@ -27,16 +27,15 @@ import numpy as np ...@@ -27,16 +27,15 @@ import numpy as np
import pandas as pd import pandas as pd
import pytest import pytest
from caosadvancedtools.datainconsistency import DataInconsistencyError from caosadvancedtools.datainconsistency import DataInconsistencyError
from caosadvancedtools.table_importer import (XLSImporter, assure_name_format, from caosadvancedtools.table_importer import (CSVImporter, TableImporter,
TSVImporter, XLSImporter,
assure_name_format,
date_converter, date_converter,
datetime_converter, datetime_converter,
TableImporter,
TSVImporter,
CSVImporter,
incomplete_date_converter, incomplete_date_converter,
string_in_list,
win_path_converter, win_path_converter,
win_path_list_converter, win_path_list_converter,
string_in_list,
yes_no_converter) yes_no_converter)
...@@ -143,20 +142,29 @@ class ConverterTest(unittest.TestCase): ...@@ -143,20 +142,29 @@ class ConverterTest(unittest.TestCase):
class TableImporterTest(unittest.TestCase): class TableImporterTest(unittest.TestCase):
def setUp(self): def setUp(self):
self.importer_kwargs = dict( self.importer_kwargs = dict(
converters={'a': str, 'b': int, 'c': float, 'd': yes_no_converter}, converters={'c': float, 'd': yes_no_converter},
datatypes={'a': str, 'b': int},
obligatory_columns=['a', 'b'], unique_keys=[('a', 'b')]) obligatory_columns=['a', 'b'], unique_keys=[('a', 'b')])
self.valid_df = pd.DataFrame( self.valid_df = pd.DataFrame(
[['a', 1, 2.0, 'yes']], columns=['a', 'b', 'c', 'd']) [['a', 1, 2.0, 'yes']], columns=['a', 'b', 'c', 'd'])
def test_missing_col(self): def test_missing_col(self):
df = pd.DataFrame(columns=['a', 'b']) # check missing from converters
df = pd.DataFrame(columns=['a', 'b', 'c'])
importer = TableImporter(**self.importer_kwargs)
self.assertRaises(ValueError, importer.check_columns, df)
# check missing from datatypes
df = pd.DataFrame(columns=['a', 'd', 'c'])
importer = TableImporter(**self.importer_kwargs) importer = TableImporter(**self.importer_kwargs)
self.assertRaises(ValueError, importer.check_columns, df) self.assertRaises(ValueError, importer.check_columns, df)
# check valid
importer.check_columns(self.valid_df) importer.check_columns(self.valid_df)
def test_missing_val(self): def test_missing_val(self):
importer = TableImporter(**self.importer_kwargs) importer = TableImporter(**self.importer_kwargs)
# check valid
importer.check_missing(self.valid_df) importer.check_missing(self.valid_df)
# check invalid
df = pd.DataFrame([[None, np.nan, 2.0, 'yes'], df = pd.DataFrame([[None, np.nan, 2.0, 'yes'],
[None, 1, 2.0, 'yes'], [None, 1, 2.0, 'yes'],
['a', np.nan, 2.0, 'yes'], ['a', np.nan, 2.0, 'yes'],
...@@ -167,6 +175,13 @@ class TableImporterTest(unittest.TestCase): ...@@ -167,6 +175,13 @@ class TableImporterTest(unittest.TestCase):
self.assertEqual(df_new.shape[1], 4) self.assertEqual(df_new.shape[1], 4)
self.assertEqual(df_new.iloc[0].b, 5) self.assertEqual(df_new.iloc[0].b, 5)
def test_wrong_datatype(self):
importer = TableImporter(**self.importer_kwargs)
df = pd.DataFrame([[None, np.nan, 2.0, 'yes'],
[5, 1, 2.0, 'yes']],
columns=['a', 'b', 'c', 'd'])
self.assertRaises(DataInconsistencyError, importer.check_datatype, df)
def test_unique(self): def test_unique(self):
importer = TableImporter(**self.importer_kwargs) importer = TableImporter(**self.importer_kwargs)
importer.check_missing(self.valid_df) importer.check_missing(self.valid_df)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment