diff --git a/CHANGELOG.md b/CHANGELOG.md index 1d2e913ccc5443aa000f453fa697f99725d54596..76f8350ea556d710d6b9f17cb76f64631412c268 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### - `check_reference_field` function to check whether entities with provided ids exits (for example when importing data from a table) +- added the `datatypes` argument to `TableImporter` for columns that do not + need a special conversion function ### Changed ### diff --git a/src/caosadvancedtools/table_importer.py b/src/caosadvancedtools/table_importer.py index c4f2df36edbaa20c9e5e2a3689e0638137f4ac68..26bb86b829c4444caf792697520d4733cae3e0bc 100755 --- a/src/caosadvancedtools/table_importer.py +++ b/src/caosadvancedtools/table_importer.py @@ -202,7 +202,8 @@ def string_in_list(val, options, ignore_case=True): class TableImporter(object): - def __init__(self, converters, obligatory_columns=None, unique_keys=None): + def __init__(self, converters, obligatory_columns=None, unique_keys=None, + datatypes=None): """ converters: dict with column names as keys and converter functions as values @@ -216,14 +217,27 @@ class TableImporter(object): unique_columns : list of column names that in combination must be unique; i.e. each row has a unique combination of values in those columns. + datatypes: dict with column names as keys and datatypes as values + All non-null values will be checked whether they have the + provided datatype. + This dict also defines what columns are required to exist + throught the existing keys. """ + + if converters is None: + converters = {} + + if datatypes is None: + datatypes = {} + self.sup = SuppressKnown() - self.required_columns = list(converters.keys()) + self.required_columns = list(converters.keys())+list(datatypes.keys()) self.obligatory_columns = ([] if obligatory_columns is None else obligatory_columns) self.unique_keys = [] if unique_keys is None else unique_keys self.converters = converters + self.datatypes = datatypes def read_file(self, filename, **kwargs): raise NotImplementedError() @@ -281,6 +295,22 @@ class TableImporter(object): return df + def check_datatype(self, df, filename=None): + """ + Check for each column whether non-null fields are have the correct + datatype. + """ + + for key, datatype in self.datatypes.items(): + for idx, val in df.loc[pd.notnull(df.loc[:, key]), key].iteritems(): + if not isinstance(val, datatype): + raise DataInconsistencyError( + "In row no. {rn} and column {c} of file '{fi}' the " + "datatype was {was} but it should be " + "{expected}".format(rn=idx, c=key, fi=filename, + was=type(val), expected=datatype) + ) + def check_missing(self, df, filename=None): """ Check in each row whether obligatory fields are empty or null. @@ -325,6 +355,7 @@ class TableImporter(object): def check_dataframe(self, df, filename): self.check_columns(df, filename=filename) df = self.check_missing(df, filename=filename) + self.check_datatype(df, filename=filename) if len(self.unique_keys) > 0: df = self.check_unique(df, filename=filename) diff --git a/unittests/test_table_importer.py b/unittests/test_table_importer.py index eb841af800de86fdb1cf2d3af818e95ee6a9271c..9c8a379d8c12def32c04cf82c5e09c0f5f6f175c 100644 --- a/unittests/test_table_importer.py +++ b/unittests/test_table_importer.py @@ -146,20 +146,29 @@ class ConverterTest(unittest.TestCase): class TableImporterTest(unittest.TestCase): def setUp(self): self.importer_kwargs = dict( - converters={'a': str, 'b': int, 'c': float, 'd': yes_no_converter}, + converters={'c': float, 'd': yes_no_converter}, + datatypes={'a': str, 'b': int}, obligatory_columns=['a', 'b'], unique_keys=[('a', 'b')]) self.valid_df = pd.DataFrame( [['a', 1, 2.0, 'yes']], columns=['a', 'b', 'c', 'd']) def test_missing_col(self): - df = pd.DataFrame(columns=['a', 'b']) + # check missing from converters + df = pd.DataFrame(columns=['a', 'b', 'c']) importer = TableImporter(**self.importer_kwargs) self.assertRaises(ValueError, importer.check_columns, df) + # check missing from datatypes + df = pd.DataFrame(columns=['a', 'd', 'c']) + importer = TableImporter(**self.importer_kwargs) + self.assertRaises(ValueError, importer.check_columns, df) + # check valid importer.check_columns(self.valid_df) def test_missing_val(self): importer = TableImporter(**self.importer_kwargs) + # check valid importer.check_missing(self.valid_df) + # check invalid df = pd.DataFrame([[None, np.nan, 2.0, 'yes'], [None, 1, 2.0, 'yes'], ['a', np.nan, 2.0, 'yes'], @@ -170,6 +179,13 @@ class TableImporterTest(unittest.TestCase): self.assertEqual(df_new.shape[1], 4) self.assertEqual(df_new.iloc[0].b, 5) + def test_wrong_datatype(self): + importer = TableImporter(**self.importer_kwargs) + df = pd.DataFrame([[None, np.nan, 2.0, 'yes'], + [5, 1, 2.0, 'yes']], + columns=['a', 'b', 'c', 'd']) + self.assertRaises(DataInconsistencyError, importer.check_datatype, df) + def test_unique(self): importer = TableImporter(**self.importer_kwargs) importer.check_missing(self.valid_df)