From b392b0751229414af814e4fc923e4fac86b9a9b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <henrik@trineo.org> Date: Mon, 8 Nov 2021 16:41:13 +0100 Subject: [PATCH] ENH: allow to define datatypes instead of converters --- src/caosadvancedtools/table_importer.py | 35 +++++++++++++++++++++++-- unittests/test_table_importer.py | 29 +++++++++++++++----- 2 files changed, 55 insertions(+), 9 deletions(-) diff --git a/src/caosadvancedtools/table_importer.py b/src/caosadvancedtools/table_importer.py index cb61e838..830919a1 100755 --- a/src/caosadvancedtools/table_importer.py +++ b/src/caosadvancedtools/table_importer.py @@ -186,7 +186,8 @@ def string_in_list(val, options, ignore_case=True): class TableImporter(object): - def __init__(self, converters, obligatory_columns=None, unique_keys=None): + def __init__(self, converters, obligatory_columns=None, unique_keys=None, + datatypes=None): """ converters: dict with column names as keys and converter functions as values @@ -200,14 +201,27 @@ class TableImporter(object): unique_columns : list of column names that in combination must be unique; i.e. each row has a unique combination of values in those columns. + datatypes: dict with column names as keys and datatypes as values + All non-null values will be checked whether they have the + provided datatype. + This dict also defines what columns are required to exist + throught the existing keys. """ + + if converters is None: + converters = {} + + if datatypes is None: + datatypes = {} + self.sup = SuppressKnown() - self.required_columns = list(converters.keys()) + self.required_columns = list(converters.keys())+list(datatypes.keys()) self.obligatory_columns = ([] if obligatory_columns is None else obligatory_columns) self.unique_keys = [] if unique_keys is None else unique_keys self.converters = converters + self.datatypes = datatypes def read_file(self, filename, **kwargs): raise NotImplementedError() @@ -265,6 +279,22 @@ class TableImporter(object): return df + def check_datatype(self, df, filename=None): + """ + Check for each column whether non-null fields are have the correct + datatype. + """ + + for key, datatype in self.datatypes.items(): + for idx, val in df.loc[pd.notnull(df.loc[:, key]), key].iteritems(): + if not isinstance(val, datatype): + raise DataInconsistencyError( + "In row no. {rn} and column {c} of file '{fi}' the " + "datatype was {was} but it should be " + "{expected}".format(rn=idx, c=key, fi=filename, + was=type(val), expected=datatype) + ) + def check_missing(self, df, filename=None): """ Check in each row whether obligatory fields are empty or null. @@ -309,6 +339,7 @@ class TableImporter(object): def check_dataframe(self, df, filename): self.check_columns(df, filename=filename) df = self.check_missing(df, filename=filename) + self.check_datatype(df, filename=filename) if len(self.unique_keys) > 0: df = self.check_unique(df, filename=filename) diff --git a/unittests/test_table_importer.py b/unittests/test_table_importer.py index b574c867..fbfb13d5 100644 --- a/unittests/test_table_importer.py +++ b/unittests/test_table_importer.py @@ -27,16 +27,15 @@ import numpy as np import pandas as pd import pytest from caosadvancedtools.datainconsistency import DataInconsistencyError -from caosadvancedtools.table_importer import (XLSImporter, assure_name_format, +from caosadvancedtools.table_importer import (CSVImporter, TableImporter, + TSVImporter, XLSImporter, + assure_name_format, date_converter, datetime_converter, - TableImporter, - TSVImporter, - CSVImporter, incomplete_date_converter, + string_in_list, win_path_converter, win_path_list_converter, - string_in_list, yes_no_converter) @@ -143,20 +142,29 @@ class ConverterTest(unittest.TestCase): class TableImporterTest(unittest.TestCase): def setUp(self): self.importer_kwargs = dict( - converters={'a': str, 'b': int, 'c': float, 'd': yes_no_converter}, + converters={'c': float, 'd': yes_no_converter}, + datatypes={'a': str, 'b': int}, obligatory_columns=['a', 'b'], unique_keys=[('a', 'b')]) self.valid_df = pd.DataFrame( [['a', 1, 2.0, 'yes']], columns=['a', 'b', 'c', 'd']) def test_missing_col(self): - df = pd.DataFrame(columns=['a', 'b']) + # check missing from converters + df = pd.DataFrame(columns=['a', 'b', 'c']) + importer = TableImporter(**self.importer_kwargs) + self.assertRaises(ValueError, importer.check_columns, df) + # check missing from datatypes + df = pd.DataFrame(columns=['a', 'd', 'c']) importer = TableImporter(**self.importer_kwargs) self.assertRaises(ValueError, importer.check_columns, df) + # check valid importer.check_columns(self.valid_df) def test_missing_val(self): importer = TableImporter(**self.importer_kwargs) + # check valid importer.check_missing(self.valid_df) + # check invalid df = pd.DataFrame([[None, np.nan, 2.0, 'yes'], [None, 1, 2.0, 'yes'], ['a', np.nan, 2.0, 'yes'], @@ -167,6 +175,13 @@ class TableImporterTest(unittest.TestCase): self.assertEqual(df_new.shape[1], 4) self.assertEqual(df_new.iloc[0].b, 5) + def test_wrong_datatype(self): + importer = TableImporter(**self.importer_kwargs) + df = pd.DataFrame([[None, np.nan, 2.0, 'yes'], + [5, 1, 2.0, 'yes']], + columns=['a', 'b', 'c', 'd']) + self.assertRaises(DataInconsistencyError, importer.check_datatype, df) + def test_unique(self): importer = TableImporter(**self.importer_kwargs) importer.check_missing(self.valid_df) -- GitLab