diff --git a/CHANGELOG.md b/CHANGELOG.md index b3c3fa0553a00490a884c289a5ac02603d9c3772..6e381333581953a15a4fe533ced131121dc0569d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ## ### Added ### +- TableImporter now accepts a `allow_missing_values_in` argument which allows to have obligatory + columns with missing values ### Changed ### diff --git a/src/caosadvancedtools/table_importer.py b/src/caosadvancedtools/table_importer.py index 1f515e78e3ddbd198fa0336589a359ba9154f038..e5f6579fe2826c589f648d1d2812c04d620576f2 100755 --- a/src/caosadvancedtools/table_importer.py +++ b/src/caosadvancedtools/table_importer.py @@ -210,7 +210,7 @@ class TableImporter(): """ def __init__(self, converters, obligatory_columns=None, unique_keys=None, - datatypes=None): + datatypes=None, allow_missing_values_in=None): """ Parameters ---------- @@ -221,7 +221,7 @@ class TableImporter(): value check is not necessary. obligatory_columns : list, optional - List of column names, each listed column must not have missing values. + List of column names, each listed column must exist and must not have missing values. unique_keys : list, optional List of column names that in combination must be unique: each row has a unique @@ -232,16 +232,22 @@ class TableImporter(): checked whether they have the provided datatype. This dict also defines what columns are required to exist throught the existing keys. + allow_missing_values_in : list, optional + List of (obligatory) column names which may have missing (NULL) values + """ if converters is None: converters = {} + if allow_missing_values_in is None: + allow_missing_values_in = [] + if datatypes is None: datatypes = {} self.sup = SuppressKnown() - self.required_columns = list(converters.keys())+list(datatypes.keys()) + self.allow_missing_values_in = allow_missing_values_in self.obligatory_columns = ([] if obligatory_columns is None else obligatory_columns) @@ -263,7 +269,7 @@ class TableImporter(): """ - for col in self.required_columns: + for col in self.obligatory_columns+list(self.converters.keys())+list(self.datatypes.keys()): if col not in df.columns: errmsg = "Column '{}' missing in ".format(col) errmsg += ("\n{}.\n".format(filename) if filename @@ -333,8 +339,7 @@ class TableImporter(): df[key] = df[key].astype(datatype) # Now check each element - for idx, val in df.loc[ - pd.notnull(df.loc[:, key]), key].iteritems(): + for idx, val in df.loc[pd.notnull(df.loc[:, key]), key].iteritems(): if not isinstance(val, datatype): msg = ( @@ -364,8 +369,8 @@ class TableImporter(): for index, row in df.iterrows(): # if none of the relevant information is given, skip - if np.array([pd.isnull(row.loc[key]) for key in - self.obligatory_columns]).all(): + if np.array([pd.isnull(row.loc[key]) for key in self.obligatory_columns + if key not in self.allow_missing_values_in]).all(): df = df.drop(index) diff --git a/unittests/test_table_importer.py b/unittests/test_table_importer.py index 70f0f87f8706d72c386b18f54b7a9a10908eb477..404043588262780f945d2b0caf3ecf20b1b7a338 100644 --- a/unittests/test_table_importer.py +++ b/unittests/test_table_importer.py @@ -146,9 +146,11 @@ class TableImporterTest(unittest.TestCase): self.importer_kwargs = dict( converters={'c': float, 'd': yes_no_converter}, datatypes={'a': str, 'b': int}, - obligatory_columns=['a', 'b'], unique_keys=[('a', 'b')]) + obligatory_columns=['a', 'b'], unique_keys=[('a', 'b')], + allow_missing_values_in=['e'], + ) self.valid_df = pd.DataFrame( - [['a', 1, 2.0, 'yes']], columns=['a', 'b', 'c', 'd']) + [['a', 1, 2.0, 'yes', np.nan]], columns=['a', 'b', 'c', 'd', 'e']) def test_missing_col(self): # check missing from converters @@ -184,6 +186,14 @@ class TableImporterTest(unittest.TestCase): columns=['a', 'b', 'c', 'd']) self.assertRaises(DataInconsistencyError, importer.check_datatype, df) + def test_allow_missing(self): + importer = TableImporter(**self.importer_kwargs) + importer.check_missing(self.valid_df) + df = pd.DataFrame([['b', np.nan, 3.0, 'no'], ['b', 5, 3.0, 'no']], + columns=['a', 'b', 'c', 'd']) + df_new = importer.check_unique(df) + self.assertEqual(df_new.shape[0], 2) + def test_unique(self): importer = TableImporter(**self.importer_kwargs) importer.check_missing(self.valid_df)