Skip to content
Snippets Groups Projects
Commit cd5d44eb authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

ENH: allow to provide required columns explicitly

parent 9ff30718
No related branches found
No related tags found
Loading
Pipeline #35286 failed
......@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased] ##
### Added ###
- TableImporter now accepts a `allow_missing_values_in` argument which allows to have obligatory
columns with missing values
### Changed ###
......
......@@ -210,7 +210,7 @@ class TableImporter():
"""
def __init__(self, converters, obligatory_columns=None, unique_keys=None,
datatypes=None):
datatypes=None, allow_missing_values_in=None):
"""
Parameters
----------
......@@ -221,7 +221,7 @@ class TableImporter():
value check is not necessary.
obligatory_columns : list, optional
List of column names, each listed column must not have missing values.
List of column names, each listed column must exist and must not have missing values.
unique_keys : list, optional
List of column names that in combination must be unique: each row has a unique
......@@ -232,16 +232,22 @@ class TableImporter():
checked whether they have the provided datatype. This dict also defines what columns are
required to exist throught the existing keys.
allow_missing_values_in : list, optional
List of (obligatory) column names which may have missing (NULL) values
"""
if converters is None:
converters = {}
if allow_missing_values_in is None:
allow_missing_values_in = []
if datatypes is None:
datatypes = {}
self.sup = SuppressKnown()
self.required_columns = list(converters.keys())+list(datatypes.keys())
self.allow_missing_values_in = allow_missing_values_in
self.obligatory_columns = ([]
if obligatory_columns is None
else obligatory_columns)
......@@ -263,7 +269,7 @@ class TableImporter():
"""
for col in self.required_columns:
for col in self.obligatory_columns+list(self.converters.keys())+list(self.datatypes.keys()):
if col not in df.columns:
errmsg = "Column '{}' missing in ".format(col)
errmsg += ("\n{}.\n".format(filename) if filename
......@@ -333,8 +339,7 @@ class TableImporter():
df[key] = df[key].astype(datatype)
# Now check each element
for idx, val in df.loc[
pd.notnull(df.loc[:, key]), key].iteritems():
for idx, val in df.loc[pd.notnull(df.loc[:, key]), key].iteritems():
if not isinstance(val, datatype):
msg = (
......@@ -364,8 +369,8 @@ class TableImporter():
for index, row in df.iterrows():
# if none of the relevant information is given, skip
if np.array([pd.isnull(row.loc[key]) for key in
self.obligatory_columns]).all():
if np.array([pd.isnull(row.loc[key]) for key in self.obligatory_columns
if key not in self.allow_missing_values_in]).all():
df = df.drop(index)
......
......@@ -146,9 +146,11 @@ class TableImporterTest(unittest.TestCase):
self.importer_kwargs = dict(
converters={'c': float, 'd': yes_no_converter},
datatypes={'a': str, 'b': int},
obligatory_columns=['a', 'b'], unique_keys=[('a', 'b')])
obligatory_columns=['a', 'b'], unique_keys=[('a', 'b')],
allow_missing_values_in=['e'],
)
self.valid_df = pd.DataFrame(
[['a', 1, 2.0, 'yes']], columns=['a', 'b', 'c', 'd'])
[['a', 1, 2.0, 'yes', np.nan]], columns=['a', 'b', 'c', 'd', 'e'])
def test_missing_col(self):
# check missing from converters
......@@ -184,6 +186,14 @@ class TableImporterTest(unittest.TestCase):
columns=['a', 'b', 'c', 'd'])
self.assertRaises(DataInconsistencyError, importer.check_datatype, df)
def test_allow_missing(self):
importer = TableImporter(**self.importer_kwargs)
importer.check_missing(self.valid_df)
df = pd.DataFrame([['b', np.nan, 3.0, 'no'], ['b', 5, 3.0, 'no']],
columns=['a', 'b', 'c', 'd'])
df_new = importer.check_unique(df)
self.assertEqual(df_new.shape[0], 2)
def test_unique(self):
importer = TableImporter(**self.importer_kwargs)
importer.check_missing(self.valid_df)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment