Skip to content
Snippets Groups Projects
Commit cd5d44eb authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

ENH: allow to provide required columns explicitly

parent 9ff30718
No related branches found
No related tags found
2 merge requests!73MAINT: change wording of TableImporter argument and allow converters and...,!70ENH: allow to provide required columns explicitly
Pipeline #35286 failed
...@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ...@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased] ## ## [Unreleased] ##
### Added ### ### Added ###
- TableImporter now accepts a `allow_missing_values_in` argument which allows to have obligatory
columns with missing values
### Changed ### ### Changed ###
......
...@@ -210,7 +210,7 @@ class TableImporter(): ...@@ -210,7 +210,7 @@ class TableImporter():
""" """
def __init__(self, converters, obligatory_columns=None, unique_keys=None, def __init__(self, converters, obligatory_columns=None, unique_keys=None,
datatypes=None): datatypes=None, allow_missing_values_in=None):
""" """
Parameters Parameters
---------- ----------
...@@ -221,7 +221,7 @@ class TableImporter(): ...@@ -221,7 +221,7 @@ class TableImporter():
value check is not necessary. value check is not necessary.
obligatory_columns : list, optional obligatory_columns : list, optional
List of column names, each listed column must not have missing values. List of column names, each listed column must exist and must not have missing values.
unique_keys : list, optional unique_keys : list, optional
List of column names that in combination must be unique: each row has a unique List of column names that in combination must be unique: each row has a unique
...@@ -232,16 +232,22 @@ class TableImporter(): ...@@ -232,16 +232,22 @@ class TableImporter():
checked whether they have the provided datatype. This dict also defines what columns are checked whether they have the provided datatype. This dict also defines what columns are
required to exist throught the existing keys. required to exist throught the existing keys.
allow_missing_values_in : list, optional
List of (obligatory) column names which may have missing (NULL) values
""" """
if converters is None: if converters is None:
converters = {} converters = {}
if allow_missing_values_in is None:
allow_missing_values_in = []
if datatypes is None: if datatypes is None:
datatypes = {} datatypes = {}
self.sup = SuppressKnown() self.sup = SuppressKnown()
self.required_columns = list(converters.keys())+list(datatypes.keys()) self.allow_missing_values_in = allow_missing_values_in
self.obligatory_columns = ([] self.obligatory_columns = ([]
if obligatory_columns is None if obligatory_columns is None
else obligatory_columns) else obligatory_columns)
...@@ -263,7 +269,7 @@ class TableImporter(): ...@@ -263,7 +269,7 @@ class TableImporter():
""" """
for col in self.required_columns: for col in self.obligatory_columns+list(self.converters.keys())+list(self.datatypes.keys()):
if col not in df.columns: if col not in df.columns:
errmsg = "Column '{}' missing in ".format(col) errmsg = "Column '{}' missing in ".format(col)
errmsg += ("\n{}.\n".format(filename) if filename errmsg += ("\n{}.\n".format(filename) if filename
...@@ -333,8 +339,7 @@ class TableImporter(): ...@@ -333,8 +339,7 @@ class TableImporter():
df[key] = df[key].astype(datatype) df[key] = df[key].astype(datatype)
# Now check each element # Now check each element
for idx, val in df.loc[ for idx, val in df.loc[pd.notnull(df.loc[:, key]), key].iteritems():
pd.notnull(df.loc[:, key]), key].iteritems():
if not isinstance(val, datatype): if not isinstance(val, datatype):
msg = ( msg = (
...@@ -364,8 +369,8 @@ class TableImporter(): ...@@ -364,8 +369,8 @@ class TableImporter():
for index, row in df.iterrows(): for index, row in df.iterrows():
# if none of the relevant information is given, skip # if none of the relevant information is given, skip
if np.array([pd.isnull(row.loc[key]) for key in if np.array([pd.isnull(row.loc[key]) for key in self.obligatory_columns
self.obligatory_columns]).all(): if key not in self.allow_missing_values_in]).all():
df = df.drop(index) df = df.drop(index)
......
...@@ -146,9 +146,11 @@ class TableImporterTest(unittest.TestCase): ...@@ -146,9 +146,11 @@ class TableImporterTest(unittest.TestCase):
self.importer_kwargs = dict( self.importer_kwargs = dict(
converters={'c': float, 'd': yes_no_converter}, converters={'c': float, 'd': yes_no_converter},
datatypes={'a': str, 'b': int}, datatypes={'a': str, 'b': int},
obligatory_columns=['a', 'b'], unique_keys=[('a', 'b')]) obligatory_columns=['a', 'b'], unique_keys=[('a', 'b')],
allow_missing_values_in=['e'],
)
self.valid_df = pd.DataFrame( self.valid_df = pd.DataFrame(
[['a', 1, 2.0, 'yes']], columns=['a', 'b', 'c', 'd']) [['a', 1, 2.0, 'yes', np.nan]], columns=['a', 'b', 'c', 'd', 'e'])
def test_missing_col(self): def test_missing_col(self):
# check missing from converters # check missing from converters
...@@ -184,6 +186,14 @@ class TableImporterTest(unittest.TestCase): ...@@ -184,6 +186,14 @@ class TableImporterTest(unittest.TestCase):
columns=['a', 'b', 'c', 'd']) columns=['a', 'b', 'c', 'd'])
self.assertRaises(DataInconsistencyError, importer.check_datatype, df) self.assertRaises(DataInconsistencyError, importer.check_datatype, df)
def test_allow_missing(self):
importer = TableImporter(**self.importer_kwargs)
importer.check_missing(self.valid_df)
df = pd.DataFrame([['b', np.nan, 3.0, 'no'], ['b', 5, 3.0, 'no']],
columns=['a', 'b', 'c', 'd'])
df_new = importer.check_unique(df)
self.assertEqual(df_new.shape[0], 2)
def test_unique(self): def test_unique(self):
importer = TableImporter(**self.importer_kwargs) importer = TableImporter(**self.importer_kwargs)
importer.check_missing(self.valid_df) importer.check_missing(self.valid_df)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment