ENH: allow to provide required columns explicitly

cd5d44eb · Henrik tom Wörden · 9ff30718 · cd5d44eb · cd5d44eb · cd5d44eb
Commit cd5d44eb authored 2 years ago by Henrik tom Wörden
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased] ##

 ### Added ###
+- TableImporter now accepts a `allow_missing_values_in` argument which allows to have obligatory
+  columns with missing values

 ### Changed ###


--- a/src/caosadvancedtools/table_importer.py
+++ b/src/caosadvancedtools/table_importer.py
@@ -210,7 +210,7 @@ class TableImporter():
    """

    def __init__(self, converters, obligatory_columns=None, unique_keys=None,
-                 datatypes=None):
+                 datatypes=None, allow_missing_values_in=None):
        """
        Parameters
        ----------
@@ -221,7 +221,7 @@ class TableImporter():
          value check is not necessary.

        obligatory_columns : list, optional
-          List of column names, each listed column must not have missing values.
+          List of column names, each listed column must exist and must not have missing values.

        unique_keys : list, optional
          List of column names that in combination must be unique: each row has a unique
@@ -232,16 +232,22 @@ class TableImporter():
          checked whether they have the provided datatype.  This dict also defines what columns are
          required to exist throught the existing keys.

+        allow_missing_values_in : list, optional
+          List of (obligatory) column names which may have missing (NULL) values
+
        """

        if converters is None:
            converters = {}

+        if allow_missing_values_in is None:
+            allow_missing_values_in = []
+
        if datatypes is None:
            datatypes = {}

        self.sup = SuppressKnown()
-        self.required_columns = list(converters.keys())+list(datatypes.keys())
+        self.allow_missing_values_in = allow_missing_values_in
        self.obligatory_columns = ([]
                                   if obligatory_columns is None
                                   else obligatory_columns)
@@ -263,7 +269,7 @@ class TableImporter():

        """

-        for col in self.required_columns:
+        for col in self.obligatory_columns+list(self.converters.keys())+list(self.datatypes.keys()):
            if col not in df.columns:
                errmsg = "Column '{}' missing in ".format(col)
                errmsg += ("\n{}.\n".format(filename) if filename
@@ -333,8 +339,7 @@ class TableImporter():
                    df[key] = df[key].astype(datatype)

            # Now check each element
-            for idx, val in df.loc[
-                    pd.notnull(df.loc[:, key]), key].iteritems():
+            for idx, val in df.loc[pd.notnull(df.loc[:, key]), key].iteritems():

                if not isinstance(val, datatype):
                    msg = (
@@ -364,8 +369,8 @@ class TableImporter():
        for index, row in df.iterrows():
            # if none of the relevant information is given, skip

-            if np.array([pd.isnull(row.loc[key]) for key in
-                         self.obligatory_columns]).all():
+            if np.array([pd.isnull(row.loc[key]) for key in self.obligatory_columns
+                         if key not in self.allow_missing_values_in]).all():

                df = df.drop(index)


--- a/unittests/test_table_importer.py
+++ b/unittests/test_table_importer.py
@@ -146,9 +146,11 @@ class TableImporterTest(unittest.TestCase):
        self.importer_kwargs = dict(
            converters={'c': float, 'd': yes_no_converter},
            datatypes={'a': str, 'b': int},
-            obligatory_columns=['a', 'b'], unique_keys=[('a', 'b')])
+            obligatory_columns=['a', 'b'], unique_keys=[('a', 'b')],
+            allow_missing_values_in=['e'],
+        )
        self.valid_df = pd.DataFrame(
-            [['a', 1, 2.0, 'yes']], columns=['a', 'b', 'c', 'd'])
+            [['a', 1, 2.0, 'yes', np.nan]], columns=['a', 'b', 'c', 'd', 'e'])

    def test_missing_col(self):
        # check missing from converters
@@ -184,6 +186,14 @@ class TableImporterTest(unittest.TestCase):
                          columns=['a', 'b', 'c', 'd'])
        self.assertRaises(DataInconsistencyError, importer.check_datatype, df)

+    def test_allow_missing(self):
+        importer = TableImporter(**self.importer_kwargs)
+        importer.check_missing(self.valid_df)
+        df = pd.DataFrame([['b', np.nan, 3.0, 'no'], ['b', 5, 3.0, 'no']],
+                          columns=['a', 'b', 'c', 'd'])
+        df_new = importer.check_unique(df)
+        self.assertEqual(df_new.shape[0], 2)
+
    def test_unique(self):
        importer = TableImporter(**self.importer_kwargs)
        importer.check_missing(self.valid_df)