MAINT: change wording of TableImporter argument and allow converters and...

MAINT: change wording of TableImporter argument and allow converters and datatypes for nonexisting columns

MAINT: change wording of TableImporter argument and allow converters and...
81ccbd95 · Henrik tom Wörden · bd8b9ed3 · 81ccbd95 · 81ccbd95 · 81ccbd95
Commit 81ccbd95 authored 1 year ago by Henrik tom Wörden
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,10 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased] ##

 ### Added ###
- TableImporter now accepts a `allow_missing_values_in` argument which allows to have obligatory
-  columns with missing values
+- TableImporter now accepts a `existing_columns` argument which demands that certain columns exist
+  although they may have missing values

 ### Changed ###
+- The converters and datatype arguments of TableImporter now may have keys for nonexisting columns

 ### Deprecated ###


--- a/src/caosadvancedtools/table_importer.py
+++ b/src/caosadvancedtools/table_importer.py
@@ -210,7 +210,7 @@ class TableImporter():
    """

    def __init__(self, converters, obligatory_columns=None, unique_keys=None,
-                 datatypes=None, allow_missing_values_in=None):
+                 datatypes=None, existing_columns=None):
        """
        Parameters
        ----------
@@ -232,28 +232,31 @@ class TableImporter():
          checked whether they have the provided datatype.  This dict also defines what columns are
          required to exist throught the existing keys.

-        allow_missing_values_in : list, optional
-          List of (obligatory) column names which may have missing (NULL) values
-
+        existing_columns : list, optional
+          List of column names that must exist but may have missing (NULL) values
        """

        if converters is None:
            converters = {}
+        self.converters = converters

-        if allow_missing_values_in is None:
-            allow_missing_values_in = []
+        if obligatory_columns is None:
+            obligatory_columns = []
+        self.obligatory_columns = obligatory_columns
+
+        if unique_keys is None:
+            unique_keys = []
+        self.unique_keys = unique_keys

        if datatypes is None:
            datatypes = {}
+        self.datatypes = datatypes
+
+        if existing_columns is None:
+            existing_columns = []
+        self.existing_columns = existing_columns

        self.sup = SuppressKnown()
-        self.allow_missing_values_in = allow_missing_values_in
-        self.obligatory_columns = ([]
-                                   if obligatory_columns is None
-                                   else obligatory_columns)
-        self.unique_keys = [] if unique_keys is None else unique_keys
-        self.converters = converters
-        self.datatypes = datatypes

    def read_file(self, filename, **kwargs):
        raise NotImplementedError()
@@ -269,7 +272,7 @@ class TableImporter():

        """

-        for col in self.obligatory_columns+list(self.converters.keys())+list(self.datatypes.keys()):
+        for col in self.obligatory_columns+self.existing_columns:
            if col not in df.columns:
                errmsg = "Column '{}' missing in ".format(col)
                errmsg += ("\n{}.\n".format(filename) if filename
@@ -329,6 +332,8 @@ class TableImporter():

        """
        for key, datatype in self.datatypes.items():
+            if key not in df.columns:
+                continue
            # Check for castable numeric types first: We unconditionally cast int to the default
            # float, because CaosDB does not have different sizes anyway.
            col_dtype = df.dtypes[key]
@@ -369,8 +374,7 @@ class TableImporter():
        for index, row in df.iterrows():
            # if none of the relevant information is given, skip

-            if np.array([pd.isnull(row.loc[key]) for key in self.obligatory_columns
-                         if key not in self.allow_missing_values_in]).all():
+            if np.array([pd.isnull(row.loc[key]) for key in self.obligatory_columns]).all():

                df = df.drop(index)

@@ -454,7 +458,10 @@ class XLSImporter(TableImporter):
                "All but the first are being ignored.".format(filename))

        try:
-            df = xls_file.parse(converters=self.converters, **kwargs)
+            tmpdf = xls_file.parse(**kwargs)
+            applicable_converters = {k: v for k, v in self.converters.items()
+                                     if k in tmpdf.columns}
+            df = xls_file.parse(converters=applicable_converters, **kwargs)
        except Exception as e:
            logger.warning(
                "Cannot parse {}.\n{}".format(filename, e),
@@ -470,7 +477,11 @@ class XLSImporter(TableImporter):
 class CSVImporter(TableImporter):
    def read_file(self, filename, sep=",", **kwargs):
        try:
-            df = pd.read_csv(filename, sep=sep, converters=self.converters,
+            tmpdf = pd.read_csv(filename, sep=sep, converters=self.converters,
+                                **kwargs)
+            applicable_converters = {k: v for k, v in self.converters.items()
+                                     if k in tmpdf.columns}
+            df = pd.read_csv(filename, sep=sep, converters=applicable_converters,
                             **kwargs)
        except ValueError as ve:
            logger.warning(
@@ -487,6 +498,10 @@ class CSVImporter(TableImporter):
 class TSVImporter(TableImporter):
    def read_file(self, filename, **kwargs):
        try:
+            tmpdf = pd.read_csv(filename, sep="\t", converters=self.converters,
+                                **kwargs)
+            applicable_converters = {k: v for k, v in self.converters.items()
+                                     if k in tmpdf.columns}
            df = pd.read_csv(filename, sep="\t", converters=self.converters,
                             **kwargs)
        except ValueError as ve:

--- a/unittests/test_table_importer.py
+++ b/unittests/test_table_importer.py
@@ -144,21 +144,21 @@ class ConverterTest(unittest.TestCase):
 class TableImporterTest(unittest.TestCase):
    def setUp(self):
        self.importer_kwargs = dict(
-            converters={'c': float, 'd': yes_no_converter},
-            datatypes={'a': str, 'b': int},
+            converters={'c': float, 'd': yes_no_converter, 'x': float},  # x does not exist
+            datatypes={'a': str, 'b': int, 'x': int},  # x does not exist
            obligatory_columns=['a', 'b'], unique_keys=[('a', 'b')],
-            allow_missing_values_in=['e'],
+            existing_columns=['e'],
        )
        self.valid_df = pd.DataFrame(
            [['a', 1, 2.0, 'yes', np.nan]], columns=['a', 'b', 'c', 'd', 'e'])

    def test_missing_col(self):
-        # check missing from converters
-        df = pd.DataFrame(columns=['a', 'b', 'c'])
+        # check missing from obligatory
+        df = pd.DataFrame(columns=['a', 'e'])
        importer = TableImporter(**self.importer_kwargs)
        self.assertRaises(ValueError, importer.check_columns, df)
-        # check missing from datatypes
-        df = pd.DataFrame(columns=['a', 'd', 'c'])
+        # check missing from existing
+        df = pd.DataFrame(columns=['a', 'b'])
        importer = TableImporter(**self.importer_kwargs)
        self.assertRaises(ValueError, importer.check_columns, df)
        # check valid
@@ -186,14 +186,6 @@ class TableImporterTest(unittest.TestCase):
                          columns=['a', 'b', 'c', 'd'])
        self.assertRaises(DataInconsistencyError, importer.check_datatype, df)

-    def test_allow_missing(self):
-        importer = TableImporter(**self.importer_kwargs)
-        importer.check_missing(self.valid_df)
-        df = pd.DataFrame([['b', np.nan, 3.0, 'no'], ['b', 5, 3.0, 'no']],
-                          columns=['a', 'b', 'c', 'd'])
-        df_new = importer.check_unique(df)
-        self.assertEqual(df_new.shape[0], 2)
-
    def test_unique(self):
        importer = TableImporter(**self.importer_kwargs)
        importer.check_missing(self.valid_df)