Merge branch 'f-datatypes' into 'dev'

ENH: allow to define datatypes instead of converters See merge request !24

Merge branch 'f-datatypes' into 'dev'
45c3dbc2 · Florian Spreckelsen · 7706438c · ff13aecb · 45c3dbc2 · 45c3dbc2
Commit 45c3dbc2 authored 3 years ago by Florian Spreckelsen
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added ###
 - `check_reference_field` function to check whether entities with provided ids
  exits (for example when importing data from a table)
+- added the `datatypes` argument to `TableImporter` for columns that do not 
+  need a special conversion function

 ### Changed ###


--- a/src/caosadvancedtools/table_importer.py
+++ b/src/caosadvancedtools/table_importer.py
@@ -202,7 +202,8 @@ def string_in_list(val, options, ignore_case=True):


 class TableImporter(object):
-    def __init__(self, converters, obligatory_columns=None, unique_keys=None):
+    def __init__(self, converters, obligatory_columns=None, unique_keys=None,
+                 datatypes=None):
        """
        converters: dict with column names as keys and converter functions as
                    values
@@ -216,14 +217,27 @@ class TableImporter(object):
        unique_columns : list of column names that in
                            combination must be unique; i.e. each row has a
                            unique combination of values in those columns.
+        datatypes: dict with column names as keys and datatypes as values
+                   All non-null values will be checked whether they have the
+                   provided datatype.
+                   This dict also defines what columns are required to exist
+                   throught the existing keys.
        """
+
+        if converters is None:
+            converters = {}
+
+        if datatypes is None:
+            datatypes = {}
+
        self.sup = SuppressKnown()
-        self.required_columns = list(converters.keys())
+        self.required_columns = list(converters.keys())+list(datatypes.keys())
        self.obligatory_columns = ([]
                                   if obligatory_columns is None
                                   else obligatory_columns)
        self.unique_keys = [] if unique_keys is None else unique_keys
        self.converters = converters
+        self.datatypes = datatypes

    def read_file(self, filename, **kwargs):
        raise NotImplementedError()
@@ -281,6 +295,22 @@ class TableImporter(object):

        return df

+    def check_datatype(self, df, filename=None):
+        """
+        Check for each column whether non-null fields are have the correct
+        datatype.
+        """
+
+        for key, datatype in self.datatypes.items():
+            for idx, val in df.loc[pd.notnull(df.loc[:, key]), key].iteritems():
+                if not isinstance(val, datatype):
+                    raise DataInconsistencyError(
+                        "In row no. {rn} and column {c} of file '{fi}' the "
+                        "datatype was {was} but it should be "
+                        "{expected}".format(rn=idx, c=key, fi=filename,
+                                            was=type(val), expected=datatype)
+                    )
+
    def check_missing(self, df, filename=None):
        """
        Check in each row whether obligatory fields are empty or null.
@@ -325,6 +355,7 @@ class TableImporter(object):
    def check_dataframe(self, df, filename):
        self.check_columns(df, filename=filename)
        df = self.check_missing(df, filename=filename)
+        self.check_datatype(df, filename=filename)

        if len(self.unique_keys) > 0:
            df = self.check_unique(df, filename=filename)

--- a/unittests/test_table_importer.py
+++ b/unittests/test_table_importer.py
@@ -146,20 +146,29 @@ class ConverterTest(unittest.TestCase):
 class TableImporterTest(unittest.TestCase):
    def setUp(self):
        self.importer_kwargs = dict(
-            converters={'a': str, 'b': int, 'c': float, 'd': yes_no_converter},
+            converters={'c': float, 'd': yes_no_converter},
+            datatypes={'a': str, 'b': int},
            obligatory_columns=['a', 'b'], unique_keys=[('a', 'b')])
        self.valid_df = pd.DataFrame(
            [['a', 1, 2.0, 'yes']], columns=['a', 'b', 'c', 'd'])

    def test_missing_col(self):
-        df = pd.DataFrame(columns=['a', 'b'])
+        # check missing from converters
+        df = pd.DataFrame(columns=['a', 'b', 'c'])
        importer = TableImporter(**self.importer_kwargs)
        self.assertRaises(ValueError, importer.check_columns, df)
+        # check missing from datatypes
+        df = pd.DataFrame(columns=['a', 'd', 'c'])
+        importer = TableImporter(**self.importer_kwargs)
+        self.assertRaises(ValueError, importer.check_columns, df)
+        # check valid
        importer.check_columns(self.valid_df)

    def test_missing_val(self):
        importer = TableImporter(**self.importer_kwargs)
+        # check valid
        importer.check_missing(self.valid_df)
+        # check invalid
        df = pd.DataFrame([[None, np.nan, 2.0, 'yes'],
                           [None, 1, 2.0, 'yes'],
                           ['a', np.nan, 2.0, 'yes'],
@@ -170,6 +179,13 @@ class TableImporterTest(unittest.TestCase):
        self.assertEqual(df_new.shape[1], 4)
        self.assertEqual(df_new.iloc[0].b, 5)

+    def test_wrong_datatype(self):
+        importer = TableImporter(**self.importer_kwargs)
+        df = pd.DataFrame([[None, np.nan, 2.0, 'yes'],
+                           [5, 1, 2.0, 'yes']],
+                          columns=['a', 'b', 'c', 'd'])
+        self.assertRaises(DataInconsistencyError, importer.check_datatype, df)
+
    def test_unique(self):
        importer = TableImporter(**self.importer_kwargs)
        importer.check_missing(self.valid_df)