From b392b0751229414af814e4fc923e4fac86b9a9b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <henrik@trineo.org>
Date: Mon, 8 Nov 2021 16:41:13 +0100
Subject: [PATCH] ENH: allow to define datatypes instead of converters

---
 src/caosadvancedtools/table_importer.py | 35 +++++++++++++++++++++++--
 unittests/test_table_importer.py        | 29 +++++++++++++++-----
 2 files changed, 55 insertions(+), 9 deletions(-)

diff --git a/src/caosadvancedtools/table_importer.py b/src/caosadvancedtools/table_importer.py
index cb61e838..830919a1 100755
--- a/src/caosadvancedtools/table_importer.py
+++ b/src/caosadvancedtools/table_importer.py
@@ -186,7 +186,8 @@ def string_in_list(val, options, ignore_case=True):
 
 
 class TableImporter(object):
-    def __init__(self, converters, obligatory_columns=None, unique_keys=None):
+    def __init__(self, converters, obligatory_columns=None, unique_keys=None,
+                 datatypes=None):
         """
         converters: dict with column names as keys and converter functions as
                     values
@@ -200,14 +201,27 @@ class TableImporter(object):
         unique_columns : list of column names that in
                             combination must be unique; i.e. each row has a
                             unique combination of values in those columns.
+        datatypes: dict with column names as keys and datatypes as values
+                   All non-null values will be checked whether they have the
+                   provided datatype.
+                   This dict also defines what columns are required to exist
+                   throught the existing keys.
         """
+
+        if converters is None:
+            converters = {}
+
+        if datatypes is None:
+            datatypes = {}
+
         self.sup = SuppressKnown()
-        self.required_columns = list(converters.keys())
+        self.required_columns = list(converters.keys())+list(datatypes.keys())
         self.obligatory_columns = ([]
                                    if obligatory_columns is None
                                    else obligatory_columns)
         self.unique_keys = [] if unique_keys is None else unique_keys
         self.converters = converters
+        self.datatypes = datatypes
 
     def read_file(self, filename, **kwargs):
         raise NotImplementedError()
@@ -265,6 +279,22 @@ class TableImporter(object):
 
         return df
 
+    def check_datatype(self, df, filename=None):
+        """
+        Check for each column whether non-null fields are have the correct
+        datatype.
+        """
+
+        for key, datatype in self.datatypes.items():
+            for idx, val in df.loc[pd.notnull(df.loc[:, key]), key].iteritems():
+                if not isinstance(val, datatype):
+                    raise DataInconsistencyError(
+                        "In row no. {rn} and column {c} of file '{fi}' the "
+                        "datatype was {was} but it should be "
+                        "{expected}".format(rn=idx, c=key, fi=filename,
+                                            was=type(val), expected=datatype)
+                    )
+
     def check_missing(self, df, filename=None):
         """
         Check in each row whether obligatory fields are empty or null.
@@ -309,6 +339,7 @@ class TableImporter(object):
     def check_dataframe(self, df, filename):
         self.check_columns(df, filename=filename)
         df = self.check_missing(df, filename=filename)
+        self.check_datatype(df, filename=filename)
 
         if len(self.unique_keys) > 0:
             df = self.check_unique(df, filename=filename)
diff --git a/unittests/test_table_importer.py b/unittests/test_table_importer.py
index b574c867..fbfb13d5 100644
--- a/unittests/test_table_importer.py
+++ b/unittests/test_table_importer.py
@@ -27,16 +27,15 @@ import numpy as np
 import pandas as pd
 import pytest
 from caosadvancedtools.datainconsistency import DataInconsistencyError
-from caosadvancedtools.table_importer import (XLSImporter, assure_name_format,
+from caosadvancedtools.table_importer import (CSVImporter, TableImporter,
+                                              TSVImporter, XLSImporter,
+                                              assure_name_format,
                                               date_converter,
                                               datetime_converter,
-                                              TableImporter,
-                                              TSVImporter,
-                                              CSVImporter,
                                               incomplete_date_converter,
+                                              string_in_list,
                                               win_path_converter,
                                               win_path_list_converter,
-                                              string_in_list,
                                               yes_no_converter)
 
 
@@ -143,20 +142,29 @@ class ConverterTest(unittest.TestCase):
 class TableImporterTest(unittest.TestCase):
     def setUp(self):
         self.importer_kwargs = dict(
-            converters={'a': str, 'b': int, 'c': float, 'd': yes_no_converter},
+            converters={'c': float, 'd': yes_no_converter},
+            datatypes={'a': str, 'b': int},
             obligatory_columns=['a', 'b'], unique_keys=[('a', 'b')])
         self.valid_df = pd.DataFrame(
             [['a', 1, 2.0, 'yes']], columns=['a', 'b', 'c', 'd'])
 
     def test_missing_col(self):
-        df = pd.DataFrame(columns=['a', 'b'])
+        # check missing from converters
+        df = pd.DataFrame(columns=['a', 'b', 'c'])
+        importer = TableImporter(**self.importer_kwargs)
+        self.assertRaises(ValueError, importer.check_columns, df)
+        # check missing from datatypes
+        df = pd.DataFrame(columns=['a', 'd', 'c'])
         importer = TableImporter(**self.importer_kwargs)
         self.assertRaises(ValueError, importer.check_columns, df)
+        # check valid
         importer.check_columns(self.valid_df)
 
     def test_missing_val(self):
         importer = TableImporter(**self.importer_kwargs)
+        # check valid
         importer.check_missing(self.valid_df)
+        # check invalid
         df = pd.DataFrame([[None, np.nan, 2.0, 'yes'],
                            [None, 1, 2.0, 'yes'],
                            ['a', np.nan, 2.0, 'yes'],
@@ -167,6 +175,13 @@ class TableImporterTest(unittest.TestCase):
         self.assertEqual(df_new.shape[1], 4)
         self.assertEqual(df_new.iloc[0].b, 5)
 
+    def test_wrong_datatype(self):
+        importer = TableImporter(**self.importer_kwargs)
+        df = pd.DataFrame([[None, np.nan, 2.0, 'yes'],
+                           [5, 1, 2.0, 'yes']],
+                          columns=['a', 'b', 'c', 'd'])
+        self.assertRaises(DataInconsistencyError, importer.check_datatype, df)
+
     def test_unique(self):
         importer = TableImporter(**self.importer_kwargs)
         importer.check_missing(self.valid_df)
-- 
GitLab