From 5d6e8fb7ecbde51a1d7f21f504ed894724a60f2e Mon Sep 17 00:00:00 2001
From: fspreck <f.spreckelsen@indiscale.com>
Date: Mon, 4 Sep 2023 13:06:01 +0200
Subject: [PATCH] ENH: Allow string columns to contain numeric values

---
 src/caosadvancedtools/table_importer.py |  4 ++-
 unittests/test_table_importer.py        | 42 +++++++++++++++++++++++--
 2 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/src/caosadvancedtools/table_importer.py b/src/caosadvancedtools/table_importer.py
index 3d77e36d..1ccfad55 100755
--- a/src/caosadvancedtools/table_importer.py
+++ b/src/caosadvancedtools/table_importer.py
@@ -322,7 +322,7 @@ class TableImporter():
         .. note::
 
           If columns are integer, but should be float, this method converts the respective columns
-          in place.
+          in place. The same for columns that should have string value but have numeric value.
 
         Parameters
         ----------
@@ -342,6 +342,8 @@ class TableImporter():
                 #  These special cases should be fine.
                 if issub(col_dtype, np.integer) and issub(datatype, np.floating):
                     df[key] = df[key].astype(datatype)
+                elif datatype==str:
+                    df[key] = df[key].astype(datatype)
 
             # Now check each element
             for idx, val in df.loc[pd.notnull(df.loc[:, key]), key].items():
diff --git a/unittests/test_table_importer.py b/unittests/test_table_importer.py
index dd5b7af7..72650d61 100644
--- a/unittests/test_table_importer.py
+++ b/unittests/test_table_importer.py
@@ -192,10 +192,24 @@ class TableImporterTest(unittest.TestCase):
 
     def test_wrong_datatype(self):
         importer = TableImporter(**self.importer_kwargs)
-        df = pd.DataFrame([[None, np.nan, 2.0, 'yes'],
+        df = pd.DataFrame([[None, 0, 2.0, 'yes'],
+                           [5, 1, 2.0, 'yes']],
+                          columns=['a', 'b', 'c', 'd'])
+        # strict = False by default, so this shouldn't raise an error
+        importer.check_datatype(df)
+
+        # Reset since check_datatype changes datatypes
+        df = pd.DataFrame([[None, 0, 2.0, 'yes'],
                            [5, 1, 2.0, 'yes']],
                           columns=['a', 'b', 'c', 'd'])
-        self.assertRaises(DataInconsistencyError, importer.check_datatype, df)
+        # strict=True, so int in str column raises an error
+        self.assertRaises(DataInconsistencyError, importer.check_datatype, df, None, True)
+
+        # This is always wrong (float in int column)
+        df = pd.DataFrame([[None, np.nan, 2.0, 'yes'],
+                           [5, 1.7, 2.0, 'yes']],
+                          columns=['a', 'b', 'c', 'd'])
+        self.assertRaises(DataInconsistencyError, importer.check_datatype, df, None, False)
 
     def test_unique(self):
         importer = TableImporter(**self.importer_kwargs)
@@ -275,6 +289,30 @@ class CSVImporterTest(TableImporterTest):
         importer = CSVImporter(**self.importer_kwargs)
         importer.read_file(tmp.name)
 
+    def test_with_generous_datatypes(self):
+        """Same as above but check that values are converted as expected."""
+        tmp = NamedTemporaryFile(delete=False, suffix=".csv")
+        tmp.close()
+        self.valid_df.to_csv(tmp.name)
+        # Copy and use float for columns with integer values, string for columns
+        # with numeric values
+        kwargs = self.importer_kwargs.copy()
+        kwargs["datatypes"] = {
+            'a': str,
+            'b': float,
+            'c': str
+        }
+        importer = CSVImporter(**kwargs)
+        importer.read_file(tmp.name)
+
+        kwargs["datatypes"] = {
+            'a': str,
+            'b': str,
+            'c': str
+        }
+        importer = CSVImporter(**kwargs)
+        importer.read_file(tmp.name)
+
 
 class TSVImporterTest(TableImporterTest):
     def test_full(self):
-- 
GitLab