From 5d6e8fb7ecbde51a1d7f21f504ed894724a60f2e Mon Sep 17 00:00:00 2001 From: fspreck <f.spreckelsen@indiscale.com> Date: Mon, 4 Sep 2023 13:06:01 +0200 Subject: [PATCH] ENH: Allow string columns to contain numeric values --- src/caosadvancedtools/table_importer.py | 4 ++- unittests/test_table_importer.py | 42 +++++++++++++++++++++++-- 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/src/caosadvancedtools/table_importer.py b/src/caosadvancedtools/table_importer.py index 3d77e36d..1ccfad55 100755 --- a/src/caosadvancedtools/table_importer.py +++ b/src/caosadvancedtools/table_importer.py @@ -322,7 +322,7 @@ class TableImporter(): .. note:: If columns are integer, but should be float, this method converts the respective columns - in place. + in place. The same for columns that should have string value but have numeric value. Parameters ---------- @@ -342,6 +342,8 @@ class TableImporter(): # These special cases should be fine. if issub(col_dtype, np.integer) and issub(datatype, np.floating): df[key] = df[key].astype(datatype) + elif datatype==str: + df[key] = df[key].astype(datatype) # Now check each element for idx, val in df.loc[pd.notnull(df.loc[:, key]), key].items(): diff --git a/unittests/test_table_importer.py b/unittests/test_table_importer.py index dd5b7af7..72650d61 100644 --- a/unittests/test_table_importer.py +++ b/unittests/test_table_importer.py @@ -192,10 +192,24 @@ class TableImporterTest(unittest.TestCase): def test_wrong_datatype(self): importer = TableImporter(**self.importer_kwargs) - df = pd.DataFrame([[None, np.nan, 2.0, 'yes'], + df = pd.DataFrame([[None, 0, 2.0, 'yes'], + [5, 1, 2.0, 'yes']], + columns=['a', 'b', 'c', 'd']) + # strict = False by default, so this shouldn't raise an error + importer.check_datatype(df) + + # Reset since check_datatype changes datatypes + df = pd.DataFrame([[None, 0, 2.0, 'yes'], [5, 1, 2.0, 'yes']], columns=['a', 'b', 'c', 'd']) - self.assertRaises(DataInconsistencyError, importer.check_datatype, df) + # strict=True, so int in str column raises an error + self.assertRaises(DataInconsistencyError, importer.check_datatype, df, None, True) + + # This is always wrong (float in int column) + df = pd.DataFrame([[None, np.nan, 2.0, 'yes'], + [5, 1.7, 2.0, 'yes']], + columns=['a', 'b', 'c', 'd']) + self.assertRaises(DataInconsistencyError, importer.check_datatype, df, None, False) def test_unique(self): importer = TableImporter(**self.importer_kwargs) @@ -275,6 +289,30 @@ class CSVImporterTest(TableImporterTest): importer = CSVImporter(**self.importer_kwargs) importer.read_file(tmp.name) + def test_with_generous_datatypes(self): + """Same as above but check that values are converted as expected.""" + tmp = NamedTemporaryFile(delete=False, suffix=".csv") + tmp.close() + self.valid_df.to_csv(tmp.name) + # Copy and use float for columns with integer values, string for columns + # with numeric values + kwargs = self.importer_kwargs.copy() + kwargs["datatypes"] = { + 'a': str, + 'b': float, + 'c': str + } + importer = CSVImporter(**kwargs) + importer.read_file(tmp.name) + + kwargs["datatypes"] = { + 'a': str, + 'b': str, + 'c': str + } + importer = CSVImporter(**kwargs) + importer.read_file(tmp.name) + class TSVImporterTest(TableImporterTest): def test_full(self): -- GitLab