diff --git a/CHANGELOG.md b/CHANGELOG.md index c4a81f1cef3b2dfac5a36f1fa566369340047745..88bdb1960cccb68c78b531924304bf86d4418af3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed ### * A bit better error handling in the yaml model parser. +* `TableImporter.check_datatypes` allows numeric values in string columns if + `strict=False` (default). ### Deprecated ### diff --git a/src/caosadvancedtools/table_importer.py b/src/caosadvancedtools/table_importer.py index 3d77e36db4d031657e43589e540edec28fd09633..bae813b23195c93ccfd369a626424dd069164fb0 100755 --- a/src/caosadvancedtools/table_importer.py +++ b/src/caosadvancedtools/table_importer.py @@ -322,7 +322,7 @@ class TableImporter(): .. note:: If columns are integer, but should be float, this method converts the respective columns - in place. + in place. The same for columns that should have string value but have numeric value. Parameters ---------- @@ -338,9 +338,11 @@ class TableImporter(): # float, because CaosDB does not have different sizes anyway. col_dtype = df.dtypes[key] if not strict and not np.issubdtype(col_dtype, datatype): - issub = np.issubdtype # These special cases should be fine. - if issub(col_dtype, np.integer) and issub(datatype, np.floating): + if ((datatype == str) + or (np.issubdtype(col_dtype, np.integer) + and np.issubdtype(datatype, np.floating)) + ): # NOQA df[key] = df[key].astype(datatype) # Now check each element diff --git a/unittests/test_table_importer.py b/unittests/test_table_importer.py index dd5b7af712e341683f6bcbd36b67653beebe1673..599ea535d95d0b6c1216a935813d71c8e90c1d3b 100644 --- a/unittests/test_table_importer.py +++ b/unittests/test_table_importer.py @@ -44,7 +44,7 @@ from test_utils import BaseMockUpTest # For testing the table importer IMPORTER_KWARGS = dict( converters={'c': float, 'd': yes_no_converter, 'x': float}, # x does not exist - datatypes={'a': str, 'b': int, 'x': int}, # x does not exist + datatypes={'a': str, 'b': int, 'float': float, 'x': int}, # x does not exist obligatory_columns=['a', 'b'], unique_keys=[('a', 'b')], existing_columns=['e'], ) @@ -192,10 +192,36 @@ class TableImporterTest(unittest.TestCase): def test_wrong_datatype(self): importer = TableImporter(**self.importer_kwargs) - df = pd.DataFrame([[None, np.nan, 2.0, 'yes'], + df = pd.DataFrame([[1234, 0, 2.0, 3, 'yes'], + [5678, 1, 2.0, 3, 'yes']], + columns=['a', 'b', 'c', 'float', 'd']) + # wrong datatypes before + assert df["a"].dtype == int + assert df["float"].dtype == int + # strict = False by default, so this shouldn't raise an error + importer.check_datatype(df) + # The types should be correct now. + assert df["a"].dtype == pd.StringDtype + assert df["float"].dtype == float + + # Resetting `df` since check_datatype may change datatypes + df = pd.DataFrame([[None, 0, 2.0, 'yes'], [5, 1, 2.0, 'yes']], columns=['a', 'b', 'c', 'd']) - self.assertRaises(DataInconsistencyError, importer.check_datatype, df) + # strict=True, so number in str column raises an error + self.assertRaises(DataInconsistencyError, importer.check_datatype, df, None, True) + + df = pd.DataFrame([[0], + [1]], + columns=['float']) + # strict=True, so int in float column raises an error + self.assertRaises(DataInconsistencyError, importer.check_datatype, df, None, True) + + # This is always wrong (float in int column) + df = pd.DataFrame([[None, np.nan, 2.0, 'yes'], + [5, 1.7, 2.0, 'yes']], + columns=['a', 'b', 'c', 'd']) + self.assertRaises(DataInconsistencyError, importer.check_datatype, df, None, False) def test_unique(self): importer = TableImporter(**self.importer_kwargs) @@ -275,6 +301,30 @@ class CSVImporterTest(TableImporterTest): importer = CSVImporter(**self.importer_kwargs) importer.read_file(tmp.name) + def test_with_generous_datatypes(self): + """Same as above but check that values are converted as expected.""" + tmp = NamedTemporaryFile(delete=False, suffix=".csv") + tmp.close() + self.valid_df.to_csv(tmp.name) + # Copy and use float for columns with integer values, string for columns + # with numeric values + kwargs = self.importer_kwargs.copy() + kwargs["datatypes"] = { + 'a': str, + 'b': float, + 'c': str + } + importer = CSVImporter(**kwargs) + importer.read_file(tmp.name) + + kwargs["datatypes"] = { + 'a': str, + 'b': str, + 'c': str + } + importer = CSVImporter(**kwargs) + importer.read_file(tmp.name) + class TSVImporterTest(TableImporterTest): def test_full(self):