diff --git a/src/caosadvancedtools/table_importer.py b/src/caosadvancedtools/table_importer.py index bae813b23195c93ccfd369a626424dd069164fb0..72c7ddd45db6a20e8ec46ae57320c42b1765a150 100755 --- a/src/caosadvancedtools/table_importer.py +++ b/src/caosadvancedtools/table_importer.py @@ -205,12 +205,41 @@ def string_in_list(val, options, ignore_case=True): return val +def _pandas_typecheck(candidate, dtype): + if pd.api.types.is_integer_dtype(dtype): + return pd.api.types.is_integer_dtype(candidate) + if pd.api.types.is_float_dtype(dtype): + return pd.api.types.is_float_dtype(candidate) + if pd.api.types.is_bool_dtype(dtype): + return pd.api.types.is_bool_dtype(candidate) + return None + + +def _is_subtype_of(candidate, supertype): + """Check whether `candidate` has a subtype of `supertype`, also respecting + pandas types that np.issubdtype is not aware of. + + """ + pandas_typecheck = _pandas_typecheck(candidate, supertype) + if pandas_typecheck is not None: + return pandas_typecheck + return np.issubdtype(candidate, supertype) + + +def _is_instance_of_type(candidate, dtype): + """Wrape `isinstance` so that pandas datatypes can be handled.""" + pandas_typecheck = _pandas_typecheck(type(candidate), dtype) + if pandas_typecheck is not None: + return pandas_typecheck + return isinstance(candidate, dtype) + + class TableImporter(): """Abstract base class for importing data from tables. """ def __init__(self, converters, obligatory_columns=None, unique_keys=None, - datatypes=None, existing_columns=None): + datatypes=None, existing_columns=None, convert_int_to_nullable_int=True): """ Parameters ---------- @@ -234,6 +263,12 @@ class TableImporter(): existing_columns : list, optional List of column names that must exist but may have missing (NULL) values + + convert_int_to_nullable_int : bool, optional + Whether to convert all integer datatypes to ``pandas.Int64Dtype()`` + which is nullable, to allow for integer columns with empty fields. If + set to False, a ``DataInConsistencyError`` will be raised in case of + empty fields in integer columns. Default is True. """ if converters is None: @@ -250,7 +285,14 @@ class TableImporter(): if datatypes is None: datatypes = {} - self.datatypes = datatypes + self.datatypes = datatypes.copy() + + self.convert_int_to_nullable_int = convert_int_to_nullable_int + + if convert_int_to_nullable_int is True: + for key, dtype in self.datatypes.items(): + if pd.api.types.is_integer_dtype(dtype): + self.datatypes[key] = pd.Int64Dtype() if existing_columns is None: existing_columns = [] @@ -333,22 +375,25 @@ class TableImporter(): """ for key, datatype in self.datatypes.items(): if key not in df.columns: + # We ignore all datatype definitions that are not present in the + # dataframe. continue + col_dtype = df.dtypes[key] + # Check for castable numeric types first: We unconditionally cast int to the default # float, because CaosDB does not have different sizes anyway. - col_dtype = df.dtypes[key] - if not strict and not np.issubdtype(col_dtype, datatype): + if not strict and not _is_subtype_of(col_dtype, datatype): # These special cases should be fine. if ((datatype == str) - or (np.issubdtype(col_dtype, np.integer) - and np.issubdtype(datatype, np.floating)) + or (pd.api.types.is_integer_dtype(col_dtype) + and pd.api.types.is_float_dtype(datatype)) ): # NOQA df[key] = df[key].astype(datatype) # Now check each element for idx, val in df.loc[pd.notnull(df.loc[:, key]), key].items(): - if not isinstance(val, datatype): + if not _is_instance_of_type(val, datatype): msg = ( "In row no. {rn} and column '{c}' of file '{fi}' the " "datatype was {was} but it should be " @@ -483,7 +528,8 @@ class CSVImporter(TableImporter): **kwargs) applicable_converters = {k: v for k, v in self.converters.items() if k in tmpdf.columns} - df = pd.read_csv(filename, sep=sep, converters=applicable_converters, + df = pd.read_csv(filename, sep=sep, + converters=applicable_converters, dtype=self.datatypes, **kwargs) except ValueError as ve: logger.warning( @@ -497,22 +543,6 @@ class CSVImporter(TableImporter): return df -class TSVImporter(TableImporter): +class TSVImporter(CSVImporter): def read_file(self, filename, **kwargs): - try: - tmpdf = pd.read_csv(filename, sep="\t", converters=self.converters, - **kwargs) - applicable_converters = {k: v for k, v in self.converters.items() - if k in tmpdf.columns} - df = pd.read_csv(filename, sep="\t", converters=self.converters, - **kwargs) - except ValueError as ve: - logger.warning( - "Cannot parse {}.\n{}".format(filename, ve), - extra={'identifier': str(filename), - 'category': "inconsistency"}) - raise DataInconsistencyError(*ve.args) - - df = self.check_dataframe(df, filename) - - return df + return super().read_file(filename, sep="\t", **kwargs) diff --git a/unittests/test_table_importer.py b/unittests/test_table_importer.py index 599ea535d95d0b6c1216a935813d71c8e90c1d3b..906118e34c5807285e43e5a8b5430f74129ab8bc 100644 --- a/unittests/test_table_importer.py +++ b/unittests/test_table_importer.py @@ -200,6 +200,7 @@ class TableImporterTest(unittest.TestCase): assert df["float"].dtype == int # strict = False by default, so this shouldn't raise an error importer.check_datatype(df) + print(importer.datatypes) # The types should be correct now. assert df["a"].dtype == pd.StringDtype assert df["float"].dtype == float @@ -325,6 +326,62 @@ class CSVImporterTest(TableImporterTest): importer = CSVImporter(**kwargs) importer.read_file(tmp.name) + def test_gaps_in_int_column(self): + """Test for + https://gitlab.com/linkahead/linkahead-advanced-user-tools/-/issues/62: + Datatype confusion when encountering empty values in integer columns. + + """ + tmpfile = NamedTemporaryFile(delete=False, suffix=".csv") + with open(tmpfile.name, 'w') as tmp: + tmp.write( + "int,int_with_gaps,float\n" + "1,1,1.1\n" + "2,,1.2\n" + "3,3,1.3\n" + ) + + kwargs = { + "datatypes": { + "int": int, + "int_with_gaps": int, + "float": float + }, + "obligatory_columns": ["int"], + "converters": {} + } + importer = CSVImporter(**kwargs) + assert importer.datatypes["int"] == "Int64" + assert importer.datatypes["int_with_gaps"] == "Int64" + assert importer.datatypes["float"] == float + df = importer.read_file(tmpfile.name) + # Default is to convert nullable ints + assert df["int"].dtype == "Int64" + assert df["int_with_gaps"].dtype == "Int64" + assert df["float"].dtype == float + + assert pd.isna(df["int_with_gaps"][1]) + + # When not converting, empty fields raise errors ... + importer_strict = CSVImporter(convert_int_to_nullable_int=False, **kwargs) + assert importer_strict.datatypes["int"] == int + assert importer_strict.datatypes["int_with_gaps"] == int + assert importer_strict.datatypes["float"] == float + with pytest.raises(DataInconsistencyError) as die: + df = importer_strict.read_file(tmpfile.name) + print(df) + assert "Integer column has NA values in column 1" in str(die.value) + + # ... except when a nullable datatype is set explicitly + kwargs["datatypes"]["int_with_gaps"] = "Int64" + importer_strict = CSVImporter(convert_int_to_nullable_int=False, **kwargs) + df = importer_strict.read_file(tmpfile.name) + # Now only the one that has been specifically set to Int64 is nullable. + assert df["int"].dtype == int + assert df["int_with_gaps"].dtype == "Int64" + assert df["float"].dtype == float + + class TSVImporterTest(TableImporterTest): def test_full(self):