diff --git a/CHANGELOG.md b/CHANGELOG.md index f0a3fbd0f77c0e147dabdcbe596eacad960d6118..2577214760ead85cce48ef9083964917037b119d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed ### +- The `TableImporter` and its subclasses now change all integer datatypes to the + nullable `pandas.Int64Datatype` so that integer columns with empty fields can be + treated properly. In case you don't want the datatypes to be changed + automatically, initialize the `TableImporter` with + `convert_int_to_nullable_int=False`. + ### Deprecated ### ### Removed ### @@ -20,12 +26,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed ### - Blacklisted buggy openpyxl version +- [#62](https://gitlab.com/linkahead/linkahead-advanced-user-tools/-/issues/62) + The `TableImporter` now handles empty fields in integer columns by supporting + the corresponding [nullable integer + types](https://pandas.pydata.org/docs/user_guide/integer_na.html) in Pandas. ### Security ### ### Documentation ### - loadFiles has better `-h` documentation now +- Rudimentary documentation for `table_importer` module ## [0.10.0] - 2024-04-24 ## diff --git a/src/caosadvancedtools/table_importer.py b/src/caosadvancedtools/table_importer.py index bae813b23195c93ccfd369a626424dd069164fb0..5efd0500a4c5a797a27a92caf0cd2a49165fddd2 100755 --- a/src/caosadvancedtools/table_importer.py +++ b/src/caosadvancedtools/table_importer.py @@ -205,18 +205,47 @@ def string_in_list(val, options, ignore_case=True): return val +def _pandas_typecheck(candidate, dtype): + if pd.api.types.is_integer_dtype(dtype): + return pd.api.types.is_integer_dtype(candidate) + if pd.api.types.is_float_dtype(dtype): + return pd.api.types.is_float_dtype(candidate) + if pd.api.types.is_bool_dtype(dtype): + return pd.api.types.is_bool_dtype(candidate) + return None + + +def _is_subtype_of(candidate, supertype): + """Check whether `candidate` has a subtype of `supertype`, also respecting + pandas types that np.issubdtype is not aware of. + + """ + pandas_typecheck = _pandas_typecheck(candidate, supertype) + if pandas_typecheck is not None: + return pandas_typecheck + return np.issubdtype(candidate, supertype) + + +def _is_instance_of_type(candidate, dtype): + """Wrape `isinstance` so that pandas datatypes can be handled.""" + pandas_typecheck = _pandas_typecheck(type(candidate), dtype) + if pandas_typecheck is not None: + return pandas_typecheck + return isinstance(candidate, dtype) + + class TableImporter(): """Abstract base class for importing data from tables. """ def __init__(self, converters, obligatory_columns=None, unique_keys=None, - datatypes=None, existing_columns=None): + datatypes=None, existing_columns=None, convert_int_to_nullable_int=True): """ Parameters ---------- converters : dict - Dict with column names as keys and converter functions as values. This dict also defines - what columns are required to exist throught the existing keys. The converter functions are + Dict with column names as keys and converter functions as values. This dict's keys also + define what columns must exist. The converter functions are applied to the cell values. They should also check for ValueErrors, such that a separate value check is not necessary. @@ -234,6 +263,12 @@ class TableImporter(): existing_columns : list, optional List of column names that must exist but may have missing (NULL) values + + convert_int_to_nullable_int : bool, optional + Whether to convert all integer datatypes to ``pandas.Int64Dtype()`` + which is nullable, to allow for integer columns with empty fields. If + set to False, a ``DataInconsistencyError`` will be raised in case of + empty fields in integer columns. Default is True. """ if converters is None: @@ -250,7 +285,14 @@ class TableImporter(): if datatypes is None: datatypes = {} - self.datatypes = datatypes + self.datatypes = datatypes.copy() + + self.convert_int_to_nullable_int = convert_int_to_nullable_int + + if convert_int_to_nullable_int is True: + for key, dtype in self.datatypes.items(): + if pd.api.types.is_integer_dtype(dtype): + self.datatypes[key] = pd.Int64Dtype() if existing_columns is None: existing_columns = [] @@ -333,22 +375,25 @@ class TableImporter(): """ for key, datatype in self.datatypes.items(): if key not in df.columns: + # We ignore all datatype definitions that are not present in the + # dataframe. continue + col_dtype = df.dtypes[key] + # Check for castable numeric types first: We unconditionally cast int to the default # float, because CaosDB does not have different sizes anyway. - col_dtype = df.dtypes[key] - if not strict and not np.issubdtype(col_dtype, datatype): + if not strict and not _is_subtype_of(col_dtype, datatype): # These special cases should be fine. if ((datatype == str) - or (np.issubdtype(col_dtype, np.integer) - and np.issubdtype(datatype, np.floating)) + or (pd.api.types.is_integer_dtype(col_dtype) + and pd.api.types.is_float_dtype(datatype)) ): # NOQA df[key] = df[key].astype(datatype) # Now check each element for idx, val in df.loc[pd.notnull(df.loc[:, key]), key].items(): - if not isinstance(val, datatype): + if not _is_instance_of_type(val, datatype): msg = ( "In row no. {rn} and column '{c}' of file '{fi}' the " "datatype was {was} but it should be " @@ -483,7 +528,8 @@ class CSVImporter(TableImporter): **kwargs) applicable_converters = {k: v for k, v in self.converters.items() if k in tmpdf.columns} - df = pd.read_csv(filename, sep=sep, converters=applicable_converters, + df = pd.read_csv(filename, sep=sep, + converters=applicable_converters, dtype=self.datatypes, **kwargs) except ValueError as ve: logger.warning( @@ -497,22 +543,6 @@ class CSVImporter(TableImporter): return df -class TSVImporter(TableImporter): +class TSVImporter(CSVImporter): def read_file(self, filename, **kwargs): - try: - tmpdf = pd.read_csv(filename, sep="\t", converters=self.converters, - **kwargs) - applicable_converters = {k: v for k, v in self.converters.items() - if k in tmpdf.columns} - df = pd.read_csv(filename, sep="\t", converters=self.converters, - **kwargs) - except ValueError as ve: - logger.warning( - "Cannot parse {}.\n{}".format(filename, ve), - extra={'identifier': str(filename), - 'category': "inconsistency"}) - raise DataInconsistencyError(*ve.args) - - df = self.check_dataframe(df, filename) - - return df + return super().read_file(filename, sep="\t", **kwargs) diff --git a/src/doc/index.rst b/src/doc/index.rst index 7fa017ec4202f25fe9f94a154ed8762c4581eebc..7032e2c24ea32b0f1efad2bd2e5b7930259daf61 100644 --- a/src/doc/index.rst +++ b/src/doc/index.rst @@ -18,6 +18,7 @@ This documentation helps you to :doc:`get started<README_SETUP>`, explains the m Specifying a datamodel with JSON schema <json_schema_interface> Convert a data model into a json schema <json_schema_exporter> Conversion between XLSX, JSON and LinkAhead Entities <table-json-conversion/specs> + Other utilities <utilities> _apidoc/modules Related Projects <related_projects/index> Back to overview <https://docs.indiscale.com/> diff --git a/src/doc/utilities.rst b/src/doc/utilities.rst new file mode 100644 index 0000000000000000000000000000000000000000..4d520ae2d4b7a9bbd81171ba002c4f736223713a --- /dev/null +++ b/src/doc/utilities.rst @@ -0,0 +1,37 @@ +Other utilities in LinkAhead Advanced User Tools +================================================ + +The table file importer +%%%%%%%%%%%%%%%%%%%%%%% + +The LinkAhead Advanced user tools provide a generic +:py:class:`~caosadvancedtools.table_importer.TableImporter` class which reads +different table file formats (at the time of writing of this documentation, +.xls(x), .csv, and .tsv) and converts them into :py:class:`pandas.DataFrame` +objects. It provides helper functions for converting column values (e.g., +converting the string values "yes" or "no" to ``True`` or ``False``), checking +the presence of obligatory columns in a table and whether those have missing +values, and datatype checks. + +The base class :py:class:`~caosadvancedtools.table_importer.TableImporter` +provides the general verification methods, while each subclass like +:py:class:`~caosadvancedtools.table_importer.XLSXImporter` or +:py:class:`~caosadvancedtools.table_importer.CSVImporter` implements its own +``read_file`` function that is used to convert a given table file into a +:py:class:`pandas.DataFrame`. + +Empty fields in integer columns +-------------------------------- + +Reading in table files that have integer-valued columns with missing data can +result in datatype contradictions (see the Pandas documentation on `nullable +integers <https://pandas.pydata.org/docs/user_guide/integer_na.html>`_) since +the default value for missing fields, ``numpy.nan``, is a float. This is why +from version 0.11 and above, the ``TableImporter`` uses +:py:class:`pandas.Int64Dtype` as the default datatype for all integer columns +which allows for empty fields while keeping all actual data integer-valued. This +behavior can be changed by initializing the ``TableImporter`` with +``convert_int_to_nullable_int=False`` in which case a +:py:class:`~caosadvancedtools.datainconsistency.DataInconsistencyError` is +raised when an empty field is encountered in a column with an non-nullable +integer datatype. diff --git a/unittests/test_table_importer.py b/unittests/test_table_importer.py index 599ea535d95d0b6c1216a935813d71c8e90c1d3b..6d445056b240e5ede6c52cb055cdde86cfb6d3d7 100644 --- a/unittests/test_table_importer.py +++ b/unittests/test_table_importer.py @@ -325,6 +325,60 @@ class CSVImporterTest(TableImporterTest): importer = CSVImporter(**kwargs) importer.read_file(tmp.name) + def test_gaps_in_int_column(self): + """Test for + https://gitlab.com/linkahead/linkahead-advanced-user-tools/-/issues/62: + Datatype confusion when encountering empty values in integer columns. + + """ + tmpfile = NamedTemporaryFile(delete=False, suffix=".csv") + with open(tmpfile.name, 'w') as tmp: + tmp.write( + "int,int_with_gaps,float\n" + "1,1,1.1\n" + "2,,1.2\n" + "3,3,1.3\n" + ) + + kwargs = { + "datatypes": { + "int": int, + "int_with_gaps": int, + "float": float + }, + "obligatory_columns": ["int"], + "converters": {} + } + importer = CSVImporter(**kwargs) + assert importer.datatypes["int"] == "Int64" + assert importer.datatypes["int_with_gaps"] == "Int64" + assert importer.datatypes["float"] == float + df = importer.read_file(tmpfile.name) + # Default is to convert nullable ints + assert df["int"].dtype == "Int64" + assert df["int_with_gaps"].dtype == "Int64" + assert df["float"].dtype == float + + assert pd.isna(df["int_with_gaps"][1]) + + # When not converting, empty fields raise errors ... + importer_strict = CSVImporter(convert_int_to_nullable_int=False, **kwargs) + assert importer_strict.datatypes["int"] == int + assert importer_strict.datatypes["int_with_gaps"] == int + assert importer_strict.datatypes["float"] == float + with pytest.raises(DataInconsistencyError) as die: + df = importer_strict.read_file(tmpfile.name) + assert "Integer column has NA values in column 1" in str(die.value) + + # ... except when a nullable datatype is set manually beforehand + kwargs["datatypes"]["int_with_gaps"] = "Int64" + importer_strict = CSVImporter(convert_int_to_nullable_int=False, **kwargs) + df = importer_strict.read_file(tmpfile.name) + # Now only the one that has been specifically set to Int64 is nullable. + assert df["int"].dtype == int + assert df["int_with_gaps"].dtype == "Int64" + assert df["float"].dtype == float + class TSVImporterTest(TableImporterTest): def test_full(self):