Florian Spreckelsen · 6e41fd1d · f8e283ae · b99716d1 · 7787203e · 6e41fd1d
--- a/src/caosadvancedtools/table_importer.py

+ 58

− 28

View file @ 7787203e

Open in Web IDE
+++ b/src/caosadvancedtools/table_importer.py

+ 58

− 28

View file @ 7787203e

Open in Web IDE
 @@ -205,18 +205,47 @@ def string_in_list(val, options, ignore_case=True):
 @@ -205,18 +205,47 @@ def string_in_list(val, options, ignore_case=True):
    return val
+def _pandas_typecheck(candidate, dtype):
+    if pd.api.types.is_integer_dtype(dtype):
+        return pd.api.types.is_integer_dtype(candidate)
+    if pd.api.types.is_float_dtype(dtype):
+        return pd.api.types.is_float_dtype(candidate)
+    if pd.api.types.is_bool_dtype(dtype):
+        return pd.api.types.is_bool_dtype(candidate)
+    return None
+def _is_subtype_of(candidate, supertype):
+    """Check whether `candidate` has a subtype of `supertype`, also respecting
+    pandas types that np.issubdtype is not aware of.
+    """
+    pandas_typecheck = _pandas_typecheck(candidate, supertype)
+    if pandas_typecheck is not None:
+        return pandas_typecheck
+    return np.issubdtype(candidate, supertype)
+def _is_instance_of_type(candidate, dtype):
+    """Wrape `isinstance` so that pandas datatypes can be handled."""
+    pandas_typecheck = _pandas_typecheck(type(candidate), dtype)
+    if pandas_typecheck is not None:
+        return pandas_typecheck
+    return isinstance(candidate, dtype)
 class TableImporter():
    """Abstract base class for importing data from tables.
    """
    def __init__(self, converters, obligatory_columns=None, unique_keys=None,
-                 datatypes=None, existing_columns=None):
+                 datatypes=None, existing_columns=None, convert_int_to_nullable_int=True):
        """
        Parameters
        ----------
        converters : dict
-          Dict with column names as keys and converter functions as values. This dict also defines
+          Dict with column names as keys and converter functions as values. This dict's keys also
-          what columns are required to exist throught the existing keys. The converter functions are
+          define what columns must exist. The converter functions are
          applied to the cell values. They should also check for ValueErrors, such that a separate
          value check is not necessary.
 @@ -234,6 +263,12 @@ class TableImporter():
 @@ -234,6 +263,12 @@ class TableImporter():
        existing_columns : list, optional
          List of column names that must exist but may have missing (NULL) values
+        convert_int_to_nullable_int : bool, optional
+          Whether to convert all integer datatypes to ``pandas.Int64Dtype()``
+          which is nullable, to allow for integer columns with empty fields. If
+          set to False, a ``DataInconsistencyError`` will be raised in case of
+          empty fields in integer columns.  Default is True.
        """
        if converters is None:
 @@ -250,7 +285,14 @@ class TableImporter():
 @@ -250,7 +285,14 @@ class TableImporter():
        if datatypes is None:
            datatypes = {}
-        self.datatypes = datatypes
+        self.datatypes = datatypes.copy()
+        self.convert_int_to_nullable_int = convert_int_to_nullable_int
+        if convert_int_to_nullable_int is True:
+            for key, dtype in self.datatypes.items():
+                if pd.api.types.is_integer_dtype(dtype):
+                    self.datatypes[key] = pd.Int64Dtype()
        if existing_columns is None:
            existing_columns = []
 @@ -333,22 +375,25 @@ class TableImporter():
 @@ -333,22 +375,25 @@ class TableImporter():
        """
        for key, datatype in self.datatypes.items():
            if key not in df.columns:
+                # We ignore all datatype definitions that are not present in the
+                # dataframe.
                continue
+            col_dtype = df.dtypes[key]
            # Check for castable numeric types first: We unconditionally cast int to the default
            # float, because CaosDB does not have different sizes anyway.
-            col_dtype = df.dtypes[key]
+            if not strict and not _is_subtype_of(col_dtype, datatype):
-            if not strict and not np.issubdtype(col_dtype, datatype):
                #  These special cases should be fine.
                if ((datatype == str)
-                        or (np.issubdtype(col_dtype, np.integer)
+                        or (pd.api.types.is_integer_dtype(col_dtype)
-                            and np.issubdtype(datatype, np.floating))
+                            and pd.api.types.is_float_dtype(datatype))
                    ):  # NOQA
                    df[key] = df[key].astype(datatype)
            # Now check each element
            for idx, val in df.loc[pd.notnull(df.loc[:, key]), key].items():
-                if not isinstance(val, datatype):
+                if not _is_instance_of_type(val, datatype):
                    msg = (
                        "In row no. {rn} and column '{c}' of file '{fi}' the "
                        "datatype was {was} but it should be "
 @@ -483,7 +528,8 @@ class CSVImporter(TableImporter):
 @@ -483,7 +528,8 @@ class CSVImporter(TableImporter):
                                **kwargs)
            applicable_converters = {k: v for k, v in self.converters.items()
                                     if k in tmpdf.columns}
-            df = pd.read_csv(filename, sep=sep, converters=applicable_converters,
+            df = pd.read_csv(filename, sep=sep,
+                             converters=applicable_converters, dtype=self.datatypes,
                             **kwargs)
        except ValueError as ve:
            logger.warning(
 @@ -497,22 +543,6 @@ class CSVImporter(TableImporter):
 @@ -497,22 +543,6 @@ class CSVImporter(TableImporter):
        return df
-class TSVImporter(TableImporter):
+class TSVImporter(CSVImporter):
    def read_file(self, filename, **kwargs):
-        try:
+        return super().read_file(filename, sep="\t", **kwargs)
-            tmpdf = pd.read_csv(filename, sep="\t", converters=self.converters,
-                                **kwargs)
-            applicable_converters = {k: v for k, v in self.converters.items()
-                                     if k in tmpdf.columns}
-            df = pd.read_csv(filename, sep="\t", converters=self.converters,
-                             **kwargs)
-        except ValueError as ve:
-            logger.warning(
-                "Cannot parse {}.\n{}".format(filename, ve),
-                extra={'identifier': str(filename),
-                       'category': "inconsistency"})
-            raise DataInconsistencyError(*ve.args)
-        df = self.check_dataframe(df, filename)
-        return df