Daniel Hornung
--- a/src/caosadvancedtools/table_importer.py

+ 81

− 34
+++ b/src/caosadvancedtools/table_importer.py

+ 81

− 34
 @@ -205,27 +205,33 @@ def string_in_list(val, options, ignore_case=True):
    return val


-class TableImporter(object):
+class TableImporter():
+    """Abstract base class for importing data from tables.
+    """
+
    def __init__(self, converters, obligatory_columns=None, unique_keys=None,
                 datatypes=None):
        """
-        converters: dict with column names as keys and converter functions as
-                    values
-                    This dict also defines what columns are required to exist
-                    throught the existing keys. The converter functions are
-                    applied to the cell values. They should also check for
-                    ValueErrors, such that a separate value check is not
-                    necessary.
-        obligatory_columns: list of column names, optional
-                            each listed column must not have missing values
-        unique_columns : list of column names that in
-                            combination must be unique; i.e. each row has a
-                            unique combination of values in those columns.
-        datatypes: dict with column names as keys and datatypes as values
-                   All non-null values will be checked whether they have the
-                   provided datatype.
-                   This dict also defines what columns are required to exist
-                   throught the existing keys.
+        Parameters
+        ----------
+        converters : dict
+          Dict with column names as keys and converter functions as values. This dict also defines
+          what columns are required to exist throught the existing keys. The converter functions are
+          applied to the cell values. They should also check for ValueErrors, such that a separate
+          value check is not necessary.
+
+        obligatory_columns : list, optional
+          List of column names, each listed column must not have missing values.
+
+        unique_keys : list, optional
+          List of column names that in combination must be unique: each row has a unique
+          combination of values in those columns.
+
+        datatypes : dict, optional
+          Dict with column names as keys and datatypes as values.  All non-null values will be
+          checked whether they have the provided datatype.  This dict also defines what columns are
+          required to exist throught the existing keys.
+
        """

        if converters is None:
 @@ -247,11 +253,14 @@ class TableImporter(object):
        raise NotImplementedError()

    def check_columns(self, df, filename=None):
-        """
-        checks whether all required columns, i.e. columns for which converters
-        were defined exist.
+        """Check whether all required columns exist.
+
+        Required columns are columns for which converters are defined.
+
+        Raises
+        ------
+        DataInconsistencyError

-        Raises: DataInconsistencyError
        """

        for col in self.required_columns:
 @@ -267,12 +276,11 @@ class TableImporter(object):
                raise DataInconsistencyError(errmsg)

    def check_unique(self, df, filename=None):
-        """
-        Check whether value combinations that shall be unique for each row are
-        unique.
+        """Check whether value combinations that shall be unique for each row are unique.

        If a second row is found, that uses the same combination of values as a
        previous one, the second one is removed.
+
        """
        df = df.copy()
        uniques = []
 @@ -299,13 +307,32 @@ class TableImporter(object):

        return df

-    def check_datatype(self, df, filename=None):
-        """
-        Check for each column whether non-null fields are have the correct
-        datatype.
-        """
+    def check_datatype(self, df, filename=None, strict=False):
+        """Check for each column whether non-null fields have the correct datatype.
+
+        .. note::

+          If columns are integer, but should be float, this method converts the respective columns
+          in place.
+
+        Parameters
+        ----------
+
+        strict: boolean, optional
+          If False (the default), try to convert columns, otherwise raise an error.
+
+        """
        for key, datatype in self.datatypes.items():
+            # Check for castable numeric types first: We unconditionally cast int to the default
+            # float, because CaosDB does not have different sizes anyway.
+            col_dtype = df.dtypes[key]
+            if not strict and not np.issubdtype(col_dtype, datatype):
+                issub = np.issubdtype
+                #  These special cases should be fine.
+                if issub(col_dtype, np.integer) and issub(datatype, np.floating):
+                    df[key] = df[key].astype(datatype)
+
+            # Now check each element
            for idx, val in df.loc[
                    pd.notnull(df.loc[:, key]), key].iteritems():

 @@ -326,6 +353,11 @@ class TableImporter(object):
        Check in each row whether obligatory fields are empty or null.

        Rows that have missing values are removed.
+
+        Returns
+        -------
+        out : pandas.DataFrame
+          The input DataFrame with incomplete rows removed.
        """
        df = df.copy()

 @@ -362,10 +394,26 @@ class TableImporter(object):

        return df

-    def check_dataframe(self, df, filename):
+    def check_dataframe(self, df, filename=None, strict=False):
+        """Check if the dataframe conforms to the restrictions.
+
+        Checked restrictions are: Columns, data types, uniqueness requirements.
+
+        Parameters
+        ----------
+
+        df: pandas.DataFrame
+          The dataframe to be checked.
+
+        filename: string, optional
+          The file name, only used for output in case of problems.
+
+        strict: boolean, optional
+          If False (the default), try to convert columns, otherwise raise an error.
+        """
        self.check_columns(df, filename=filename)
        df = self.check_missing(df, filename=filename)
-        self.check_datatype(df, filename=filename)
+        self.check_datatype(df, filename=filename, strict=strict)

        if len(self.unique_keys) > 0:
            df = self.check_unique(df, filename=filename)
 @@ -378,8 +426,7 @@ class XLSImporter(TableImporter):
        return self.read_xls(filename=filename, **kwargs)

    def read_xls(self, filename, **kwargs):
-        """
-        converts an xls file into a Pandas DataFrame.
+        """Convert an xls file into a Pandas DataFrame.

        The converters of the XLSImporter object are used.