Henrik tom Wörden · Henrik tom Wörden
--- a/src/caosadvancedtools/table_importer.py

+ 33

− 18
+++ b/src/caosadvancedtools/table_importer.py

+ 33

− 18
 @@ -210,7 +210,7 @@ class TableImporter():
    """

    def __init__(self, converters, obligatory_columns=None, unique_keys=None,
-                 datatypes=None, allow_missing_values_in=None):
+                 datatypes=None, existing_columns=None):
        """
        Parameters
        ----------
 @@ -232,28 +232,31 @@ class TableImporter():
          checked whether they have the provided datatype.  This dict also defines what columns are
          required to exist throught the existing keys.

-        allow_missing_values_in : list, optional
-          List of (obligatory) column names which may have missing (NULL) values
-
+        existing_columns : list, optional
+          List of column names that must exist but may have missing (NULL) values
        """

        if converters is None:
            converters = {}
+        self.converters = converters

-        if allow_missing_values_in is None:
-            allow_missing_values_in = []
+        if obligatory_columns is None:
+            obligatory_columns = []
+        self.obligatory_columns = obligatory_columns
+
+        if unique_keys is None:
+            unique_keys = []
+        self.unique_keys = unique_keys

        if datatypes is None:
            datatypes = {}
+        self.datatypes = datatypes
+
+        if existing_columns is None:
+            existing_columns = []
+        self.existing_columns = existing_columns

        self.sup = SuppressKnown()
-        self.allow_missing_values_in = allow_missing_values_in
-        self.obligatory_columns = ([]
-                                   if obligatory_columns is None
-                                   else obligatory_columns)
-        self.unique_keys = [] if unique_keys is None else unique_keys
-        self.converters = converters
-        self.datatypes = datatypes

    def read_file(self, filename, **kwargs):
        raise NotImplementedError()
 @@ -269,7 +272,7 @@ class TableImporter():

        """

-        for col in self.obligatory_columns+list(self.converters.keys())+list(self.datatypes.keys()):
+        for col in self.obligatory_columns+self.existing_columns:
            if col not in df.columns:
                errmsg = "Column '{}' missing in ".format(col)
                errmsg += ("\n{}.\n".format(filename) if filename
 @@ -329,6 +332,8 @@ class TableImporter():

        """
        for key, datatype in self.datatypes.items():
+            if key not in df.columns:
+                continue
            # Check for castable numeric types first: We unconditionally cast int to the default
            # float, because CaosDB does not have different sizes anyway.
            col_dtype = df.dtypes[key]
 @@ -369,8 +374,7 @@ class TableImporter():
        for index, row in df.iterrows():
            # if none of the relevant information is given, skip

-            if np.array([pd.isnull(row.loc[key]) for key in self.obligatory_columns
-                         if key not in self.allow_missing_values_in]).all():
+            if np.array([pd.isnull(row.loc[key]) for key in self.obligatory_columns]).all():

                df = df.drop(index)

 @@ -454,7 +458,10 @@ class XLSImporter(TableImporter):
                "All but the first are being ignored.".format(filename))

        try:
-            df = xls_file.parse(converters=self.converters, **kwargs)
+            tmpdf = xls_file.parse(**kwargs)
+            applicable_converters = {k: v for k, v in self.converters.items()
+                                     if k in tmpdf.columns}
+            df = xls_file.parse(converters=applicable_converters, **kwargs)
        except Exception as e:
            logger.warning(
                "Cannot parse {}.\n{}".format(filename, e),
 @@ -470,7 +477,11 @@ class XLSImporter(TableImporter):
 class CSVImporter(TableImporter):
    def read_file(self, filename, sep=",", **kwargs):
        try:
-            df = pd.read_csv(filename, sep=sep, converters=self.converters,
+            tmpdf = pd.read_csv(filename, sep=sep, converters=self.converters,
+                                **kwargs)
+            applicable_converters = {k: v for k, v in self.converters.items()
+                                     if k in tmpdf.columns}
+            df = pd.read_csv(filename, sep=sep, converters=applicable_converters,
                             **kwargs)
        except ValueError as ve:
            logger.warning(
 @@ -487,6 +498,10 @@ class CSVImporter(TableImporter):
 class TSVImporter(TableImporter):
    def read_file(self, filename, **kwargs):
        try:
+            tmpdf = pd.read_csv(filename, sep="\t", converters=self.converters,
+                                **kwargs)
+            applicable_converters = {k: v for k, v in self.converters.items()
+                                     if k in tmpdf.columns}
            df = pd.read_csv(filename, sep="\t", converters=self.converters,
                             **kwargs)
        except ValueError as ve: