Skip to content
Snippets Groups Projects

ENH: allow to provide required columns explicitly

Merged Henrik tom Wörden requested to merge f-required into dev
3 files
+ 43
35
Compare changes
  • Side-by-side
  • Inline
Files
3
@@ -210,7 +210,7 @@ class TableImporter():
"""
def __init__(self, converters, obligatory_columns=None, unique_keys=None,
datatypes=None, allow_missing_values_in=None):
datatypes=None, existing_columns=None):
"""
Parameters
----------
@@ -232,28 +232,31 @@ class TableImporter():
checked whether they have the provided datatype. This dict also defines what columns are
required to exist throught the existing keys.
allow_missing_values_in : list, optional
List of (obligatory) column names which may have missing (NULL) values
existing_columns : list, optional
List of column names that must exist but may have missing (NULL) values
"""
if converters is None:
converters = {}
self.converters = converters
if allow_missing_values_in is None:
allow_missing_values_in = []
if obligatory_columns is None:
obligatory_columns = []
self.obligatory_columns = obligatory_columns
if unique_keys is None:
unique_keys = []
self.unique_keys = unique_keys
if datatypes is None:
datatypes = {}
self.datatypes = datatypes
if existing_columns is None:
existing_columns = []
self.existing_columns = existing_columns
self.sup = SuppressKnown()
self.allow_missing_values_in = allow_missing_values_in
self.obligatory_columns = ([]
if obligatory_columns is None
else obligatory_columns)
self.unique_keys = [] if unique_keys is None else unique_keys
self.converters = converters
self.datatypes = datatypes
def read_file(self, filename, **kwargs):
raise NotImplementedError()
@@ -269,7 +272,7 @@ class TableImporter():
"""
for col in self.obligatory_columns+list(self.converters.keys())+list(self.datatypes.keys()):
for col in self.obligatory_columns+self.existing_columns:
if col not in df.columns:
errmsg = "Column '{}' missing in ".format(col)
errmsg += ("\n{}.\n".format(filename) if filename
@@ -329,6 +332,8 @@ class TableImporter():
"""
for key, datatype in self.datatypes.items():
if key not in df.columns:
continue
# Check for castable numeric types first: We unconditionally cast int to the default
# float, because CaosDB does not have different sizes anyway.
col_dtype = df.dtypes[key]
@@ -369,8 +374,7 @@ class TableImporter():
for index, row in df.iterrows():
# if none of the relevant information is given, skip
if np.array([pd.isnull(row.loc[key]) for key in self.obligatory_columns
if key not in self.allow_missing_values_in]).all():
if np.array([pd.isnull(row.loc[key]) for key in self.obligatory_columns]).all():
df = df.drop(index)
@@ -454,7 +458,10 @@ class XLSImporter(TableImporter):
"All but the first are being ignored.".format(filename))
try:
df = xls_file.parse(converters=self.converters, **kwargs)
tmpdf = xls_file.parse(**kwargs)
applicable_converters = {k: v for k, v in self.converters.items()
if k in tmpdf.columns}
df = xls_file.parse(converters=applicable_converters, **kwargs)
except Exception as e:
logger.warning(
"Cannot parse {}.\n{}".format(filename, e),
@@ -470,7 +477,11 @@ class XLSImporter(TableImporter):
class CSVImporter(TableImporter):
def read_file(self, filename, sep=",", **kwargs):
try:
df = pd.read_csv(filename, sep=sep, converters=self.converters,
tmpdf = pd.read_csv(filename, sep=sep, converters=self.converters,
**kwargs)
applicable_converters = {k: v for k, v in self.converters.items()
if k in tmpdf.columns}
df = pd.read_csv(filename, sep=sep, converters=applicable_converters,
**kwargs)
except ValueError as ve:
logger.warning(
@@ -487,6 +498,10 @@ class CSVImporter(TableImporter):
class TSVImporter(TableImporter):
def read_file(self, filename, **kwargs):
try:
tmpdf = pd.read_csv(filename, sep="\t", converters=self.converters,
**kwargs)
applicable_converters = {k: v for k, v in self.converters.items()
if k in tmpdf.columns}
df = pd.read_csv(filename, sep="\t", converters=self.converters,
**kwargs)
except ValueError as ve:
Loading