Skip to content
Snippets Groups Projects

ENH: allow to provide required columns explicitly

Merged Henrik tom Wörden requested to merge f-required into dev
Files
3
@@ -210,7 +210,7 @@ class TableImporter():
"""
def __init__(self, converters, obligatory_columns=None, unique_keys=None,
datatypes=None):
datatypes=None, existing_columns=None):
"""
Parameters
----------
@@ -221,7 +221,7 @@ class TableImporter():
value check is not necessary.
obligatory_columns : list, optional
List of column names, each listed column must not have missing values.
List of column names that (if they exist) must not have missing values.
unique_keys : list, optional
List of column names that in combination must be unique: each row has a unique
@@ -232,22 +232,31 @@ class TableImporter():
checked whether they have the provided datatype. This dict also defines what columns are
required to exist throught the existing keys.
existing_columns : list, optional
List of column names that must exist but may have missing (NULL) values
"""
if converters is None:
converters = {}
self.converters = converters
if obligatory_columns is None:
obligatory_columns = []
self.obligatory_columns = obligatory_columns
if unique_keys is None:
unique_keys = []
self.unique_keys = unique_keys
if datatypes is None:
datatypes = {}
self.datatypes = datatypes
if existing_columns is None:
existing_columns = []
self.existing_columns = existing_columns
self.sup = SuppressKnown()
self.required_columns = list(converters.keys())+list(datatypes.keys())
self.obligatory_columns = ([]
if obligatory_columns is None
else obligatory_columns)
self.unique_keys = [] if unique_keys is None else unique_keys
self.converters = converters
self.datatypes = datatypes
def read_file(self, filename, **kwargs):
raise NotImplementedError()
@@ -263,7 +272,7 @@ class TableImporter():
"""
for col in self.required_columns:
for col in self.existing_columns:
if col not in df.columns:
errmsg = "Column '{}' missing in ".format(col)
errmsg += ("\n{}.\n".format(filename) if filename
@@ -323,6 +332,8 @@ class TableImporter():
"""
for key, datatype in self.datatypes.items():
if key not in df.columns:
continue
# Check for castable numeric types first: We unconditionally cast int to the default
# float, because CaosDB does not have different sizes anyway.
col_dtype = df.dtypes[key]
@@ -333,8 +344,7 @@ class TableImporter():
df[key] = df[key].astype(datatype)
# Now check each element
for idx, val in df.loc[
pd.notnull(df.loc[:, key]), key].items():
for idx, val in df.loc[pd.notnull(df.loc[:, key]), key].items():
if not isinstance(val, datatype):
msg = (
@@ -363,22 +373,20 @@ class TableImporter():
for index, row in df.iterrows():
# if none of the relevant information is given, skip
if np.array([pd.isnull(row.loc[key]) for key in
self.obligatory_columns]).all():
if pd.isnull(row.loc[[key for key in self.obligatory_columns if key in df.columns]]).all():
df = df.drop(index)
continue
# if any of the relevant information is missing, report it
i = 0
okay = True
while okay and i < len(self.obligatory_columns):
key = self.obligatory_columns[i]
i += 1
if key not in df.columns:
continue
if pd.isnull(row.loc[key]):
errmsg = (
@@ -449,7 +457,10 @@ class XLSImporter(TableImporter):
"All but the first are being ignored.".format(filename))
try:
df = xls_file.parse(converters=self.converters, **kwargs)
tmpdf = xls_file.parse(**kwargs)
applicable_converters = {k: v for k, v in self.converters.items()
if k in tmpdf.columns}
df = xls_file.parse(converters=applicable_converters, **kwargs)
except Exception as e:
logger.warning(
"Cannot parse {}.\n{}".format(filename, e),
@@ -465,7 +476,11 @@ class XLSImporter(TableImporter):
class CSVImporter(TableImporter):
def read_file(self, filename, sep=",", **kwargs):
try:
df = pd.read_csv(filename, sep=sep, converters=self.converters,
tmpdf = pd.read_csv(filename, sep=sep, converters=self.converters,
**kwargs)
applicable_converters = {k: v for k, v in self.converters.items()
if k in tmpdf.columns}
df = pd.read_csv(filename, sep=sep, converters=applicable_converters,
**kwargs)
except ValueError as ve:
logger.warning(
@@ -482,6 +497,10 @@ class CSVImporter(TableImporter):
class TSVImporter(TableImporter):
def read_file(self, filename, **kwargs):
try:
tmpdf = pd.read_csv(filename, sep="\t", converters=self.converters,
**kwargs)
applicable_converters = {k: v for k, v in self.converters.items()
if k in tmpdf.columns}
df = pd.read_csv(filename, sep="\t", converters=self.converters,
**kwargs)
except ValueError as ve:
Loading