Skip to content
Snippets Groups Projects
Commit 81ccbd95 authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

MAINT: change wording of TableImporter argument and allow converters and...

MAINT: change wording of TableImporter argument and allow converters and datatypes for nonexisting columns
parent bd8b9ed3
No related branches found
No related tags found
2 merge requests!73MAINT: change wording of TableImporter argument and allow converters and...,!70ENH: allow to provide required columns explicitly
Pipeline #35526 passed
......@@ -7,10 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased] ##
### Added ###
- TableImporter now accepts a `allow_missing_values_in` argument which allows to have obligatory
columns with missing values
- TableImporter now accepts a `existing_columns` argument which demands that certain columns exist
although they may have missing values
### Changed ###
- The converters and datatype arguments of TableImporter now may have keys for nonexisting columns
### Deprecated ###
......
......@@ -210,7 +210,7 @@ class TableImporter():
"""
def __init__(self, converters, obligatory_columns=None, unique_keys=None,
datatypes=None, allow_missing_values_in=None):
datatypes=None, existing_columns=None):
"""
Parameters
----------
......@@ -232,28 +232,31 @@ class TableImporter():
checked whether they have the provided datatype. This dict also defines what columns are
required to exist throught the existing keys.
allow_missing_values_in : list, optional
List of (obligatory) column names which may have missing (NULL) values
existing_columns : list, optional
List of column names that must exist but may have missing (NULL) values
"""
if converters is None:
converters = {}
self.converters = converters
if allow_missing_values_in is None:
allow_missing_values_in = []
if obligatory_columns is None:
obligatory_columns = []
self.obligatory_columns = obligatory_columns
if unique_keys is None:
unique_keys = []
self.unique_keys = unique_keys
if datatypes is None:
datatypes = {}
self.datatypes = datatypes
if existing_columns is None:
existing_columns = []
self.existing_columns = existing_columns
self.sup = SuppressKnown()
self.allow_missing_values_in = allow_missing_values_in
self.obligatory_columns = ([]
if obligatory_columns is None
else obligatory_columns)
self.unique_keys = [] if unique_keys is None else unique_keys
self.converters = converters
self.datatypes = datatypes
def read_file(self, filename, **kwargs):
raise NotImplementedError()
......@@ -269,7 +272,7 @@ class TableImporter():
"""
for col in self.obligatory_columns+list(self.converters.keys())+list(self.datatypes.keys()):
for col in self.obligatory_columns+self.existing_columns:
if col not in df.columns:
errmsg = "Column '{}' missing in ".format(col)
errmsg += ("\n{}.\n".format(filename) if filename
......@@ -329,6 +332,8 @@ class TableImporter():
"""
for key, datatype in self.datatypes.items():
if key not in df.columns:
continue
# Check for castable numeric types first: We unconditionally cast int to the default
# float, because CaosDB does not have different sizes anyway.
col_dtype = df.dtypes[key]
......@@ -369,8 +374,7 @@ class TableImporter():
for index, row in df.iterrows():
# if none of the relevant information is given, skip
if np.array([pd.isnull(row.loc[key]) for key in self.obligatory_columns
if key not in self.allow_missing_values_in]).all():
if np.array([pd.isnull(row.loc[key]) for key in self.obligatory_columns]).all():
df = df.drop(index)
......@@ -454,7 +458,10 @@ class XLSImporter(TableImporter):
"All but the first are being ignored.".format(filename))
try:
df = xls_file.parse(converters=self.converters, **kwargs)
tmpdf = xls_file.parse(**kwargs)
applicable_converters = {k: v for k, v in self.converters.items()
if k in tmpdf.columns}
df = xls_file.parse(converters=applicable_converters, **kwargs)
except Exception as e:
logger.warning(
"Cannot parse {}.\n{}".format(filename, e),
......@@ -470,7 +477,11 @@ class XLSImporter(TableImporter):
class CSVImporter(TableImporter):
def read_file(self, filename, sep=",", **kwargs):
try:
df = pd.read_csv(filename, sep=sep, converters=self.converters,
tmpdf = pd.read_csv(filename, sep=sep, converters=self.converters,
**kwargs)
applicable_converters = {k: v for k, v in self.converters.items()
if k in tmpdf.columns}
df = pd.read_csv(filename, sep=sep, converters=applicable_converters,
**kwargs)
except ValueError as ve:
logger.warning(
......@@ -487,6 +498,10 @@ class CSVImporter(TableImporter):
class TSVImporter(TableImporter):
def read_file(self, filename, **kwargs):
try:
tmpdf = pd.read_csv(filename, sep="\t", converters=self.converters,
**kwargs)
applicable_converters = {k: v for k, v in self.converters.items()
if k in tmpdf.columns}
df = pd.read_csv(filename, sep="\t", converters=self.converters,
**kwargs)
except ValueError as ve:
......
......@@ -144,21 +144,21 @@ class ConverterTest(unittest.TestCase):
class TableImporterTest(unittest.TestCase):
def setUp(self):
self.importer_kwargs = dict(
converters={'c': float, 'd': yes_no_converter},
datatypes={'a': str, 'b': int},
converters={'c': float, 'd': yes_no_converter, 'x': float}, # x does not exist
datatypes={'a': str, 'b': int, 'x': int}, # x does not exist
obligatory_columns=['a', 'b'], unique_keys=[('a', 'b')],
allow_missing_values_in=['e'],
existing_columns=['e'],
)
self.valid_df = pd.DataFrame(
[['a', 1, 2.0, 'yes', np.nan]], columns=['a', 'b', 'c', 'd', 'e'])
def test_missing_col(self):
# check missing from converters
df = pd.DataFrame(columns=['a', 'b', 'c'])
# check missing from obligatory
df = pd.DataFrame(columns=['a', 'e'])
importer = TableImporter(**self.importer_kwargs)
self.assertRaises(ValueError, importer.check_columns, df)
# check missing from datatypes
df = pd.DataFrame(columns=['a', 'd', 'c'])
# check missing from existing
df = pd.DataFrame(columns=['a', 'b'])
importer = TableImporter(**self.importer_kwargs)
self.assertRaises(ValueError, importer.check_columns, df)
# check valid
......@@ -186,14 +186,6 @@ class TableImporterTest(unittest.TestCase):
columns=['a', 'b', 'c', 'd'])
self.assertRaises(DataInconsistencyError, importer.check_datatype, df)
def test_allow_missing(self):
importer = TableImporter(**self.importer_kwargs)
importer.check_missing(self.valid_df)
df = pd.DataFrame([['b', np.nan, 3.0, 'no'], ['b', 5, 3.0, 'no']],
columns=['a', 'b', 'c', 'd'])
df_new = importer.check_unique(df)
self.assertEqual(df_new.shape[0], 2)
def test_unique(self):
importer = TableImporter(**self.importer_kwargs)
importer.check_missing(self.valid_df)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment