Skip to content
Snippets Groups Projects
Commit db1ccf4b authored by Florian Spreckelsen's avatar Florian Spreckelsen
Browse files

Merge branch 'f-required' into 'dev'

ENH: allow to provide required columns explicitly

See merge request !70
parents f986cea4 a9bab585
No related branches found
No related tags found
2 merge requests!73MAINT: change wording of TableImporter argument and allow converters and...,!70ENH: allow to provide required columns explicitly
Pipeline #37085 passed
......@@ -7,8 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased] ##
### Added ###
- TableImporter now accepts a `existing_columns` argument which demands that certain columns exist
### Changed ###
- The converters and datatype arguments of TableImporter now may have keys for nonexisting columns
### Deprecated ###
......
......@@ -210,7 +210,7 @@ class TableImporter():
"""
def __init__(self, converters, obligatory_columns=None, unique_keys=None,
datatypes=None):
datatypes=None, existing_columns=None):
"""
Parameters
----------
......@@ -221,7 +221,7 @@ class TableImporter():
value check is not necessary.
obligatory_columns : list, optional
List of column names, each listed column must not have missing values.
List of column names that (if they exist) must not have missing values.
unique_keys : list, optional
List of column names that in combination must be unique: each row has a unique
......@@ -232,22 +232,31 @@ class TableImporter():
checked whether they have the provided datatype. This dict also defines what columns are
required to exist throught the existing keys.
existing_columns : list, optional
List of column names that must exist but may have missing (NULL) values
"""
if converters is None:
converters = {}
self.converters = converters
if obligatory_columns is None:
obligatory_columns = []
self.obligatory_columns = obligatory_columns
if unique_keys is None:
unique_keys = []
self.unique_keys = unique_keys
if datatypes is None:
datatypes = {}
self.datatypes = datatypes
if existing_columns is None:
existing_columns = []
self.existing_columns = existing_columns
self.sup = SuppressKnown()
self.required_columns = list(converters.keys())+list(datatypes.keys())
self.obligatory_columns = ([]
if obligatory_columns is None
else obligatory_columns)
self.unique_keys = [] if unique_keys is None else unique_keys
self.converters = converters
self.datatypes = datatypes
def read_file(self, filename, **kwargs):
raise NotImplementedError()
......@@ -263,7 +272,7 @@ class TableImporter():
"""
for col in self.required_columns:
for col in self.existing_columns:
if col not in df.columns:
errmsg = "Column '{}' missing in ".format(col)
errmsg += ("\n{}.\n".format(filename) if filename
......@@ -323,6 +332,8 @@ class TableImporter():
"""
for key, datatype in self.datatypes.items():
if key not in df.columns:
continue
# Check for castable numeric types first: We unconditionally cast int to the default
# float, because CaosDB does not have different sizes anyway.
col_dtype = df.dtypes[key]
......@@ -333,8 +344,7 @@ class TableImporter():
df[key] = df[key].astype(datatype)
# Now check each element
for idx, val in df.loc[
pd.notnull(df.loc[:, key]), key].items():
for idx, val in df.loc[pd.notnull(df.loc[:, key]), key].items():
if not isinstance(val, datatype):
msg = (
......@@ -363,22 +373,20 @@ class TableImporter():
for index, row in df.iterrows():
# if none of the relevant information is given, skip
if np.array([pd.isnull(row.loc[key]) for key in
self.obligatory_columns]).all():
if pd.isnull(row.loc[[key for key in self.obligatory_columns if key in df.columns]]).all():
df = df.drop(index)
continue
# if any of the relevant information is missing, report it
i = 0
okay = True
while okay and i < len(self.obligatory_columns):
key = self.obligatory_columns[i]
i += 1
if key not in df.columns:
continue
if pd.isnull(row.loc[key]):
errmsg = (
......@@ -449,7 +457,10 @@ class XLSImporter(TableImporter):
"All but the first are being ignored.".format(filename))
try:
df = xls_file.parse(converters=self.converters, **kwargs)
tmpdf = xls_file.parse(**kwargs)
applicable_converters = {k: v for k, v in self.converters.items()
if k in tmpdf.columns}
df = xls_file.parse(converters=applicable_converters, **kwargs)
except Exception as e:
logger.warning(
"Cannot parse {}.\n{}".format(filename, e),
......@@ -465,7 +476,11 @@ class XLSImporter(TableImporter):
class CSVImporter(TableImporter):
def read_file(self, filename, sep=",", **kwargs):
try:
df = pd.read_csv(filename, sep=sep, converters=self.converters,
tmpdf = pd.read_csv(filename, sep=sep, converters=self.converters,
**kwargs)
applicable_converters = {k: v for k, v in self.converters.items()
if k in tmpdf.columns}
df = pd.read_csv(filename, sep=sep, converters=applicable_converters,
**kwargs)
except ValueError as ve:
logger.warning(
......@@ -482,6 +497,10 @@ class CSVImporter(TableImporter):
class TSVImporter(TableImporter):
def read_file(self, filename, **kwargs):
try:
tmpdf = pd.read_csv(filename, sep="\t", converters=self.converters,
**kwargs)
applicable_converters = {k: v for k, v in self.converters.items()
if k in tmpdf.columns}
df = pd.read_csv(filename, sep="\t", converters=self.converters,
**kwargs)
except ValueError as ve:
......
......@@ -41,6 +41,16 @@ from caosadvancedtools.table_importer import (CSVImporter, TableImporter,
from test_utils import BaseMockUpTest
# For testing the table importer
IMPORTER_KWARGS = dict(
converters={'c': float, 'd': yes_no_converter, 'x': float}, # x does not exist
datatypes={'a': str, 'b': int, 'x': int}, # x does not exist
obligatory_columns=['a', 'b'], unique_keys=[('a', 'b')],
existing_columns=['e'],
)
VALID_DF = pd.DataFrame(
[['a', 1, 2.0, 'yes', np.nan]], columns=['a', 'b', 'c', 'd', 'e'])
class ConverterTest(unittest.TestCase):
def test_yes_no(self):
......@@ -143,22 +153,16 @@ class ConverterTest(unittest.TestCase):
class TableImporterTest(unittest.TestCase):
def setUp(self):
self.importer_kwargs = dict(
converters={'c': float, 'd': yes_no_converter},
datatypes={'a': str, 'b': int},
obligatory_columns=['a', 'b'], unique_keys=[('a', 'b')])
self.valid_df = pd.DataFrame(
[['a', 1, 2.0, 'yes']], columns=['a', 'b', 'c', 'd'])
self.importer_kwargs = IMPORTER_KWARGS
self.valid_df = VALID_DF
def test_missing_col(self):
# check missing from converters
df = pd.DataFrame(columns=['a', 'b', 'c'])
importer = TableImporter(**self.importer_kwargs)
self.assertRaises(ValueError, importer.check_columns, df)
# check missing from datatypes
df = pd.DataFrame(columns=['a', 'd', 'c'])
# check missing from existing
df = pd.DataFrame(columns=['a', 'b'])
importer = TableImporter(**self.importer_kwargs)
self.assertRaises(ValueError, importer.check_columns, df)
with pytest.raises(DataInconsistencyError) as die:
importer.check_columns(df)
assert "Column 'e' missing" in str(die.value)
# check valid
importer.check_columns(self.valid_df)
......@@ -193,6 +197,35 @@ class TableImporterTest(unittest.TestCase):
self.assertEqual(df_new.shape[0], 1)
def test_check_dataframe_existing_obligatory_columns(caplog):
"""Needs caplog so remove from above class."""
# stricter test case; column 'a' must exist and have a value
strict_kwargs = IMPORTER_KWARGS.copy()
strict_kwargs["existing_columns"].append('a')
importer = TableImporter(**strict_kwargs)
# the valid df is still valid, since 'a' has a value
importer.check_dataframe(VALID_DF)
# Now 'a' doesn't
df_missing_a = pd.DataFrame(
[[np.nan, 1, 2.0, 'yes', 'e']], columns=['a', 'b', 'c', 'd', 'e'])
new_df = importer.check_dataframe(df_missing_a)
# Column is removed and a warning is in the logger:
assert new_df.shape[0] == 0
assert "Required information is missing (a) in 1. row" in caplog.text
df_missing_c = pd.DataFrame(
[['a', 1, 'yes', np.nan]], columns=['a', 'b', 'd', 'e'])
new_df = importer.check_dataframe(df_missing_c)
assert new_df.shape[0] == 1
assert new_df.shape[1] == 4
caplog.clear()
class XLSImporterTest(TableImporterTest):
def test_full(self):
""" test full run with example data """
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment