Skip to content
Snippets Groups Projects
Commit db1ccf4b authored by Florian Spreckelsen's avatar Florian Spreckelsen
Browse files

Merge branch 'f-required' into 'dev'

ENH: allow to provide required columns explicitly

See merge request !70
parents f986cea4 a9bab585
No related branches found
No related tags found
2 merge requests!73MAINT: change wording of TableImporter argument and allow converters and...,!70ENH: allow to provide required columns explicitly
Pipeline #37085 passed
...@@ -7,8 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ...@@ -7,8 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased] ## ## [Unreleased] ##
### Added ### ### Added ###
- TableImporter now accepts a `existing_columns` argument which demands that certain columns exist
### Changed ### ### Changed ###
- The converters and datatype arguments of TableImporter now may have keys for nonexisting columns
### Deprecated ### ### Deprecated ###
......
...@@ -210,7 +210,7 @@ class TableImporter(): ...@@ -210,7 +210,7 @@ class TableImporter():
""" """
def __init__(self, converters, obligatory_columns=None, unique_keys=None, def __init__(self, converters, obligatory_columns=None, unique_keys=None,
datatypes=None): datatypes=None, existing_columns=None):
""" """
Parameters Parameters
---------- ----------
...@@ -221,7 +221,7 @@ class TableImporter(): ...@@ -221,7 +221,7 @@ class TableImporter():
value check is not necessary. value check is not necessary.
obligatory_columns : list, optional obligatory_columns : list, optional
List of column names, each listed column must not have missing values. List of column names that (if they exist) must not have missing values.
unique_keys : list, optional unique_keys : list, optional
List of column names that in combination must be unique: each row has a unique List of column names that in combination must be unique: each row has a unique
...@@ -232,22 +232,31 @@ class TableImporter(): ...@@ -232,22 +232,31 @@ class TableImporter():
checked whether they have the provided datatype. This dict also defines what columns are checked whether they have the provided datatype. This dict also defines what columns are
required to exist throught the existing keys. required to exist throught the existing keys.
existing_columns : list, optional
List of column names that must exist but may have missing (NULL) values
""" """
if converters is None: if converters is None:
converters = {} converters = {}
self.converters = converters
if obligatory_columns is None:
obligatory_columns = []
self.obligatory_columns = obligatory_columns
if unique_keys is None:
unique_keys = []
self.unique_keys = unique_keys
if datatypes is None: if datatypes is None:
datatypes = {} datatypes = {}
self.datatypes = datatypes
if existing_columns is None:
existing_columns = []
self.existing_columns = existing_columns
self.sup = SuppressKnown() self.sup = SuppressKnown()
self.required_columns = list(converters.keys())+list(datatypes.keys())
self.obligatory_columns = ([]
if obligatory_columns is None
else obligatory_columns)
self.unique_keys = [] if unique_keys is None else unique_keys
self.converters = converters
self.datatypes = datatypes
def read_file(self, filename, **kwargs): def read_file(self, filename, **kwargs):
raise NotImplementedError() raise NotImplementedError()
...@@ -263,7 +272,7 @@ class TableImporter(): ...@@ -263,7 +272,7 @@ class TableImporter():
""" """
for col in self.required_columns: for col in self.existing_columns:
if col not in df.columns: if col not in df.columns:
errmsg = "Column '{}' missing in ".format(col) errmsg = "Column '{}' missing in ".format(col)
errmsg += ("\n{}.\n".format(filename) if filename errmsg += ("\n{}.\n".format(filename) if filename
...@@ -323,6 +332,8 @@ class TableImporter(): ...@@ -323,6 +332,8 @@ class TableImporter():
""" """
for key, datatype in self.datatypes.items(): for key, datatype in self.datatypes.items():
if key not in df.columns:
continue
# Check for castable numeric types first: We unconditionally cast int to the default # Check for castable numeric types first: We unconditionally cast int to the default
# float, because CaosDB does not have different sizes anyway. # float, because CaosDB does not have different sizes anyway.
col_dtype = df.dtypes[key] col_dtype = df.dtypes[key]
...@@ -333,8 +344,7 @@ class TableImporter(): ...@@ -333,8 +344,7 @@ class TableImporter():
df[key] = df[key].astype(datatype) df[key] = df[key].astype(datatype)
# Now check each element # Now check each element
for idx, val in df.loc[ for idx, val in df.loc[pd.notnull(df.loc[:, key]), key].items():
pd.notnull(df.loc[:, key]), key].items():
if not isinstance(val, datatype): if not isinstance(val, datatype):
msg = ( msg = (
...@@ -363,22 +373,20 @@ class TableImporter(): ...@@ -363,22 +373,20 @@ class TableImporter():
for index, row in df.iterrows(): for index, row in df.iterrows():
# if none of the relevant information is given, skip # if none of the relevant information is given, skip
if pd.isnull(row.loc[[key for key in self.obligatory_columns if key in df.columns]]).all():
if np.array([pd.isnull(row.loc[key]) for key in
self.obligatory_columns]).all():
df = df.drop(index) df = df.drop(index)
continue continue
# if any of the relevant information is missing, report it # if any of the relevant information is missing, report it
i = 0 i = 0
okay = True okay = True
while okay and i < len(self.obligatory_columns): while okay and i < len(self.obligatory_columns):
key = self.obligatory_columns[i] key = self.obligatory_columns[i]
i += 1 i += 1
if key not in df.columns:
continue
if pd.isnull(row.loc[key]): if pd.isnull(row.loc[key]):
errmsg = ( errmsg = (
...@@ -449,7 +457,10 @@ class XLSImporter(TableImporter): ...@@ -449,7 +457,10 @@ class XLSImporter(TableImporter):
"All but the first are being ignored.".format(filename)) "All but the first are being ignored.".format(filename))
try: try:
df = xls_file.parse(converters=self.converters, **kwargs) tmpdf = xls_file.parse(**kwargs)
applicable_converters = {k: v for k, v in self.converters.items()
if k in tmpdf.columns}
df = xls_file.parse(converters=applicable_converters, **kwargs)
except Exception as e: except Exception as e:
logger.warning( logger.warning(
"Cannot parse {}.\n{}".format(filename, e), "Cannot parse {}.\n{}".format(filename, e),
...@@ -465,7 +476,11 @@ class XLSImporter(TableImporter): ...@@ -465,7 +476,11 @@ class XLSImporter(TableImporter):
class CSVImporter(TableImporter): class CSVImporter(TableImporter):
def read_file(self, filename, sep=",", **kwargs): def read_file(self, filename, sep=",", **kwargs):
try: try:
df = pd.read_csv(filename, sep=sep, converters=self.converters, tmpdf = pd.read_csv(filename, sep=sep, converters=self.converters,
**kwargs)
applicable_converters = {k: v for k, v in self.converters.items()
if k in tmpdf.columns}
df = pd.read_csv(filename, sep=sep, converters=applicable_converters,
**kwargs) **kwargs)
except ValueError as ve: except ValueError as ve:
logger.warning( logger.warning(
...@@ -482,6 +497,10 @@ class CSVImporter(TableImporter): ...@@ -482,6 +497,10 @@ class CSVImporter(TableImporter):
class TSVImporter(TableImporter): class TSVImporter(TableImporter):
def read_file(self, filename, **kwargs): def read_file(self, filename, **kwargs):
try: try:
tmpdf = pd.read_csv(filename, sep="\t", converters=self.converters,
**kwargs)
applicable_converters = {k: v for k, v in self.converters.items()
if k in tmpdf.columns}
df = pd.read_csv(filename, sep="\t", converters=self.converters, df = pd.read_csv(filename, sep="\t", converters=self.converters,
**kwargs) **kwargs)
except ValueError as ve: except ValueError as ve:
......
...@@ -41,6 +41,16 @@ from caosadvancedtools.table_importer import (CSVImporter, TableImporter, ...@@ -41,6 +41,16 @@ from caosadvancedtools.table_importer import (CSVImporter, TableImporter,
from test_utils import BaseMockUpTest from test_utils import BaseMockUpTest
# For testing the table importer
IMPORTER_KWARGS = dict(
converters={'c': float, 'd': yes_no_converter, 'x': float}, # x does not exist
datatypes={'a': str, 'b': int, 'x': int}, # x does not exist
obligatory_columns=['a', 'b'], unique_keys=[('a', 'b')],
existing_columns=['e'],
)
VALID_DF = pd.DataFrame(
[['a', 1, 2.0, 'yes', np.nan]], columns=['a', 'b', 'c', 'd', 'e'])
class ConverterTest(unittest.TestCase): class ConverterTest(unittest.TestCase):
def test_yes_no(self): def test_yes_no(self):
...@@ -143,22 +153,16 @@ class ConverterTest(unittest.TestCase): ...@@ -143,22 +153,16 @@ class ConverterTest(unittest.TestCase):
class TableImporterTest(unittest.TestCase): class TableImporterTest(unittest.TestCase):
def setUp(self): def setUp(self):
self.importer_kwargs = dict( self.importer_kwargs = IMPORTER_KWARGS
converters={'c': float, 'd': yes_no_converter}, self.valid_df = VALID_DF
datatypes={'a': str, 'b': int},
obligatory_columns=['a', 'b'], unique_keys=[('a', 'b')])
self.valid_df = pd.DataFrame(
[['a', 1, 2.0, 'yes']], columns=['a', 'b', 'c', 'd'])
def test_missing_col(self): def test_missing_col(self):
# check missing from converters # check missing from existing
df = pd.DataFrame(columns=['a', 'b', 'c']) df = pd.DataFrame(columns=['a', 'b'])
importer = TableImporter(**self.importer_kwargs)
self.assertRaises(ValueError, importer.check_columns, df)
# check missing from datatypes
df = pd.DataFrame(columns=['a', 'd', 'c'])
importer = TableImporter(**self.importer_kwargs) importer = TableImporter(**self.importer_kwargs)
self.assertRaises(ValueError, importer.check_columns, df) with pytest.raises(DataInconsistencyError) as die:
importer.check_columns(df)
assert "Column 'e' missing" in str(die.value)
# check valid # check valid
importer.check_columns(self.valid_df) importer.check_columns(self.valid_df)
...@@ -193,6 +197,35 @@ class TableImporterTest(unittest.TestCase): ...@@ -193,6 +197,35 @@ class TableImporterTest(unittest.TestCase):
self.assertEqual(df_new.shape[0], 1) self.assertEqual(df_new.shape[0], 1)
def test_check_dataframe_existing_obligatory_columns(caplog):
"""Needs caplog so remove from above class."""
# stricter test case; column 'a' must exist and have a value
strict_kwargs = IMPORTER_KWARGS.copy()
strict_kwargs["existing_columns"].append('a')
importer = TableImporter(**strict_kwargs)
# the valid df is still valid, since 'a' has a value
importer.check_dataframe(VALID_DF)
# Now 'a' doesn't
df_missing_a = pd.DataFrame(
[[np.nan, 1, 2.0, 'yes', 'e']], columns=['a', 'b', 'c', 'd', 'e'])
new_df = importer.check_dataframe(df_missing_a)
# Column is removed and a warning is in the logger:
assert new_df.shape[0] == 0
assert "Required information is missing (a) in 1. row" in caplog.text
df_missing_c = pd.DataFrame(
[['a', 1, 'yes', np.nan]], columns=['a', 'b', 'd', 'e'])
new_df = importer.check_dataframe(df_missing_c)
assert new_df.shape[0] == 1
assert new_df.shape[1] == 4
caplog.clear()
class XLSImporterTest(TableImporterTest): class XLSImporterTest(TableImporterTest):
def test_full(self): def test_full(self):
""" test full run with example data """ """ test full run with example data """
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment