diff --git a/CHANGELOG.md b/CHANGELOG.md index 7603e6a2a3c0188ef830908b45f606daa98e4c00..2317fa8c76086e9c5a9079daa5cbf31d74f4d088 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,8 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ## ### Added ### +- TableImporter now accepts a `existing_columns` argument which demands that certain columns exist ### Changed ### +- The converters and datatype arguments of TableImporter now may have keys for nonexisting columns ### Deprecated ### diff --git a/src/caosadvancedtools/table_importer.py b/src/caosadvancedtools/table_importer.py index 48cfb663a4d6749d2bdd00c5e156072b6b295bd4..8f793584051386796bce18bdbaded6c7e34c06ca 100755 --- a/src/caosadvancedtools/table_importer.py +++ b/src/caosadvancedtools/table_importer.py @@ -210,7 +210,7 @@ class TableImporter(): """ def __init__(self, converters, obligatory_columns=None, unique_keys=None, - datatypes=None): + datatypes=None, existing_columns=None): """ Parameters ---------- @@ -221,7 +221,7 @@ class TableImporter(): value check is not necessary. obligatory_columns : list, optional - List of column names, each listed column must not have missing values. + List of column names that (if they exist) must not have missing values. unique_keys : list, optional List of column names that in combination must be unique: each row has a unique @@ -232,22 +232,31 @@ class TableImporter(): checked whether they have the provided datatype. This dict also defines what columns are required to exist throught the existing keys. + existing_columns : list, optional + List of column names that must exist but may have missing (NULL) values """ if converters is None: converters = {} + self.converters = converters + + if obligatory_columns is None: + obligatory_columns = [] + self.obligatory_columns = obligatory_columns + + if unique_keys is None: + unique_keys = [] + self.unique_keys = unique_keys if datatypes is None: datatypes = {} + self.datatypes = datatypes + + if existing_columns is None: + existing_columns = [] + self.existing_columns = existing_columns self.sup = SuppressKnown() - self.required_columns = list(converters.keys())+list(datatypes.keys()) - self.obligatory_columns = ([] - if obligatory_columns is None - else obligatory_columns) - self.unique_keys = [] if unique_keys is None else unique_keys - self.converters = converters - self.datatypes = datatypes def read_file(self, filename, **kwargs): raise NotImplementedError() @@ -263,7 +272,7 @@ class TableImporter(): """ - for col in self.required_columns: + for col in self.existing_columns: if col not in df.columns: errmsg = "Column '{}' missing in ".format(col) errmsg += ("\n{}.\n".format(filename) if filename @@ -323,6 +332,8 @@ class TableImporter(): """ for key, datatype in self.datatypes.items(): + if key not in df.columns: + continue # Check for castable numeric types first: We unconditionally cast int to the default # float, because CaosDB does not have different sizes anyway. col_dtype = df.dtypes[key] @@ -333,8 +344,7 @@ class TableImporter(): df[key] = df[key].astype(datatype) # Now check each element - for idx, val in df.loc[ - pd.notnull(df.loc[:, key]), key].items(): + for idx, val in df.loc[pd.notnull(df.loc[:, key]), key].items(): if not isinstance(val, datatype): msg = ( @@ -363,22 +373,20 @@ class TableImporter(): for index, row in df.iterrows(): # if none of the relevant information is given, skip - - if np.array([pd.isnull(row.loc[key]) for key in - self.obligatory_columns]).all(): - + if pd.isnull(row.loc[[key for key in self.obligatory_columns if key in df.columns]]).all(): df = df.drop(index) continue # if any of the relevant information is missing, report it - i = 0 okay = True while okay and i < len(self.obligatory_columns): key = self.obligatory_columns[i] i += 1 + if key not in df.columns: + continue if pd.isnull(row.loc[key]): errmsg = ( @@ -449,7 +457,10 @@ class XLSImporter(TableImporter): "All but the first are being ignored.".format(filename)) try: - df = xls_file.parse(converters=self.converters, **kwargs) + tmpdf = xls_file.parse(**kwargs) + applicable_converters = {k: v for k, v in self.converters.items() + if k in tmpdf.columns} + df = xls_file.parse(converters=applicable_converters, **kwargs) except Exception as e: logger.warning( "Cannot parse {}.\n{}".format(filename, e), @@ -465,7 +476,11 @@ class XLSImporter(TableImporter): class CSVImporter(TableImporter): def read_file(self, filename, sep=",", **kwargs): try: - df = pd.read_csv(filename, sep=sep, converters=self.converters, + tmpdf = pd.read_csv(filename, sep=sep, converters=self.converters, + **kwargs) + applicable_converters = {k: v for k, v in self.converters.items() + if k in tmpdf.columns} + df = pd.read_csv(filename, sep=sep, converters=applicable_converters, **kwargs) except ValueError as ve: logger.warning( @@ -482,6 +497,10 @@ class CSVImporter(TableImporter): class TSVImporter(TableImporter): def read_file(self, filename, **kwargs): try: + tmpdf = pd.read_csv(filename, sep="\t", converters=self.converters, + **kwargs) + applicable_converters = {k: v for k, v in self.converters.items() + if k in tmpdf.columns} df = pd.read_csv(filename, sep="\t", converters=self.converters, **kwargs) except ValueError as ve: diff --git a/unittests/test_table_importer.py b/unittests/test_table_importer.py index 70f0f87f8706d72c386b18f54b7a9a10908eb477..0b3f0d7c7fc81b2a9d64e24fb2262c686ea669da 100644 --- a/unittests/test_table_importer.py +++ b/unittests/test_table_importer.py @@ -41,6 +41,16 @@ from caosadvancedtools.table_importer import (CSVImporter, TableImporter, from test_utils import BaseMockUpTest +# For testing the table importer +IMPORTER_KWARGS = dict( + converters={'c': float, 'd': yes_no_converter, 'x': float}, # x does not exist + datatypes={'a': str, 'b': int, 'x': int}, # x does not exist + obligatory_columns=['a', 'b'], unique_keys=[('a', 'b')], + existing_columns=['e'], +) +VALID_DF = pd.DataFrame( + [['a', 1, 2.0, 'yes', np.nan]], columns=['a', 'b', 'c', 'd', 'e']) + class ConverterTest(unittest.TestCase): def test_yes_no(self): @@ -143,22 +153,16 @@ class ConverterTest(unittest.TestCase): class TableImporterTest(unittest.TestCase): def setUp(self): - self.importer_kwargs = dict( - converters={'c': float, 'd': yes_no_converter}, - datatypes={'a': str, 'b': int}, - obligatory_columns=['a', 'b'], unique_keys=[('a', 'b')]) - self.valid_df = pd.DataFrame( - [['a', 1, 2.0, 'yes']], columns=['a', 'b', 'c', 'd']) + self.importer_kwargs = IMPORTER_KWARGS + self.valid_df = VALID_DF def test_missing_col(self): - # check missing from converters - df = pd.DataFrame(columns=['a', 'b', 'c']) - importer = TableImporter(**self.importer_kwargs) - self.assertRaises(ValueError, importer.check_columns, df) - # check missing from datatypes - df = pd.DataFrame(columns=['a', 'd', 'c']) + # check missing from existing + df = pd.DataFrame(columns=['a', 'b']) importer = TableImporter(**self.importer_kwargs) - self.assertRaises(ValueError, importer.check_columns, df) + with pytest.raises(DataInconsistencyError) as die: + importer.check_columns(df) + assert "Column 'e' missing" in str(die.value) # check valid importer.check_columns(self.valid_df) @@ -193,6 +197,35 @@ class TableImporterTest(unittest.TestCase): self.assertEqual(df_new.shape[0], 1) +def test_check_dataframe_existing_obligatory_columns(caplog): + """Needs caplog so remove from above class.""" + # stricter test case; column 'a' must exist and have a value + strict_kwargs = IMPORTER_KWARGS.copy() + strict_kwargs["existing_columns"].append('a') + + importer = TableImporter(**strict_kwargs) + + # the valid df is still valid, since 'a' has a value + importer.check_dataframe(VALID_DF) + + # Now 'a' doesn't + df_missing_a = pd.DataFrame( + [[np.nan, 1, 2.0, 'yes', 'e']], columns=['a', 'b', 'c', 'd', 'e']) + + new_df = importer.check_dataframe(df_missing_a) + # Column is removed and a warning is in the logger: + assert new_df.shape[0] == 0 + assert "Required information is missing (a) in 1. row" in caplog.text + + df_missing_c = pd.DataFrame( + [['a', 1, 'yes', np.nan]], columns=['a', 'b', 'd', 'e']) + new_df = importer.check_dataframe(df_missing_c) + assert new_df.shape[0] == 1 + assert new_df.shape[1] == 4 + + caplog.clear() + + class XLSImporterTest(TableImporterTest): def test_full(self): """ test full run with example data """