diff --git a/CHANGELOG.md b/CHANGELOG.md index ebd856357bfb25d4e03ee3550458e590f4a38610..52b2ce4ed00dc97ff7ae9ed9258f317505bed8e4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,10 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ## ### Added ### -- TableImporter now accepts a `allow_missing_values_in` argument which allows to have obligatory - columns with missing values +- TableImporter now accepts a `existing_columns` argument which demands that certain columns exist + although they may have missing values ### Changed ### +- The converters and datatype arguments of TableImporter now may have keys for nonexisting columns ### Deprecated ### diff --git a/src/caosadvancedtools/table_importer.py b/src/caosadvancedtools/table_importer.py index 1931c0477fc89584aff8f24fb873d0bf9e98553d..4324bc8b1246a7d4aeaf270b6b4ba09a49602338 100755 --- a/src/caosadvancedtools/table_importer.py +++ b/src/caosadvancedtools/table_importer.py @@ -210,7 +210,7 @@ class TableImporter(): """ def __init__(self, converters, obligatory_columns=None, unique_keys=None, - datatypes=None, allow_missing_values_in=None): + datatypes=None, existing_columns=None): """ Parameters ---------- @@ -232,28 +232,31 @@ class TableImporter(): checked whether they have the provided datatype. This dict also defines what columns are required to exist throught the existing keys. - allow_missing_values_in : list, optional - List of (obligatory) column names which may have missing (NULL) values - + existing_columns : list, optional + List of column names that must exist but may have missing (NULL) values """ if converters is None: converters = {} + self.converters = converters - if allow_missing_values_in is None: - allow_missing_values_in = [] + if obligatory_columns is None: + obligatory_columns = [] + self.obligatory_columns = obligatory_columns + + if unique_keys is None: + unique_keys = [] + self.unique_keys = unique_keys if datatypes is None: datatypes = {} + self.datatypes = datatypes + + if existing_columns is None: + existing_columns = [] + self.existing_columns = existing_columns self.sup = SuppressKnown() - self.allow_missing_values_in = allow_missing_values_in - self.obligatory_columns = ([] - if obligatory_columns is None - else obligatory_columns) - self.unique_keys = [] if unique_keys is None else unique_keys - self.converters = converters - self.datatypes = datatypes def read_file(self, filename, **kwargs): raise NotImplementedError() @@ -269,7 +272,7 @@ class TableImporter(): """ - for col in self.obligatory_columns+list(self.converters.keys())+list(self.datatypes.keys()): + for col in self.obligatory_columns+self.existing_columns: if col not in df.columns: errmsg = "Column '{}' missing in ".format(col) errmsg += ("\n{}.\n".format(filename) if filename @@ -329,6 +332,8 @@ class TableImporter(): """ for key, datatype in self.datatypes.items(): + if key not in df.columns: + continue # Check for castable numeric types first: We unconditionally cast int to the default # float, because CaosDB does not have different sizes anyway. col_dtype = df.dtypes[key] @@ -369,8 +374,7 @@ class TableImporter(): for index, row in df.iterrows(): # if none of the relevant information is given, skip - if np.array([pd.isnull(row.loc[key]) for key in self.obligatory_columns - if key not in self.allow_missing_values_in]).all(): + if np.array([pd.isnull(row.loc[key]) for key in self.obligatory_columns]).all(): df = df.drop(index) @@ -454,7 +458,10 @@ class XLSImporter(TableImporter): "All but the first are being ignored.".format(filename)) try: - df = xls_file.parse(converters=self.converters, **kwargs) + tmpdf = xls_file.parse(**kwargs) + applicable_converters = {k: v for k, v in self.converters.items() + if k in tmpdf.columns} + df = xls_file.parse(converters=applicable_converters, **kwargs) except Exception as e: logger.warning( "Cannot parse {}.\n{}".format(filename, e), @@ -470,7 +477,11 @@ class XLSImporter(TableImporter): class CSVImporter(TableImporter): def read_file(self, filename, sep=",", **kwargs): try: - df = pd.read_csv(filename, sep=sep, converters=self.converters, + tmpdf = pd.read_csv(filename, sep=sep, converters=self.converters, + **kwargs) + applicable_converters = {k: v for k, v in self.converters.items() + if k in tmpdf.columns} + df = pd.read_csv(filename, sep=sep, converters=applicable_converters, **kwargs) except ValueError as ve: logger.warning( @@ -487,6 +498,10 @@ class CSVImporter(TableImporter): class TSVImporter(TableImporter): def read_file(self, filename, **kwargs): try: + tmpdf = pd.read_csv(filename, sep="\t", converters=self.converters, + **kwargs) + applicable_converters = {k: v for k, v in self.converters.items() + if k in tmpdf.columns} df = pd.read_csv(filename, sep="\t", converters=self.converters, **kwargs) except ValueError as ve: diff --git a/unittests/test_table_importer.py b/unittests/test_table_importer.py index 404043588262780f945d2b0caf3ecf20b1b7a338..a2d4b7c2b62a4820cb45bbf086e600df241d944e 100644 --- a/unittests/test_table_importer.py +++ b/unittests/test_table_importer.py @@ -144,21 +144,21 @@ class ConverterTest(unittest.TestCase): class TableImporterTest(unittest.TestCase): def setUp(self): self.importer_kwargs = dict( - converters={'c': float, 'd': yes_no_converter}, - datatypes={'a': str, 'b': int}, + converters={'c': float, 'd': yes_no_converter, 'x': float}, # x does not exist + datatypes={'a': str, 'b': int, 'x': int}, # x does not exist obligatory_columns=['a', 'b'], unique_keys=[('a', 'b')], - allow_missing_values_in=['e'], + existing_columns=['e'], ) self.valid_df = pd.DataFrame( [['a', 1, 2.0, 'yes', np.nan]], columns=['a', 'b', 'c', 'd', 'e']) def test_missing_col(self): - # check missing from converters - df = pd.DataFrame(columns=['a', 'b', 'c']) + # check missing from obligatory + df = pd.DataFrame(columns=['a', 'e']) importer = TableImporter(**self.importer_kwargs) self.assertRaises(ValueError, importer.check_columns, df) - # check missing from datatypes - df = pd.DataFrame(columns=['a', 'd', 'c']) + # check missing from existing + df = pd.DataFrame(columns=['a', 'b']) importer = TableImporter(**self.importer_kwargs) self.assertRaises(ValueError, importer.check_columns, df) # check valid @@ -186,14 +186,6 @@ class TableImporterTest(unittest.TestCase): columns=['a', 'b', 'c', 'd']) self.assertRaises(DataInconsistencyError, importer.check_datatype, df) - def test_allow_missing(self): - importer = TableImporter(**self.importer_kwargs) - importer.check_missing(self.valid_df) - df = pd.DataFrame([['b', np.nan, 3.0, 'no'], ['b', 5, 3.0, 'no']], - columns=['a', 'b', 'c', 'd']) - df_new = importer.check_unique(df) - self.assertEqual(df_new.shape[0], 2) - def test_unique(self): importer = TableImporter(**self.importer_kwargs) importer.check_missing(self.valid_df)