diff --git a/CHANGELOG.md b/CHANGELOG.md index 04a5171184bb35417a6eb66aa0739f474adf8b7c..0ca5f692e8c9ca1a89d2b48a23ba4ad017711234 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Proof-of-concept integration with Bloxberg. - Introduce a cfood that can create a Record structure based on the contents of a hdf5 file h5py is now an optional dependency +- table importer implementations for csv and tsv - string-in-list check for table imports ### Changed ### diff --git a/src/caosadvancedtools/table_importer.py b/src/caosadvancedtools/table_importer.py index 2f65a4249ce50394eff574abb6bf716602ad4aea..cb61e8389de69a2d0d0527ad01cb8b9991b19ece 100755 --- a/src/caosadvancedtools/table_importer.py +++ b/src/caosadvancedtools/table_importer.py @@ -185,12 +185,7 @@ def string_in_list(val, options, ignore_case=True): return val -class TSVImporter(object): - def __init__(self, converters, obligatory_columns=[], unique_columns=[]): - raise NotImplementedError() - - -class XLSImporter(object): +class TableImporter(object): def __init__(self, converters, obligatory_columns=None, unique_keys=None): """ converters: dict with column names as keys and converter functions as @@ -208,50 +203,14 @@ class XLSImporter(object): """ self.sup = SuppressKnown() self.required_columns = list(converters.keys()) - self.obligatory_columns = [] if obligatory_columns is None else obligatory_columns + self.obligatory_columns = ([] + if obligatory_columns is None + else obligatory_columns) self.unique_keys = [] if unique_keys is None else unique_keys self.converters = converters - def read_xls(self, filename, **kwargs): - """ - converts an xls file into a Pandas DataFrame. - - The converters of the XLSImporter object are used. - - Raises: DataInconsistencyError - """ - try: - xls_file = pd.io.excel.ExcelFile(filename) - except (XLRDError, ValueError) as e: - logger.warning( - "Cannot read \n{}.\nError:{}".format(filename, - str(e)), - extra={'identifier': str(filename), - 'category': "inconsistency"}) - raise DataInconsistencyError(*e.args) - - if len(xls_file.sheet_names) > 1: - # Multiple sheets is the default now. Only show in debug - logger.debug( - "Excel file {} contains multiple sheets. " - "All but the first are being ignored.".format(filename)) - - try: - df = xls_file.parse(converters=self.converters, **kwargs) - except Exception as e: - logger.warning( - "Cannot parse {}.".format(filename), - extra={'identifier': str(filename), - 'category': "inconsistency"}) - raise DataInconsistencyError(*e.args) - - self.check_columns(df, filename=filename) - df = self.check_missing(df, filename=filename) - - if len(self.unique_keys) > 0: - df = self.check_unique(df, filename=filename) - - return df + def read_file(self, filename, **kwargs): + raise NotImplementedError() def check_columns(self, df, filename=None): """ @@ -346,3 +305,70 @@ class XLSImporter(object): okay = False return df + + def check_dataframe(self, df, filename): + self.check_columns(df, filename=filename) + df = self.check_missing(df, filename=filename) + + if len(self.unique_keys) > 0: + df = self.check_unique(df, filename=filename) + + +class XLSImporter(TableImporter): + def read_file(self, filename, **kwargs): + return self.read_xls(filename=filename, **kwargs) + + def read_xls(self, filename, **kwargs): + """ + converts an xls file into a Pandas DataFrame. + + The converters of the XLSImporter object are used. + + Raises: DataInconsistencyError + """ + try: + xls_file = pd.io.excel.ExcelFile(filename) + except (XLRDError, ValueError) as e: + logger.warning( + "Cannot read \n{}.\nError:{}".format(filename, + str(e)), + extra={'identifier': str(filename), + 'category': "inconsistency"}) + raise DataInconsistencyError(*e.args) + + if len(xls_file.sheet_names) > 1: + # Multiple sheets is the default now. Only show in debug + logger.debug( + "Excel file {} contains multiple sheets. " + "All but the first are being ignored.".format(filename)) + + try: + df = xls_file.parse(converters=self.converters, **kwargs) + except Exception as e: + logger.warning( + "Cannot parse {}.".format(filename), + extra={'identifier': str(filename), + 'category': "inconsistency"}) + raise DataInconsistencyError(*e.args) + + self.check_dataframe(df, filename) + + return df + + +class CSVImporter(TableImporter): + def read_file(self, filename, sep=",", **kwargs): + df = pd.read_csv(filename, sep=sep, converters=self.converters, + **kwargs) + self.check_dataframe(df, filename) + + return df + + +class TSVImporter(TableImporter): + def read_file(self, filename, **kwargs): + df = pd.read_csv(filename, sep="\t", converters=self.converters, + **kwargs) + self.check_dataframe(df, filename) + + return df diff --git a/unittests/test_table_importer.py b/unittests/test_table_importer.py index b71d2f5a8af863c8840f6d930dc4c58b2cbccb5d..b574c867881141928ac59c2b002fb7f185dac7bb 100644 --- a/unittests/test_table_importer.py +++ b/unittests/test_table_importer.py @@ -30,6 +30,9 @@ from caosadvancedtools.datainconsistency import DataInconsistencyError from caosadvancedtools.table_importer import (XLSImporter, assure_name_format, date_converter, datetime_converter, + TableImporter, + TSVImporter, + CSVImporter, incomplete_date_converter, win_path_converter, win_path_list_converter, @@ -78,12 +81,12 @@ class ConverterTest(unittest.TestCase): @pytest.mark.xfail(reason="To be fixed, see Issue #34") def test_datetime(self): test_file = os.path.join(os.path.dirname(__file__), "date.xlsx") - self.importer = XLSImporter(converters={'d': datetime_converter, - }, obligatory_columns=['d']) + importer = XLSImporter(converters={'d': datetime_converter, + }, obligatory_columns=['d']) xls_file = pd.io.excel.ExcelFile(test_file) df = xls_file.parse() - df = self.importer.read_xls(test_file) + df = importer.read_xls(test_file) assert df.shape[0] == 2 # TODO datatypes are different; fix it assert df.d.iloc[0] == datetime.datetime(1980, 12, 31, 13, 24, 23) @@ -91,30 +94,30 @@ class ConverterTest(unittest.TestCase): def test_date_xlsx(self): """Test with .xlsx in order to check openpyxl engine.""" test_file = os.path.join(os.path.dirname(__file__), "date.xlsx") - self.importer = XLSImporter(converters={'a': date_converter, - 'b': date_converter, - 'c': partial(date_converter, - fmt="%d.%m.%y") - }, obligatory_columns=['a']) + importer = XLSImporter(converters={'a': date_converter, + 'b': date_converter, + 'c': partial(date_converter, + fmt="%d.%m.%y") + }, obligatory_columns=['a']) xls_file = pd.io.excel.ExcelFile(test_file) df = xls_file.parse() - df = self.importer.read_xls(test_file) + df = importer.read_xls(test_file) assert df.shape[0] == 2 assert df.a.iloc[0] == df.b.iloc[0] == df.c.iloc[0] def test_date_xls(self): """Test with .xls in order to check xlrd engine.""" test_file = os.path.join(os.path.dirname(__file__), "date.xls") - self.importer = XLSImporter(converters={'a': date_converter, - 'b': date_converter, - 'c': partial(date_converter, - fmt="%d.%m.%y") - }, obligatory_columns=['a']) + importer = XLSImporter(converters={'a': date_converter, + 'b': date_converter, + 'c': partial(date_converter, + fmt="%d.%m.%y") + }, obligatory_columns=['a']) xls_file = pd.io.excel.ExcelFile(test_file) df = xls_file.parse() - df = self.importer.read_xls(test_file) + df = importer.read_xls(test_file) assert df.shape[0] == 2 assert df.a.iloc[0] == df.b.iloc[0] == df.c.iloc[0] @@ -137,9 +140,9 @@ class ConverterTest(unittest.TestCase): fmts={"%Y": "%Y"}) -class XLSImporterTest(unittest.TestCase): +class TableImporterTest(unittest.TestCase): def setUp(self): - self.importer = XLSImporter( + self.importer_kwargs = dict( converters={'a': str, 'b': int, 'c': float, 'd': yes_no_converter}, obligatory_columns=['a', 'b'], unique_keys=[('a', 'b')]) self.valid_df = pd.DataFrame( @@ -147,37 +150,64 @@ class XLSImporterTest(unittest.TestCase): def test_missing_col(self): df = pd.DataFrame(columns=['a', 'b']) - self.assertRaises(ValueError, self.importer.check_columns, df) - self.importer.check_columns(self.valid_df) + importer = TableImporter(**self.importer_kwargs) + self.assertRaises(ValueError, importer.check_columns, df) + importer.check_columns(self.valid_df) def test_missing_val(self): - self.importer.check_missing(self.valid_df) + importer = TableImporter(**self.importer_kwargs) + importer.check_missing(self.valid_df) df = pd.DataFrame([[None, np.nan, 2.0, 'yes'], [None, 1, 2.0, 'yes'], ['a', np.nan, 2.0, 'yes'], ['b', 5, 3.0, 'no']], columns=['a', 'b', 'c', 'd']) - df_new = self.importer.check_missing(df) + df_new = importer.check_missing(df) self.assertEqual(df_new.shape[0], 1) self.assertEqual(df_new.shape[1], 4) self.assertEqual(df_new.iloc[0].b, 5) + def test_unique(self): + importer = TableImporter(**self.importer_kwargs) + importer.check_missing(self.valid_df) + df = pd.DataFrame([['b', 5, 3.0, 'no'], ['b', 5, 3.0, 'no']], + columns=['a', 'b', 'c', 'd']) + df_new = importer.check_unique(df) + self.assertEqual(df_new.shape[0], 1) + + +class XLSImporterTest(TableImporterTest): def test_full(self): """ test full run with example data """ tmp = NamedTemporaryFile(delete=False, suffix=".xlsx") tmp.close() self.valid_df.to_excel(tmp.name) - self.importer.read_xls(tmp.name) - - def test_unique(self): - self.importer.check_missing(self.valid_df) - df = pd.DataFrame([['b', 5, 3.0, 'no'], ['b', 5, 3.0, 'no']], - columns=['a', 'b', 'c', 'd']) - df_new = self.importer.check_unique(df) - self.assertEqual(df_new.shape[0], 1) + importer = XLSImporter(**self.importer_kwargs) + importer.read_file(tmp.name) def test_raise(self): + importer = XLSImporter(**self.importer_kwargs) tmp = NamedTemporaryFile(delete=False, suffix=".lol") tmp.close() - self.assertRaises(DataInconsistencyError, self.importer.read_xls, + self.assertRaises(DataInconsistencyError, importer.read_xls, tmp.name) + + +class CSVImporterTest(TableImporterTest): + def test_full(self): + """ test full run with example data """ + tmp = NamedTemporaryFile(delete=False, suffix=".csv") + tmp.close() + self.valid_df.to_csv(tmp.name) + importer = CSVImporter(**self.importer_kwargs) + importer.read_file(tmp.name) + + +class TSVImporterTest(TableImporterTest): + def test_full(self): + """ test full run with example data """ + tmp = NamedTemporaryFile(delete=False, suffix=".tsv") + tmp.close() + self.valid_df.to_csv(tmp.name, sep="\t") + importer = TSVImporter(**self.importer_kwargs) + importer.read_file(tmp.name)