Merge branch 'f-tsv-importer' into 'dev'

ENH: add table importer implementations for csv See merge request caosdb/caosdb-advanced-user-tools!87

Merge branch 'f-tsv-importer' into 'dev'
3ddc7619 · Florian Spreckelsen · 2658d443 · ab1703a1 · 3ddc7619 · 3ddc7619
Commit 3ddc7619 authored Oct 11, 2021 by Florian Spreckelsen
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -29,6 +29,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Proof-of-concept integration with Bloxberg.
 - Introduce a cfood that can create a Record structure based on the contents of a hdf5 file
  h5py is now an optional dependency
+- table importer implementations for csv and tsv
 - string-in-list check for table imports
 ### Changed ###

--- a/src/caosadvancedtools/table_importer.py
+++ b/src/caosadvancedtools/table_importer.py
@@ -185,12 +185,7 @@ def string_in_list(val, options, ignore_case=True):
    return val
-class TSVImporter(object):
+class TableImporter(object):
-    def __init__(self, converters, obligatory_columns=[], unique_columns=[]):
-        raise NotImplementedError()
-class XLSImporter(object):
    def __init__(self, converters, obligatory_columns=None, unique_keys=None):
        """
        converters: dict with column names as keys and converter functions as
@@ -208,50 +203,14 @@ class XLSImporter(object):
        """
        self.sup = SuppressKnown()
        self.required_columns = list(converters.keys())
-        self.obligatory_columns = [] if obligatory_columns is None else obligatory_columns
+        self.obligatory_columns = ([]
+                                   if obligatory_columns is None
+                                   else obligatory_columns)
        self.unique_keys = [] if unique_keys is None else unique_keys
        self.converters = converters
-    def read_xls(self, filename, **kwargs):
+    def read_file(self, filename, **kwargs):
-        """
+        raise NotImplementedError()
-        converts an xls file into a Pandas DataFrame.
-        The converters of the XLSImporter object are used.
-        Raises: DataInconsistencyError
-        """
-        try:
-            xls_file = pd.io.excel.ExcelFile(filename)
-        except (XLRDError, ValueError) as e:
-            logger.warning(
-                "Cannot read \n{}.\nError:{}".format(filename,
-                                                     str(e)),
-                extra={'identifier': str(filename),
-                       'category': "inconsistency"})
-            raise DataInconsistencyError(*e.args)
-        if len(xls_file.sheet_names) > 1:
-            # Multiple sheets is the default now. Only show in debug
-            logger.debug(
-                "Excel file {} contains multiple sheets. "
-                "All but the first are being ignored.".format(filename))
-        try:
-            df = xls_file.parse(converters=self.converters, **kwargs)
-        except Exception as e:
-            logger.warning(
-                "Cannot parse {}.".format(filename),
-                extra={'identifier': str(filename),
-                       'category': "inconsistency"})
-            raise DataInconsistencyError(*e.args)
-        self.check_columns(df, filename=filename)
-        df = self.check_missing(df, filename=filename)
-        if len(self.unique_keys) > 0:
-            df = self.check_unique(df, filename=filename)
-        return df
    def check_columns(self, df, filename=None):
        """
@@ -346,3 +305,70 @@ class XLSImporter(object):
                    okay = False
        return df
+    def check_dataframe(self, df, filename):
+        self.check_columns(df, filename=filename)
+        df = self.check_missing(df, filename=filename)
+        if len(self.unique_keys) > 0:
+            df = self.check_unique(df, filename=filename)
+class XLSImporter(TableImporter):
+    def read_file(self, filename, **kwargs):
+        return self.read_xls(filename=filename, **kwargs)
+    def read_xls(self, filename, **kwargs):
+        """
+        converts an xls file into a Pandas DataFrame.
+        The converters of the XLSImporter object are used.
+        Raises: DataInconsistencyError
+        """
+        try:
+            xls_file = pd.io.excel.ExcelFile(filename)
+        except (XLRDError, ValueError) as e:
+            logger.warning(
+                "Cannot read \n{}.\nError:{}".format(filename,
+                                                     str(e)),
+                extra={'identifier': str(filename),
+                       'category': "inconsistency"})
+            raise DataInconsistencyError(*e.args)
+        if len(xls_file.sheet_names) > 1:
+            # Multiple sheets is the default now. Only show in debug
+            logger.debug(
+                "Excel file {} contains multiple sheets. "
+                "All but the first are being ignored.".format(filename))
+        try:
+            df = xls_file.parse(converters=self.converters, **kwargs)
+        except Exception as e:
+            logger.warning(
+                "Cannot parse {}.".format(filename),
+                extra={'identifier': str(filename),
+                       'category': "inconsistency"})
+            raise DataInconsistencyError(*e.args)
+        self.check_dataframe(df, filename)
+        return df
+class CSVImporter(TableImporter):
+    def read_file(self, filename, sep=",", **kwargs):
+        df = pd.read_csv(filename, sep=sep, converters=self.converters,
+                         **kwargs)
+        self.check_dataframe(df, filename)
+        return df
+class TSVImporter(TableImporter):
+    def read_file(self, filename, **kwargs):
+        df = pd.read_csv(filename, sep="\t", converters=self.converters,
+                         **kwargs)
+        self.check_dataframe(df, filename)
+        return df
--- a/unittests/test_table_importer.py
+++ b/unittests/test_table_importer.py
@@ -30,6 +30,9 @@ from caosadvancedtools.datainconsistency import DataInconsistencyError
 from caosadvancedtools.table_importer import (XLSImporter, assure_name_format,
                                              date_converter,
                                              datetime_converter,
+                                              TableImporter,
+                                              TSVImporter,
+                                              CSVImporter,
                                              incomplete_date_converter,
                                              win_path_converter,
                                              win_path_list_converter,
@@ -78,12 +81,12 @@ class ConverterTest(unittest.TestCase):
    @pytest.mark.xfail(reason="To be fixed, see Issue #34")
    def test_datetime(self):
        test_file = os.path.join(os.path.dirname(__file__), "date.xlsx")
-        self.importer = XLSImporter(converters={'d': datetime_converter,
+        importer = XLSImporter(converters={'d': datetime_converter,
                                           }, obligatory_columns=['d'])
        xls_file = pd.io.excel.ExcelFile(test_file)
        df = xls_file.parse()
-        df = self.importer.read_xls(test_file)
+        df = importer.read_xls(test_file)
        assert df.shape[0] == 2
        # TODO datatypes are different; fix it
        assert df.d.iloc[0] == datetime.datetime(1980, 12, 31, 13, 24, 23)
@@ -91,7 +94,7 @@ class ConverterTest(unittest.TestCase):
    def test_date_xlsx(self):
        """Test with .xlsx in order to check openpyxl engine."""
        test_file = os.path.join(os.path.dirname(__file__), "date.xlsx")
-        self.importer = XLSImporter(converters={'a': date_converter,
+        importer = XLSImporter(converters={'a': date_converter,
                                           'b': date_converter,
                                           'c': partial(date_converter,
                                                        fmt="%d.%m.%y")
@@ -99,14 +102,14 @@ class ConverterTest(unittest.TestCase):
        xls_file = pd.io.excel.ExcelFile(test_file)
        df = xls_file.parse()
-        df = self.importer.read_xls(test_file)
+        df = importer.read_xls(test_file)
        assert df.shape[0] == 2
        assert df.a.iloc[0] == df.b.iloc[0] == df.c.iloc[0]
    def test_date_xls(self):
        """Test with .xls in order to check xlrd engine."""
        test_file = os.path.join(os.path.dirname(__file__), "date.xls")
-        self.importer = XLSImporter(converters={'a': date_converter,
+        importer = XLSImporter(converters={'a': date_converter,
                                           'b': date_converter,
                                           'c': partial(date_converter,
                                                        fmt="%d.%m.%y")
@@ -114,7 +117,7 @@ class ConverterTest(unittest.TestCase):
        xls_file = pd.io.excel.ExcelFile(test_file)
        df = xls_file.parse()
-        df = self.importer.read_xls(test_file)
+        df = importer.read_xls(test_file)
        assert df.shape[0] == 2
        assert df.a.iloc[0] == df.b.iloc[0] == df.c.iloc[0]
@@ -137,9 +140,9 @@ class ConverterTest(unittest.TestCase):
                          fmts={"%Y": "%Y"})
-class XLSImporterTest(unittest.TestCase):
+class TableImporterTest(unittest.TestCase):
    def setUp(self):
-        self.importer = XLSImporter(
+        self.importer_kwargs = dict(
            converters={'a': str, 'b': int, 'c': float, 'd': yes_no_converter},
            obligatory_columns=['a', 'b'], unique_keys=[('a', 'b')])
        self.valid_df = pd.DataFrame(
@@ -147,37 +150,64 @@ class XLSImporterTest(unittest.TestCase):
    def test_missing_col(self):
        df = pd.DataFrame(columns=['a', 'b'])
-        self.assertRaises(ValueError, self.importer.check_columns, df)
+        importer = TableImporter(**self.importer_kwargs)
-        self.importer.check_columns(self.valid_df)
+        self.assertRaises(ValueError, importer.check_columns, df)
+        importer.check_columns(self.valid_df)
    def test_missing_val(self):
-        self.importer.check_missing(self.valid_df)
+        importer = TableImporter(**self.importer_kwargs)
+        importer.check_missing(self.valid_df)
        df = pd.DataFrame([[None, np.nan, 2.0, 'yes'],
                           [None, 1, 2.0, 'yes'],
                           ['a', np.nan, 2.0, 'yes'],
                           ['b', 5, 3.0, 'no']],
                          columns=['a', 'b', 'c', 'd'])
-        df_new = self.importer.check_missing(df)
+        df_new = importer.check_missing(df)
        self.assertEqual(df_new.shape[0], 1)
        self.assertEqual(df_new.shape[1], 4)
        self.assertEqual(df_new.iloc[0].b, 5)
+    def test_unique(self):
+        importer = TableImporter(**self.importer_kwargs)
+        importer.check_missing(self.valid_df)
+        df = pd.DataFrame([['b', 5, 3.0, 'no'], ['b', 5, 3.0, 'no']],
+                          columns=['a', 'b', 'c', 'd'])
+        df_new = importer.check_unique(df)
+        self.assertEqual(df_new.shape[0], 1)
+class XLSImporterTest(TableImporterTest):
    def test_full(self):
        """ test full run with example data """
        tmp = NamedTemporaryFile(delete=False, suffix=".xlsx")
        tmp.close()
        self.valid_df.to_excel(tmp.name)
-        self.importer.read_xls(tmp.name)
+        importer = XLSImporter(**self.importer_kwargs)
+        importer.read_file(tmp.name)
-    def test_unique(self):
-        self.importer.check_missing(self.valid_df)
-        df = pd.DataFrame([['b', 5, 3.0, 'no'], ['b', 5, 3.0, 'no']],
-                          columns=['a', 'b', 'c', 'd'])
-        df_new = self.importer.check_unique(df)
-        self.assertEqual(df_new.shape[0], 1)
    def test_raise(self):
+        importer = XLSImporter(**self.importer_kwargs)
        tmp = NamedTemporaryFile(delete=False, suffix=".lol")
        tmp.close()
-        self.assertRaises(DataInconsistencyError, self.importer.read_xls,
+        self.assertRaises(DataInconsistencyError, importer.read_xls,
                          tmp.name)
+class CSVImporterTest(TableImporterTest):
+    def test_full(self):
+        """ test full run with example data """
+        tmp = NamedTemporaryFile(delete=False, suffix=".csv")
+        tmp.close()
+        self.valid_df.to_csv(tmp.name)
+        importer = CSVImporter(**self.importer_kwargs)
+        importer.read_file(tmp.name)
+class TSVImporterTest(TableImporterTest):
+    def test_full(self):
+        """ test full run with example data """
+        tmp = NamedTemporaryFile(delete=False, suffix=".tsv")
+        tmp.close()
+        self.valid_df.to_csv(tmp.name, sep="\t")
+        importer = TSVImporter(**self.importer_kwargs)
+        importer.read_file(tmp.name)