Skip to content
Snippets Groups Projects
Commit 3ddc7619 authored by Florian Spreckelsen's avatar Florian Spreckelsen
Browse files

Merge branch 'f-tsv-importer' into 'dev'

ENH: add table importer implementations for csv

See merge request caosdb/caosdb-advanced-user-tools!87
parents 2658d443 ab1703a1
Branches
Tags
1 merge request!22Release 0.3
Pipeline #14743 passed
...@@ -29,6 +29,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ...@@ -29,6 +29,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Proof-of-concept integration with Bloxberg. - Proof-of-concept integration with Bloxberg.
- Introduce a cfood that can create a Record structure based on the contents of a hdf5 file - Introduce a cfood that can create a Record structure based on the contents of a hdf5 file
h5py is now an optional dependency h5py is now an optional dependency
- table importer implementations for csv and tsv
- string-in-list check for table imports - string-in-list check for table imports
### Changed ### ### Changed ###
......
...@@ -185,12 +185,7 @@ def string_in_list(val, options, ignore_case=True): ...@@ -185,12 +185,7 @@ def string_in_list(val, options, ignore_case=True):
return val return val
class TSVImporter(object): class TableImporter(object):
def __init__(self, converters, obligatory_columns=[], unique_columns=[]):
raise NotImplementedError()
class XLSImporter(object):
def __init__(self, converters, obligatory_columns=None, unique_keys=None): def __init__(self, converters, obligatory_columns=None, unique_keys=None):
""" """
converters: dict with column names as keys and converter functions as converters: dict with column names as keys and converter functions as
...@@ -208,50 +203,14 @@ class XLSImporter(object): ...@@ -208,50 +203,14 @@ class XLSImporter(object):
""" """
self.sup = SuppressKnown() self.sup = SuppressKnown()
self.required_columns = list(converters.keys()) self.required_columns = list(converters.keys())
self.obligatory_columns = [] if obligatory_columns is None else obligatory_columns self.obligatory_columns = ([]
if obligatory_columns is None
else obligatory_columns)
self.unique_keys = [] if unique_keys is None else unique_keys self.unique_keys = [] if unique_keys is None else unique_keys
self.converters = converters self.converters = converters
def read_xls(self, filename, **kwargs): def read_file(self, filename, **kwargs):
""" raise NotImplementedError()
converts an xls file into a Pandas DataFrame.
The converters of the XLSImporter object are used.
Raises: DataInconsistencyError
"""
try:
xls_file = pd.io.excel.ExcelFile(filename)
except (XLRDError, ValueError) as e:
logger.warning(
"Cannot read \n{}.\nError:{}".format(filename,
str(e)),
extra={'identifier': str(filename),
'category': "inconsistency"})
raise DataInconsistencyError(*e.args)
if len(xls_file.sheet_names) > 1:
# Multiple sheets is the default now. Only show in debug
logger.debug(
"Excel file {} contains multiple sheets. "
"All but the first are being ignored.".format(filename))
try:
df = xls_file.parse(converters=self.converters, **kwargs)
except Exception as e:
logger.warning(
"Cannot parse {}.".format(filename),
extra={'identifier': str(filename),
'category': "inconsistency"})
raise DataInconsistencyError(*e.args)
self.check_columns(df, filename=filename)
df = self.check_missing(df, filename=filename)
if len(self.unique_keys) > 0:
df = self.check_unique(df, filename=filename)
return df
def check_columns(self, df, filename=None): def check_columns(self, df, filename=None):
""" """
...@@ -346,3 +305,70 @@ class XLSImporter(object): ...@@ -346,3 +305,70 @@ class XLSImporter(object):
okay = False okay = False
return df return df
def check_dataframe(self, df, filename):
self.check_columns(df, filename=filename)
df = self.check_missing(df, filename=filename)
if len(self.unique_keys) > 0:
df = self.check_unique(df, filename=filename)
class XLSImporter(TableImporter):
def read_file(self, filename, **kwargs):
return self.read_xls(filename=filename, **kwargs)
def read_xls(self, filename, **kwargs):
"""
converts an xls file into a Pandas DataFrame.
The converters of the XLSImporter object are used.
Raises: DataInconsistencyError
"""
try:
xls_file = pd.io.excel.ExcelFile(filename)
except (XLRDError, ValueError) as e:
logger.warning(
"Cannot read \n{}.\nError:{}".format(filename,
str(e)),
extra={'identifier': str(filename),
'category': "inconsistency"})
raise DataInconsistencyError(*e.args)
if len(xls_file.sheet_names) > 1:
# Multiple sheets is the default now. Only show in debug
logger.debug(
"Excel file {} contains multiple sheets. "
"All but the first are being ignored.".format(filename))
try:
df = xls_file.parse(converters=self.converters, **kwargs)
except Exception as e:
logger.warning(
"Cannot parse {}.".format(filename),
extra={'identifier': str(filename),
'category': "inconsistency"})
raise DataInconsistencyError(*e.args)
self.check_dataframe(df, filename)
return df
class CSVImporter(TableImporter):
def read_file(self, filename, sep=",", **kwargs):
df = pd.read_csv(filename, sep=sep, converters=self.converters,
**kwargs)
self.check_dataframe(df, filename)
return df
class TSVImporter(TableImporter):
def read_file(self, filename, **kwargs):
df = pd.read_csv(filename, sep="\t", converters=self.converters,
**kwargs)
self.check_dataframe(df, filename)
return df
...@@ -30,6 +30,9 @@ from caosadvancedtools.datainconsistency import DataInconsistencyError ...@@ -30,6 +30,9 @@ from caosadvancedtools.datainconsistency import DataInconsistencyError
from caosadvancedtools.table_importer import (XLSImporter, assure_name_format, from caosadvancedtools.table_importer import (XLSImporter, assure_name_format,
date_converter, date_converter,
datetime_converter, datetime_converter,
TableImporter,
TSVImporter,
CSVImporter,
incomplete_date_converter, incomplete_date_converter,
win_path_converter, win_path_converter,
win_path_list_converter, win_path_list_converter,
...@@ -78,12 +81,12 @@ class ConverterTest(unittest.TestCase): ...@@ -78,12 +81,12 @@ class ConverterTest(unittest.TestCase):
@pytest.mark.xfail(reason="To be fixed, see Issue #34") @pytest.mark.xfail(reason="To be fixed, see Issue #34")
def test_datetime(self): def test_datetime(self):
test_file = os.path.join(os.path.dirname(__file__), "date.xlsx") test_file = os.path.join(os.path.dirname(__file__), "date.xlsx")
self.importer = XLSImporter(converters={'d': datetime_converter, importer = XLSImporter(converters={'d': datetime_converter,
}, obligatory_columns=['d']) }, obligatory_columns=['d'])
xls_file = pd.io.excel.ExcelFile(test_file) xls_file = pd.io.excel.ExcelFile(test_file)
df = xls_file.parse() df = xls_file.parse()
df = self.importer.read_xls(test_file) df = importer.read_xls(test_file)
assert df.shape[0] == 2 assert df.shape[0] == 2
# TODO datatypes are different; fix it # TODO datatypes are different; fix it
assert df.d.iloc[0] == datetime.datetime(1980, 12, 31, 13, 24, 23) assert df.d.iloc[0] == datetime.datetime(1980, 12, 31, 13, 24, 23)
...@@ -91,7 +94,7 @@ class ConverterTest(unittest.TestCase): ...@@ -91,7 +94,7 @@ class ConverterTest(unittest.TestCase):
def test_date_xlsx(self): def test_date_xlsx(self):
"""Test with .xlsx in order to check openpyxl engine.""" """Test with .xlsx in order to check openpyxl engine."""
test_file = os.path.join(os.path.dirname(__file__), "date.xlsx") test_file = os.path.join(os.path.dirname(__file__), "date.xlsx")
self.importer = XLSImporter(converters={'a': date_converter, importer = XLSImporter(converters={'a': date_converter,
'b': date_converter, 'b': date_converter,
'c': partial(date_converter, 'c': partial(date_converter,
fmt="%d.%m.%y") fmt="%d.%m.%y")
...@@ -99,14 +102,14 @@ class ConverterTest(unittest.TestCase): ...@@ -99,14 +102,14 @@ class ConverterTest(unittest.TestCase):
xls_file = pd.io.excel.ExcelFile(test_file) xls_file = pd.io.excel.ExcelFile(test_file)
df = xls_file.parse() df = xls_file.parse()
df = self.importer.read_xls(test_file) df = importer.read_xls(test_file)
assert df.shape[0] == 2 assert df.shape[0] == 2
assert df.a.iloc[0] == df.b.iloc[0] == df.c.iloc[0] assert df.a.iloc[0] == df.b.iloc[0] == df.c.iloc[0]
def test_date_xls(self): def test_date_xls(self):
"""Test with .xls in order to check xlrd engine.""" """Test with .xls in order to check xlrd engine."""
test_file = os.path.join(os.path.dirname(__file__), "date.xls") test_file = os.path.join(os.path.dirname(__file__), "date.xls")
self.importer = XLSImporter(converters={'a': date_converter, importer = XLSImporter(converters={'a': date_converter,
'b': date_converter, 'b': date_converter,
'c': partial(date_converter, 'c': partial(date_converter,
fmt="%d.%m.%y") fmt="%d.%m.%y")
...@@ -114,7 +117,7 @@ class ConverterTest(unittest.TestCase): ...@@ -114,7 +117,7 @@ class ConverterTest(unittest.TestCase):
xls_file = pd.io.excel.ExcelFile(test_file) xls_file = pd.io.excel.ExcelFile(test_file)
df = xls_file.parse() df = xls_file.parse()
df = self.importer.read_xls(test_file) df = importer.read_xls(test_file)
assert df.shape[0] == 2 assert df.shape[0] == 2
assert df.a.iloc[0] == df.b.iloc[0] == df.c.iloc[0] assert df.a.iloc[0] == df.b.iloc[0] == df.c.iloc[0]
...@@ -137,9 +140,9 @@ class ConverterTest(unittest.TestCase): ...@@ -137,9 +140,9 @@ class ConverterTest(unittest.TestCase):
fmts={"%Y": "%Y"}) fmts={"%Y": "%Y"})
class XLSImporterTest(unittest.TestCase): class TableImporterTest(unittest.TestCase):
def setUp(self): def setUp(self):
self.importer = XLSImporter( self.importer_kwargs = dict(
converters={'a': str, 'b': int, 'c': float, 'd': yes_no_converter}, converters={'a': str, 'b': int, 'c': float, 'd': yes_no_converter},
obligatory_columns=['a', 'b'], unique_keys=[('a', 'b')]) obligatory_columns=['a', 'b'], unique_keys=[('a', 'b')])
self.valid_df = pd.DataFrame( self.valid_df = pd.DataFrame(
...@@ -147,37 +150,64 @@ class XLSImporterTest(unittest.TestCase): ...@@ -147,37 +150,64 @@ class XLSImporterTest(unittest.TestCase):
def test_missing_col(self): def test_missing_col(self):
df = pd.DataFrame(columns=['a', 'b']) df = pd.DataFrame(columns=['a', 'b'])
self.assertRaises(ValueError, self.importer.check_columns, df) importer = TableImporter(**self.importer_kwargs)
self.importer.check_columns(self.valid_df) self.assertRaises(ValueError, importer.check_columns, df)
importer.check_columns(self.valid_df)
def test_missing_val(self): def test_missing_val(self):
self.importer.check_missing(self.valid_df) importer = TableImporter(**self.importer_kwargs)
importer.check_missing(self.valid_df)
df = pd.DataFrame([[None, np.nan, 2.0, 'yes'], df = pd.DataFrame([[None, np.nan, 2.0, 'yes'],
[None, 1, 2.0, 'yes'], [None, 1, 2.0, 'yes'],
['a', np.nan, 2.0, 'yes'], ['a', np.nan, 2.0, 'yes'],
['b', 5, 3.0, 'no']], ['b', 5, 3.0, 'no']],
columns=['a', 'b', 'c', 'd']) columns=['a', 'b', 'c', 'd'])
df_new = self.importer.check_missing(df) df_new = importer.check_missing(df)
self.assertEqual(df_new.shape[0], 1) self.assertEqual(df_new.shape[0], 1)
self.assertEqual(df_new.shape[1], 4) self.assertEqual(df_new.shape[1], 4)
self.assertEqual(df_new.iloc[0].b, 5) self.assertEqual(df_new.iloc[0].b, 5)
def test_unique(self):
importer = TableImporter(**self.importer_kwargs)
importer.check_missing(self.valid_df)
df = pd.DataFrame([['b', 5, 3.0, 'no'], ['b', 5, 3.0, 'no']],
columns=['a', 'b', 'c', 'd'])
df_new = importer.check_unique(df)
self.assertEqual(df_new.shape[0], 1)
class XLSImporterTest(TableImporterTest):
def test_full(self): def test_full(self):
""" test full run with example data """ """ test full run with example data """
tmp = NamedTemporaryFile(delete=False, suffix=".xlsx") tmp = NamedTemporaryFile(delete=False, suffix=".xlsx")
tmp.close() tmp.close()
self.valid_df.to_excel(tmp.name) self.valid_df.to_excel(tmp.name)
self.importer.read_xls(tmp.name) importer = XLSImporter(**self.importer_kwargs)
importer.read_file(tmp.name)
def test_unique(self):
self.importer.check_missing(self.valid_df)
df = pd.DataFrame([['b', 5, 3.0, 'no'], ['b', 5, 3.0, 'no']],
columns=['a', 'b', 'c', 'd'])
df_new = self.importer.check_unique(df)
self.assertEqual(df_new.shape[0], 1)
def test_raise(self): def test_raise(self):
importer = XLSImporter(**self.importer_kwargs)
tmp = NamedTemporaryFile(delete=False, suffix=".lol") tmp = NamedTemporaryFile(delete=False, suffix=".lol")
tmp.close() tmp.close()
self.assertRaises(DataInconsistencyError, self.importer.read_xls, self.assertRaises(DataInconsistencyError, importer.read_xls,
tmp.name) tmp.name)
class CSVImporterTest(TableImporterTest):
def test_full(self):
""" test full run with example data """
tmp = NamedTemporaryFile(delete=False, suffix=".csv")
tmp.close()
self.valid_df.to_csv(tmp.name)
importer = CSVImporter(**self.importer_kwargs)
importer.read_file(tmp.name)
class TSVImporterTest(TableImporterTest):
def test_full(self):
""" test full run with example data """
tmp = NamedTemporaryFile(delete=False, suffix=".tsv")
tmp.close()
self.valid_df.to_csv(tmp.name, sep="\t")
importer = TSVImporter(**self.importer_kwargs)
importer.read_file(tmp.name)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment