Skip to content
Snippets Groups Projects
Commit de569ba6 authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

ENH: add table importer implementations for csv

There are actually two classes TSVImporter and CSVImporter.
parent df623eef
No related branches found
No related tags found
1 merge request!22Release 0.3
......@@ -145,12 +145,7 @@ def win_path_converter(val):
return path.as_posix()
class TSVImporter(object):
def __init__(self, converters, obligatory_columns=[], unique_columns=[]):
raise NotImplementedError()
class XLSImporter(object):
class TableImporter(object):
def __init__(self, converters, obligatory_columns=None, unique_keys=None):
"""
converters: dict with column names as keys and converter functions as
......@@ -168,50 +163,14 @@ class XLSImporter(object):
"""
self.sup = SuppressKnown()
self.required_columns = list(converters.keys())
self.obligatory_columns = [] if obligatory_columns is None else obligatory_columns
self.obligatory_columns = ([]
if obligatory_columns is None
else obligatory_columns)
self.unique_keys = [] if unique_keys is None else unique_keys
self.converters = converters
def read_xls(self, filename, **kwargs):
"""
converts an xls file into a Pandas DataFrame.
The converters of the XLSImporter object are used.
Raises: DataInconsistencyError
"""
try:
xls_file = pd.io.excel.ExcelFile(filename)
except (XLRDError, ValueError) as e:
logger.warning(
"Cannot read \n{}.\nError:{}".format(filename,
str(e)),
extra={'identifier': str(filename),
'category': "inconsistency"})
raise DataInconsistencyError(*e.args)
if len(xls_file.sheet_names) > 1:
# Multiple sheets is the default now. Only show in debug
logger.debug(
"Excel file {} contains multiple sheets. "
"All but the first are being ignored.".format(filename))
try:
df = xls_file.parse(converters=self.converters, **kwargs)
except Exception as e:
logger.warning(
"Cannot parse {}.".format(filename),
extra={'identifier': str(filename),
'category': "inconsistency"})
raise DataInconsistencyError(*e.args)
self.check_columns(df, filename=filename)
df = self.check_missing(df, filename=filename)
if len(self.unique_keys) > 0:
df = self.check_unique(df, filename=filename)
return df
def read_file(self, filename, **kwargs):
raise NotImplementedError()
def check_columns(self, df, filename=None):
"""
......@@ -306,3 +265,70 @@ class XLSImporter(object):
okay = False
return df
def check_dataframe(self, df, filename):
self.check_columns(df, filename=filename)
df = self.check_missing(df, filename=filename)
if len(self.unique_keys) > 0:
df = self.check_unique(df, filename=filename)
class XLSImporter(TableImporter):
def read_file(self, filename, **kwargs):
return self.read_xls(filename=filename, **kwargs)
def read_xls(self, filename, **kwargs):
"""
converts an xls file into a Pandas DataFrame.
The converters of the XLSImporter object are used.
Raises: DataInconsistencyError
"""
try:
xls_file = pd.io.excel.ExcelFile(filename)
except (XLRDError, ValueError) as e:
logger.warning(
"Cannot read \n{}.\nError:{}".format(filename,
str(e)),
extra={'identifier': str(filename),
'category': "inconsistency"})
raise DataInconsistencyError(*e.args)
if len(xls_file.sheet_names) > 1:
# Multiple sheets is the default now. Only show in debug
logger.debug(
"Excel file {} contains multiple sheets. "
"All but the first are being ignored.".format(filename))
try:
df = xls_file.parse(converters=self.converters, **kwargs)
except Exception as e:
logger.warning(
"Cannot parse {}.".format(filename),
extra={'identifier': str(filename),
'category': "inconsistency"})
raise DataInconsistencyError(*e.args)
self.check_dataframe(df, filename)
return df
class CSVImporter(TableImporter):
def read_file(self, filename, **kwargs):
df = pd.read_csv(filename, sep=";", converters=self.converters,
**kwargs)
self.check_dataframe(df, filename)
return df
class TSVImporter(TableImporter):
def read_file(self, filename, **kwargs):
df = pd.read_csv(filename, sep="\t", converters=self.converters,
**kwargs)
self.check_dataframe(df, filename)
return df
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment