From 4ad7df786ca90bf1f030584336acd9f6c886c319 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Tue, 14 Jul 2020 17:15:03 +0200 Subject: [PATCH] MAINT: include proper error handling when reading xls files --- src/caosadvancedtools/table_importer.py | 41 ++++++++++++++++++++----- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/src/caosadvancedtools/table_importer.py b/src/caosadvancedtools/table_importer.py index 13305ac0..3b67e932 100755 --- a/src/caosadvancedtools/table_importer.py +++ b/src/caosadvancedtools/table_importer.py @@ -21,8 +21,10 @@ import logging import numpy as np import pandas as pd - from caosadvancedtools.suppressKnown import SuppressKnown +from xlrd import XLRDError + +from .datainconsistency import DataInconsistencyError logger = logging.getLogger("caosadvancedtools") @@ -68,16 +70,35 @@ class XLS_Importer(object): self.converters = converters def read_xls(self, filename): - xls_file = pd.io.excel.ExcelFile(filename) - - if len(xls_file .sheet_names) > 1: + try: + xls_file = pd.io.excel.ExcelFile(filename) + except XLRDError as e: + errmsg = ("Cannot read {}.".format(filename) if "\n"+filename+"\n" + else "the file.") + logger.warning( + errmsg, + extra={'identifier': str(filename), + 'category': "inconsistency"}) + raise DataInconsistencyError(*e.args) + + if len(xls_file.sheet_names) > 1: # Multiple sheets is the default now. Only show in debug logger.debug("Excel file {} contains multiple sheets. " "All but the first are being ignored.".format( filename )) - df = xls_file.parse(converters=self.converters) + try: + df = xls_file.parse(converters=self.converters) + except Exception as e: + errmsg = ("Cannot parse {}.".format(filename) if "\n"+filename+"\n" + else "the file.") + logger.warning( + errmsg, + extra={'identifier': str(filename), + 'category': "inconsistency"}) + raise DataInconsistencyError(*e.args) + self.check_columns(df, filename=filename) df = self.check_missing(df, filename=filename) @@ -90,8 +111,14 @@ class XLS_Importer(object): for col in self.required_columns: if col not in df.columns: errmsg = "Column '{}' missing in ".format(col) - errmsg += "{}.".format(filename) if filename else "the file." - raise ValueError(errmsg) + errmsg += ("{}.".format(filename) if "\n"+filename+"\n" + else "the file.") + errmsg += "Stopping to treat this file..." + logger.warning( + errmsg, + extra={'identifier': str(filename), + 'category': "inconsistency"}) + raise DataInconsistencyError(errmsg) def check_unique(self, df, filename=None): df = df.copy() -- GitLab