From 4ad7df786ca90bf1f030584336acd9f6c886c319 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com>
Date: Tue, 14 Jul 2020 17:15:03 +0200
Subject: [PATCH] MAINT: include proper error handling when reading xls files

---
 src/caosadvancedtools/table_importer.py | 41 ++++++++++++++++++++-----
 1 file changed, 34 insertions(+), 7 deletions(-)

diff --git a/src/caosadvancedtools/table_importer.py b/src/caosadvancedtools/table_importer.py
index 13305ac0..3b67e932 100755
--- a/src/caosadvancedtools/table_importer.py
+++ b/src/caosadvancedtools/table_importer.py
@@ -21,8 +21,10 @@ import logging
 
 import numpy as np
 import pandas as pd
-
 from caosadvancedtools.suppressKnown import SuppressKnown
+from xlrd import XLRDError
+
+from .datainconsistency import DataInconsistencyError
 
 logger = logging.getLogger("caosadvancedtools")
 
@@ -68,16 +70,35 @@ class XLS_Importer(object):
         self.converters = converters
 
     def read_xls(self, filename):
-        xls_file = pd.io.excel.ExcelFile(filename)
-
-        if len(xls_file .sheet_names) > 1:
+        try:
+            xls_file = pd.io.excel.ExcelFile(filename)
+        except XLRDError as e:
+            errmsg = ("Cannot read {}.".format(filename) if "\n"+filename+"\n"
+                      else "the file.")
+            logger.warning(
+                errmsg,
+                extra={'identifier': str(filename),
+                       'category': "inconsistency"})
+            raise DataInconsistencyError(*e.args)
+
+        if len(xls_file.sheet_names) > 1:
             # Multiple sheets is the default now. Only show in debug
             logger.debug("Excel file {} contains multiple sheets. "
                          "All but the first are being ignored.".format(
                              filename
                          ))
 
-        df = xls_file.parse(converters=self.converters)
+        try:
+            df = xls_file.parse(converters=self.converters)
+        except Exception as e:
+            errmsg = ("Cannot parse {}.".format(filename) if "\n"+filename+"\n"
+                      else "the file.")
+            logger.warning(
+                errmsg,
+                extra={'identifier': str(filename),
+                       'category': "inconsistency"})
+            raise DataInconsistencyError(*e.args)
+
         self.check_columns(df, filename=filename)
         df = self.check_missing(df, filename=filename)
 
@@ -90,8 +111,14 @@ class XLS_Importer(object):
         for col in self.required_columns:
             if col not in df.columns:
                 errmsg = "Column '{}' missing in ".format(col)
-                errmsg += "{}.".format(filename) if filename else "the file."
-                raise ValueError(errmsg)
+                errmsg += ("{}.".format(filename) if "\n"+filename+"\n"
+                           else "the file.")
+                errmsg += "Stopping to treat this file..."
+                logger.warning(
+                    errmsg,
+                    extra={'identifier': str(filename),
+                           'category': "inconsistency"})
+                raise DataInconsistencyError(errmsg)
 
     def check_unique(self, df, filename=None):
         df = df.copy()
-- 
GitLab