From 1f4c5e314a7bd430792bbd06620204f3396ddd78 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com>
Date: Tue, 14 Jul 2020 14:01:05 +0200
Subject: [PATCH] ENH: add utitlities for importing tables

---
 src/caosadvancedtools/table_importer.py | 148 ++++++++++++++++++++++++
 tox.ini                                 |   2 +
 unittests/test_table_importer.py        |  83 +++++++++++++
 3 files changed, 233 insertions(+)
 create mode 100755 src/caosadvancedtools/table_importer.py
 create mode 100644 unittests/test_table_importer.py

diff --git a/src/caosadvancedtools/table_importer.py b/src/caosadvancedtools/table_importer.py
new file mode 100755
index 00000000..c859f675
--- /dev/null
+++ b/src/caosadvancedtools/table_importer.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python
+# encoding: utf-8
+#
+# Copyright (C) 2020 Henrik tom WÃ¶rden
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+
+import logging
+
+import numpy as np
+import pandas as pd
+
+from caosadvancedtools.suppressable import Suppressable
+
+logger = logging.getLogger("caosadvancedtools")
+
+
+def name_converter(name):
+    name = str(name)
+
+    if len(name.split(",")) != 2:
+        raise ValueError("Name field should be 'LastName, FirstName'."
+                         "The supplied value was '{}'.".format(name))
+
+    return name
+
+
+def yes_no_converter(val):
+    if str(val).lower() == "yes":
+        return True
+    elif str(val).lower() == "no":
+        return False
+    else:
+        raise ValueError(
+            "Field should be 'Yes' or 'No', but is '{}'.".format(val))
+
+
+class XLS_Importer(object):
+    def __init__(self, converters, obligatory_columns=[], unique_columns=[]):
+        """
+
+        converters: dict with column names as keys and converter functions as
+                    values
+                    This dict also defines what columns are required to exist
+                    throught the existing keys. The converter functions are
+                    applied to the cell values. They should also check for
+                    ValueErrors, such that a separate value check is not
+                    necessary.
+        obligatory_columns: list of column names, optional
+                            each listed column must not have missing values
+        """
+        self.sup = Suppressable(logger=logger)
+        self.required_columns = list(converters.keys())
+        self.obligatory_columns = obligatory_columns
+        self.unique_columns = unique_columns
+        self.converters = converters
+
+    def read_xls(self, filename):
+        xls_file = pd.io.excel.ExcelFile(filename)
+
+        if len(xls_file .sheet_names) > 1:
+            # Multiple sheets is the default now. Only show in debug
+            logger.debug("Excel file {} contains multiple sheets. "
+                         "All but the first are being ignored.".format(
+                             filename
+                         ))
+
+        df = xls_file.parse(converters=self.converters)
+        self.check_columns(df, filename=filename)
+        df = self.check_missing(df, filename=filename)
+
+        if len(self.unique_columns) > 0:
+            df = self.check_unique(df, filename=filename)
+
+        return df
+
+    def check_columns(self, df, filename=None):
+        for col in self.required_columns:
+            if col not in df.columns:
+                errmssg = "Column '{}' missing in ".format(col)
+                errmssg += "{}.".format(filename) if filename else "the file."
+                raise ValueError(errmssg)
+
+    def check_unique(self, df, filename=None):
+        df = df.copy()
+        uniques = []
+
+        for index, row in df.iterrows():
+            element = tuple(row.loc[key] for key in self.unique_columns)
+
+            if element in uniques:
+                errmssg = (
+                    "The {}. row contains the values '{}'.\nThis value "
+                    "combination should be unique, but was used in a previous "
+                    "row in\n").format(index+1, element)
+                errmssg += "{}.".format(filename) if filename else "the file."
+                errmssg += "\nThis row will be ignored!"
+
+                self.sup.warning(errmssg, identifier=filename,
+                                 category="inconsistency")
+                df = df.drop(index)
+            else:
+                uniques.append(element)
+
+        return df
+
+    def check_missing(self, df, filename=None):
+        df = df.copy()
+
+        for index, row in df.iterrows():
+            # if none of the relevant information is given, skip
+
+            if np.array([pd.isnull(row.loc[key]) for key in
+                         self.obligatory_columns]).all():
+
+                df = df.drop(index)
+
+                continue
+
+            # if any of the relevant information is missing, report it
+
+            for key in self.obligatory_columns:
+
+                if pd.isnull(row.loc[key]):
+                    errmssg = (
+                        "Required information is missing ({}) in {}. row"
+                        " (without header) of "
+                        "file:\n{}".format(key, index+1, filename))
+
+                    self.sup.warning(errmssg, identifier=filename,
+                                     category="inconsistency")
+                    df = df.drop(index)
+
+                    continue
+
+        return df
diff --git a/tox.ini b/tox.ini
index 57cda000..e40e0ad3 100644
--- a/tox.ini
+++ b/tox.ini
@@ -7,4 +7,6 @@ deps=nose
     caosdb
     pytest
     pytest-cov
+    openpyxl
+    xlrd
 commands=py.test --cov=caosadvancedtools -vv {posargs}
diff --git a/unittests/test_table_importer.py b/unittests/test_table_importer.py
new file mode 100644
index 00000000..49debb43
--- /dev/null
+++ b/unittests/test_table_importer.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python
+# encoding: utf-8
+#
+# Copyright (C) 2020 Henrik tom WÃ¶rden
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+
+import unittest
+from tempfile import NamedTemporaryFile
+
+import numpy as np
+import pandas as pd
+from caosadvancedtools.table_importer import (XLS_Importer, name_converter,
+                                              yes_no_converter)
+
+
+class ConverterTest(unittest.TestCase):
+    def test_yes_no(self):
+        self.assertTrue(yes_no_converter("YES"))
+        self.assertTrue(yes_no_converter("Yes"))
+        self.assertTrue(yes_no_converter("yes"))
+        self.assertTrue(not yes_no_converter("No"))
+        self.assertTrue(not yes_no_converter("no"))
+        self.assertRaises(ValueError, yes_no_converter, "nope")
+        self.assertRaises(ValueError, yes_no_converter, "FALSE")
+        self.assertRaises(ValueError, yes_no_converter, "TRUE")
+        self.assertRaises(ValueError, yes_no_converter, "True")
+        self.assertRaises(ValueError, yes_no_converter, "true")
+
+    def test_name_converter(self):
+        self.assertEqual(name_converter("MÃ¼stermann, Max"), "MÃ¼stermann, Max")
+        self.assertRaises(ValueError, name_converter, "Max Mustermann")
+
+
+class XLS_ImporterTest(unittest.TestCase):
+    def setUp(self):
+        self.importer = XLS_Importer(
+            converters={'a': str, 'b': int, 'c': float, 'd': yes_no_converter},
+            obligatory_columns=['a', 'b'], unique_columns=['a', 'b'])
+        self.valid_df = pd.DataFrame(
+            [['a', 1, 2.0, 'yes']], columns=['a', 'b', 'c', 'd'])
+
+    def test_missing_col(self):
+        df = pd.DataFrame(columns=['a', 'b'])
+        self.assertRaises(ValueError, self.importer.check_columns, df)
+        self.importer.check_columns(self.valid_df)
+
+    def test_missing_val(self):
+        self.importer.check_missing(self.valid_df)
+        df = pd.DataFrame([[None, np.nan, 2.0, 'yes'],
+                           [None, 1, 2.0, 'yes'],
+                           ['a', np.nan, 2.0, 'yes'],
+                           ['b', 5, 3.0, 'no']],
+                          columns=['a', 'b', 'c', 'd'])
+        df_new = self.importer.check_missing(df)
+        self.assertEqual(df_new.shape[0], 1)
+        self.assertEqual(df_new.shape[1], 4)
+        self.assertEqual(df_new.iloc[0].b, 5)
+
+    def test_full(self):
+        tmp = NamedTemporaryFile(delete=False, suffix=".xlsx")
+        tmp.close()
+        self.valid_df.to_excel(tmp.name)
+        self.importer.read_xls(tmp.name)
+
+    def test_unique(self):
+        self.importer.check_missing(self.valid_df)
+        df = pd.DataFrame([['b', 5, 3.0, 'no'], ['b', 5, 3.0, 'no']],
+                          columns=['a', 'b', 'c', 'd'])
+        df_new = self.importer.check_unique(df)
+        self.assertEqual(df_new.shape[0], 1)
-- 
GitLab