From 1f4c5e314a7bd430792bbd06620204f3396ddd78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Tue, 14 Jul 2020 14:01:05 +0200 Subject: [PATCH] ENH: add utitlities for importing tables --- src/caosadvancedtools/table_importer.py | 148 ++++++++++++++++++++++++ tox.ini | 2 + unittests/test_table_importer.py | 83 +++++++++++++ 3 files changed, 233 insertions(+) create mode 100755 src/caosadvancedtools/table_importer.py create mode 100644 unittests/test_table_importer.py diff --git a/src/caosadvancedtools/table_importer.py b/src/caosadvancedtools/table_importer.py new file mode 100755 index 00000000..c859f675 --- /dev/null +++ b/src/caosadvancedtools/table_importer.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python +# encoding: utf-8 +# +# Copyright (C) 2020 Henrik tom Wörden +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + + +import logging + +import numpy as np +import pandas as pd + +from caosadvancedtools.suppressable import Suppressable + +logger = logging.getLogger("caosadvancedtools") + + +def name_converter(name): + name = str(name) + + if len(name.split(",")) != 2: + raise ValueError("Name field should be 'LastName, FirstName'." + "The supplied value was '{}'.".format(name)) + + return name + + +def yes_no_converter(val): + if str(val).lower() == "yes": + return True + elif str(val).lower() == "no": + return False + else: + raise ValueError( + "Field should be 'Yes' or 'No', but is '{}'.".format(val)) + + +class XLS_Importer(object): + def __init__(self, converters, obligatory_columns=[], unique_columns=[]): + """ + + converters: dict with column names as keys and converter functions as + values + This dict also defines what columns are required to exist + throught the existing keys. The converter functions are + applied to the cell values. They should also check for + ValueErrors, such that a separate value check is not + necessary. + obligatory_columns: list of column names, optional + each listed column must not have missing values + """ + self.sup = Suppressable(logger=logger) + self.required_columns = list(converters.keys()) + self.obligatory_columns = obligatory_columns + self.unique_columns = unique_columns + self.converters = converters + + def read_xls(self, filename): + xls_file = pd.io.excel.ExcelFile(filename) + + if len(xls_file .sheet_names) > 1: + # Multiple sheets is the default now. Only show in debug + logger.debug("Excel file {} contains multiple sheets. " + "All but the first are being ignored.".format( + filename + )) + + df = xls_file.parse(converters=self.converters) + self.check_columns(df, filename=filename) + df = self.check_missing(df, filename=filename) + + if len(self.unique_columns) > 0: + df = self.check_unique(df, filename=filename) + + return df + + def check_columns(self, df, filename=None): + for col in self.required_columns: + if col not in df.columns: + errmssg = "Column '{}' missing in ".format(col) + errmssg += "{}.".format(filename) if filename else "the file." + raise ValueError(errmssg) + + def check_unique(self, df, filename=None): + df = df.copy() + uniques = [] + + for index, row in df.iterrows(): + element = tuple(row.loc[key] for key in self.unique_columns) + + if element in uniques: + errmssg = ( + "The {}. row contains the values '{}'.\nThis value " + "combination should be unique, but was used in a previous " + "row in\n").format(index+1, element) + errmssg += "{}.".format(filename) if filename else "the file." + errmssg += "\nThis row will be ignored!" + + self.sup.warning(errmssg, identifier=filename, + category="inconsistency") + df = df.drop(index) + else: + uniques.append(element) + + return df + + def check_missing(self, df, filename=None): + df = df.copy() + + for index, row in df.iterrows(): + # if none of the relevant information is given, skip + + if np.array([pd.isnull(row.loc[key]) for key in + self.obligatory_columns]).all(): + + df = df.drop(index) + + continue + + # if any of the relevant information is missing, report it + + for key in self.obligatory_columns: + + if pd.isnull(row.loc[key]): + errmssg = ( + "Required information is missing ({}) in {}. row" + " (without header) of " + "file:\n{}".format(key, index+1, filename)) + + self.sup.warning(errmssg, identifier=filename, + category="inconsistency") + df = df.drop(index) + + continue + + return df diff --git a/tox.ini b/tox.ini index 57cda000..e40e0ad3 100644 --- a/tox.ini +++ b/tox.ini @@ -7,4 +7,6 @@ deps=nose caosdb pytest pytest-cov + openpyxl + xlrd commands=py.test --cov=caosadvancedtools -vv {posargs} diff --git a/unittests/test_table_importer.py b/unittests/test_table_importer.py new file mode 100644 index 00000000..49debb43 --- /dev/null +++ b/unittests/test_table_importer.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python +# encoding: utf-8 +# +# Copyright (C) 2020 Henrik tom Wörden +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + + +import unittest +from tempfile import NamedTemporaryFile + +import numpy as np +import pandas as pd +from caosadvancedtools.table_importer import (XLS_Importer, name_converter, + yes_no_converter) + + +class ConverterTest(unittest.TestCase): + def test_yes_no(self): + self.assertTrue(yes_no_converter("YES")) + self.assertTrue(yes_no_converter("Yes")) + self.assertTrue(yes_no_converter("yes")) + self.assertTrue(not yes_no_converter("No")) + self.assertTrue(not yes_no_converter("no")) + self.assertRaises(ValueError, yes_no_converter, "nope") + self.assertRaises(ValueError, yes_no_converter, "FALSE") + self.assertRaises(ValueError, yes_no_converter, "TRUE") + self.assertRaises(ValueError, yes_no_converter, "True") + self.assertRaises(ValueError, yes_no_converter, "true") + + def test_name_converter(self): + self.assertEqual(name_converter("Müstermann, Max"), "Müstermann, Max") + self.assertRaises(ValueError, name_converter, "Max Mustermann") + + +class XLS_ImporterTest(unittest.TestCase): + def setUp(self): + self.importer = XLS_Importer( + converters={'a': str, 'b': int, 'c': float, 'd': yes_no_converter}, + obligatory_columns=['a', 'b'], unique_columns=['a', 'b']) + self.valid_df = pd.DataFrame( + [['a', 1, 2.0, 'yes']], columns=['a', 'b', 'c', 'd']) + + def test_missing_col(self): + df = pd.DataFrame(columns=['a', 'b']) + self.assertRaises(ValueError, self.importer.check_columns, df) + self.importer.check_columns(self.valid_df) + + def test_missing_val(self): + self.importer.check_missing(self.valid_df) + df = pd.DataFrame([[None, np.nan, 2.0, 'yes'], + [None, 1, 2.0, 'yes'], + ['a', np.nan, 2.0, 'yes'], + ['b', 5, 3.0, 'no']], + columns=['a', 'b', 'c', 'd']) + df_new = self.importer.check_missing(df) + self.assertEqual(df_new.shape[0], 1) + self.assertEqual(df_new.shape[1], 4) + self.assertEqual(df_new.iloc[0].b, 5) + + def test_full(self): + tmp = NamedTemporaryFile(delete=False, suffix=".xlsx") + tmp.close() + self.valid_df.to_excel(tmp.name) + self.importer.read_xls(tmp.name) + + def test_unique(self): + self.importer.check_missing(self.valid_df) + df = pd.DataFrame([['b', 5, 3.0, 'no'], ['b', 5, 3.0, 'no']], + columns=['a', 'b', 'c', 'd']) + df_new = self.importer.check_unique(df) + self.assertEqual(df_new.shape[0], 1) -- GitLab