test_table_importer.py

#!/usr/bin/env python
# encoding: utf-8
#
# Copyright (C) 2020 Henrik tom Wörden
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.


import datetime
import os
import unittest
from functools import partial
from tempfile import NamedTemporaryFile

import numpy as np
import pandas as pd
from caosadvancedtools.datainconsistency import DataInconsistencyError
from caosadvancedtools.table_importer import (XLSImporter, assure_name_format,
                                              date_converter,
                                              datetime_converter,
                                              incomplete_date_converter,
                                              win_path_converter,
                                              win_path_list_converter,
                                              yes_no_converter)


class ConverterTest(unittest.TestCase):
    def test_yes_no(self):
        self.assertTrue(yes_no_converter("YES"))
        self.assertTrue(yes_no_converter("Yes"))
        self.assertTrue(yes_no_converter("yes"))
        self.assertTrue(not yes_no_converter("No"))
        self.assertTrue(not yes_no_converter("no"))
        self.assertRaises(ValueError, yes_no_converter, "nope")
        self.assertRaises(ValueError, yes_no_converter, "FALSE")
        self.assertRaises(ValueError, yes_no_converter, "TRUE")
        self.assertRaises(ValueError, yes_no_converter, "True")
        self.assertRaises(ValueError, yes_no_converter, "true")

    def test_assure_name_format(self):
        self.assertEqual(assure_name_format("Müstermann, Max"),
                         "Müstermann, Max")
        self.assertRaises(ValueError, assure_name_format, "Max Mustermann")

    def test_winpath(self):
        self.assertRaises(ValueError, win_path_converter, "/hallo/python")
        self.assertEqual(win_path_converter(r"\this\computer"),
                         "/this/computer")
        self.assertEqual(win_path_list_converter(r"\this\computer"),
                         ["/this/computer"])
        self.assertEqual(win_path_list_converter(
            r"\this\computer,\this\computer"),
                         ["/this/computer", "/this/computer"])

    def test_datetime(self):
        test_file = os.path.join(os.path.dirname(__file__), "date.xlsx")
        self.importer = XLSImporter(converters={'d': datetime_converter,
                                                }, obligatory_columns=['d'])

        xls_file = pd.io.excel.ExcelFile(test_file)
        df = xls_file.parse()
        df = self.importer.read_xls(test_file)
        assert df.shape[0] == 2
        assert df.d.iloc[0] == datetime.datetime(1980, 12, 31, 13, 24, 23)

    def test_date(self):
        test_file = os.path.join(os.path.dirname(__file__), "date.xlsx")
        self.importer = XLSImporter(converters={'a': date_converter,
                                                'b': date_converter,
                                                'c': partial(date_converter,
                                                             fmt="%d.%m.%y")
                                                }, obligatory_columns=['a'])

        xls_file = pd.io.excel.ExcelFile(test_file)
        df = xls_file.parse()
        df = self.importer.read_xls(test_file)
        assert df.shape[0] == 2
        assert df.a.iloc[0] == df.b.iloc[0] == df.c.iloc[0]

    def test_inc_date(self):
        incomplete_date_converter("2020", fmts={"%Y": "%Y"}) == "2020"
        incomplete_date_converter("02/2020",
                                  fmts={"%Y": "%Y", "%Y-%m": "%m/%Y"}
                                  ) == "2020-02"
        incomplete_date_converter("02/02/2020",
                                  fmts={"%Y": "%Y", "%Y-%m": "%m/%Y",
                                        "%Y-%m-%d": "%d/%m/%Y"}
                                  ) == "2020-02-02"
        incomplete_date_converter("2020",
                                  fmts={"%Y": "%Y", "%Y-%m": "%m/%Y",
                                        "%Y-%m-%d": "%d/%m/%Y"}
                                  ) == "2020"
        self.assertRaises(RuntimeError,
                          incomplete_date_converter,
                          "2020e",
                          fmts={"%Y": "%Y"})


class XLSImporterTest(unittest.TestCase):
    def setUp(self):
        self.importer = XLSImporter(
            converters={'a': str, 'b': int, 'c': float, 'd': yes_no_converter},
            obligatory_columns=['a', 'b'], unique_keys=[('a', 'b')])
        self.valid_df = pd.DataFrame(
            [['a', 1, 2.0, 'yes']], columns=['a', 'b', 'c', 'd'])

    def test_missing_col(self):
        df = pd.DataFrame(columns=['a', 'b'])
        self.assertRaises(ValueError, self.importer.check_columns, df)
        self.importer.check_columns(self.valid_df)

    def test_missing_val(self):
        self.importer.check_missing(self.valid_df)
        df = pd.DataFrame([[None, np.nan, 2.0, 'yes'],
                           [None, 1, 2.0, 'yes'],
                           ['a', np.nan, 2.0, 'yes'],
                           ['b', 5, 3.0, 'no']],
                          columns=['a', 'b', 'c', 'd'])
        df_new = self.importer.check_missing(df)
        self.assertEqual(df_new.shape[0], 1)
        self.assertEqual(df_new.shape[1], 4)
        self.assertEqual(df_new.iloc[0].b, 5)

    def test_full(self):
        """ test full run with example data """
        tmp = NamedTemporaryFile(delete=False, suffix=".xlsx")
        tmp.close()
        self.valid_df.to_excel(tmp.name)
        self.importer.read_xls(tmp.name)

    def test_unique(self):
        self.importer.check_missing(self.valid_df)
        df = pd.DataFrame([['b', 5, 3.0, 'no'], ['b', 5, 3.0, 'no']],
                          columns=['a', 'b', 'c', 'd'])
        df_new = self.importer.check_unique(df)
        self.assertEqual(df_new.shape[0], 1)

    def test_raise(self):
        tmp = NamedTemporaryFile(delete=False, suffix=".lol")
        tmp.close()
        self.assertRaises(DataInconsistencyError, self.importer.read_xls,
                          tmp.name)