Skip to content
Snippets Groups Projects
Commit 1f4c5e31 authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

ENH: add utitlities for importing tables

parent 283f8a3d
No related branches found
No related tags found
1 merge request!22Release 0.3
#!/usr/bin/env python
# encoding: utf-8
#
# Copyright (C) 2020 Henrik tom Wörden
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import logging
import numpy as np
import pandas as pd
from caosadvancedtools.suppressable import Suppressable
logger = logging.getLogger("caosadvancedtools")
def name_converter(name):
name = str(name)
if len(name.split(",")) != 2:
raise ValueError("Name field should be 'LastName, FirstName'."
"The supplied value was '{}'.".format(name))
return name
def yes_no_converter(val):
if str(val).lower() == "yes":
return True
elif str(val).lower() == "no":
return False
else:
raise ValueError(
"Field should be 'Yes' or 'No', but is '{}'.".format(val))
class XLS_Importer(object):
def __init__(self, converters, obligatory_columns=[], unique_columns=[]):
"""
converters: dict with column names as keys and converter functions as
values
This dict also defines what columns are required to exist
throught the existing keys. The converter functions are
applied to the cell values. They should also check for
ValueErrors, such that a separate value check is not
necessary.
obligatory_columns: list of column names, optional
each listed column must not have missing values
"""
self.sup = Suppressable(logger=logger)
self.required_columns = list(converters.keys())
self.obligatory_columns = obligatory_columns
self.unique_columns = unique_columns
self.converters = converters
def read_xls(self, filename):
xls_file = pd.io.excel.ExcelFile(filename)
if len(xls_file .sheet_names) > 1:
# Multiple sheets is the default now. Only show in debug
logger.debug("Excel file {} contains multiple sheets. "
"All but the first are being ignored.".format(
filename
))
df = xls_file.parse(converters=self.converters)
self.check_columns(df, filename=filename)
df = self.check_missing(df, filename=filename)
if len(self.unique_columns) > 0:
df = self.check_unique(df, filename=filename)
return df
def check_columns(self, df, filename=None):
for col in self.required_columns:
if col not in df.columns:
errmssg = "Column '{}' missing in ".format(col)
errmssg += "{}.".format(filename) if filename else "the file."
raise ValueError(errmssg)
def check_unique(self, df, filename=None):
df = df.copy()
uniques = []
for index, row in df.iterrows():
element = tuple(row.loc[key] for key in self.unique_columns)
if element in uniques:
errmssg = (
"The {}. row contains the values '{}'.\nThis value "
"combination should be unique, but was used in a previous "
"row in\n").format(index+1, element)
errmssg += "{}.".format(filename) if filename else "the file."
errmssg += "\nThis row will be ignored!"
self.sup.warning(errmssg, identifier=filename,
category="inconsistency")
df = df.drop(index)
else:
uniques.append(element)
return df
def check_missing(self, df, filename=None):
df = df.copy()
for index, row in df.iterrows():
# if none of the relevant information is given, skip
if np.array([pd.isnull(row.loc[key]) for key in
self.obligatory_columns]).all():
df = df.drop(index)
continue
# if any of the relevant information is missing, report it
for key in self.obligatory_columns:
if pd.isnull(row.loc[key]):
errmssg = (
"Required information is missing ({}) in {}. row"
" (without header) of "
"file:\n{}".format(key, index+1, filename))
self.sup.warning(errmssg, identifier=filename,
category="inconsistency")
df = df.drop(index)
continue
return df
......@@ -7,4 +7,6 @@ deps=nose
caosdb
pytest
pytest-cov
openpyxl
xlrd
commands=py.test --cov=caosadvancedtools -vv {posargs}
#!/usr/bin/env python
# encoding: utf-8
#
# Copyright (C) 2020 Henrik tom Wörden
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import unittest
from tempfile import NamedTemporaryFile
import numpy as np
import pandas as pd
from caosadvancedtools.table_importer import (XLS_Importer, name_converter,
yes_no_converter)
class ConverterTest(unittest.TestCase):
def test_yes_no(self):
self.assertTrue(yes_no_converter("YES"))
self.assertTrue(yes_no_converter("Yes"))
self.assertTrue(yes_no_converter("yes"))
self.assertTrue(not yes_no_converter("No"))
self.assertTrue(not yes_no_converter("no"))
self.assertRaises(ValueError, yes_no_converter, "nope")
self.assertRaises(ValueError, yes_no_converter, "FALSE")
self.assertRaises(ValueError, yes_no_converter, "TRUE")
self.assertRaises(ValueError, yes_no_converter, "True")
self.assertRaises(ValueError, yes_no_converter, "true")
def test_name_converter(self):
self.assertEqual(name_converter("Müstermann, Max"), "Müstermann, Max")
self.assertRaises(ValueError, name_converter, "Max Mustermann")
class XLS_ImporterTest(unittest.TestCase):
def setUp(self):
self.importer = XLS_Importer(
converters={'a': str, 'b': int, 'c': float, 'd': yes_no_converter},
obligatory_columns=['a', 'b'], unique_columns=['a', 'b'])
self.valid_df = pd.DataFrame(
[['a', 1, 2.0, 'yes']], columns=['a', 'b', 'c', 'd'])
def test_missing_col(self):
df = pd.DataFrame(columns=['a', 'b'])
self.assertRaises(ValueError, self.importer.check_columns, df)
self.importer.check_columns(self.valid_df)
def test_missing_val(self):
self.importer.check_missing(self.valid_df)
df = pd.DataFrame([[None, np.nan, 2.0, 'yes'],
[None, 1, 2.0, 'yes'],
['a', np.nan, 2.0, 'yes'],
['b', 5, 3.0, 'no']],
columns=['a', 'b', 'c', 'd'])
df_new = self.importer.check_missing(df)
self.assertEqual(df_new.shape[0], 1)
self.assertEqual(df_new.shape[1], 4)
self.assertEqual(df_new.iloc[0].b, 5)
def test_full(self):
tmp = NamedTemporaryFile(delete=False, suffix=".xlsx")
tmp.close()
self.valid_df.to_excel(tmp.name)
self.importer.read_xls(tmp.name)
def test_unique(self):
self.importer.check_missing(self.valid_df)
df = pd.DataFrame([['b', 5, 3.0, 'no'], ['b', 5, 3.0, 'no']],
columns=['a', 'b', 'c', 'd'])
df_new = self.importer.check_unique(df)
self.assertEqual(df_new.shape[0], 1)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment