Skip to content
Snippets Groups Projects
Commit 79018027 authored by Timm Fitschen's avatar Timm Fitschen
Browse files

Merge branch 'table' into 'dev'

ENH: add utitlities for importing tables

See merge request caosdb/caosdb-advanced-user-tools!35
parents 4b1e3dae 2b138494
No related branches found
No related tags found
1 merge request!22Release 0.3
......@@ -4,3 +4,5 @@ __pycache__
*cache.db
*.egg-info
.docker/cert
version.py
.eggs/
......@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added ###
- New class to collect possible problems whith the data model
- New class for checking and importing tables
### Changed ###
- The WebUIHandler is now a python logging formatter.
......
#!/usr/bin/env python
# encoding: utf-8
#
# Copyright (C) 2020 Henrik tom Wörden
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import logging
import numpy as np
import pandas as pd
from caosadvancedtools.suppressable import Suppressable
logger = logging.getLogger("caosadvancedtools")
def name_converter(name):
name = str(name)
if len(name.split(",")) != 2:
raise ValueError("Name field should be 'LastName, FirstName'."
"The supplied value was '{}'.".format(name))
return name
def yes_no_converter(val):
if str(val).lower() == "yes":
return True
elif str(val).lower() == "no":
return False
else:
raise ValueError(
"Field should be 'Yes' or 'No', but is '{}'.".format(val))
class XLSImporter(object):
def __init__(self, converters, obligatory_columns=None, unique_keys=None):
"""
converters: dict with column names as keys and converter functions as
values
This dict also defines what columns are required to exist
throught the existing keys. The converter functions are
applied to the cell values. They should also check for
ValueErrors, such that a separate value check is not
necessary.
obligatory_columns: list of column names, optional
each listed column must not have missing values
"""
self.sup = Suppressable(logger=logger)
self.required_columns = list(converters.keys())
self.obligatory_columns = [] if obligatory_columns is None else obligatory_columns
self.unique_keys = [] if unique_keys is None else unique_keys
self.converters = converters
def read_xls(self, filename):
xls_file = pd.io.excel.ExcelFile(filename)
if len(xls_file .sheet_names) > 1:
# Multiple sheets is the default now. Only show in debug
logger.debug("Excel file %s contains multiple sheets. "
"All but the first are being ignored.", filename)
df = xls_file.parse(converters=self.converters)
self.check_columns(df, filename=filename)
df = self.check_missing(df, filename=filename)
if len(self.unique_keys) > 0:
df = self.check_unique(df, filename=filename)
return df
def check_columns(self, df, filename=None):
for col in self.required_columns:
if col not in df.columns:
errmssg = "Column '{}' missing in ".format(col)
errmssg += "{}.".format(filename) if filename else "the file."
raise ValueError(errmssg)
def check_unique(self, df, filename=None):
df = df.copy()
uniques = []
for unique_columns in self.unique_keys:
subtable = df[list(unique_columns)]
for index, row in subtable.iterrows():
element = tuple(row)
if element in uniques:
errmssg = (
"The {}. row contains the values '{}'.\nThis value "
"combination should be unique, but was used in a previous "
"row in\n").format(index+1, element)
errmssg += "{}.".format(filename) if filename else "the file."
errmssg += "\nThis row will be ignored!"
self.sup.warning(errmssg, identifier=filename,
category="inconsistency")
df = df.drop(index)
else:
uniques.append(element)
return df
def check_missing(self, df, filename=None):
df = df.copy()
for index, row in df.iterrows():
# if none of the relevant information is given, skip
if np.array([pd.isnull(row.loc[key]) for key in
self.obligatory_columns]).all():
df = df.drop(index)
continue
# if any of the relevant information is missing, report it
for key in self.obligatory_columns:
if pd.isnull(row.loc[key]):
errmssg = (
"Required information is missing ({}) in {}. row"
" (without header) of "
"file:\n{}".format(key, index+1, filename))
self.sup.warning(errmssg, identifier=filename,
category="inconsistency")
df = df.drop(index)
continue
return df
......@@ -7,4 +7,6 @@ deps=nose
caosdb
pytest
pytest-cov
openpyxl
xlrd
commands=py.test --cov=caosadvancedtools -vv {posargs}
#!/usr/bin/env python
# encoding: utf-8
#
# Copyright (C) 2020 Henrik tom Wörden
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import unittest
from tempfile import NamedTemporaryFile
import numpy as np
import pandas as pd
from caosadvancedtools.table_importer import (XLSImporter, name_converter,
yes_no_converter)
class ConverterTest(unittest.TestCase):
def test_yes_no(self):
self.assertTrue(yes_no_converter("YES"))
self.assertTrue(yes_no_converter("Yes"))
self.assertTrue(yes_no_converter("yes"))
self.assertTrue(not yes_no_converter("No"))
self.assertTrue(not yes_no_converter("no"))
self.assertRaises(ValueError, yes_no_converter, "nope")
self.assertRaises(ValueError, yes_no_converter, "FALSE")
self.assertRaises(ValueError, yes_no_converter, "TRUE")
self.assertRaises(ValueError, yes_no_converter, "True")
self.assertRaises(ValueError, yes_no_converter, "true")
def test_name_converter(self):
self.assertEqual(name_converter("Müstermann, Max"), "Müstermann, Max")
self.assertRaises(ValueError, name_converter, "Max Mustermann")
class XLSImporterTest(unittest.TestCase):
def setUp(self):
self.importer = XLSImporter(
converters={'a': str, 'b': int, 'c': float, 'd': yes_no_converter},
obligatory_columns=['a', 'b'], unique_keys=[('a', 'b')])
self.valid_df = pd.DataFrame(
[['a', 1, 2.0, 'yes']], columns=['a', 'b', 'c', 'd'])
def test_missing_col(self):
df = pd.DataFrame(columns=['a', 'b'])
self.assertRaises(ValueError, self.importer.check_columns, df)
self.importer.check_columns(self.valid_df)
def test_missing_val(self):
self.importer.check_missing(self.valid_df)
df = pd.DataFrame([[None, np.nan, 2.0, 'yes'],
[None, 1, 2.0, 'yes'],
['a', np.nan, 2.0, 'yes'],
['b', 5, 3.0, 'no']],
columns=['a', 'b', 'c', 'd'])
df_new = self.importer.check_missing(df)
self.assertEqual(df_new.shape[0], 1)
self.assertEqual(df_new.shape[1], 4)
self.assertEqual(df_new.iloc[0].b, 5)
def test_full(self):
tmp = NamedTemporaryFile(delete=False, suffix=".xlsx")
tmp.close()
self.valid_df.to_excel(tmp.name)
self.importer.read_xls(tmp.name)
def test_unique(self):
self.importer.check_missing(self.valid_df)
df = pd.DataFrame([['b', 5, 3.0, 'no'], ['b', 5, 3.0, 'no']],
columns=['a', 'b', 'c', 'd'])
df_new = self.importer.check_unique(df)
self.assertEqual(df_new.shape[0], 1)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment