Skip to content
Snippets Groups Projects
Verified Commit 985cb826 authored by Daniel Hornung's avatar Daniel Hornung
Browse files

FIX: Be less strict about numeric datatypes form xlsx files.

parent cc64104c
No related branches found
No related tags found
2 merge requests!39Release 0.4.0,!34TableConverter now converts int to float and vice versa to match the desired dtype.
......@@ -14,6 +14,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Changed ###
- `TableConverter` now converts int to float and vice versa to match the desired dtype.
### Deprecated ###
### Removed ###
......
......@@ -279,6 +279,8 @@ class Crawler(object):
except DataInconsistencyError as e:
logger.debug(traceback.format_exc())
logger.debug(e)
# TODO: Generally: in which cases should exceptions be raised? When is
# errors_occured set to True? The expected behavior must be documented.
except Exception as e:
try:
DataModelProblems.evaluate_exception(e)
......
......@@ -205,27 +205,32 @@ def string_in_list(val, options, ignore_case=True):
return val
class TableImporter(object):
class TableImporter():
"""Abstract base class for importing data from tables.
"""
def __init__(self, converters, obligatory_columns=None, unique_keys=None,
datatypes=None):
"""
converters: dict with column names as keys and converter functions as
values
This dict also defines what columns are required to exist
throught the existing keys. The converter functions are
applied to the cell values. They should also check for
ValueErrors, such that a separate value check is not
necessary.
obligatory_columns: list of column names, optional
each listed column must not have missing values
unique_columns : list of column names that in
combination must be unique; i.e. each row has a
unique combination of values in those columns.
datatypes: dict with column names as keys and datatypes as values
All non-null values will be checked whether they have the
provided datatype.
This dict also defines what columns are required to exist
throught the existing keys.
Parameters
----------
converters : dict
Dict with column names as keys and converter functions as values. This dict also defines
what columns are required to exist throught the existing keys. The converter functions are
applied to the cell values. They should also check for ValueErrors, such that a separate
value check is not necessary.
obligatory_columns : list, optional
List of column names, each listed column must not have missing values.
unique_keys : list, optional
List of column names that in combination must be unique: each row has a unique
combination of values in those columns.
datatypes : dict, optional
Dict with column names as keys and datatypes as values. All non-null values will be
checked whether they have the provided datatype. This dict also defines what columns are
required to exist throught the existing keys.
"""
if converters is None:
......@@ -247,11 +252,14 @@ class TableImporter(object):
raise NotImplementedError()
def check_columns(self, df, filename=None):
"""
checks whether all required columns, i.e. columns for which converters
were defined exist.
"""Check whether all required columns exist.
Required columns are columns for which converters are defined.
Raises
------
DataInconsistencyError
Raises: DataInconsistencyError
"""
for col in self.required_columns:
......@@ -267,12 +275,11 @@ class TableImporter(object):
raise DataInconsistencyError(errmsg)
def check_unique(self, df, filename=None):
"""
Check whether value combinations that shall be unique for each row are
unique.
"""Check whether value combinations that shall be unique for each row are unique.
If a second row is found, that uses the same combination of values as a
previous one, the second one is removed.
"""
df = df.copy()
uniques = []
......@@ -299,13 +306,33 @@ class TableImporter(object):
return df
def check_datatype(self, df, filename=None):
"""
Check for each column whether non-null fields are have the correct
datatype.
"""
def check_datatype(self, df, filename=None, strict=False):
"""Check for each column whether non-null fields have the correct datatype.
.. note::
If columns are float, but should be integer or vice versa, this method converts the
respective columns in place.
Parameters
----------
strict: boolean, optional
If False (the default), try to convert columns, otherwise raise an error.
"""
for key, datatype in self.datatypes.items():
# Check for castable numeric types first: We unconditionally cast float to int and vice
# versa, because CaosDB does not have different sizes anyway.
col_dtype = df.dtypes[key]
if not strict and not np.issubdtype(col_dtype, datatype):
issub = np.issubdtype
# These special cases should be fine.
if ((issub(col_dtype, np.integer) and issub(datatype, np.floating))
or (issub(col_dtype, np.floating) and issub(datatype, np.integer))):
df[key] = df[key].astype(datatype)
# Now check each element
for idx, val in df.loc[
pd.notnull(df.loc[:, key]), key].iteritems():
......@@ -326,6 +353,11 @@ class TableImporter(object):
Check in each row whether obligatory fields are empty or null.
Rows that have missing values are removed.
Returns
-------
out : pandas.DataFrame
The input DataFrame with incomplete rows removed.
"""
df = df.copy()
......@@ -362,10 +394,26 @@ class TableImporter(object):
return df
def check_dataframe(self, df, filename):
def check_dataframe(self, df, filename=None, strict=False):
"""Check if the dataframe conforms to the restrictions.
Checked restrictions are: Columns, data types, uniqueness requirements.
Parameters
----------
df: pandas.DataFrame
The dataframe to be checked.
filename: string, optional
The file name, only used for output in case of problems.
strict: boolean, optional
If False (the default), try to convert columns, otherwise raise an error.
"""
self.check_columns(df, filename=filename)
df = self.check_missing(df, filename=filename)
self.check_datatype(df, filename=filename)
self.check_datatype(df, filename=filename, strict=strict)
if len(self.unique_keys) > 0:
df = self.check_unique(df, filename=filename)
......@@ -378,8 +426,7 @@ class XLSImporter(TableImporter):
return self.read_xls(filename=filename, **kwargs)
def read_xls(self, filename, **kwargs):
"""
converts an xls file into a Pandas DataFrame.
"""Convert an xls file into a Pandas DataFrame.
The converters of the XLSImporter object are used.
......
File added
......@@ -23,7 +23,6 @@ import unittest
from functools import partial
from tempfile import NamedTemporaryFile
import caosdb as db
import numpy as np
import pandas as pd
import pytest
......@@ -211,6 +210,21 @@ class XLSImporterTest(TableImporterTest):
self.assertRaises(DataInconsistencyError, importer.read_xls,
tmp.name)
def test_datatypes(self):
"""Test datataypes in columns."""
importer = XLSImporter(converters={},
obligatory_columns=["float_as_float"],
datatypes={
"float_as_float": float,
"int_as_float": float,
"int_as_int": int,
"float_as_int": int,
}
)
df = importer.read_xls(os.path.join(os.path.dirname(__file__), "data", "datatypes.xlsx"))
assert np.issubdtype(df.loc[0, "int_as_float"], float)
assert df.loc[1, "float_as_int"] == 6 # This is an acceptable rounding error.
class CSVImporterTest(TableImporterTest):
def test_full(self):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment