Skip to content
Snippets Groups Projects
Commit cb9b86f6 authored by Florian Spreckelsen's avatar Florian Spreckelsen
Browse files

Merge branch 'f-convert-int-float' into 'dev'

TableConverter now converts int to float and vice versa to match the desired dtype.

See merge request !34
parents cc64104c 61a31fff
No related branches found
No related tags found
2 merge requests!39Release 0.4.0,!34TableConverter now converts int to float and vice versa to match the desired dtype.
Pipeline #19892 failed
...@@ -107,7 +107,7 @@ style: ...@@ -107,7 +107,7 @@ style:
stage: style stage: style
image: $CI_REGISTRY_IMAGE image: $CI_REGISTRY_IMAGE
script: script:
- autopep8 -ar --diff --exit-code --exclude swagger_client . - make style
allow_failure: true allow_failure: true
unittest: unittest:
......
...@@ -14,6 +14,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ...@@ -14,6 +14,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Changed ### ### Changed ###
- `TableConverter` now converts int to float and vice versa to match the desired dtype.
### Deprecated ### ### Deprecated ###
### Removed ### ### Removed ###
......
...@@ -34,3 +34,7 @@ install: ...@@ -34,3 +34,7 @@ install:
unittest: unittest:
pytest-3 unittests pytest-3 unittests
style:
autopep8 -ar --diff --exit-code --exclude swagger_client .
.PHONY: style
[pytest] [pytest]
testpaths = unittests testpaths = unittests
addopts = -vv addopts = -vv
python_paths = src
...@@ -279,6 +279,8 @@ class Crawler(object): ...@@ -279,6 +279,8 @@ class Crawler(object):
except DataInconsistencyError as e: except DataInconsistencyError as e:
logger.debug(traceback.format_exc()) logger.debug(traceback.format_exc())
logger.debug(e) logger.debug(e)
# TODO: Generally: in which cases should exceptions be raised? When is
# errors_occured set to True? The expected behavior must be documented.
except Exception as e: except Exception as e:
try: try:
DataModelProblems.evaluate_exception(e) DataModelProblems.evaluate_exception(e)
......
...@@ -205,27 +205,33 @@ def string_in_list(val, options, ignore_case=True): ...@@ -205,27 +205,33 @@ def string_in_list(val, options, ignore_case=True):
return val return val
class TableImporter(object): class TableImporter():
"""Abstract base class for importing data from tables.
"""
def __init__(self, converters, obligatory_columns=None, unique_keys=None, def __init__(self, converters, obligatory_columns=None, unique_keys=None,
datatypes=None): datatypes=None):
""" """
converters: dict with column names as keys and converter functions as Parameters
values ----------
This dict also defines what columns are required to exist converters : dict
throught the existing keys. The converter functions are Dict with column names as keys and converter functions as values. This dict also defines
applied to the cell values. They should also check for what columns are required to exist throught the existing keys. The converter functions are
ValueErrors, such that a separate value check is not applied to the cell values. They should also check for ValueErrors, such that a separate
necessary. value check is not necessary.
obligatory_columns: list of column names, optional
each listed column must not have missing values obligatory_columns : list, optional
unique_columns : list of column names that in List of column names, each listed column must not have missing values.
combination must be unique; i.e. each row has a
unique combination of values in those columns. unique_keys : list, optional
datatypes: dict with column names as keys and datatypes as values List of column names that in combination must be unique: each row has a unique
All non-null values will be checked whether they have the combination of values in those columns.
provided datatype.
This dict also defines what columns are required to exist datatypes : dict, optional
throught the existing keys. Dict with column names as keys and datatypes as values. All non-null values will be
checked whether they have the provided datatype. This dict also defines what columns are
required to exist throught the existing keys.
""" """
if converters is None: if converters is None:
...@@ -247,11 +253,14 @@ class TableImporter(object): ...@@ -247,11 +253,14 @@ class TableImporter(object):
raise NotImplementedError() raise NotImplementedError()
def check_columns(self, df, filename=None): def check_columns(self, df, filename=None):
""" """Check whether all required columns exist.
checks whether all required columns, i.e. columns for which converters
were defined exist. Required columns are columns for which converters are defined.
Raises
------
DataInconsistencyError
Raises: DataInconsistencyError
""" """
for col in self.required_columns: for col in self.required_columns:
...@@ -267,12 +276,11 @@ class TableImporter(object): ...@@ -267,12 +276,11 @@ class TableImporter(object):
raise DataInconsistencyError(errmsg) raise DataInconsistencyError(errmsg)
def check_unique(self, df, filename=None): def check_unique(self, df, filename=None):
""" """Check whether value combinations that shall be unique for each row are unique.
Check whether value combinations that shall be unique for each row are
unique.
If a second row is found, that uses the same combination of values as a If a second row is found, that uses the same combination of values as a
previous one, the second one is removed. previous one, the second one is removed.
""" """
df = df.copy() df = df.copy()
uniques = [] uniques = []
...@@ -299,13 +307,32 @@ class TableImporter(object): ...@@ -299,13 +307,32 @@ class TableImporter(object):
return df return df
def check_datatype(self, df, filename=None): def check_datatype(self, df, filename=None, strict=False):
""" """Check for each column whether non-null fields have the correct datatype.
Check for each column whether non-null fields are have the correct
datatype. .. note::
"""
If columns are integer, but should be float, this method converts the respective columns
in place.
Parameters
----------
strict: boolean, optional
If False (the default), try to convert columns, otherwise raise an error.
"""
for key, datatype in self.datatypes.items(): for key, datatype in self.datatypes.items():
# Check for castable numeric types first: We unconditionally cast int to the default
# float, because CaosDB does not have different sizes anyway.
col_dtype = df.dtypes[key]
if not strict and not np.issubdtype(col_dtype, datatype):
issub = np.issubdtype
# These special cases should be fine.
if issub(col_dtype, np.integer) and issub(datatype, np.floating):
df[key] = df[key].astype(datatype)
# Now check each element
for idx, val in df.loc[ for idx, val in df.loc[
pd.notnull(df.loc[:, key]), key].iteritems(): pd.notnull(df.loc[:, key]), key].iteritems():
...@@ -326,6 +353,11 @@ class TableImporter(object): ...@@ -326,6 +353,11 @@ class TableImporter(object):
Check in each row whether obligatory fields are empty or null. Check in each row whether obligatory fields are empty or null.
Rows that have missing values are removed. Rows that have missing values are removed.
Returns
-------
out : pandas.DataFrame
The input DataFrame with incomplete rows removed.
""" """
df = df.copy() df = df.copy()
...@@ -362,10 +394,26 @@ class TableImporter(object): ...@@ -362,10 +394,26 @@ class TableImporter(object):
return df return df
def check_dataframe(self, df, filename): def check_dataframe(self, df, filename=None, strict=False):
"""Check if the dataframe conforms to the restrictions.
Checked restrictions are: Columns, data types, uniqueness requirements.
Parameters
----------
df: pandas.DataFrame
The dataframe to be checked.
filename: string, optional
The file name, only used for output in case of problems.
strict: boolean, optional
If False (the default), try to convert columns, otherwise raise an error.
"""
self.check_columns(df, filename=filename) self.check_columns(df, filename=filename)
df = self.check_missing(df, filename=filename) df = self.check_missing(df, filename=filename)
self.check_datatype(df, filename=filename) self.check_datatype(df, filename=filename, strict=strict)
if len(self.unique_keys) > 0: if len(self.unique_keys) > 0:
df = self.check_unique(df, filename=filename) df = self.check_unique(df, filename=filename)
...@@ -378,8 +426,7 @@ class XLSImporter(TableImporter): ...@@ -378,8 +426,7 @@ class XLSImporter(TableImporter):
return self.read_xls(filename=filename, **kwargs) return self.read_xls(filename=filename, **kwargs)
def read_xls(self, filename, **kwargs): def read_xls(self, filename, **kwargs):
""" """Convert an xls file into a Pandas DataFrame.
converts an xls file into a Pandas DataFrame.
The converters of the XLSImporter object are used. The converters of the XLSImporter object are used.
......
File added
...@@ -23,7 +23,6 @@ import unittest ...@@ -23,7 +23,6 @@ import unittest
from functools import partial from functools import partial
from tempfile import NamedTemporaryFile from tempfile import NamedTemporaryFile
import caosdb as db
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import pytest import pytest
...@@ -211,6 +210,19 @@ class XLSImporterTest(TableImporterTest): ...@@ -211,6 +210,19 @@ class XLSImporterTest(TableImporterTest):
self.assertRaises(DataInconsistencyError, importer.read_xls, self.assertRaises(DataInconsistencyError, importer.read_xls,
tmp.name) tmp.name)
def test_datatypes(self):
"""Test datataypes in columns."""
importer = XLSImporter(converters={},
obligatory_columns=["float_as_float"],
datatypes={
"float_as_float": float,
"int_as_float": float,
"int_as_int": int,
}
)
df = importer.read_xls(os.path.join(os.path.dirname(__file__), "data", "datatypes.xlsx"))
assert np.issubdtype(df.loc[0, "int_as_float"], float)
class CSVImporterTest(TableImporterTest): class CSVImporterTest(TableImporterTest):
def test_full(self): def test_full(self):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment