diff --git a/CHANGELOG.md b/CHANGELOG.md index 44629bd9b80b9bfd6a8d6a991fe52c8ce5ed3919..a6b2de738c79b3ad38c6bf77a2abb3611a6511eb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed ### +- `TableConverter` now converts int to float and vice versa to match the desired dtype. + ### Deprecated ### ### Removed ### diff --git a/src/caosadvancedtools/crawler.py b/src/caosadvancedtools/crawler.py index 5d91d85cbbbff5b6f64ce9a9de1f29ca603d3b8a..87b91a52a6034e906766a56ded787416e5c0027d 100644 --- a/src/caosadvancedtools/crawler.py +++ b/src/caosadvancedtools/crawler.py @@ -279,6 +279,8 @@ class Crawler(object): except DataInconsistencyError as e: logger.debug(traceback.format_exc()) logger.debug(e) + # TODO: Generally: in which cases should exceptions be raised? When is + # errors_occured set to True? The expected behavior must be documented. except Exception as e: try: DataModelProblems.evaluate_exception(e) diff --git a/src/caosadvancedtools/table_importer.py b/src/caosadvancedtools/table_importer.py index 0b55252bbf4d65cde1ffdf0711f396dda0f29546..654d28b41a8bd6fc2ad1d71b7deafa71e0ff21a0 100755 --- a/src/caosadvancedtools/table_importer.py +++ b/src/caosadvancedtools/table_importer.py @@ -205,27 +205,32 @@ def string_in_list(val, options, ignore_case=True): return val -class TableImporter(object): +class TableImporter(): + """Abstract base class for importing data from tables. + """ def __init__(self, converters, obligatory_columns=None, unique_keys=None, datatypes=None): """ - converters: dict with column names as keys and converter functions as - values - This dict also defines what columns are required to exist - throught the existing keys. The converter functions are - applied to the cell values. They should also check for - ValueErrors, such that a separate value check is not - necessary. - obligatory_columns: list of column names, optional - each listed column must not have missing values - unique_columns : list of column names that in - combination must be unique; i.e. each row has a - unique combination of values in those columns. - datatypes: dict with column names as keys and datatypes as values - All non-null values will be checked whether they have the - provided datatype. - This dict also defines what columns are required to exist - throught the existing keys. + Parameters + ---------- + converters : dict + Dict with column names as keys and converter functions as values. This dict also defines + what columns are required to exist throught the existing keys. The converter functions are + applied to the cell values. They should also check for ValueErrors, such that a separate + value check is not necessary. + + obligatory_columns : list, optional + List of column names, each listed column must not have missing values. + + unique_keys : list, optional + List of column names that in combination must be unique: each row has a unique + combination of values in those columns. + + datatypes : dict, optional + Dict with column names as keys and datatypes as values. All non-null values will be + checked whether they have the provided datatype. This dict also defines what columns are + required to exist throught the existing keys. + """ if converters is None: @@ -247,11 +252,14 @@ class TableImporter(object): raise NotImplementedError() def check_columns(self, df, filename=None): - """ - checks whether all required columns, i.e. columns for which converters - were defined exist. + """Check whether all required columns exist. + + Required columns are columns for which converters are defined. + + Raises + ------ + DataInconsistencyError - Raises: DataInconsistencyError """ for col in self.required_columns: @@ -267,12 +275,11 @@ class TableImporter(object): raise DataInconsistencyError(errmsg) def check_unique(self, df, filename=None): - """ - Check whether value combinations that shall be unique for each row are - unique. + """Check whether value combinations that shall be unique for each row are unique. If a second row is found, that uses the same combination of values as a previous one, the second one is removed. + """ df = df.copy() uniques = [] @@ -299,13 +306,33 @@ class TableImporter(object): return df - def check_datatype(self, df, filename=None): - """ - Check for each column whether non-null fields are have the correct - datatype. - """ + def check_datatype(self, df, filename=None, strict=False): + """Check for each column whether non-null fields have the correct datatype. + + .. note:: + + If columns are float, but should be integer or vice versa, this method converts the + respective columns in place. + Parameters + ---------- + + strict: boolean, optional + If False (the default), try to convert columns, otherwise raise an error. + + """ for key, datatype in self.datatypes.items(): + # Check for castable numeric types first: We unconditionally cast float to int and vice + # versa, because CaosDB does not have different sizes anyway. + col_dtype = df.dtypes[key] + if not strict and not np.issubdtype(col_dtype, datatype): + issub = np.issubdtype + # These special cases should be fine. + if ((issub(col_dtype, np.integer) and issub(datatype, np.floating)) + or (issub(col_dtype, np.floating) and issub(datatype, np.integer))): + df[key] = df[key].astype(datatype) + + # Now check each element for idx, val in df.loc[ pd.notnull(df.loc[:, key]), key].iteritems(): @@ -326,6 +353,11 @@ class TableImporter(object): Check in each row whether obligatory fields are empty or null. Rows that have missing values are removed. + + Returns + ------- + out : pandas.DataFrame + The input DataFrame with incomplete rows removed. """ df = df.copy() @@ -362,10 +394,26 @@ class TableImporter(object): return df - def check_dataframe(self, df, filename): + def check_dataframe(self, df, filename=None, strict=False): + """Check if the dataframe conforms to the restrictions. + + Checked restrictions are: Columns, data types, uniqueness requirements. + + Parameters + ---------- + + df: pandas.DataFrame + The dataframe to be checked. + + filename: string, optional + The file name, only used for output in case of problems. + + strict: boolean, optional + If False (the default), try to convert columns, otherwise raise an error. + """ self.check_columns(df, filename=filename) df = self.check_missing(df, filename=filename) - self.check_datatype(df, filename=filename) + self.check_datatype(df, filename=filename, strict=strict) if len(self.unique_keys) > 0: df = self.check_unique(df, filename=filename) @@ -378,8 +426,7 @@ class XLSImporter(TableImporter): return self.read_xls(filename=filename, **kwargs) def read_xls(self, filename, **kwargs): - """ - converts an xls file into a Pandas DataFrame. + """Convert an xls file into a Pandas DataFrame. The converters of the XLSImporter object are used. diff --git a/unittests/data/datatypes.xlsx b/unittests/data/datatypes.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..34fc4cf43092a68b630e0e04ebc43609b8a0b17b Binary files /dev/null and b/unittests/data/datatypes.xlsx differ diff --git a/unittests/test_table_importer.py b/unittests/test_table_importer.py index 9c8a379d8c12def32c04cf82c5e09c0f5f6f175c..f2727472798c9605313c3f4aacc4755530a6862d 100644 --- a/unittests/test_table_importer.py +++ b/unittests/test_table_importer.py @@ -23,7 +23,6 @@ import unittest from functools import partial from tempfile import NamedTemporaryFile -import caosdb as db import numpy as np import pandas as pd import pytest @@ -211,6 +210,21 @@ class XLSImporterTest(TableImporterTest): self.assertRaises(DataInconsistencyError, importer.read_xls, tmp.name) + def test_datatypes(self): + """Test datataypes in columns.""" + importer = XLSImporter(converters={}, + obligatory_columns=["float_as_float"], + datatypes={ + "float_as_float": float, + "int_as_float": float, + "int_as_int": int, + "float_as_int": int, + } + ) + df = importer.read_xls(os.path.join(os.path.dirname(__file__), "data", "datatypes.xlsx")) + assert np.issubdtype(df.loc[0, "int_as_float"], float) + assert df.loc[1, "float_as_int"] == 6 # This is an acceptable rounding error. + class CSVImporterTest(TableImporterTest): def test_full(self):