diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index f620aeffd5146254bf630645eaded34d69f35f1c..75520091e64c870c9975174fda183c5d8fd5880b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -107,7 +107,7 @@ style: stage: style image: $CI_REGISTRY_IMAGE script: - - autopep8 -ar --diff --exit-code --exclude swagger_client . + - make style allow_failure: true unittest: diff --git a/CHANGELOG.md b/CHANGELOG.md index 44629bd9b80b9bfd6a8d6a991fe52c8ce5ed3919..a6b2de738c79b3ad38c6bf77a2abb3611a6511eb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed ### +- `TableConverter` now converts int to float and vice versa to match the desired dtype. + ### Deprecated ### ### Removed ### diff --git a/Makefile b/Makefile index 7609444bd4fd3a8ce980eca0bc3993b3cf2e168f..52ac04456cf59a24334003d4a0af9055dd3b11ec 100644 --- a/Makefile +++ b/Makefile @@ -34,3 +34,7 @@ install: unittest: pytest-3 unittests + +style: + autopep8 -ar --diff --exit-code --exclude swagger_client . +.PHONY: style diff --git a/pytest.ini b/pytest.ini index 211913fa06d4e0a46c9c9024e147c5313e4746e1..e65efaf9aaf061a8a1ec0040f87d682536fac4c2 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,4 +1,3 @@ [pytest] testpaths = unittests addopts = -vv -python_paths = src diff --git a/src/caosadvancedtools/crawler.py b/src/caosadvancedtools/crawler.py index 5d91d85cbbbff5b6f64ce9a9de1f29ca603d3b8a..87b91a52a6034e906766a56ded787416e5c0027d 100644 --- a/src/caosadvancedtools/crawler.py +++ b/src/caosadvancedtools/crawler.py @@ -279,6 +279,8 @@ class Crawler(object): except DataInconsistencyError as e: logger.debug(traceback.format_exc()) logger.debug(e) + # TODO: Generally: in which cases should exceptions be raised? When is + # errors_occured set to True? The expected behavior must be documented. except Exception as e: try: DataModelProblems.evaluate_exception(e) diff --git a/src/caosadvancedtools/table_importer.py b/src/caosadvancedtools/table_importer.py index 0b55252bbf4d65cde1ffdf0711f396dda0f29546..1f515e78e3ddbd198fa0336589a359ba9154f038 100755 --- a/src/caosadvancedtools/table_importer.py +++ b/src/caosadvancedtools/table_importer.py @@ -205,27 +205,33 @@ def string_in_list(val, options, ignore_case=True): return val -class TableImporter(object): +class TableImporter(): + """Abstract base class for importing data from tables. + """ + def __init__(self, converters, obligatory_columns=None, unique_keys=None, datatypes=None): """ - converters: dict with column names as keys and converter functions as - values - This dict also defines what columns are required to exist - throught the existing keys. The converter functions are - applied to the cell values. They should also check for - ValueErrors, such that a separate value check is not - necessary. - obligatory_columns: list of column names, optional - each listed column must not have missing values - unique_columns : list of column names that in - combination must be unique; i.e. each row has a - unique combination of values in those columns. - datatypes: dict with column names as keys and datatypes as values - All non-null values will be checked whether they have the - provided datatype. - This dict also defines what columns are required to exist - throught the existing keys. + Parameters + ---------- + converters : dict + Dict with column names as keys and converter functions as values. This dict also defines + what columns are required to exist throught the existing keys. The converter functions are + applied to the cell values. They should also check for ValueErrors, such that a separate + value check is not necessary. + + obligatory_columns : list, optional + List of column names, each listed column must not have missing values. + + unique_keys : list, optional + List of column names that in combination must be unique: each row has a unique + combination of values in those columns. + + datatypes : dict, optional + Dict with column names as keys and datatypes as values. All non-null values will be + checked whether they have the provided datatype. This dict also defines what columns are + required to exist throught the existing keys. + """ if converters is None: @@ -247,11 +253,14 @@ class TableImporter(object): raise NotImplementedError() def check_columns(self, df, filename=None): - """ - checks whether all required columns, i.e. columns for which converters - were defined exist. + """Check whether all required columns exist. + + Required columns are columns for which converters are defined. + + Raises + ------ + DataInconsistencyError - Raises: DataInconsistencyError """ for col in self.required_columns: @@ -267,12 +276,11 @@ class TableImporter(object): raise DataInconsistencyError(errmsg) def check_unique(self, df, filename=None): - """ - Check whether value combinations that shall be unique for each row are - unique. + """Check whether value combinations that shall be unique for each row are unique. If a second row is found, that uses the same combination of values as a previous one, the second one is removed. + """ df = df.copy() uniques = [] @@ -299,13 +307,32 @@ class TableImporter(object): return df - def check_datatype(self, df, filename=None): - """ - Check for each column whether non-null fields are have the correct - datatype. - """ + def check_datatype(self, df, filename=None, strict=False): + """Check for each column whether non-null fields have the correct datatype. + + .. note:: + If columns are integer, but should be float, this method converts the respective columns + in place. + + Parameters + ---------- + + strict: boolean, optional + If False (the default), try to convert columns, otherwise raise an error. + + """ for key, datatype in self.datatypes.items(): + # Check for castable numeric types first: We unconditionally cast int to the default + # float, because CaosDB does not have different sizes anyway. + col_dtype = df.dtypes[key] + if not strict and not np.issubdtype(col_dtype, datatype): + issub = np.issubdtype + # These special cases should be fine. + if issub(col_dtype, np.integer) and issub(datatype, np.floating): + df[key] = df[key].astype(datatype) + + # Now check each element for idx, val in df.loc[ pd.notnull(df.loc[:, key]), key].iteritems(): @@ -326,6 +353,11 @@ class TableImporter(object): Check in each row whether obligatory fields are empty or null. Rows that have missing values are removed. + + Returns + ------- + out : pandas.DataFrame + The input DataFrame with incomplete rows removed. """ df = df.copy() @@ -362,10 +394,26 @@ class TableImporter(object): return df - def check_dataframe(self, df, filename): + def check_dataframe(self, df, filename=None, strict=False): + """Check if the dataframe conforms to the restrictions. + + Checked restrictions are: Columns, data types, uniqueness requirements. + + Parameters + ---------- + + df: pandas.DataFrame + The dataframe to be checked. + + filename: string, optional + The file name, only used for output in case of problems. + + strict: boolean, optional + If False (the default), try to convert columns, otherwise raise an error. + """ self.check_columns(df, filename=filename) df = self.check_missing(df, filename=filename) - self.check_datatype(df, filename=filename) + self.check_datatype(df, filename=filename, strict=strict) if len(self.unique_keys) > 0: df = self.check_unique(df, filename=filename) @@ -378,8 +426,7 @@ class XLSImporter(TableImporter): return self.read_xls(filename=filename, **kwargs) def read_xls(self, filename, **kwargs): - """ - converts an xls file into a Pandas DataFrame. + """Convert an xls file into a Pandas DataFrame. The converters of the XLSImporter object are used. diff --git a/unittests/data/datatypes.xlsx b/unittests/data/datatypes.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..34fc4cf43092a68b630e0e04ebc43609b8a0b17b Binary files /dev/null and b/unittests/data/datatypes.xlsx differ diff --git a/unittests/test_table_importer.py b/unittests/test_table_importer.py index 9c8a379d8c12def32c04cf82c5e09c0f5f6f175c..4c7d044ef1de877cf4072034c96aca7113f75cc0 100644 --- a/unittests/test_table_importer.py +++ b/unittests/test_table_importer.py @@ -23,7 +23,6 @@ import unittest from functools import partial from tempfile import NamedTemporaryFile -import caosdb as db import numpy as np import pandas as pd import pytest @@ -211,6 +210,19 @@ class XLSImporterTest(TableImporterTest): self.assertRaises(DataInconsistencyError, importer.read_xls, tmp.name) + def test_datatypes(self): + """Test datataypes in columns.""" + importer = XLSImporter(converters={}, + obligatory_columns=["float_as_float"], + datatypes={ + "float_as_float": float, + "int_as_float": float, + "int_as_int": int, + } + ) + df = importer.read_xls(os.path.join(os.path.dirname(__file__), "data", "datatypes.xlsx")) + assert np.issubdtype(df.loc[0, "int_as_float"], float) + class CSVImporterTest(TableImporterTest): def test_full(self):