diff --git a/CHANGELOG.md b/CHANGELOG.md index 3763e6b9cac9e154af51b9958facaf0d4f58d8a1..cc8e6c0190855d135614a7d1e9bfb73a34cee77b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed ### - Rendering of entities in static HTML sites created by the crawler. +- [#63](https://gitlab.com/linkahead/linkahead-advanced-user-tools/-/issues/63) + The `TableImporter` now gives a detailed output in what row/column + which TypeError occurred. ### Security ### diff --git a/src/caosadvancedtools/table_importer.py b/src/caosadvancedtools/table_importer.py index 5efd0500a4c5a797a27a92caf0cd2a49165fddd2..cd1b206f7ebbe7730692a3a6a7137e4aa467a5eb 100755 --- a/src/caosadvancedtools/table_importer.py +++ b/src/caosadvancedtools/table_importer.py @@ -31,7 +31,7 @@ import logging import pathlib from datetime import datetime -import caosdb as db +import linkahead as db import numpy as np import pandas as pd from xlrd import XLRDError @@ -537,6 +537,46 @@ class CSVImporter(TableImporter): extra={'identifier': str(filename), 'category': "inconsistency"}) raise DataInconsistencyError(*ve.args) + except TypeError as te: + # Iterate through the columns and rows to identify + # problematic cells with wrong types. + df = pd.read_csv(filename, sep=sep, + converters=applicable_converters, dtype=None, + **kwargs) + error_dict = {} + columns_with_errors = [] + for key, dtype in self.datatypes.items(): + if key not in df.columns: + continue + try: + df[key].astype(dtype) + except (TypeError, ValueError): + columns_with_errors.append(key) + if not columns_with_errors: + # We may have run into any other TypeError not caused + # by wrong datatypes within the table. + raise te + for ii, row in df.iterrows(): + for name in columns_with_errors: + try: + # we need to check with astype to provoke the + # same errors, but that only works on + # Dataframes, so cast value to list to + # DataFrame. + pd.DataFrame([row[name]]).astype(self.datatypes[name]) + except (TypeError, ValueError): + if ii not in error_dict: + error_dict[ii] = [] + error_dict[ii].append( + (name, str(self.datatypes[name]).strip("<>"), str(type(row[name])).strip("<>")) + ) + msg = "Elements with wrong datatypes encountered:\n" + for ii, error_list in error_dict.items(): + msg += f"* row {ii}:\n" + for err in error_list: + msg += f" * column \"{err[0]}\": Expected \"{err[1]}\" but found \"{err[2]}\".\n" + msg += '\n' + raise DataInconsistencyError(msg) df = self.check_dataframe(df, filename) diff --git a/unittests/test_table_importer.py b/unittests/test_table_importer.py index 6d445056b240e5ede6c52cb055cdde86cfb6d3d7..0abc28bba17dfbcf8f0ce59a15e51ace68db9167 100644 --- a/unittests/test_table_importer.py +++ b/unittests/test_table_importer.py @@ -379,6 +379,50 @@ class CSVImporterTest(TableImporterTest): assert df["int_with_gaps"].dtype == "Int64" assert df["float"].dtype == float + def test_wrong_datatype_type_errors(self): + """Test for + https://gitlab.com/linkahead/linkahead-advanced-user-tools/-/issues/63: + Highlight rows and columns in which type errors occur. + + """ + tmpfile = NamedTemporaryFile(delete=False, suffix=".csv") + with open(tmpfile.name, 'w') as tmp: + # Wrong types in row 2, columns 1 and 2, and row 4, column 2. + tmp.write( + "int,float\n" + "1,2.3\n" + "4.5,word\n" + "0,1.2\n" + "-12,12+3j\n" + ) + kwargs = { + "datatypes": { + "int": int, + "float": float, + "not-in-table": str # An unused datatype definition must not cause problems. + }, + "obligatory_columns": ["int"], + "converters": {} + } + importer = CSVImporter(**kwargs) + with pytest.raises(DataInconsistencyError) as die: + df = importer.read_file(tmpfile.name) + msg = str(die.value) + print("\n" + msg) + assert "Elements with wrong datatypes encountered:\n" in msg + # Errors in rows 1 and 3, no errors in 2 and 4 + assert "* row 1:\n" in msg + assert "* row 2:\n" not in msg + assert "* row 3:\n" in msg + assert "* row 4:\n" not in msg + row_1_msgs, row_3_msgs = msg.split("* row 1:\n")[1].split("* row 3:\n") + # exactly 2 errors in row 1, exactly 1 in row 3 + assert len(row_1_msgs.strip().split('\n')) == 2 + assert len(row_3_msgs.strip().split('\n')) == 1 + assert " * column \"int\"" in row_1_msgs + assert " * column \"float\"" in row_1_msgs + assert " * column \"float\"" in row_3_msgs + class TSVImporterTest(TableImporterTest): def test_full(self):