diff --git a/src/caosadvancedtools/table_importer.py b/src/caosadvancedtools/table_importer.py index 3e0e8991caba512fefc7c3289d55c365775bc439..9e84d332f80f3488911dd9af92698fae554c7104 100755 --- a/src/caosadvancedtools/table_importer.py +++ b/src/caosadvancedtools/table_importer.py @@ -16,6 +16,17 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. +""" +This module allows to read table files like tsv and xls. They are converted to +a Pandas DataFrame and checked whether they comply with the rules provided. +For example, a list of column names that have to exist can be provided. + +This module also implements some converters that can be applied to cell +entries. + +Those converters can also be used to apply checks on the entries. TODO: Should +this be separated? +""" import logging @@ -30,6 +41,9 @@ logger = logging.getLogger("caosadvancedtools") def name_converter(name): + """ + checks whether a string can be interpreted as 'LastName, FirstName' + """ name = str(name) if len(name.split(",")) != 2: @@ -40,6 +54,12 @@ def name_converter(name): def yes_no_converter(val): + """ + converts a string to True or False if possible. + + Allowed filed values are yes and no. + """ + if str(val).lower() == "yes": return True elif str(val).lower() == "no": @@ -49,10 +69,14 @@ def yes_no_converter(val): "Field should be 'Yes' or 'No', but is '{}'.".format(val)) +class TSV_Importer(object): + def __init__(self, converters, obligatory_columns=[], unique_columns=[]): + raise NotImplementedError() + + class XLS_Importer(object): def __init__(self, converters, obligatory_columns=[], unique_columns=[]): """ - converters: dict with column names as keys and converter functions as values This dict also defines what columns are required to exist @@ -70,6 +94,11 @@ class XLS_Importer(object): self.converters = converters def read_xls(self, filename): + """ + converts an xls file into a Pandas DataFrame. + + The converters of the XLS_Importer object are used. + """ try: xls_file = pd.io.excel.ExcelFile(filename) except XLRDError as e: @@ -82,10 +111,9 @@ class XLS_Importer(object): if len(xls_file.sheet_names) > 1: # Multiple sheets is the default now. Only show in debug - logger.debug("Excel file {} contains multiple sheets. " - "All but the first are being ignored.".format( - filename - )) + logger.debug( + "Excel file {} contains multiple sheets. " + "All but the first are being ignored.".format(filename)) try: df = xls_file.parse(converters=self.converters) @@ -105,6 +133,11 @@ class XLS_Importer(object): return df def check_columns(self, df, filename=None): + """ + checks whether all required columns, i.e. columns for which converters + were defined exist. + """ + for col in self.required_columns: if col not in df.columns: errmsg = "Column '{}' missing in ".format(col) @@ -118,6 +151,13 @@ class XLS_Importer(object): raise DataInconsistencyError(errmsg) def check_unique(self, df, filename=None): + """ + Check whether value combinations that shall be unique for each row are + unique. + + If a second row is found, that uses the same combination of values as a + previous one, the second one is removed. + """ df = df.copy() uniques = [] @@ -141,6 +181,11 @@ class XLS_Importer(object): return df def check_missing(self, df, filename=None): + """ + Check in each row whether obligatory fields are empty or null. + + Rows that have missing values are removed. + """ df = df.copy() for index, row in df.iterrows():