DOC: added docstrings

afc902e0 · Henrik tom Wörden · 0b4ddcd1 · afc902e0
Commit afc902e0 authored Jul 15, 2020 by Henrik tom Wörden
--- a/src/caosadvancedtools/table_importer.py
+++ b/src/caosadvancedtools/table_importer.py
@@ -16,6 +16,17 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program. If not, see <https://www.gnu.org/licenses/>.
+"""
+This module allows to read table files like tsv and xls. They are converted to
+a Pandas DataFrame and checked whether they comply with the rules provided.
+For example, a list of column names that have to exist can be provided.
+This module also implements some converters that can be applied to cell
+entries.
+Those converters can also be used to apply checks on the entries. TODO: Should
+this be separated?
+"""
 import logging
@@ -30,6 +41,9 @@ logger = logging.getLogger("caosadvancedtools")
 def name_converter(name):
+    """
+    checks whether a string can be interpreted as 'LastName, FirstName'
+    """
    name = str(name)
    if len(name.split(",")) != 2:
@@ -40,6 +54,12 @@ def name_converter(name):
 def yes_no_converter(val):
+    """
+    converts a string to True or False if possible.
+    Allowed filed values are yes and no.
+    """
    if str(val).lower() == "yes":
        return True
    elif str(val).lower() == "no":
@@ -49,10 +69,14 @@ def yes_no_converter(val):
            "Field should be 'Yes' or 'No', but is '{}'.".format(val))
+class TSV_Importer(object):
+    def __init__(self, converters, obligatory_columns=[], unique_columns=[]):
+        raise NotImplementedError()
 class XLS_Importer(object):
    def __init__(self, converters, obligatory_columns=[], unique_columns=[]):
        """
        converters: dict with column names as keys and converter functions as
                    values
                    This dict also defines what columns are required to exist
@@ -70,6 +94,11 @@ class XLS_Importer(object):
        self.converters = converters
    def read_xls(self, filename):
+        """
+        converts an xls file into a Pandas DataFrame.
+        The converters of the XLS_Importer object are used.
+        """
        try:
            xls_file = pd.io.excel.ExcelFile(filename)
        except XLRDError as e:
@@ -82,10 +111,9 @@ class XLS_Importer(object):
        if len(xls_file.sheet_names) > 1:
            # Multiple sheets is the default now. Only show in debug
-            logger.debug("Excel file {} contains multiple sheets. "
+            logger.debug(
-                         "All but the first are being ignored.".format(
+                "Excel file {} contains multiple sheets. "
-                             filename
+                "All but the first are being ignored.".format(filename))
-                         ))
        try:
            df = xls_file.parse(converters=self.converters)
@@ -105,6 +133,11 @@ class XLS_Importer(object):
        return df
    def check_columns(self, df, filename=None):
+        """
+        checks whether all required columns, i.e. columns for which converters
+        were defined exist.
+        """
        for col in self.required_columns:
            if col not in df.columns:
                errmsg = "Column '{}' missing in ".format(col)
@@ -118,6 +151,13 @@ class XLS_Importer(object):
                raise DataInconsistencyError(errmsg)
    def check_unique(self, df, filename=None):
+        """
+        Check whether value combinations that shall be unique for each row are
+        unique.
+        If a second row is found, that uses the same combination of values as a
+        previous one, the second one is removed.
+        """
        df = df.copy()
        uniques = []
@@ -141,6 +181,11 @@ class XLS_Importer(object):
        return df
    def check_missing(self, df, filename=None):
+        """
+        Check in each row whether obligatory fields are empty or null.
+        Rows that have missing values are removed.
+        """
        df = df.copy()
        for index, row in df.iterrows():