Skip to content
Snippets Groups Projects
Commit afc902e0 authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

DOC: added docstrings

parent 0b4ddcd1
No related branches found
No related tags found
1 merge request!22Release 0.3
......@@ -16,6 +16,17 @@
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
This module allows to read table files like tsv and xls. They are converted to
a Pandas DataFrame and checked whether they comply with the rules provided.
For example, a list of column names that have to exist can be provided.
This module also implements some converters that can be applied to cell
entries.
Those converters can also be used to apply checks on the entries. TODO: Should
this be separated?
"""
import logging
......@@ -30,6 +41,9 @@ logger = logging.getLogger("caosadvancedtools")
def name_converter(name):
"""
checks whether a string can be interpreted as 'LastName, FirstName'
"""
name = str(name)
if len(name.split(",")) != 2:
......@@ -40,6 +54,12 @@ def name_converter(name):
def yes_no_converter(val):
"""
converts a string to True or False if possible.
Allowed filed values are yes and no.
"""
if str(val).lower() == "yes":
return True
elif str(val).lower() == "no":
......@@ -49,10 +69,14 @@ def yes_no_converter(val):
"Field should be 'Yes' or 'No', but is '{}'.".format(val))
class TSV_Importer(object):
def __init__(self, converters, obligatory_columns=[], unique_columns=[]):
raise NotImplementedError()
class XLS_Importer(object):
def __init__(self, converters, obligatory_columns=[], unique_columns=[]):
"""
converters: dict with column names as keys and converter functions as
values
This dict also defines what columns are required to exist
......@@ -70,6 +94,11 @@ class XLS_Importer(object):
self.converters = converters
def read_xls(self, filename):
"""
converts an xls file into a Pandas DataFrame.
The converters of the XLS_Importer object are used.
"""
try:
xls_file = pd.io.excel.ExcelFile(filename)
except XLRDError as e:
......@@ -82,10 +111,9 @@ class XLS_Importer(object):
if len(xls_file.sheet_names) > 1:
# Multiple sheets is the default now. Only show in debug
logger.debug("Excel file {} contains multiple sheets. "
"All but the first are being ignored.".format(
filename
))
logger.debug(
"Excel file {} contains multiple sheets. "
"All but the first are being ignored.".format(filename))
try:
df = xls_file.parse(converters=self.converters)
......@@ -105,6 +133,11 @@ class XLS_Importer(object):
return df
def check_columns(self, df, filename=None):
"""
checks whether all required columns, i.e. columns for which converters
were defined exist.
"""
for col in self.required_columns:
if col not in df.columns:
errmsg = "Column '{}' missing in ".format(col)
......@@ -118,6 +151,13 @@ class XLS_Importer(object):
raise DataInconsistencyError(errmsg)
def check_unique(self, df, filename=None):
"""
Check whether value combinations that shall be unique for each row are
unique.
If a second row is found, that uses the same combination of values as a
previous one, the second one is removed.
"""
df = df.copy()
uniques = []
......@@ -141,6 +181,11 @@ class XLS_Importer(object):
return df
def check_missing(self, df, filename=None):
"""
Check in each row whether obligatory fields are empty or null.
Rows that have missing values are removed.
"""
df = df.copy()
for index, row in df.iterrows():
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment