Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
C
caosdb-advanced-user-tools
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
caosdb
Software
caosdb-advanced-user-tools
Commits
afc902e0
Commit
afc902e0
authored
4 years ago
by
Henrik tom Wörden
Browse files
Options
Downloads
Patches
Plain Diff
DOC: added docstrings
parent
0b4ddcd1
No related branches found
Branches containing commit
No related tags found
Tags containing commit
1 merge request
!22
Release 0.3
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/caosadvancedtools/table_importer.py
+50
-5
50 additions, 5 deletions
src/caosadvancedtools/table_importer.py
with
50 additions
and
5 deletions
src/caosadvancedtools/table_importer.py
+
50
−
5
View file @
afc902e0
...
...
@@ -16,6 +16,17 @@
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
This module allows to read table files like tsv and xls. They are converted to
a Pandas DataFrame and checked whether they comply with the rules provided.
For example, a list of column names that have to exist can be provided.
This module also implements some converters that can be applied to cell
entries.
Those converters can also be used to apply checks on the entries. TODO: Should
this be separated?
"""
import
logging
...
...
@@ -30,6 +41,9 @@ logger = logging.getLogger("caosadvancedtools")
def
name_converter
(
name
):
"""
checks whether a string can be interpreted as
'
LastName, FirstName
'
"""
name
=
str
(
name
)
if
len
(
name
.
split
(
"
,
"
))
!=
2
:
...
...
@@ -40,6 +54,12 @@ def name_converter(name):
def
yes_no_converter
(
val
):
"""
converts a string to True or False if possible.
Allowed filed values are yes and no.
"""
if
str
(
val
).
lower
()
==
"
yes
"
:
return
True
elif
str
(
val
).
lower
()
==
"
no
"
:
...
...
@@ -49,10 +69,14 @@ def yes_no_converter(val):
"
Field should be
'
Yes
'
or
'
No
'
, but is
'
{}
'
.
"
.
format
(
val
))
class
TSV_Importer
(
object
):
def
__init__
(
self
,
converters
,
obligatory_columns
=
[],
unique_columns
=
[]):
raise
NotImplementedError
()
class
XLS_Importer
(
object
):
def
__init__
(
self
,
converters
,
obligatory_columns
=
[],
unique_columns
=
[]):
"""
converters: dict with column names as keys and converter functions as
values
This dict also defines what columns are required to exist
...
...
@@ -70,6 +94,11 @@ class XLS_Importer(object):
self
.
converters
=
converters
def
read_xls
(
self
,
filename
):
"""
converts an xls file into a Pandas DataFrame.
The converters of the XLS_Importer object are used.
"""
try
:
xls_file
=
pd
.
io
.
excel
.
ExcelFile
(
filename
)
except
XLRDError
as
e
:
...
...
@@ -82,10 +111,9 @@ class XLS_Importer(object):
if
len
(
xls_file
.
sheet_names
)
>
1
:
# Multiple sheets is the default now. Only show in debug
logger
.
debug
(
"
Excel file {} contains multiple sheets.
"
"
All but the first are being ignored.
"
.
format
(
filename
))
logger
.
debug
(
"
Excel file {} contains multiple sheets.
"
"
All but the first are being ignored.
"
.
format
(
filename
))
try
:
df
=
xls_file
.
parse
(
converters
=
self
.
converters
)
...
...
@@ -105,6 +133,11 @@ class XLS_Importer(object):
return
df
def
check_columns
(
self
,
df
,
filename
=
None
):
"""
checks whether all required columns, i.e. columns for which converters
were defined exist.
"""
for
col
in
self
.
required_columns
:
if
col
not
in
df
.
columns
:
errmsg
=
"
Column
'
{}
'
missing in
"
.
format
(
col
)
...
...
@@ -118,6 +151,13 @@ class XLS_Importer(object):
raise
DataInconsistencyError
(
errmsg
)
def
check_unique
(
self
,
df
,
filename
=
None
):
"""
Check whether value combinations that shall be unique for each row are
unique.
If a second row is found, that uses the same combination of values as a
previous one, the second one is removed.
"""
df
=
df
.
copy
()
uniques
=
[]
...
...
@@ -141,6 +181,11 @@ class XLS_Importer(object):
return
df
def
check_missing
(
self
,
df
,
filename
=
None
):
"""
Check in each row whether obligatory fields are empty or null.
Rows that have missing values are removed.
"""
df
=
df
.
copy
()
for
index
,
row
in
df
.
iterrows
():
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment