Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
CaosDB Crawler
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
caosdb
Software
CaosDB Crawler
Commits
b0c1dc7a
Verified
Commit
b0c1dc7a
authored
11 months ago
by
Daniel Hornung
Browse files
Options
Downloads
Patches
Plain Diff
ENH: data model and cfood generator for csv files.
parent
0a244951
Branches
Branches containing commit
Tags
Tags containing commit
2 merge requests
!178
FIX: #96 Better error output for crawl.py script.
,
!171
sav/spss converter
Pipeline
#51749
passed with warnings
11 months ago
Stage: info
Stage: setup
Stage: cert
Stage: style
Stage: test
Changes
3
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
setup.cfg
+1
-0
1 addition, 0 deletions
setup.cfg
src/caoscrawler/scripts/__init__.py
+0
-0
0 additions, 0 deletions
src/caoscrawler/scripts/__init__.py
src/caoscrawler/scripts/generators.py
+221
-0
221 additions, 0 deletions
src/caoscrawler/scripts/generators.py
with
222 additions
and
0 deletions
setup.cfg
+
1
−
0
View file @
b0c1dc7a
...
...
@@ -41,6 +41,7 @@ per-file-ignores = __init__.py:F401
console_scripts
=
caosdb-crawler
=
caoscrawler.crawl:main
spss_to_datamodel
=
caoscrawler.conv_impl.spss:spss_to_datamodel_main
csv_to_datamodel
=
caoscrawler.scripts.generators:csv_to_datamodel_main
[options.extras_require]
h5_crawler
=
...
...
This diff is collapsed.
Click to expand it.
src/caoscrawler/scripts/__init__.py
0 → 100644
+
0
−
0
View file @
b0c1dc7a
This diff is collapsed.
Click to expand it.
src/caoscrawler/scripts/generators.py
0 → 100644
+
221
−
0
View file @
b0c1dc7a
# This file is a part of the LinkAhead Project.
#
# Copyright (C) 2024 IndiScale GmbH <info@indiscale.com>
# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
Scripts and functions to generate datamodel yaml files and cfood skeletons.
For example from actual data files.
"""
import
argparse
import
csv
from
collections
import
OrderedDict
from
string
import
Template
from
typing
import
Optional
import
pandas
as
pd
import
yaml
DM_TEMPLATE
=
"""
# auto-generated data model from file
"
[]{infile}
"
.
# To insert a datamodel into LinkAhead, run:
#
# python3 -m caosadvancedtools.models.parser datamodel.yaml --sync
"""
HEADER_RT
=
"""
###############
# RecordTypes #
###############
DummyRT:
description: Note: Change name and enter description.
recommended_properties:
"""
CFOOD_TEMPLATE
=
"""
---
metadata:
macros:
- !defmacro
# Simple column value -> property rule
name: ColumnValue
params:
name: null
belongsto: BaseElement
type: TextElement
definition:
${name}:
type: ${type}
match_name: ^${name}$$
match_value: (?P<val>.*)
records:
${belongsto}:
${name}: $$val
- !defmacro
# column value -> reference property
name: ColumnValueReference
params:
name: null
reftype: null # RecordType of the reference
belongsto: BaseElement
type: TextElement # References are always text, right?
definition:
${name}:
type: ${type}
match_name: ^${name}$$
match_value: (?P<val>.*)
records:
${reftype}:
name: $$val
${belongsto}:
${name}: $$${reftype}
- !defmacro
# Same as
"
ColumnValue
"
, but also give name of property.
name: ColumnValuePropname
params:
name: null
propname: null
belongsto: BaseElement
type: TextElement
definition:
${name}:
type: ${type}
match_name: ^${name}$$
match_value: (?P<val>.*)
records:
${belongsto}:
${propname}: $$val
---
directory: # corresponds to the directory given to the crawler
type: Directory
match: .* # we do not care how it is named here
subtree:
# This is the file
thisfile:
type: []{file}
match: []{match}
subtree:
entry:
type: Dict
match: .* # Name is irrelevant
records:
MyParent:
subtree: !macro
"""
class
_CustomTemplate
(
Template
):
delimiter
=
"
[]
"
# "$" is used too much by the yaml template.
def
csv_to_datamodel
(
infile
:
str
,
outfile
:
str
,
cfood
:
Optional
[
str
]
=
None
):
"""
Parse the input csv and create basic datamodel in ``outfile``.
Parameters
----------
cfood: str
If given, also create a cfood skeleton.
"""
sniffer
=
csv
.
Sniffer
()
with
open
(
infile
,
encoding
=
"
utf-8
"
)
as
f_infile
:
max_sniff
=
50000
sniffed
=
sniffer
.
sniff
(
f_infile
.
read
(
max_sniff
))
df
=
pd
.
read_table
(
infile
,
sep
=
sniffed
.
delimiter
,
quotechar
=
sniffed
.
quotechar
,
escapechar
=
sniffed
.
escapechar
)
properties
=
OrderedDict
()
for
colname
in
df
.
columns
:
column
=
df
[
colname
]
dtype
:
Optional
[
str
]
=
"
TEXT
"
if
pd
.
api
.
types
.
is_bool_dtype
(
column
.
dtype
):
dtype
=
"
BOOLEAN
"
if
pd
.
api
.
types
.
is_float_dtype
(
column
.
dtype
):
dtype
=
"
DOUBLE
"
elif
pd
.
api
.
types
.
is_integer_dtype
(
column
.
dtype
):
dtype
=
"
INTEGER
"
properties
[
colname
]
=
{
"
datatype
"
:
dtype
}
result
=
(
_CustomTemplate
(
DM_TEMPLATE
).
substitute
({
"
infile
"
:
infile
})
+
HEADER_RT
+
"
"
.
join
(
yaml
.
dump
(
dict
(
properties
),
# from OrderedDict to dict
allow_unicode
=
True
,
sort_keys
=
False
).
splitlines
(
keepends
=
True
))
)
with
open
(
outfile
,
encoding
=
"
utf-8
"
,
mode
=
"
w
"
)
as
myfile
:
myfile
.
write
(
result
)
#################
# cfood section #
#################
if
cfood
:
defs_col_value
:
list
[
str
]
=
[]
defs_col_value_ref
:
list
[
str
]
=
[]
prefix
=
"
"
*
14
for
name
,
propdef
in
properties
.
items
():
def_str
=
prefix
+
f
"
- name:
{
name
}
\n
"
dtype
=
None
reftype
=
None
defs
=
defs_col_value
# Which type?
if
propdef
[
"
datatype
"
]
==
"
BOOLEAN
"
:
dtype
=
"
BooleanElement
"
elif
propdef
[
"
datatype
"
]
==
"
INTEGER
"
:
dtype
=
"
IntegerElement
"
elif
propdef
[
"
datatype
"
]
==
"
DOUBLE
"
:
dtype
=
"
FloatElement
"
elif
propdef
[
"
datatype
"
]
==
"
TEXT
"
:
dtype
=
None
else
:
reftype
=
propdef
[
"
datatype
"
]
defs
=
defs_col_value_ref
# Append according to types:
if
reftype
:
def_str
+=
prefix
+
f
"
reftype:
{
reftype
}
\n
"
if
dtype
:
def_str
+=
prefix
+
f
"
type:
{
dtype
}
\n
"
# Store result
defs
.
append
(
def_str
)
del
defs
cfood_str
=
(
_CustomTemplate
(
CFOOD_TEMPLATE
).
substitute
({
"
file
"
:
"
CSVTableConverter
"
,
"
match
"
:
"
.*
\\
[ct]sv
"
})
+
prefix
[
2
:]
+
"
ColumnValue:
\n
"
+
""
.
join
(
defs_col_value
)
+
prefix
[
2
:]
+
"
ColumnValueReference:
\n
"
+
""
.
join
(
defs_col_value_ref
)
)
with
open
(
cfood
,
encoding
=
"
utf-8
"
,
mode
=
"
w
"
)
as
myfile
:
myfile
.
write
(
cfood_str
)
def
_parse_args_csv
():
"""
Parse the arguments.
"""
parser
=
argparse
.
ArgumentParser
(
description
=
"
Create datamodel and cfood from CSV files.
"
)
parser
.
add_argument
(
'
-i
'
,
'
--input
'
,
help
=
"
The input file.
"
,
required
=
True
,
dest
=
"
infile
"
)
parser
.
add_argument
(
'
-o
'
,
'
--outfile
'
,
help
=
"
Yaml filename to save the result
"
,
required
=
True
)
parser
.
add_argument
(
'
--cfood
'
,
help
=
"
Yaml filename to create cfood output in
"
,
required
=
False
)
return
parser
.
parse_args
()
def
csv_to_datamodel_main
():
"""
The main function for csv data handling.
"""
args
=
_parse_args_csv
()
csv_to_datamodel
(
**
vars
(
args
))
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment