Skip to content
Snippets Groups Projects
Verified Commit a4d62851 authored by Daniel Hornung's avatar Daniel Hornung
Browse files

ENH: Added spss_to_datamodel script.

parent eb52eb19
No related branches found
No related tags found
2 merge requests!178FIX: #96 Better error output for crawl.py script.,!171sav/spss converter
Pipeline #51032 failed
......@@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added ###
* Support for Python 3.12 and experimental support for 3.13
* `spss_to_datamodel` script.
### Changed ###
......
......@@ -40,6 +40,7 @@ per-file-ignores = __init__.py:F401
[options.entry_points]
console_scripts =
caosdb-crawler = caoscrawler.crawl:main
spss_to_datamodel = caoscrawler.conv_impl.spss:spss_to_datamodel_main
[options.extras_require]
h5_crawler =
......
......@@ -18,14 +18,23 @@
"""Converter for SAV files (stored by SPSS)."""
import argparse
import pandas as pd
import pyreadstat
import yaml
from .. import converters
from ..stores import GeneralStore
from ..structure_elements import (File, StructureElement)
TYPES = {
"double": "DOUBLE",
"string": "TEXT",
}
class SPSSConverter(converters.TableConverter):
"""Converter for SAV files (stored by SPSS)."""
......@@ -44,3 +53,94 @@ class SPSSConverter(converters.TableConverter):
# embed()
return self._children_from_dataframe(df)
def spss_to_yaml(savfile: str, yamlfile: str) -> None:
"""Parse the *.sav and create basic datamodel in ``yamlfile``.
"""
_, meta = pyreadstat.read_sav(savfile, metadataonly=True)
enums: dict[str, list[str]] = {}
properties = {}
for name in meta.column_names:
prop = {
"datatype": TYPES[meta.readstat_variable_types[name]],
}
desc = meta.column_names_to_labels.get(name)
if desc and desc != name:
prop["description"] = desc
# Handle categorial variables
if var_label := meta.variable_to_label.get(name):
prop["datatype"] = var_label
vvl = meta.variable_value_labels[name]
# reproducible (and sensible) order
label_values = [vvl[key] for key in sorted(vvl.keys())]
if label_values not in enums.values():
enums[var_label] = label_values
properties[name] = prop
output = f"""# auto-generated data model from file "{savfile}".
# To insert a datamodel into LinkAhead, run:
#
# python3 -m caosadvancedtools.models.parser datamodel.yaml --sync
#
# Code for creating enum records:
#
"""
for name, values in enums.items():
for line in f"""
cont = db.Container()
for value in {repr(values)}:
rec = db.Record(name=value).add_parent(name="{name}")
cont.append(rec)
cont.insert()
""".splitlines(keepends=True):
if line.strip():
output += f"# {line}"
output += "#\n"
# Actual datamodel
output += """
#########
# Enums #
#########
"""
for name, values in enums.items():
output += f"""{name}:
description:
# possible values: {values}\n"""
output += ("""
###############
# RecordTypes #
###############
DummyRT:
description: Note: Change name and enter description.
recommended_properties:
"""
+ " ".join(yaml.dump(properties,
allow_unicode=True,
sort_keys=False).splitlines(keepends=True)))
with open(yamlfile, encoding="utf-8", mode="w") as myfile:
myfile.write(output)
def _parse_arguments():
"""Parse the arguments."""
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('-i', '--input', help="The *.sav file.", required=True)
parser.add_argument('-o', '--outfile', help="Yaml filename to save the result", required=True)
return parser.parse_args()
def spss_to_datamodel_main():
"""The main function of this script."""
args = _parse_arguments()
spss_to_yaml(savfile=args.input, yamlfile=args.outfile)
print(f"Written datamodel to: {args.outfile}")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment