Skip to content
Snippets Groups Projects
Verified Commit a4d62851 authored by Daniel Hornung's avatar Daniel Hornung
Browse files

ENH: Added spss_to_datamodel script.

parent eb52eb19
No related branches found
No related tags found
2 merge requests!178FIX: #96 Better error output for crawl.py script.,!171sav/spss converter
Pipeline #51032 failed
...@@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ...@@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added ### ### Added ###
* Support for Python 3.12 and experimental support for 3.13 * Support for Python 3.12 and experimental support for 3.13
* `spss_to_datamodel` script.
### Changed ### ### Changed ###
......
...@@ -40,6 +40,7 @@ per-file-ignores = __init__.py:F401 ...@@ -40,6 +40,7 @@ per-file-ignores = __init__.py:F401
[options.entry_points] [options.entry_points]
console_scripts = console_scripts =
caosdb-crawler = caoscrawler.crawl:main caosdb-crawler = caoscrawler.crawl:main
spss_to_datamodel = caoscrawler.conv_impl.spss:spss_to_datamodel_main
[options.extras_require] [options.extras_require]
h5_crawler = h5_crawler =
......
...@@ -18,14 +18,23 @@ ...@@ -18,14 +18,23 @@
"""Converter for SAV files (stored by SPSS).""" """Converter for SAV files (stored by SPSS)."""
import argparse
import pandas as pd import pandas as pd
import pyreadstat import pyreadstat
import yaml
from .. import converters from .. import converters
from ..stores import GeneralStore from ..stores import GeneralStore
from ..structure_elements import (File, StructureElement) from ..structure_elements import (File, StructureElement)
TYPES = {
"double": "DOUBLE",
"string": "TEXT",
}
class SPSSConverter(converters.TableConverter): class SPSSConverter(converters.TableConverter):
"""Converter for SAV files (stored by SPSS).""" """Converter for SAV files (stored by SPSS)."""
...@@ -44,3 +53,94 @@ class SPSSConverter(converters.TableConverter): ...@@ -44,3 +53,94 @@ class SPSSConverter(converters.TableConverter):
# embed() # embed()
return self._children_from_dataframe(df) return self._children_from_dataframe(df)
def spss_to_yaml(savfile: str, yamlfile: str) -> None:
"""Parse the *.sav and create basic datamodel in ``yamlfile``.
"""
_, meta = pyreadstat.read_sav(savfile, metadataonly=True)
enums: dict[str, list[str]] = {}
properties = {}
for name in meta.column_names:
prop = {
"datatype": TYPES[meta.readstat_variable_types[name]],
}
desc = meta.column_names_to_labels.get(name)
if desc and desc != name:
prop["description"] = desc
# Handle categorial variables
if var_label := meta.variable_to_label.get(name):
prop["datatype"] = var_label
vvl = meta.variable_value_labels[name]
# reproducible (and sensible) order
label_values = [vvl[key] for key in sorted(vvl.keys())]
if label_values not in enums.values():
enums[var_label] = label_values
properties[name] = prop
output = f"""# auto-generated data model from file "{savfile}".
# To insert a datamodel into LinkAhead, run:
#
# python3 -m caosadvancedtools.models.parser datamodel.yaml --sync
#
# Code for creating enum records:
#
"""
for name, values in enums.items():
for line in f"""
cont = db.Container()
for value in {repr(values)}:
rec = db.Record(name=value).add_parent(name="{name}")
cont.append(rec)
cont.insert()
""".splitlines(keepends=True):
if line.strip():
output += f"# {line}"
output += "#\n"
# Actual datamodel
output += """
#########
# Enums #
#########
"""
for name, values in enums.items():
output += f"""{name}:
description:
# possible values: {values}\n"""
output += ("""
###############
# RecordTypes #
###############
DummyRT:
description: Note: Change name and enter description.
recommended_properties:
"""
+ " ".join(yaml.dump(properties,
allow_unicode=True,
sort_keys=False).splitlines(keepends=True)))
with open(yamlfile, encoding="utf-8", mode="w") as myfile:
myfile.write(output)
def _parse_arguments():
"""Parse the arguments."""
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('-i', '--input', help="The *.sav file.", required=True)
parser.add_argument('-o', '--outfile', help="Yaml filename to save the result", required=True)
return parser.parse_args()
def spss_to_datamodel_main():
"""The main function of this script."""
args = _parse_arguments()
spss_to_yaml(savfile=args.input, yamlfile=args.outfile)
print(f"Written datamodel to: {args.outfile}")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment