Skip to content
Snippets Groups Projects

sav/spss converter

Merged Daniel Hornung requested to merge f-sav-converter into dev
Files
2
@@ -21,6 +21,7 @@
import argparse
from collections import OrderedDict
import numpy as np
import pandas as pd
import pyreadstat
import yaml
@@ -53,15 +54,58 @@ class SPSSConverter(converters.TableConverter):
# The default dtype backend "numpy_nullable" does not handle dates well.
# Note that pandas.ArrowDtype is considered experimental (in Pandas 2.2).
df = pd.io.spss.read_spss(element.path, dtype_backend="pyarrow")
dtypes = read_column_types(element.path)
# if element.path.endswith(".sav"):
# sav_df, meta = pyreadstat.read_sav(element.path, metadataonly=True)
# from IPython import embed
# embed()
df.drop(range(15, len(df.index)), inplace=True)
# Fix datetime columns
for name, dtype in dtypes.items():
if dtype != "DATETIME":
continue
col = df.loc[:, name]
col.fillna(np.nan, inplace=True)
col.replace([np.nan], [None], inplace=True)
# from IPython import embed
# embed()
return self._children_from_dataframe(df)
def read_column_types(savfile: Optional[str] = None, meta: Optional = None) -> dict[str, str]:
"""Read SAV file and return the column types.
Optionally, take data from a previours reading.
Parameters
----------
savfile : Optional[str]
The SAV file to read.
meta : Optional
The meta data result from `pyreadstat.read_sav(...)`.
Returns
-------
out : dict[str, str]
The column names and types.
"""
if not meta:
_, meta = pyreadstat.read_sav(savfile, metadataonly=True)
elif savfile is not None:
raise ValueError("Only one of `savfile` and `meta` must be given.")
dtypes: dict[str, str] = {}
for name in meta.column_names:
datatype = ORIGINAL_TYPES.get(meta.original_variable_types[name],
READSTAT_TYPES[meta.readstat_variable_types[name]])
dtypes[name] = datatype
return dtypes
def spss_to_yaml(savfile: str, yamlfile: str, cfood: Optional[str] = None) -> None:
"""Parse the *.sav and create basic datamodel in ``yamlfile``.
@@ -71,6 +115,7 @@ cfood: str
If given, also create a cfood skeleton.
"""
_, meta = pyreadstat.read_sav(savfile, metadataonly=True)
dtypes = read_column_types(meta=meta)
cfood_str = """
---
@@ -147,13 +192,10 @@ directory: # corresponds to the directory given to the crawler
properties = OrderedDict()
for name in meta.column_names:
datatype = ORIGINAL_TYPES.get(meta.original_variable_types[name],
READSTAT_TYPES[meta.readstat_variable_types[name]])
prop = {
"datatype": datatype,
"datatype": dtypes[name],
}
desc = meta.column_names_to_labels.get(name)
if desc and desc != name:
prop["description"] = desc
# Handle categorial variables
@@ -246,7 +288,7 @@ DummyRT:
# Which type?
if propdef["datatype"] == "DOUBLE":
dtype = "FloatElement"
elif propdef["datatype"] == "TEXT":
elif propdef["datatype"] in ("TEXT", "DATETIME"):
dtype = None
else:
reftype = propdef["datatype"]
Loading