diff --git a/src/caoscrawler/conv_impl/spss.py b/src/caoscrawler/conv_impl/spss.py index 619ed9ab210cfc6ea27b45d3160d5eb880a9b7eb..5bd980728b61cfc65818ffb833018ce036fc5a9d 100644 --- a/src/caoscrawler/conv_impl/spss.py +++ b/src/caoscrawler/conv_impl/spss.py @@ -21,6 +21,7 @@ import argparse from collections import OrderedDict +import numpy as np import pandas as pd import pyreadstat import yaml @@ -53,15 +54,58 @@ class SPSSConverter(converters.TableConverter): # The default dtype backend "numpy_nullable" does not handle dates well. # Note that pandas.ArrowDtype is considered experimental (in Pandas 2.2). df = pd.io.spss.read_spss(element.path, dtype_backend="pyarrow") + dtypes = read_column_types(element.path) # if element.path.endswith(".sav"): # sav_df, meta = pyreadstat.read_sav(element.path, metadataonly=True) # from IPython import embed # embed() + df.drop(range(15, len(df.index)), inplace=True) + + # Fix datetime columns + for name, dtype in dtypes.items(): + if dtype != "DATETIME": + continue + col = df.loc[:, name] + col.fillna(np.nan, inplace=True) + col.replace([np.nan], [None], inplace=True) + + # from IPython import embed + # embed() return self._children_from_dataframe(df) +def read_column_types(savfile: Optional[str] = None, meta: Optional = None) -> dict[str, str]: + """Read SAV file and return the column types. + +Optionally, take data from a previours reading. + +Parameters +---------- +savfile : Optional[str] + The SAV file to read. + +meta : Optional + The meta data result from `pyreadstat.read_sav(...)`. + +Returns +------- +out : dict[str, str] + The column names and types. + """ + if not meta: + _, meta = pyreadstat.read_sav(savfile, metadataonly=True) + elif savfile is not None: + raise ValueError("Only one of `savfile` and `meta` must be given.") + dtypes: dict[str, str] = {} + for name in meta.column_names: + datatype = ORIGINAL_TYPES.get(meta.original_variable_types[name], + READSTAT_TYPES[meta.readstat_variable_types[name]]) + dtypes[name] = datatype + return dtypes + + def spss_to_yaml(savfile: str, yamlfile: str, cfood: Optional[str] = None) -> None: """Parse the *.sav and create basic datamodel in ``yamlfile``. @@ -71,6 +115,7 @@ cfood: str If given, also create a cfood skeleton. """ _, meta = pyreadstat.read_sav(savfile, metadataonly=True) + dtypes = read_column_types(meta=meta) cfood_str = """ --- @@ -147,13 +192,10 @@ directory: # corresponds to the directory given to the crawler properties = OrderedDict() for name in meta.column_names: - datatype = ORIGINAL_TYPES.get(meta.original_variable_types[name], - READSTAT_TYPES[meta.readstat_variable_types[name]]) prop = { - "datatype": datatype, + "datatype": dtypes[name], } desc = meta.column_names_to_labels.get(name) - if desc and desc != name: prop["description"] = desc # Handle categorial variables @@ -246,7 +288,7 @@ DummyRT: # Which type? if propdef["datatype"] == "DOUBLE": dtype = "FloatElement" - elif propdef["datatype"] == "TEXT": + elif propdef["datatype"] in ("TEXT", "DATETIME"): dtype = None else: reftype = propdef["datatype"] diff --git a/src/doc/converters.rst b/src/doc/converters.rst index 88ccee77cff97dc91a4470ca1774d8f2181da048..ce172d0c0ce0411516a6ffa75538bb42d1bb104b 100644 --- a/src/doc/converters.rst +++ b/src/doc/converters.rst @@ -155,6 +155,9 @@ The following StructureElement types are typically created by the DictElement co - ListElement - DictElement +Note that you may use ``TextElement`` for anything that exists in a text format that can be +interpreted by the server, such as date and datetime strings in ISO-8601 format. + Scalar Value Converters ======================= `BooleanElementConverter`, `FloatElementConverter`, `TextElementConverter`, and