Daniel Hornung · 39dfeeaa · d54c31cd · c822fa4a · b1b220c8 · c0c2ea17
--- a/src/caoscrawler/conv_impl/spss.py

+ 47

− 5
+++ b/src/caoscrawler/conv_impl/spss.py

+ 47

− 5
 @@ -21,6 +21,7 @@
 import argparse
 from collections import OrderedDict

+import numpy as np
 import pandas as pd
 import pyreadstat
 import yaml
 @@ -53,15 +54,58 @@ class SPSSConverter(converters.TableConverter):
        # The default dtype backend "numpy_nullable" does not handle dates well.
        # Note that pandas.ArrowDtype is considered experimental (in Pandas 2.2).
        df = pd.io.spss.read_spss(element.path, dtype_backend="pyarrow")
+        dtypes = read_column_types(element.path)

        # if element.path.endswith(".sav"):
        #     sav_df, meta = pyreadstat.read_sav(element.path, metadataonly=True)
        # from IPython import embed
        # embed()
+        df.drop(range(15, len(df.index)), inplace=True)
+
+        # Fix datetime columns
+        for name, dtype in dtypes.items():
+            if dtype != "DATETIME":
+                continue
+            col = df.loc[:, name]
+            col.fillna(np.nan, inplace=True)
+            col.replace([np.nan], [None], inplace=True)
+
+        # from IPython import embed
+        # embed()

        return self._children_from_dataframe(df)


+def read_column_types(savfile: Optional[str] = None, meta: Optional = None) -> dict[str, str]:
+    """Read SAV file and return the column types.
+
+Optionally, take data from a previours reading.
+
+Parameters
+----------
+savfile : Optional[str]
+    The SAV file to read.
+
+meta : Optional
+    The meta data result from `pyreadstat.read_sav(...)`.
+
+Returns
+-------
+out : dict[str, str]
+    The column names and types.
+    """
+    if not meta:
+        _, meta = pyreadstat.read_sav(savfile, metadataonly=True)
+    elif savfile is not None:
+        raise ValueError("Only one of `savfile` and `meta` must be given.")
+    dtypes: dict[str, str] = {}
+    for name in meta.column_names:
+        datatype = ORIGINAL_TYPES.get(meta.original_variable_types[name],
+                                      READSTAT_TYPES[meta.readstat_variable_types[name]])
+        dtypes[name] = datatype
+    return dtypes
+
+
 def spss_to_yaml(savfile: str, yamlfile: str, cfood: Optional[str] = None) -> None:
    """Parse the *.sav and create basic datamodel in ``yamlfile``.

 @@ -71,6 +115,7 @@ cfood: str
  If given, also create a cfood skeleton.
    """
    _, meta = pyreadstat.read_sav(savfile, metadataonly=True)
+    dtypes = read_column_types(meta=meta)

    cfood_str = """
 ---
 @@ -147,13 +192,10 @@ directory: # corresponds to the directory given to the crawler
    properties = OrderedDict()

    for name in meta.column_names:
-        datatype = ORIGINAL_TYPES.get(meta.original_variable_types[name],
-                                      READSTAT_TYPES[meta.readstat_variable_types[name]])
        prop = {
-            "datatype": datatype,
+            "datatype": dtypes[name],
        }
        desc = meta.column_names_to_labels.get(name)
-
        if desc and desc != name:
            prop["description"] = desc
        # Handle categorial variables
 @@ -246,7 +288,7 @@ DummyRT:
            # Which type?
            if propdef["datatype"] == "DOUBLE":
                dtype = "FloatElement"
-            elif propdef["datatype"] == "TEXT":
+            elif propdef["datatype"] in ("TEXT", "DATETIME"):
                dtype = None
            else:
                reftype = propdef["datatype"]