From db4e6348bf97956d2408ea1fd7988752b6029769 Mon Sep 17 00:00:00 2001
From: Daniel <d.hornung@indiscale.com>
Date: Wed, 12 Jun 2024 16:24:26 +0200
Subject: [PATCH] ENH: Trying out the other backend.

---
 src/caoscrawler/conv_impl/spss.py | 4 +++-
 src/caoscrawler/converters.py     | 2 +-
 unittests/test_spss_converter.py  | 7 ++++---
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/caoscrawler/conv_impl/spss.py b/src/caoscrawler/conv_impl/spss.py
index 824d4f8e..2d102707 100644
--- a/src/caoscrawler/conv_impl/spss.py
+++ b/src/caoscrawler/conv_impl/spss.py
@@ -47,7 +47,9 @@ class SPSSConverter(converters.TableConverter):
 
     def create_children(self, values: GeneralStore, element: StructureElement) -> list:
         assert isinstance(element, File)
-        df = pd.io.spss.read_spss(element.path)
+        # The default dtype backend "numpy_nullable" does not handle dates well.
+        # Note that pandas.ArrowDtype is considered experimental (in Pandas 2.2).
+        df = pd.io.spss.read_spss(element.path, dtype_backend="pyarrow")
 
         # if element.path.endswith(".sav"):
         #     sav_df, meta = pyreadstat.read_sav(element.path, metadataonly=True)
diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py
index 89457327..dad33402 100644
--- a/src/caoscrawler/converters.py
+++ b/src/caoscrawler/converters.py
@@ -188,7 +188,7 @@ out: tuple
         if "value" not in value:
             # TODO: how do we handle this case? Just ignore?
             #       or disallow?
-            raise NotImplementedError()
+            raise NotImplementedError(f"This definition has no \"value\": {value}")
         propvalue = value["value"]
         # can be "single", "list" or "multiproperty"
         collection_mode = value["collection_mode"]
diff --git a/unittests/test_spss_converter.py b/unittests/test_spss_converter.py
index c86fb17e..7ffc18db 100644
--- a/unittests/test_spss_converter.py
+++ b/unittests/test_spss_converter.py
@@ -62,7 +62,7 @@ def test_spss_converter(converter_registry):
     spss_dir = UNITTESTDIR / "test_tables" / "spss"
     for sav_file, length, thistype in [
             (File("sample.sav", spss_dir / "sample.sav"), 5, str),
-            (File("sample.sav", spss_dir / "sample_large.sav"), 485, float),
+            (File("sample.sav", spss_dir / "sample_large.sav"), 485, int),
     ]:
         m = converter.match(sav_file)
         assert m is not None
@@ -70,6 +70,7 @@ def test_spss_converter(converter_registry):
 
         children = converter.create_children(None, sav_file)
         assert len(children) == length
+
         for ii, child in enumerate(children):
             assert child.__class__ == DictElement
             assert child.name == str(ii)
@@ -78,5 +79,5 @@ def test_spss_converter(converter_registry):
             assert isinstance(my_dict["mydate"], datetime.date) or np.isnan(my_dict["mydate"])
             assert isinstance(my_dict["dtime"], datetime.datetime) or np.isnan(my_dict["dtime"])
             assert isinstance(my_dict["mytime"], datetime.time) or np.isnan(my_dict["mytime"])
-            assert isinstance(my_dict["mylabl"], thistype)
-            assert isinstance(my_dict["myord"], thistype)
+            assert isinstance(my_dict["mylabl"], thistype), f"{type(my_dict['mylabl'])}"
+            assert isinstance(my_dict["myord"], thistype), f"{type(my_dict['myord'])}"
-- 
GitLab