diff --git a/src/caoscrawler/conv_impl/spss.py b/src/caoscrawler/conv_impl/spss.py index 824d4f8eb27f5adf63d14cc045e53f788cae88f9..2d1027075bffc1245ff9fe12bcb1f73e4c378948 100644 --- a/src/caoscrawler/conv_impl/spss.py +++ b/src/caoscrawler/conv_impl/spss.py @@ -47,7 +47,9 @@ class SPSSConverter(converters.TableConverter): def create_children(self, values: GeneralStore, element: StructureElement) -> list: assert isinstance(element, File) - df = pd.io.spss.read_spss(element.path) + # The default dtype backend "numpy_nullable" does not handle dates well. + # Note that pandas.ArrowDtype is considered experimental (in Pandas 2.2). + df = pd.io.spss.read_spss(element.path, dtype_backend="pyarrow") # if element.path.endswith(".sav"): # sav_df, meta = pyreadstat.read_sav(element.path, metadataonly=True) diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index 8945732776d51da5c924f19e938a68007c668704..dad3340224fcc85a60929e8cca4f92c319e50aad 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -188,7 +188,7 @@ out: tuple if "value" not in value: # TODO: how do we handle this case? Just ignore? # or disallow? - raise NotImplementedError() + raise NotImplementedError(f"This definition has no \"value\": {value}") propvalue = value["value"] # can be "single", "list" or "multiproperty" collection_mode = value["collection_mode"] diff --git a/unittests/test_spss_converter.py b/unittests/test_spss_converter.py index c86fb17e7814e4ccebf7b2978e7d91f289e90395..7ffc18dba43a6f7cd3c9fbc9273da349b4ec3c6e 100644 --- a/unittests/test_spss_converter.py +++ b/unittests/test_spss_converter.py @@ -62,7 +62,7 @@ def test_spss_converter(converter_registry): spss_dir = UNITTESTDIR / "test_tables" / "spss" for sav_file, length, thistype in [ (File("sample.sav", spss_dir / "sample.sav"), 5, str), - (File("sample.sav", spss_dir / "sample_large.sav"), 485, float), + (File("sample.sav", spss_dir / "sample_large.sav"), 485, int), ]: m = converter.match(sav_file) assert m is not None @@ -70,6 +70,7 @@ def test_spss_converter(converter_registry): children = converter.create_children(None, sav_file) assert len(children) == length + for ii, child in enumerate(children): assert child.__class__ == DictElement assert child.name == str(ii) @@ -78,5 +79,5 @@ def test_spss_converter(converter_registry): assert isinstance(my_dict["mydate"], datetime.date) or np.isnan(my_dict["mydate"]) assert isinstance(my_dict["dtime"], datetime.datetime) or np.isnan(my_dict["dtime"]) assert isinstance(my_dict["mytime"], datetime.time) or np.isnan(my_dict["mytime"]) - assert isinstance(my_dict["mylabl"], thistype) - assert isinstance(my_dict["myord"], thistype) + assert isinstance(my_dict["mylabl"], thistype), f"{type(my_dict['mylabl'])}" + assert isinstance(my_dict["myord"], thistype), f"{type(my_dict['myord'])}"