From db4e6348bf97956d2408ea1fd7988752b6029769 Mon Sep 17 00:00:00 2001 From: Daniel <d.hornung@indiscale.com> Date: Wed, 12 Jun 2024 16:24:26 +0200 Subject: [PATCH] ENH: Trying out the other backend. --- src/caoscrawler/conv_impl/spss.py | 4 +++- src/caoscrawler/converters.py | 2 +- unittests/test_spss_converter.py | 7 ++++--- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/caoscrawler/conv_impl/spss.py b/src/caoscrawler/conv_impl/spss.py index 824d4f8e..2d102707 100644 --- a/src/caoscrawler/conv_impl/spss.py +++ b/src/caoscrawler/conv_impl/spss.py @@ -47,7 +47,9 @@ class SPSSConverter(converters.TableConverter): def create_children(self, values: GeneralStore, element: StructureElement) -> list: assert isinstance(element, File) - df = pd.io.spss.read_spss(element.path) + # The default dtype backend "numpy_nullable" does not handle dates well. + # Note that pandas.ArrowDtype is considered experimental (in Pandas 2.2). + df = pd.io.spss.read_spss(element.path, dtype_backend="pyarrow") # if element.path.endswith(".sav"): # sav_df, meta = pyreadstat.read_sav(element.path, metadataonly=True) diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index 89457327..dad33402 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -188,7 +188,7 @@ out: tuple if "value" not in value: # TODO: how do we handle this case? Just ignore? # or disallow? - raise NotImplementedError() + raise NotImplementedError(f"This definition has no \"value\": {value}") propvalue = value["value"] # can be "single", "list" or "multiproperty" collection_mode = value["collection_mode"] diff --git a/unittests/test_spss_converter.py b/unittests/test_spss_converter.py index c86fb17e..7ffc18db 100644 --- a/unittests/test_spss_converter.py +++ b/unittests/test_spss_converter.py @@ -62,7 +62,7 @@ def test_spss_converter(converter_registry): spss_dir = UNITTESTDIR / "test_tables" / "spss" for sav_file, length, thistype in [ (File("sample.sav", spss_dir / "sample.sav"), 5, str), - (File("sample.sav", spss_dir / "sample_large.sav"), 485, float), + (File("sample.sav", spss_dir / "sample_large.sav"), 485, int), ]: m = converter.match(sav_file) assert m is not None @@ -70,6 +70,7 @@ def test_spss_converter(converter_registry): children = converter.create_children(None, sav_file) assert len(children) == length + for ii, child in enumerate(children): assert child.__class__ == DictElement assert child.name == str(ii) @@ -78,5 +79,5 @@ def test_spss_converter(converter_registry): assert isinstance(my_dict["mydate"], datetime.date) or np.isnan(my_dict["mydate"]) assert isinstance(my_dict["dtime"], datetime.datetime) or np.isnan(my_dict["dtime"]) assert isinstance(my_dict["mytime"], datetime.time) or np.isnan(my_dict["mytime"]) - assert isinstance(my_dict["mylabl"], thistype) - assert isinstance(my_dict["myord"], thistype) + assert isinstance(my_dict["mylabl"], thistype), f"{type(my_dict['mylabl'])}" + assert isinstance(my_dict["myord"], thistype), f"{type(my_dict['myord'])}" -- GitLab