From 67764f97bc13c46f9bf24afc43435eeee85b3e0c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com>
Date: Mon, 13 May 2024 10:45:34 +0200
Subject: [PATCH] ENH: add sav converter

---
 .docker/Dockerfile                          |   2 +-
 .gitlab-ci.yml                              |   4 +-
 setup.cfg                                   |   5 +-
 src/caoscrawler/__init__.py                 |   5 +
 src/caoscrawler/conv_impl/__init__.py       |   0
 src/caoscrawler/conv_impl/sav.py            |  37 ++++
 src/caoscrawler/converters.py               | 132 ++++++------
 src/caoscrawler/version.py                  |   8 +-
 src/doc/converters.rst                      |   2 +
 unittests/test_converters.py                |   2 +-
 unittests/test_sav_converter.py             |  82 ++++++++
 unittests/test_tables/spss/CITATION.cff     |  11 +
 unittests/test_tables/spss/LICENSE          | 210 ++++++++++++++++++++
 unittests/test_tables/spss/sample.sav       | Bin 0 -> 1651 bytes
 unittests/test_tables/spss/sample_large.sav | Bin 0 -> 27895 bytes
 15 files changed, 426 insertions(+), 74 deletions(-)
 create mode 100644 src/caoscrawler/conv_impl/__init__.py
 create mode 100644 src/caoscrawler/conv_impl/sav.py
 create mode 100644 unittests/test_sav_converter.py
 create mode 100644 unittests/test_tables/spss/CITATION.cff
 create mode 100644 unittests/test_tables/spss/LICENSE
 create mode 100644 unittests/test_tables/spss/sample.sav
 create mode 100644 unittests/test_tables/spss/sample_large.sav

diff --git a/.docker/Dockerfile b/.docker/Dockerfile
index 539ac0d4..a4d9ce69 100644
--- a/.docker/Dockerfile
+++ b/.docker/Dockerfile
@@ -30,7 +30,7 @@ RUN rm -r /git/.git
 # Install pycaosdb.ini for the tests
 RUN mv /git/.docker/tester_pycaosdb.ini /git/integrationtests/pycaosdb.ini
 
-RUN cd /git/ && pip3 install .
+RUN cd /git/ && pip3 install .[h5_crawler,spss]
 
 WORKDIR /git/integrationtests
 # wait for server,
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 9ec1727d..ff313673 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -130,7 +130,7 @@ unittest_py3.8:
     # TODO: Use f-branch logic here
     - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev
     - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@dev
-    - pip install .[h5-crawler]
+    - pip install .[h5_crawler,spss]
     # actual test
     - caosdb-crawler --help
     - pytest --cov=caosdb -vv ./unittests
@@ -166,7 +166,7 @@ unittest_py3.13:
     # TODO: Use f-branch logic here
     - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev
     - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@dev
-    - pip install .[h5-crawler]
+    - pip install .[h5_crawler,spss]
     # actual test
     - caosdb-crawler --help
     - pytest --cov=caosdb -vv ./unittests
diff --git a/setup.cfg b/setup.cfg
index 1b4a9185..4a9c8265 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -25,6 +25,7 @@ install_requires =
     odfpy #make optional
     packaging
     pandas
+    pyarrow  # Will be required by Pandas >= 3.0.
     pyyaml
     yaml-header-tools >= 0.2.1
 
@@ -41,6 +42,8 @@ console_scripts =
   caosdb-crawler = caoscrawler.crawl:main
 
 [options.extras_require]
-h5-crawler =
+h5_crawler =
            h5py >= 3.8
            numpy
+spss =
+     pandas[spss]
diff --git a/src/caoscrawler/__init__.py b/src/caoscrawler/__init__.py
index 05bad0b5..9c5e3743 100644
--- a/src/caoscrawler/__init__.py
+++ b/src/caoscrawler/__init__.py
@@ -1,4 +1,9 @@
+from . import converters
+from .conv_impl.sav import SAVConverter
 from .crawl import Crawler, SecurityMode
 from .version import CfoodRequiredVersionError, get_caoscrawler_version
 
 __version__ = get_caoscrawler_version()
+
+# Convenience members #########################################################
+converters.SAVConverter = SAVConverter
diff --git a/src/caoscrawler/conv_impl/__init__.py b/src/caoscrawler/conv_impl/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/caoscrawler/conv_impl/sav.py b/src/caoscrawler/conv_impl/sav.py
new file mode 100644
index 00000000..8308719b
--- /dev/null
+++ b/src/caoscrawler/conv_impl/sav.py
@@ -0,0 +1,37 @@
+# This file is a part of the LinkAhead Project.
+#
+# Copyright (C) 2024 IndiScale GmbH <info@indiscale.com>
+# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+"""Converter for SAV files (stored by SPSS)."""
+
+# import pyreadstat  # Maybe us this if we need more metadata
+import pandas as pd
+
+from .. import converters
+from ..stores import GeneralStore, RecordStore
+from ..structure_elements import (BooleanElement, DictElement, Directory, File,
+                                  FloatElement, IntegerElement, JSONFile,
+                                  ListElement, NoneElement, StructureElement,
+                                  TextElement)
+
+
+class SAVConverter(converters.TableConverter):
+    """Converter for SAV files (stored by SPSS)."""
+
+    def create_children(self, values: GeneralStore, element: StructureElement):
+        df = pd.io.spss.read_spss(element.path)
+        return self._children_from_dataframe(df)
diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py
index e0ca0f9b..296bfeed 100644
--- a/src/caoscrawler/converters.py
+++ b/src/caoscrawler/converters.py
@@ -1,11 +1,11 @@
-#!/usr/bin/env python3
 # encoding: utf-8
 #
-# ** header v3.0
-# This file is a part of the CaosDB Project.
+# This file is a part of the LinkAhead Project.
 #
+# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com>
 # Copyright (C) 2021 Henrik tom Wörden
-#               2021 Alexander Schlemmer
+# Copyright (C) 2021 Alexander Schlemmer
+# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com>
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU Affero General Public License as
@@ -19,9 +19,8 @@
 #
 # You should have received a copy of the GNU Affero General Public License
 # along with this program. If not, see <https://www.gnu.org/licenses/>.
-#
-# ** end header
-#
+
+"""Converters take structure elements and create Records and new structure elements from them."""
 
 from __future__ import annotations
 
@@ -34,7 +33,7 @@ import warnings
 from abc import ABCMeta, abstractmethod
 from inspect import signature
 from string import Template
-from typing import Any, List, Optional, Tuple, Union
+from typing import Any, Callable, Optional, Union
 
 import linkahead as db
 import pandas as pd
@@ -134,8 +133,8 @@ def replace_variables(propvalue: Any, values: GeneralStore):
     This function replaces variables in property values (and possibly other locations,
     where the crawler can replace cfood-internal variables).
 
-    If `propvalue` is a single variable name preceeded with a '$' (e.g. '$var' or '${var}'), then
-    the corresponding value stored in `values` is returned.
+    If ``propvalue`` is a single variable name preceeded by a ``$`` (e.g. ``$var`` or ``${var}``),
+    then the corresponding value stored in ``values`` is returned.
     In any other case the variable substitution is carried out as defined by string templates
     and a new string with the replaced variables is returned.
     """
@@ -160,16 +159,16 @@ def handle_value(value: Union[dict, str, list], values: GeneralStore):
     add as an additional property (multiproperty).
 
     Variable names (starting with a "$") are replaced by the corresponding value stored in the
-    `values` GeneralStore.
+    ``values`` GeneralStore.
 
 Parameters
 ----------
 
-value:
-  - if str, the value to be interpreted. E.g. "4", "hallo" or "$a" etc.
-  - if dict, must have keys "value" and "collection_mode". The returned tuple is directly
+value: Union[dict, str, list]
+  - If *str*, the value to be interpreted. E.g. "4", "hello" or "$a" etc.
+  - If *dict*, must have keys ``value`` and ``collection_mode``. The returned tuple is directly
     created from the corresponding values.
-  - if list, each element is checked for replacement and the resulting list will be used
+  - If *list*, each element is checked for replacement and the resulting list will be used
     as (list) value for the property
 
 Returns
@@ -181,7 +180,7 @@ out: tuple
     """
     # @review Florian Spreckelsen 2022-05-13
 
-    if type(value) == dict:
+    if isinstance(value, dict):
         if "value" not in value:
             # TODO: how do we handle this case? Just ignore?
             #       or disallow?
@@ -189,7 +188,7 @@ out: tuple
         propvalue = value["value"]
         # can be "single", "list" or "multiproperty"
         collection_mode = value["collection_mode"]
-    elif type(value) == str:
+    elif isinstance(value, str):
         propvalue = value
         collection_mode = "single"
         if propvalue.startswith("+"):
@@ -198,7 +197,7 @@ out: tuple
         elif propvalue.startswith("*"):
             collection_mode = "multiproperty"
             propvalue = propvalue[1:]
-    elif type(value) == list:
+    elif isinstance(value, list):
         # TODO: (for review)
         #       This is a bit dirty right now and needed for
         #       being able to directly set list values. Semantics is, however, a bit
@@ -209,7 +208,7 @@ out: tuple
         propvalue = list()
         for element in value:
             # Do the element-wise replacement only, when its type is string:
-            if type(element) == str:
+            if isinstance(element, str):
                 propvalue.append(replace_variables(element, values))
             else:
                 propvalue.append(element)
@@ -286,9 +285,7 @@ def create_records(values: GeneralStore, records: RecordStore, def_records: dict
                     propvalue = os.path.normpath(propvalue)
                 setattr(c_record, key.lower(), propvalue)
             else:
-
                 if c_record.get_property(key) is None:
-
                     if collection_mode == "list":
                         c_record.add_property(name=key, value=[propvalue])
                     elif (collection_mode == "multiproperty" or
@@ -322,10 +319,13 @@ class Converter(object, metaclass=ABCMeta):
 
         Parameters
         ----------
-        definition: dict, Please refer to ``src/doc/converters.rst`` to learn about the structure
-                    that the definition dict must have.
-        converter_registry: dict, A dictionary that contains converter names as keys and dicts as
-                            values. Those value dicts have the keys 'converter' and 'package'.
+        definition: dict
+          Please refer to ``src/doc/converters.rst`` to learn about the structure that the
+          definition dict must have.
+        converter_registry: dict
+          A dictionary that contains converter names as keys and dicts as values. Those value dicts
+          have the keys 'converter', 'package' and 'class'.  'converter' is the class name,
+          'package' the module and 'class' the class instance of converters.
         """
 
         self.definition = definition
@@ -363,7 +363,7 @@ class Converter(object, metaclass=ABCMeta):
 
     @staticmethod
     def converter_factory(definition: dict, name: str, converter_registry: dict):
-        """creates a Converter instance of the appropriate class.
+        """Create a Converter instance of the appropriate class.
 
         The `type` key in the `definition` defines the Converter class which is being used.
         """
@@ -424,7 +424,7 @@ class Converter(object, metaclass=ABCMeta):
                 pass
         """
 
-        if not "transform" in self.definition:
+        if "transform" not in self.definition:
             return
         for transformer_key, transformer in self.definition["transform"].items():
             in_value = replace_variables(transformer["in"], values)
@@ -460,8 +460,7 @@ class Converter(object, metaclass=ABCMeta):
             values[match.group('varname')] = out_value
 
     @abstractmethod
-    def create_children(self, values: GeneralStore,
-                        element: StructureElement):
+    def create_children(self, values: GeneralStore, element: StructureElement):
         pass
 
     def create_records(self, values: GeneralStore, records: RecordStore,
@@ -477,7 +476,7 @@ class Converter(object, metaclass=ABCMeta):
                               self.definition["records"])
 
     def filter_children(self, children_with_strings:
-                        List[Tuple[StructureElement, str]], expr: str,
+                        list[tuple[StructureElement, str]], expr: str,
                         group: str, rule: str):
         """Filter children according to regexp `expr` and `rule`."""
 
@@ -515,8 +514,8 @@ class Converter(object, metaclass=ABCMeta):
                                  result: Optional[dict]):
         """ Template for the debugging output for the match function """
         msg = "\n--------" + name + "-----------\n"
-        for re, ma in zip(regexp, matched):
-            msg += "matching reg:\t" + re + "\n"
+        for exp, ma in zip(regexp, matched):
+            msg += "matching reg:\t" + exp + "\n"
             msg += "matching val:\t" + ma + "\n"
             msg += "---------\n"
         if result is None:
@@ -620,7 +619,7 @@ class DirectoryConverter(Converter):
 
         element: A directory (of type Directory) which will be traversed.
         """
-        children: List[StructureElement] = []
+        children: list[StructureElement] = []
 
         for name in sorted(os.listdir(element.path)):
             path = os.path.join(element.path, name)
@@ -660,7 +659,7 @@ class SimpleFileConverter(Converter):
 class FileConverter(SimpleFileConverter):
     def __init__(self, *args, **kwargs):
         warnings.warn(DeprecationWarning(
-            "This class is depricated. Please use SimpleFileConverter."))
+            "This class is deprecated. Please use SimpleFileConverter."))
         super().__init__(*args, **kwargs)
 
 
@@ -693,12 +692,12 @@ class MarkdownFileConverter(SimpleFileConverter):
                 "Error during the validation (yaml header cannot be read) of the markdown file "
                 "located at the following node in the data structure:\n"
                 "{}\nError:\n{}".format(path, err))
-        children: List[StructureElement] = []
+        children: list[StructureElement] = []
 
         for name, entry in header.items():
-            if type(entry) == list:
+            if isinstance(entry, list):
                 children.append(ListElement(name, entry))
-            elif type(entry) == str:
+            elif isinstance(entry, str):
                 children.append(TextElement(name, entry))
             else:
                 if generalStore is not None and self.name in generalStore:
@@ -814,14 +813,14 @@ class DictElementConverter(Converter):
 class DictConverter(DictElementConverter):
     def __init__(self, *args, **kwargs):
         warnings.warn(DeprecationWarning(
-            "This class is depricated. Please use DictConverter."))
+            "This class is deprecated. Please use DictElementConverter."))
         super().__init__(*args, **kwargs)
 
 
 class DictDictElementConverter(DictElementConverter):
     def __init__(self, *args, **kwargs):
         warnings.warn(DeprecationWarning(
-            "This class is depricated. Please use DictElementConverter."))
+            "This class is deprecated. Please use DictElementConverter."))
         super().__init__(*args, **kwargs)
 
 
@@ -886,7 +885,7 @@ out:
     """
     if "match_name" in definition:
         if "match" in definition:
-            raise RuntimeError(f"Do not supply both, 'match_name' and 'match'.")
+            raise RuntimeError("Do not supply both, 'match_name' and 'match'.")
 
         m1 = re.match(definition["match_name"], name)
         if m1 is None:
@@ -1009,7 +1008,7 @@ class BooleanElementConverter(_AbstractScalarValueElementConverter):
 class DictBooleanElementConverter(BooleanElementConverter):
     def __init__(self, *args, **kwargs):
         warnings.warn(DeprecationWarning(
-            "This class is depricated. Please use BooleanElementConverter."))
+            "This class is deprecated. Please use BooleanElementConverter."))
         super().__init__(*args, **kwargs)
 
 
@@ -1025,7 +1024,7 @@ class FloatElementConverter(_AbstractScalarValueElementConverter):
 class DictFloatElementConverter(FloatElementConverter):
     def __init__(self, *args, **kwargs):
         warnings.warn(DeprecationWarning(
-            "This class is depricated. Please use FloatElementConverter."))
+            "This class is deprecated. Please use FloatElementConverter."))
         super().__init__(*args, **kwargs)
 
 
@@ -1040,7 +1039,7 @@ class TextElementConverter(_AbstractScalarValueElementConverter):
     def __init__(self, definition, *args, **kwargs):
         if "match" in definition:
             raise ValueError("""
-The 'match' key will in future be used to match a potential name of a TextElement. Please use
+The 'match' key is used to match a potential name of a TextElement. Please use
 the 'match_value' key to match the value of the TextElement and 'match_name' for matching the name.
 """)
 
@@ -1050,7 +1049,7 @@ the 'match_value' key to match the value of the TextElement and 'match_name' for
 class DictTextElementConverter(TextElementConverter):
     def __init__(self, *args, **kwargs):
         warnings.warn(DeprecationWarning(
-            "This class is depricated. Please use TextElementConverter."))
+            "This class is deprecated. Please use TextElementConverter."))
         super().__init__(*args, **kwargs)
 
 
@@ -1066,7 +1065,7 @@ class IntegerElementConverter(_AbstractScalarValueElementConverter):
 class DictIntegerElementConverter(IntegerElementConverter):
     def __init__(self, *args, **kwargs):
         warnings.warn(DeprecationWarning(
-            "This class is depricated. Please use IntegerElementConverter."))
+            "This class is deprecated. Please use IntegerElementConverter."))
         super().__init__(*args, **kwargs)
 
 
@@ -1108,7 +1107,7 @@ class ListElementConverter(Converter):
 class DictListElementConverter(ListElementConverter):
     def __init__(self, *args, **kwargs):
         warnings.warn(DeprecationWarning(
-            "This class is depricated. Please use ListElementConverter."))
+            "This class is deprecated. Please use ListElementConverter."))
         super().__init__(*args, **kwargs)
 
 
@@ -1122,15 +1121,22 @@ class TableConverter(Converter):
     The rows can be matched using a DictElementConverter.
 
     """
-    @abstractmethod
-    def get_options(self):
-        """
-        This method needs to be overwritten by the specific table converter to provide
-        information about the possible options.
+
+    def get_options(self) -> dict:
+        """Get specific options, e.g. from ``self.definitions``.
+
+This method may to be overwritten by the specific table converter to provide information about the
+possible options.  Implementors may use ``TableConverter._get_options(...)`` to get (and convert)
+options from ``self.definitions``.
+
+Returns
+-------
+out: dict
+  An options dict.
         """
-        pass
+        return {}
 
-    def _get_options(self, possible_options):
+    def _get_options(self, possible_options: list[tuple[str, Callable]]) -> dict:
         option_dict = dict()
         for opt_name, opt_conversion in possible_options:
             if opt_name in self.definition:
@@ -1158,6 +1164,14 @@ class TableConverter(Converter):
             return None
         return m.groupdict()
 
+    @staticmethod
+    def _children_from_dataframe(dataframe: pd.DataFrame):
+        child_elements = list()
+        for index, row in dataframe.iterrows():
+            child_elements.append(
+                DictElement(str(index), row.to_dict()))
+        return child_elements
+
 
 class XLSXTableConverter(TableConverter):
     """
@@ -1187,11 +1201,7 @@ class XLSXTableConverter(TableConverter):
         if not isinstance(element, File):
             raise RuntimeError("Element must be a File.")
         table = pd.read_excel(element.path, **self.get_options())
-        child_elements = list()
-        for index, row in table.iterrows():
-            child_elements.append(
-                DictElement(str(index), row.to_dict()))
-        return child_elements
+        return self._children_from_dataframe(table)
 
 
 class CSVTableConverter(TableConverter):
@@ -1216,11 +1226,7 @@ class CSVTableConverter(TableConverter):
         if not isinstance(element, File):
             raise RuntimeError("Element must be a File.")
         table = pd.read_csv(element.path, **self.get_options())
-        child_elements = list()
-        for index, row in table.iterrows():
-            child_elements.append(
-                DictElement(str(index), row.to_dict()))
-        return child_elements
+        return self._children_from_dataframe(table)
 
 
 class DateElementConverter(TextElementConverter):
diff --git a/src/caoscrawler/version.py b/src/caoscrawler/version.py
index fdc83234..0b72dd65 100644
--- a/src/caoscrawler/version.py
+++ b/src/caoscrawler/version.py
@@ -17,11 +17,7 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program. If not, see <https://www.gnu.org/licenses/>.
 #
-try:
-    from importlib import metadata as importlib_metadata
-except ImportError:  # Python<3.8 dowesn"t support this so use
-    import importlib_metadata
-
+from importlib import metadata as importlib_metadata
 from packaging.version import parse as parse_version
 from warnings import warn
 
@@ -43,7 +39,7 @@ def check_cfood_version(metadata: dict):
     if not metadata or "crawler-version" not in metadata:
 
         msg = """
-No crawler version specified in cfood definition, so there is now guarantee that
+No crawler version specified in cfood definition, so there is no guarantee that
 the cfood definition matches the installed crawler version.
 
 Specifying a version is highly recommended to ensure that the definition works
diff --git a/src/doc/converters.rst b/src/doc/converters.rst
index 9b28c9a6..637c6635 100644
--- a/src/doc/converters.rst
+++ b/src/doc/converters.rst
@@ -55,6 +55,8 @@ element. If the structure element matches (this is a combination of a typecheck
 match, see :py:class:`~caoscrawler.converters.Converter` for details) the converter is used
 to generate records (see :py:meth:`~caoscrawler.converters.Converter.create_records`) and to possibly process a subtree, as defined by the function :func:`caoscrawler.converters.create_children`.
 
+**match** *TODO*
+
 **records** is a dict of definitions that define the semantic structure
 (see details below).
 
diff --git a/unittests/test_converters.py b/unittests/test_converters.py
index 52ece13d..0e90fab3 100644
--- a/unittests/test_converters.py
+++ b/unittests/test_converters.py
@@ -140,7 +140,7 @@ def test_markdown_converter(converter_registry):
     converter = MarkdownFileConverter({"match": "(.*)"}, "TestMarkdownFileConverter",
                                       converter_registry)
 
-    with pytest.raises(ConverterValidationError) as err:
+    with pytest.raises(ConverterValidationError):
         converter.create_children(None, File("test_tool.py", UNITTESTDIR / "test_crawler.py"))
 
     m = converter.match(test_readme)
diff --git a/unittests/test_sav_converter.py b/unittests/test_sav_converter.py
new file mode 100644
index 00000000..3cc72038
--- /dev/null
+++ b/unittests/test_sav_converter.py
@@ -0,0 +1,82 @@
+# This file is a part of the LinkAhead Project.
+#
+# Copyright (C) 2024 IndiScale GmbH <info@indiscale.com>
+# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+"""Testing converter for SAV files."""
+
+import datetime
+import importlib
+import re
+from pathlib import Path
+
+import numpy as np
+import pytest
+
+from caoscrawler.converters import (
+    ConverterValidationError,
+    SAVConverter,
+)
+from caoscrawler.structure_elements import (BooleanElement, DictElement,
+                                            Directory, File, FloatElement,
+                                            IntegerElement, ListElement,
+                                            TextElement)
+
+UNITTESTDIR = Path(__file__).parent
+
+
+@pytest.fixture
+def converter_registry():
+    converter_registry: dict[str, dict[str, str]] = {
+        "Directory": {
+            "converter": "DirectoryConverter",
+            "package": "caoscrawler.converters"},
+    }
+
+    for key, value in converter_registry.items():
+        module = importlib.import_module(value["package"])
+        value["class"] = getattr(module, value["converter"])
+    return converter_registry
+
+
+def test_sav_converter(converter_registry):
+    converter = SAVConverter({
+        "match": ("sample.sav")
+    },
+        "ThisConverterNameIsIrrelevant", converter_registry
+    )
+
+    spss_dir = UNITTESTDIR / "test_tables" / "spss"
+    for sav_file, length, thistype in [
+            (File("sample.sav", spss_dir / "sample.sav"), 5, str),
+            (File("sample.sav", spss_dir / "sample_large.sav"), 485, float),
+    ]:
+        m = converter.match(sav_file)
+        assert m is not None
+        assert len(m) == 0
+
+        children = converter.create_children(None, sav_file)
+        assert len(children) == length
+        for ii, child in enumerate(children):
+            assert child.__class__ == DictElement
+            assert child.name == str(ii)
+            my_dict = child.value
+            assert isinstance(my_dict["mychar"], str)
+            assert isinstance(my_dict["mydate"], datetime.date) or np.isnan(my_dict["mydate"])
+            assert isinstance(my_dict["dtime"], datetime.datetime) or np.isnan(my_dict["dtime"])
+            assert isinstance(my_dict["mytime"], datetime.time) or np.isnan(my_dict["mytime"])
+            assert isinstance(my_dict["mylabl"], thistype)
+            assert isinstance(my_dict["myord"], thistype)
diff --git a/unittests/test_tables/spss/CITATION.cff b/unittests/test_tables/spss/CITATION.cff
new file mode 100644
index 00000000..140fcc07
--- /dev/null
+++ b/unittests/test_tables/spss/CITATION.cff
@@ -0,0 +1,11 @@
+cff-version: 1.2.0
+message: "If you use this software, please cite it as below."
+authors:
+- family-names: "Fajardo"
+  given-names: "Otto"
+  orcid: "https://orcid.org/0000-0002-3363-9287"
+title: "Pyreadstat"
+version: 1.2.7
+doi: 10.5281/zenodo.6612282
+date-released: 2018-09-24
+url: "https://github.com/Roche/pyreadstat"
diff --git a/unittests/test_tables/spss/LICENSE b/unittests/test_tables/spss/LICENSE
new file mode 100644
index 00000000..a2f94b1a
--- /dev/null
+++ b/unittests/test_tables/spss/LICENSE
@@ -0,0 +1,210 @@
+Test data files were copied from [pyreadstat](https://github.com/Roche/pyreadstat), they are
+licensed under the Apache License, cited below.
+
+Copyright (C) 2018-2024 Otto Fajardo
+Copyright (C) 2024 Indiscale GmbH <info@indiscale.com>
+Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com>
+
+pyreadstat liscence:
+---------------------------------------------------------------------------
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
\ No newline at end of file
diff --git a/unittests/test_tables/spss/sample.sav b/unittests/test_tables/spss/sample.sav
new file mode 100644
index 0000000000000000000000000000000000000000..20d0c5ce6689a60adfa329a17b4347274e9a863b
GIT binary patch
literal 1651
zcmY#!^D%PJP}WrNbn;aQ4hRlb2o7-!@eB^}bPiT9Gto`TEK%?cRtV3`OUW-URxmQv
zGXOya7+_*xU|?rpU|?i`09FXaz`zjcU}&b`SemY2Xkln>Wn^S!Yz)&xF3rHe0J0Bc
z7YKu`Vq|1sU}R+Qjdb>K3{p_wWME)O&PXguOfE?+QczF;sRyZpsb}I~WdLE{NWV~D
z1qB6=+w)3uQ;RZ_G1M|}sWE`CZ={Q3h^vAE3j+f~N@7VWx|s|jVhkYc65{FWs-U32
z0aaI$nTw$oWH$)=M*289`9SQ?NlZ%3Nln49AEXY1eIxyYT%c~vFG|VGOU%Jg%fKPZ
z0K&eJ5J!OA3bLOW8fKs{U|{%Q&*Gbyld7Pg0O2z@u(_q?f<!>-L16@nKad^}hN$^q
z&zzHA4vI1epTU7GH#H@*G#6?HhXYGSW_pGKBz!<-urV+&fH26+;{4oHg_6|b5`~h~
ziV}syVuif?lGI|5apYnR*SwO{qSO=xLsMPH(sW%T149c<at#B!FTW%swMZd3KQ}iu
zuY@AEft*2A_kq$UC@w%@2})Oz&`1DL7nmUV4ix5~H2H)Dk{4lWLHhoK!Fw?I|Nr~?
ze_+=8{~&X@85kHqYDJ;>7nFBE=7RKr>;jpK&gO%vMHUCyjZHns4qmAGXa)vo9<|M_
z1Z7Z8a1ON1t;{RU<%Hxg+uTY}_TmKRBij^E^5ul&6WiR%oW!IYPH;}J&8^HYO5ud0
zblcoYkUEf60t^fcAZLJ_19CdZE1={IqCxUPP^WY-Ffc&eV5Q;^l%JETp>CkgrKt~g
z3c3);L8xM2ccBVFT!$`}UzCC_1ac~h7|2~BP<Qb#Fff!u%2{1QBU7X2pfF}&VEFs@
z@88tazkd@!iAzC2Vb)9#_-N0-z%WO07N-*fgMf$}m}X#bc<|66399bz-@j>TfB*h9
z27}Ka^%ecKj3D(NCWC1P28Gx!4#`mUX&~U#f8g)m)by0UfB!NtFl<rRVRM4fDah*5
tzU>Ev>jKUw$G>T*fB*jd{~yK&(GEFp96-ekNDIi=G^c)t)Kst%1_1cfG6nzu

literal 0
HcmV?d00001

diff --git a/unittests/test_tables/spss/sample_large.sav b/unittests/test_tables/spss/sample_large.sav
new file mode 100644
index 0000000000000000000000000000000000000000..b0c16c1390a15a4f62a859ade76aa17b89c6ae40
GIT binary patch
literal 27895
zcmY#!^D%PJP}Wok4hRlbaB&QARB-e3aaGV&$S5f(D7MnqPtPpLC{5B!&d=2k&#X!;
zO7Trh)(=WeObISYEKy)$U|?WpK!B%=Fm|Mafw6*Lewl)ifuVtwfrXWksRD(7fq?;J
zAKVI%Rg8=b42+BnzLCxzjzJ1w_ke_9n2Cdx0fc=c{X%^eFy$GzMHoQXH`2v1#1)IY
zh!_Kdh!}%Qh^MbB7X2WzLD)Ca$I;0LQ$I)!gnc9ZgIuuaXW$TJ0Ab%qhy|dqWM*Js
zU}0cj;9y{25P^Awfq~)w|Ns9%eg|PtIK2D|3454YkUo%_|Nq}ZsQ>Tl|AATW|AWls
zhMFr1H4|h$lnJ&AWG;vg5+{cZvV)g_fdQm9nt=ftC$_nj$r*`7oZuL+&8^HU&E<rI
zwry@@N@7VWCpc_vQ%W*(Q#m2wXq#J^lbDpl2@W;e+{*l-6i!G8+2&S))PbxLfCdpL
zC_q+%oDNHV&p~0qz`&3QO>whkg1|?61_p*XlCwCS7#IXZ<iIpY*9VX|0|Ntuc6jj6
zAqlF_7z{px<SY7X8A19!Oa{{+eGCp@k^w?1#C~x|hU!ZL0jK^03=9lg)OFaLkZ8C)
z3=AAF{R}C{`qIAb2bsHoGs+Q}W`NoQcW2HUhg4*JPW=wx*hWPEe~^BddJqkh9}O2!
z$Ux$IG+f}t#b|ng<lE7F1S)Js%NKC@3knKQX#pxHK(xbXxQvDisGJzBr$^f_i1yuR
zyKc0-3(gOt{Qyuu2~<-;`bDt%b2MB=!v)-(HXiLygL22{I0I-L1{4yAemICX8ZM*Z
zGCE!d$_XFrN5|_xA@F{5Tng0R939^Ujl(mHj=w<W1z_#!(QzqAc);dMM#E)v{s>fi
zg7E0P8YmQC?d#F`#?g6L$b>PdIgvIxUN<^k#~?7eo<TuDVRT#yQBRDHzl@H*jE=uR
z@&jn}0koWMbiEWLJU}F<zd0H%pb!`xmx8t<N7qY%1|3G%1&yu?0%wTP@fS!v!7#e6
z85|I!<1a|#F0l3RqwA$c$6rRrUq;7Yzyp$_<1e6!0#Z+mj=zkKzl@H*K=K2qmpwZE
zGCIxx?pBSie;Hl>!oa{Vy3RvE0h}L3_aA}#51^W2be#t%6d>)#(S7Bk`|U^1Apqrv
z(fvmZ3=E^^R*atK0XlyKX}lFuPmGQ;jE*ymjx&JjiP3#FuyGbpXn@vtz-XBG==v8>
zh(O9sQ23073n&Ce$ECpi@X>Lp(Q&EKaVhXT9ymXYj!TV>OO2k3gUAn~>pZ~uVf6f$
z(eqzM*LfiN5u@WTqvJ25<1di>0P0PSuJah}PlNjpqy6d8{`6>n8qt3k?N5XI51^W2
zbe#t%6ky}fqvv@b?JGf=-y98>(QpA5pOAHOqvJ2&@^f_jWpw;ybo>QTe~ym7jNI`T
E0L-mwHUIzs

literal 0
HcmV?d00001

-- 
GitLab