From 67764f97bc13c46f9bf24afc43435eeee85b3e0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Mon, 13 May 2024 10:45:34 +0200 Subject: [PATCH] ENH: add sav converter --- .docker/Dockerfile | 2 +- .gitlab-ci.yml | 4 +- setup.cfg | 5 +- src/caoscrawler/__init__.py | 5 + src/caoscrawler/conv_impl/__init__.py | 0 src/caoscrawler/conv_impl/sav.py | 37 ++++ src/caoscrawler/converters.py | 132 ++++++------ src/caoscrawler/version.py | 8 +- src/doc/converters.rst | 2 + unittests/test_converters.py | 2 +- unittests/test_sav_converter.py | 82 ++++++++ unittests/test_tables/spss/CITATION.cff | 11 + unittests/test_tables/spss/LICENSE | 210 ++++++++++++++++++++ unittests/test_tables/spss/sample.sav | Bin 0 -> 1651 bytes unittests/test_tables/spss/sample_large.sav | Bin 0 -> 27895 bytes 15 files changed, 426 insertions(+), 74 deletions(-) create mode 100644 src/caoscrawler/conv_impl/__init__.py create mode 100644 src/caoscrawler/conv_impl/sav.py create mode 100644 unittests/test_sav_converter.py create mode 100644 unittests/test_tables/spss/CITATION.cff create mode 100644 unittests/test_tables/spss/LICENSE create mode 100644 unittests/test_tables/spss/sample.sav create mode 100644 unittests/test_tables/spss/sample_large.sav diff --git a/.docker/Dockerfile b/.docker/Dockerfile index 539ac0d4..a4d9ce69 100644 --- a/.docker/Dockerfile +++ b/.docker/Dockerfile @@ -30,7 +30,7 @@ RUN rm -r /git/.git # Install pycaosdb.ini for the tests RUN mv /git/.docker/tester_pycaosdb.ini /git/integrationtests/pycaosdb.ini -RUN cd /git/ && pip3 install . +RUN cd /git/ && pip3 install .[h5_crawler,spss] WORKDIR /git/integrationtests # wait for server, diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9ec1727d..ff313673 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -130,7 +130,7 @@ unittest_py3.8: # TODO: Use f-branch logic here - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@dev - - pip install .[h5-crawler] + - pip install .[h5_crawler,spss] # actual test - caosdb-crawler --help - pytest --cov=caosdb -vv ./unittests @@ -166,7 +166,7 @@ unittest_py3.13: # TODO: Use f-branch logic here - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@dev - - pip install .[h5-crawler] + - pip install .[h5_crawler,spss] # actual test - caosdb-crawler --help - pytest --cov=caosdb -vv ./unittests diff --git a/setup.cfg b/setup.cfg index 1b4a9185..4a9c8265 100644 --- a/setup.cfg +++ b/setup.cfg @@ -25,6 +25,7 @@ install_requires = odfpy #make optional packaging pandas + pyarrow # Will be required by Pandas >= 3.0. pyyaml yaml-header-tools >= 0.2.1 @@ -41,6 +42,8 @@ console_scripts = caosdb-crawler = caoscrawler.crawl:main [options.extras_require] -h5-crawler = +h5_crawler = h5py >= 3.8 numpy +spss = + pandas[spss] diff --git a/src/caoscrawler/__init__.py b/src/caoscrawler/__init__.py index 05bad0b5..9c5e3743 100644 --- a/src/caoscrawler/__init__.py +++ b/src/caoscrawler/__init__.py @@ -1,4 +1,9 @@ +from . import converters +from .conv_impl.sav import SAVConverter from .crawl import Crawler, SecurityMode from .version import CfoodRequiredVersionError, get_caoscrawler_version __version__ = get_caoscrawler_version() + +# Convenience members ######################################################### +converters.SAVConverter = SAVConverter diff --git a/src/caoscrawler/conv_impl/__init__.py b/src/caoscrawler/conv_impl/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/caoscrawler/conv_impl/sav.py b/src/caoscrawler/conv_impl/sav.py new file mode 100644 index 00000000..8308719b --- /dev/null +++ b/src/caoscrawler/conv_impl/sav.py @@ -0,0 +1,37 @@ +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Converter for SAV files (stored by SPSS).""" + +# import pyreadstat # Maybe us this if we need more metadata +import pandas as pd + +from .. import converters +from ..stores import GeneralStore, RecordStore +from ..structure_elements import (BooleanElement, DictElement, Directory, File, + FloatElement, IntegerElement, JSONFile, + ListElement, NoneElement, StructureElement, + TextElement) + + +class SAVConverter(converters.TableConverter): + """Converter for SAV files (stored by SPSS).""" + + def create_children(self, values: GeneralStore, element: StructureElement): + df = pd.io.spss.read_spss(element.path) + return self._children_from_dataframe(df) diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index e0ca0f9b..296bfeed 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -1,11 +1,11 @@ -#!/usr/bin/env python3 # encoding: utf-8 # -# ** header v3.0 -# This file is a part of the CaosDB Project. +# This file is a part of the LinkAhead Project. # +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> # Copyright (C) 2021 Henrik tom Wörden -# 2021 Alexander Schlemmer +# Copyright (C) 2021 Alexander Schlemmer +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as @@ -19,9 +19,8 @@ # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. -# -# ** end header -# + +"""Converters take structure elements and create Records and new structure elements from them.""" from __future__ import annotations @@ -34,7 +33,7 @@ import warnings from abc import ABCMeta, abstractmethod from inspect import signature from string import Template -from typing import Any, List, Optional, Tuple, Union +from typing import Any, Callable, Optional, Union import linkahead as db import pandas as pd @@ -134,8 +133,8 @@ def replace_variables(propvalue: Any, values: GeneralStore): This function replaces variables in property values (and possibly other locations, where the crawler can replace cfood-internal variables). - If `propvalue` is a single variable name preceeded with a '$' (e.g. '$var' or '${var}'), then - the corresponding value stored in `values` is returned. + If ``propvalue`` is a single variable name preceeded by a ``$`` (e.g. ``$var`` or ``${var}``), + then the corresponding value stored in ``values`` is returned. In any other case the variable substitution is carried out as defined by string templates and a new string with the replaced variables is returned. """ @@ -160,16 +159,16 @@ def handle_value(value: Union[dict, str, list], values: GeneralStore): add as an additional property (multiproperty). Variable names (starting with a "$") are replaced by the corresponding value stored in the - `values` GeneralStore. + ``values`` GeneralStore. Parameters ---------- -value: - - if str, the value to be interpreted. E.g. "4", "hallo" or "$a" etc. - - if dict, must have keys "value" and "collection_mode". The returned tuple is directly +value: Union[dict, str, list] + - If *str*, the value to be interpreted. E.g. "4", "hello" or "$a" etc. + - If *dict*, must have keys ``value`` and ``collection_mode``. The returned tuple is directly created from the corresponding values. - - if list, each element is checked for replacement and the resulting list will be used + - If *list*, each element is checked for replacement and the resulting list will be used as (list) value for the property Returns @@ -181,7 +180,7 @@ out: tuple """ # @review Florian Spreckelsen 2022-05-13 - if type(value) == dict: + if isinstance(value, dict): if "value" not in value: # TODO: how do we handle this case? Just ignore? # or disallow? @@ -189,7 +188,7 @@ out: tuple propvalue = value["value"] # can be "single", "list" or "multiproperty" collection_mode = value["collection_mode"] - elif type(value) == str: + elif isinstance(value, str): propvalue = value collection_mode = "single" if propvalue.startswith("+"): @@ -198,7 +197,7 @@ out: tuple elif propvalue.startswith("*"): collection_mode = "multiproperty" propvalue = propvalue[1:] - elif type(value) == list: + elif isinstance(value, list): # TODO: (for review) # This is a bit dirty right now and needed for # being able to directly set list values. Semantics is, however, a bit @@ -209,7 +208,7 @@ out: tuple propvalue = list() for element in value: # Do the element-wise replacement only, when its type is string: - if type(element) == str: + if isinstance(element, str): propvalue.append(replace_variables(element, values)) else: propvalue.append(element) @@ -286,9 +285,7 @@ def create_records(values: GeneralStore, records: RecordStore, def_records: dict propvalue = os.path.normpath(propvalue) setattr(c_record, key.lower(), propvalue) else: - if c_record.get_property(key) is None: - if collection_mode == "list": c_record.add_property(name=key, value=[propvalue]) elif (collection_mode == "multiproperty" or @@ -322,10 +319,13 @@ class Converter(object, metaclass=ABCMeta): Parameters ---------- - definition: dict, Please refer to ``src/doc/converters.rst`` to learn about the structure - that the definition dict must have. - converter_registry: dict, A dictionary that contains converter names as keys and dicts as - values. Those value dicts have the keys 'converter' and 'package'. + definition: dict + Please refer to ``src/doc/converters.rst`` to learn about the structure that the + definition dict must have. + converter_registry: dict + A dictionary that contains converter names as keys and dicts as values. Those value dicts + have the keys 'converter', 'package' and 'class'. 'converter' is the class name, + 'package' the module and 'class' the class instance of converters. """ self.definition = definition @@ -363,7 +363,7 @@ class Converter(object, metaclass=ABCMeta): @staticmethod def converter_factory(definition: dict, name: str, converter_registry: dict): - """creates a Converter instance of the appropriate class. + """Create a Converter instance of the appropriate class. The `type` key in the `definition` defines the Converter class which is being used. """ @@ -424,7 +424,7 @@ class Converter(object, metaclass=ABCMeta): pass """ - if not "transform" in self.definition: + if "transform" not in self.definition: return for transformer_key, transformer in self.definition["transform"].items(): in_value = replace_variables(transformer["in"], values) @@ -460,8 +460,7 @@ class Converter(object, metaclass=ABCMeta): values[match.group('varname')] = out_value @abstractmethod - def create_children(self, values: GeneralStore, - element: StructureElement): + def create_children(self, values: GeneralStore, element: StructureElement): pass def create_records(self, values: GeneralStore, records: RecordStore, @@ -477,7 +476,7 @@ class Converter(object, metaclass=ABCMeta): self.definition["records"]) def filter_children(self, children_with_strings: - List[Tuple[StructureElement, str]], expr: str, + list[tuple[StructureElement, str]], expr: str, group: str, rule: str): """Filter children according to regexp `expr` and `rule`.""" @@ -515,8 +514,8 @@ class Converter(object, metaclass=ABCMeta): result: Optional[dict]): """ Template for the debugging output for the match function """ msg = "\n--------" + name + "-----------\n" - for re, ma in zip(regexp, matched): - msg += "matching reg:\t" + re + "\n" + for exp, ma in zip(regexp, matched): + msg += "matching reg:\t" + exp + "\n" msg += "matching val:\t" + ma + "\n" msg += "---------\n" if result is None: @@ -620,7 +619,7 @@ class DirectoryConverter(Converter): element: A directory (of type Directory) which will be traversed. """ - children: List[StructureElement] = [] + children: list[StructureElement] = [] for name in sorted(os.listdir(element.path)): path = os.path.join(element.path, name) @@ -660,7 +659,7 @@ class SimpleFileConverter(Converter): class FileConverter(SimpleFileConverter): def __init__(self, *args, **kwargs): warnings.warn(DeprecationWarning( - "This class is depricated. Please use SimpleFileConverter.")) + "This class is deprecated. Please use SimpleFileConverter.")) super().__init__(*args, **kwargs) @@ -693,12 +692,12 @@ class MarkdownFileConverter(SimpleFileConverter): "Error during the validation (yaml header cannot be read) of the markdown file " "located at the following node in the data structure:\n" "{}\nError:\n{}".format(path, err)) - children: List[StructureElement] = [] + children: list[StructureElement] = [] for name, entry in header.items(): - if type(entry) == list: + if isinstance(entry, list): children.append(ListElement(name, entry)) - elif type(entry) == str: + elif isinstance(entry, str): children.append(TextElement(name, entry)) else: if generalStore is not None and self.name in generalStore: @@ -814,14 +813,14 @@ class DictElementConverter(Converter): class DictConverter(DictElementConverter): def __init__(self, *args, **kwargs): warnings.warn(DeprecationWarning( - "This class is depricated. Please use DictConverter.")) + "This class is deprecated. Please use DictElementConverter.")) super().__init__(*args, **kwargs) class DictDictElementConverter(DictElementConverter): def __init__(self, *args, **kwargs): warnings.warn(DeprecationWarning( - "This class is depricated. Please use DictElementConverter.")) + "This class is deprecated. Please use DictElementConverter.")) super().__init__(*args, **kwargs) @@ -886,7 +885,7 @@ out: """ if "match_name" in definition: if "match" in definition: - raise RuntimeError(f"Do not supply both, 'match_name' and 'match'.") + raise RuntimeError("Do not supply both, 'match_name' and 'match'.") m1 = re.match(definition["match_name"], name) if m1 is None: @@ -1009,7 +1008,7 @@ class BooleanElementConverter(_AbstractScalarValueElementConverter): class DictBooleanElementConverter(BooleanElementConverter): def __init__(self, *args, **kwargs): warnings.warn(DeprecationWarning( - "This class is depricated. Please use BooleanElementConverter.")) + "This class is deprecated. Please use BooleanElementConverter.")) super().__init__(*args, **kwargs) @@ -1025,7 +1024,7 @@ class FloatElementConverter(_AbstractScalarValueElementConverter): class DictFloatElementConverter(FloatElementConverter): def __init__(self, *args, **kwargs): warnings.warn(DeprecationWarning( - "This class is depricated. Please use FloatElementConverter.")) + "This class is deprecated. Please use FloatElementConverter.")) super().__init__(*args, **kwargs) @@ -1040,7 +1039,7 @@ class TextElementConverter(_AbstractScalarValueElementConverter): def __init__(self, definition, *args, **kwargs): if "match" in definition: raise ValueError(""" -The 'match' key will in future be used to match a potential name of a TextElement. Please use +The 'match' key is used to match a potential name of a TextElement. Please use the 'match_value' key to match the value of the TextElement and 'match_name' for matching the name. """) @@ -1050,7 +1049,7 @@ the 'match_value' key to match the value of the TextElement and 'match_name' for class DictTextElementConverter(TextElementConverter): def __init__(self, *args, **kwargs): warnings.warn(DeprecationWarning( - "This class is depricated. Please use TextElementConverter.")) + "This class is deprecated. Please use TextElementConverter.")) super().__init__(*args, **kwargs) @@ -1066,7 +1065,7 @@ class IntegerElementConverter(_AbstractScalarValueElementConverter): class DictIntegerElementConverter(IntegerElementConverter): def __init__(self, *args, **kwargs): warnings.warn(DeprecationWarning( - "This class is depricated. Please use IntegerElementConverter.")) + "This class is deprecated. Please use IntegerElementConverter.")) super().__init__(*args, **kwargs) @@ -1108,7 +1107,7 @@ class ListElementConverter(Converter): class DictListElementConverter(ListElementConverter): def __init__(self, *args, **kwargs): warnings.warn(DeprecationWarning( - "This class is depricated. Please use ListElementConverter.")) + "This class is deprecated. Please use ListElementConverter.")) super().__init__(*args, **kwargs) @@ -1122,15 +1121,22 @@ class TableConverter(Converter): The rows can be matched using a DictElementConverter. """ - @abstractmethod - def get_options(self): - """ - This method needs to be overwritten by the specific table converter to provide - information about the possible options. + + def get_options(self) -> dict: + """Get specific options, e.g. from ``self.definitions``. + +This method may to be overwritten by the specific table converter to provide information about the +possible options. Implementors may use ``TableConverter._get_options(...)`` to get (and convert) +options from ``self.definitions``. + +Returns +------- +out: dict + An options dict. """ - pass + return {} - def _get_options(self, possible_options): + def _get_options(self, possible_options: list[tuple[str, Callable]]) -> dict: option_dict = dict() for opt_name, opt_conversion in possible_options: if opt_name in self.definition: @@ -1158,6 +1164,14 @@ class TableConverter(Converter): return None return m.groupdict() + @staticmethod + def _children_from_dataframe(dataframe: pd.DataFrame): + child_elements = list() + for index, row in dataframe.iterrows(): + child_elements.append( + DictElement(str(index), row.to_dict())) + return child_elements + class XLSXTableConverter(TableConverter): """ @@ -1187,11 +1201,7 @@ class XLSXTableConverter(TableConverter): if not isinstance(element, File): raise RuntimeError("Element must be a File.") table = pd.read_excel(element.path, **self.get_options()) - child_elements = list() - for index, row in table.iterrows(): - child_elements.append( - DictElement(str(index), row.to_dict())) - return child_elements + return self._children_from_dataframe(table) class CSVTableConverter(TableConverter): @@ -1216,11 +1226,7 @@ class CSVTableConverter(TableConverter): if not isinstance(element, File): raise RuntimeError("Element must be a File.") table = pd.read_csv(element.path, **self.get_options()) - child_elements = list() - for index, row in table.iterrows(): - child_elements.append( - DictElement(str(index), row.to_dict())) - return child_elements + return self._children_from_dataframe(table) class DateElementConverter(TextElementConverter): diff --git a/src/caoscrawler/version.py b/src/caoscrawler/version.py index fdc83234..0b72dd65 100644 --- a/src/caoscrawler/version.py +++ b/src/caoscrawler/version.py @@ -17,11 +17,7 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. # -try: - from importlib import metadata as importlib_metadata -except ImportError: # Python<3.8 dowesn"t support this so use - import importlib_metadata - +from importlib import metadata as importlib_metadata from packaging.version import parse as parse_version from warnings import warn @@ -43,7 +39,7 @@ def check_cfood_version(metadata: dict): if not metadata or "crawler-version" not in metadata: msg = """ -No crawler version specified in cfood definition, so there is now guarantee that +No crawler version specified in cfood definition, so there is no guarantee that the cfood definition matches the installed crawler version. Specifying a version is highly recommended to ensure that the definition works diff --git a/src/doc/converters.rst b/src/doc/converters.rst index 9b28c9a6..637c6635 100644 --- a/src/doc/converters.rst +++ b/src/doc/converters.rst @@ -55,6 +55,8 @@ element. If the structure element matches (this is a combination of a typecheck match, see :py:class:`~caoscrawler.converters.Converter` for details) the converter is used to generate records (see :py:meth:`~caoscrawler.converters.Converter.create_records`) and to possibly process a subtree, as defined by the function :func:`caoscrawler.converters.create_children`. +**match** *TODO* + **records** is a dict of definitions that define the semantic structure (see details below). diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 52ece13d..0e90fab3 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -140,7 +140,7 @@ def test_markdown_converter(converter_registry): converter = MarkdownFileConverter({"match": "(.*)"}, "TestMarkdownFileConverter", converter_registry) - with pytest.raises(ConverterValidationError) as err: + with pytest.raises(ConverterValidationError): converter.create_children(None, File("test_tool.py", UNITTESTDIR / "test_crawler.py")) m = converter.match(test_readme) diff --git a/unittests/test_sav_converter.py b/unittests/test_sav_converter.py new file mode 100644 index 00000000..3cc72038 --- /dev/null +++ b/unittests/test_sav_converter.py @@ -0,0 +1,82 @@ +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Testing converter for SAV files.""" + +import datetime +import importlib +import re +from pathlib import Path + +import numpy as np +import pytest + +from caoscrawler.converters import ( + ConverterValidationError, + SAVConverter, +) +from caoscrawler.structure_elements import (BooleanElement, DictElement, + Directory, File, FloatElement, + IntegerElement, ListElement, + TextElement) + +UNITTESTDIR = Path(__file__).parent + + +@pytest.fixture +def converter_registry(): + converter_registry: dict[str, dict[str, str]] = { + "Directory": { + "converter": "DirectoryConverter", + "package": "caoscrawler.converters"}, + } + + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry + + +def test_sav_converter(converter_registry): + converter = SAVConverter({ + "match": ("sample.sav") + }, + "ThisConverterNameIsIrrelevant", converter_registry + ) + + spss_dir = UNITTESTDIR / "test_tables" / "spss" + for sav_file, length, thistype in [ + (File("sample.sav", spss_dir / "sample.sav"), 5, str), + (File("sample.sav", spss_dir / "sample_large.sav"), 485, float), + ]: + m = converter.match(sav_file) + assert m is not None + assert len(m) == 0 + + children = converter.create_children(None, sav_file) + assert len(children) == length + for ii, child in enumerate(children): + assert child.__class__ == DictElement + assert child.name == str(ii) + my_dict = child.value + assert isinstance(my_dict["mychar"], str) + assert isinstance(my_dict["mydate"], datetime.date) or np.isnan(my_dict["mydate"]) + assert isinstance(my_dict["dtime"], datetime.datetime) or np.isnan(my_dict["dtime"]) + assert isinstance(my_dict["mytime"], datetime.time) or np.isnan(my_dict["mytime"]) + assert isinstance(my_dict["mylabl"], thistype) + assert isinstance(my_dict["myord"], thistype) diff --git a/unittests/test_tables/spss/CITATION.cff b/unittests/test_tables/spss/CITATION.cff new file mode 100644 index 00000000..140fcc07 --- /dev/null +++ b/unittests/test_tables/spss/CITATION.cff @@ -0,0 +1,11 @@ +cff-version: 1.2.0 +message: "If you use this software, please cite it as below." +authors: +- family-names: "Fajardo" + given-names: "Otto" + orcid: "https://orcid.org/0000-0002-3363-9287" +title: "Pyreadstat" +version: 1.2.7 +doi: 10.5281/zenodo.6612282 +date-released: 2018-09-24 +url: "https://github.com/Roche/pyreadstat" diff --git a/unittests/test_tables/spss/LICENSE b/unittests/test_tables/spss/LICENSE new file mode 100644 index 00000000..a2f94b1a --- /dev/null +++ b/unittests/test_tables/spss/LICENSE @@ -0,0 +1,210 @@ +Test data files were copied from [pyreadstat](https://github.com/Roche/pyreadstat), they are +licensed under the Apache License, cited below. + +Copyright (C) 2018-2024 Otto Fajardo +Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> + +pyreadstat liscence: +--------------------------------------------------------------------------- + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/unittests/test_tables/spss/sample.sav b/unittests/test_tables/spss/sample.sav new file mode 100644 index 0000000000000000000000000000000000000000..20d0c5ce6689a60adfa329a17b4347274e9a863b GIT binary patch literal 1651 zcmY#!^D%PJP}WrNbn;aQ4hRlb2o7-!@eB^}bPiT9Gto`TEK%?cRtV3`OUW-URxmQv zGXOya7+_*xU|?rpU|?i`09FXaz`zjcU}&b`SemY2Xkln>Wn^S!Yz)&xF3rHe0J0Bc z7YKu`Vq|1sU}R+Qjdb>K3{p_wWME)O&PXguOfE?+QczF;sRyZpsb}I~WdLE{NWV~D z1qB6=+w)3uQ;RZ_G1M|}sWE`CZ={Q3h^vAE3j+f~N@7VWx|s|jVhkYc65{FWs-U32 z0aaI$nTw$oWH$)=M*289`9SQ?NlZ%3Nln49AEXY1eIxyYT%c~vFG|VGOU%Jg%fKPZ z0K&eJ5J!OA3bLOW8fKs{U|{%Q&*Gbyld7Pg0O2z@u(_q?f<!>-L16@nKad^}hN$^q z&zzHA4vI1epTU7GH#H@*G#6?HhXYGSW_pGKBz!<-urV+&fH26+;{4oHg_6|b5`~h~ ziV}syVuif?lGI|5apYnR*SwO{qSO=xLsMPH(sW%T149c<at#B!FTW%swMZd3KQ}iu zuY@AEft*2A_kq$UC@w%@2})Oz&`1DL7nmUV4ix5~H2H)Dk{4lWLHhoK!Fw?I|Nr~? ze_+=8{~&X@85kHqYDJ;>7nFBE=7RKr>;jpK&gO%vMHUCyjZHns4qmAGXa)vo9<|M_ z1Z7Z8a1ON1t;{RU<%Hxg+uTY}_TmKRBij^E^5ul&6WiR%oW!IYPH;}J&8^HYO5ud0 zblcoYkUEf60t^fcAZLJ_19CdZE1={IqCxUPP^WY-Ffc&eV5Q;^l%JETp>CkgrKt~g z3c3);L8xM2ccBVFT!$`}UzCC_1ac~h7|2~BP<Qb#Fff!u%2{1QBU7X2pfF}&VEFs@ z@88tazkd@!iAzC2Vb)9#_-N0-z%WO07N-*fgMf$}m}X#bc<|66399bz-@j>TfB*h9 z27}Ka^%ecKj3D(NCWC1P28Gx!4#`mUX&~U#f8g)m)by0UfB!NtFl<rRVRM4fDah*5 tzU>Ev>jKUw$G>T*fB*jd{~yK&(GEFp96-ekNDIi=G^c)t)Kst%1_1cfG6nzu literal 0 HcmV?d00001 diff --git a/unittests/test_tables/spss/sample_large.sav b/unittests/test_tables/spss/sample_large.sav new file mode 100644 index 0000000000000000000000000000000000000000..b0c16c1390a15a4f62a859ade76aa17b89c6ae40 GIT binary patch literal 27895 zcmY#!^D%PJP}Wok4hRlbaB&QARB-e3aaGV&$S5f(D7MnqPtPpLC{5B!&d=2k&#X!; zO7Trh)(=WeObISYEKy)$U|?WpK!B%=Fm|Mafw6*Lewl)ifuVtwfrXWksRD(7fq?;J zAKVI%Rg8=b42+BnzLCxzjzJ1w_ke_9n2Cdx0fc=c{X%^eFy$GzMHoQXH`2v1#1)IY zh!_Kdh!}%Qh^MbB7X2WzLD)Ca$I;0LQ$I)!gnc9ZgIuuaXW$TJ0Ab%qhy|dqWM*Js zU}0cj;9y{25P^Awfq~)w|Ns9%eg|PtIK2D|3454YkUo%_|Nq}ZsQ>Tl|AATW|AWls zhMFr1H4|h$lnJ&AWG;vg5+{cZvV)g_fdQm9nt=ftC$_nj$r*`7oZuL+&8^HU&E<rI zwry@@N@7VWCpc_vQ%W*(Q#m2wXq#J^lbDpl2@W;e+{*l-6i!G8+2&S))PbxLfCdpL zC_q+%oDNHV&p~0qz`&3QO>whkg1|?61_p*XlCwCS7#IXZ<iIpY*9VX|0|Ntuc6jj6 zAqlF_7z{px<SY7X8A19!Oa{{+eGCp@k^w?1#C~x|hU!ZL0jK^03=9lg)OFaLkZ8C) z3=AAF{R}C{`qIAb2bsHoGs+Q}W`NoQcW2HUhg4*JPW=wx*hWPEe~^BddJqkh9}O2! z$Ux$IG+f}t#b|ng<lE7F1S)Js%NKC@3knKQX#pxHK(xbXxQvDisGJzBr$^f_i1yuR zyKc0-3(gOt{Qyuu2~<-;`bDt%b2MB=!v)-(HXiLygL22{I0I-L1{4yAemICX8ZM*Z zGCE!d$_XFrN5|_xA@F{5Tng0R939^Ujl(mHj=w<W1z_#!(QzqAc);dMM#E)v{s>fi zg7E0P8YmQC?d#F`#?g6L$b>PdIgvIxUN<^k#~?7eo<TuDVRT#yQBRDHzl@H*jE=uR z@&jn}0koWMbiEWLJU}F<zd0H%pb!`xmx8t<N7qY%1|3G%1&yu?0%wTP@fS!v!7#e6 z85|I!<1a|#F0l3RqwA$c$6rRrUq;7Yzyp$_<1e6!0#Z+mj=zkKzl@H*K=K2qmpwZE zGCIxx?pBSie;Hl>!oa{Vy3RvE0h}L3_aA}#51^W2be#t%6d>)#(S7Bk`|U^1Apqrv z(fvmZ3=E^^R*atK0XlyKX}lFuPmGQ;jE*ymjx&JjiP3#FuyGbpXn@vtz-XBG==v8> zh(O9sQ23073n&Ce$ECpi@X>Lp(Q&EKaVhXT9ymXYj!TV>OO2k3gUAn~>pZ~uVf6f$ z(eqzM*LfiN5u@WTqvJ25<1di>0P0PSuJah}PlNjpqy6d8{`6>n8qt3k?N5XI51^W2 zbe#t%6ky}fqvv@b?JGf=-y98>(QpA5pOAHOqvJ2&@^f_jWpw;ybo>QTe~ym7jNI`T E0L-mwHUIzs literal 0 HcmV?d00001 -- GitLab