diff --git a/CHANGELOG.md b/CHANGELOG.md index 49b2e2c7519894f8922ab7947ad5002979638ce7..d72aaa8539bbc3dbe6d3b90b819b73f87e0aba1f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### * Support for Python 3.12 and experimental support for 3.13 +* `spss_to_datamodel` script. ### Changed ### diff --git a/setup.cfg b/setup.cfg index 4a9c82657bf453081cb4843bd955a333c73b6390..3e91d053a67fc5b92202b0f77b02511b1f04a0ec 100644 --- a/setup.cfg +++ b/setup.cfg @@ -40,6 +40,7 @@ per-file-ignores = __init__.py:F401 [options.entry_points] console_scripts = caosdb-crawler = caoscrawler.crawl:main + spss_to_datamodel = caoscrawler.conv_impl.spss:spss_to_datamodel_main [options.extras_require] h5_crawler = diff --git a/src/caoscrawler/conv_impl/spss.py b/src/caoscrawler/conv_impl/spss.py index f5ae4582d31c59d624048d757c9a42ce807491f6..903dea277de0b4659b130a8c02fea28810159407 100644 --- a/src/caoscrawler/conv_impl/spss.py +++ b/src/caoscrawler/conv_impl/spss.py @@ -18,14 +18,23 @@ """Converter for SAV files (stored by SPSS).""" +import argparse + import pandas as pd import pyreadstat +import yaml from .. import converters from ..stores import GeneralStore from ..structure_elements import (File, StructureElement) +TYPES = { + "double": "DOUBLE", + "string": "TEXT", +} + + class SPSSConverter(converters.TableConverter): """Converter for SAV files (stored by SPSS).""" @@ -44,3 +53,94 @@ class SPSSConverter(converters.TableConverter): # embed() return self._children_from_dataframe(df) + + +def spss_to_yaml(savfile: str, yamlfile: str) -> None: + """Parse the *.sav and create basic datamodel in ``yamlfile``. + """ + _, meta = pyreadstat.read_sav(savfile, metadataonly=True) + + enums: dict[str, list[str]] = {} + properties = {} + + for name in meta.column_names: + prop = { + "datatype": TYPES[meta.readstat_variable_types[name]], + } + desc = meta.column_names_to_labels.get(name) + if desc and desc != name: + prop["description"] = desc + # Handle categorial variables + if var_label := meta.variable_to_label.get(name): + prop["datatype"] = var_label + vvl = meta.variable_value_labels[name] + # reproducible (and sensible) order + label_values = [vvl[key] for key in sorted(vvl.keys())] + if label_values not in enums.values(): + enums[var_label] = label_values + properties[name] = prop + + output = f"""# auto-generated data model from file "{savfile}". +# To insert a datamodel into LinkAhead, run: +# +# python3 -m caosadvancedtools.models.parser datamodel.yaml --sync + +# +# Code for creating enum records: +# +""" + for name, values in enums.items(): + for line in f""" +cont = db.Container() +for value in {repr(values)}: + rec = db.Record(name=value).add_parent(name="{name}") + cont.append(rec) +cont.insert() + """.splitlines(keepends=True): + if line.strip(): + output += f"# {line}" + output += "#\n" + + # Actual datamodel + output += """ +######### +# Enums # +######### + +""" + for name, values in enums.items(): + output += f"""{name}: + description: + # possible values: {values}\n""" + + output += (""" +############### +# RecordTypes # +############### + +DummyRT: + description: Note: Change name and enter description. + recommended_properties: + """ + + " ".join(yaml.dump(properties, + allow_unicode=True, + sort_keys=False).splitlines(keepends=True))) + + with open(yamlfile, encoding="utf-8", mode="w") as myfile: + myfile.write(output) + + +def _parse_arguments(): + """Parse the arguments.""" + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('-i', '--input', help="The *.sav file.", required=True) + parser.add_argument('-o', '--outfile', help="Yaml filename to save the result", required=True) + + return parser.parse_args() + + +def spss_to_datamodel_main(): + """The main function of this script.""" + args = _parse_arguments() + spss_to_yaml(savfile=args.input, yamlfile=args.outfile) + print(f"Written datamodel to: {args.outfile}")