From 2f20561baf4cb972b9c3631ce112cedfec86ce60 Mon Sep 17 00:00:00 2001 From: Daniel <d.hornung@indiscale.com> Date: Fri, 19 Apr 2024 11:27:56 +0200 Subject: [PATCH] WIP: XLSX reader. --- .../table_json_conversion/convert.py | 75 +++++++++++++++++-- .../table_json_conversion/test_read_data.py | 68 +++++++++++++++++ 2 files changed, 135 insertions(+), 8 deletions(-) create mode 100644 unittests/table_json_conversion/test_read_data.py diff --git a/src/caosadvancedtools/table_json_conversion/convert.py b/src/caosadvancedtools/table_json_conversion/convert.py index 344c7ad1..a19f10fb 100644 --- a/src/caosadvancedtools/table_json_conversion/convert.py +++ b/src/caosadvancedtools/table_json_conversion/convert.py @@ -4,7 +4,6 @@ # This file is a part of the LinkAhead Project. # # Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> -# Copyright (C) 2024 Henrik tom Wörden <h.tomwoerden@indiscale.com> # Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> # # This program is free software: you can redistribute it and/or modify @@ -20,18 +19,78 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. -from fill_xlsx import read_or_dict +from typing import Any, BinaryIO, Dict, List, Optional, TextIO, Union + +import caosadvancedtools.table_json_conversion.xlsx_utils as xlsx_utils +from . import fill_xlsx +from .fill_xlsx import read_or_dict +from openpyxl import load_workbook, Workbook +from openpyxl.worksheet.worksheet import Worksheet class XLSXConverter: - def __init__(self, schema: Union[dict, str, TextIO]): - """ + """Class for conversion from XLSX to JSON. + +For a detailed description of the required formatting of the XLSX files, see ``specs.md`` in the +documentation. + """ + def __init__(self, xlsx: Union[str, BinaryIO], schema: Union[dict, str, TextIO]): + """ Parameters ---------- +xlsx: Union[str, BinaryIO] + Path to the XLSX file or opened file object. + schema: Union[dict, str, TextIO] - If given, validate the date against this schema first. This raises an exception if the validation - fails. May be given as dict with the schema, path to a json file as string or as a readable file - like object. - """ + Schema for validation of XLSX content. +""" + self._workbook = load_workbook(xlsx) self._schema = read_or_dict(schema) + self._handled_sheets = set() + self._result = {} + + def to_dict(self) -> dict: + """Convert the xlsx contents to a dict. + +Returns +------- +out: dict + A dict representing the JSON with the extracted data. + """ + self._handled_sheets = set() + self._result = {} + for sheetname in self._workbook.sheetnames: + if sheetname not in self._handled_sheets: + self._handle_sheet(self._workbook.get_sheet_by_name(sheetname)) + + def _handle_sheet(self, sheet: Worksheet) -> None: + """Add the contents of the sheet to the result. + """ + path_rows = xlsx_utils.get_path_rows(sheet) + row_type_column = xlsx_utils.get_row_type_column_index(sheet) + foreign_columns = xlsx_utils.get_foreign_key_columns(sheet) + # Parent element: longest common path shared among any foreign column and all the data columns + parent = xlsx_utils.get_parent_path(sheet) + # from IPython import embed + # embed() + + +def to_dict(xlsx: Union[str, BinaryIO], schema: Union[dict, str, TextIO]) -> dict: + """Convert the xlsx contents to a dict, it must follow a schema. + +Parameters +---------- +xlsx: Union[str, BinaryIO] + Path to the XLSX file or opened file object. + +schema: Union[dict, str, TextIO] + Schema for validation of XLSX content. + +Returns +------- +out: dict + A dict representing the JSON with the extracted data. + """ + converter = XLSXConverter(xlsx, schema) + return converter.to_dict() diff --git a/unittests/table_json_conversion/test_read_data.py b/unittests/table_json_conversion/test_read_data.py new file mode 100644 index 00000000..1b1b49d0 --- /dev/null +++ b/unittests/table_json_conversion/test_read_data.py @@ -0,0 +1,68 @@ +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +import json +import os +import re +import tempfile + +import jsonschema.exceptions as schema_exc +import pytest +import caosadvancedtools.table_json_conversion.convert as convert +from openpyxl import load_workbook + +from .utils import compare_workbooks + + +def rfp(*pathcomponents): + """Return full path, a shorthand convenience function. + """ + return os.path.join(os.path.dirname(__file__), *pathcomponents) + + +def fill_and_compare(json_file: str, template_file: str, known_good: str, + schema: str = None, custom_output: str = None): + """Fill the data into a template and compare to a known good. + +Parameters: +----------- +schema: str, optional, + Json schema to validate against. +custom_output: str, optional + If given, write to this file and drop into an IPython shell. For development only. + """ + with tempfile.TemporaryDirectory() as tmpdir: + outfile = os.path.join(tmpdir, 'test.xlsx') + assert not os.path.exists(outfile) + if custom_output is not None: + outfile = custom_output + # fill_template(data=json_file, template=template_file, result=outfile, + # validation_schema=schema) + assert os.path.exists(outfile) + generated = load_workbook(outfile) # workbook can be read + known_good_wb = load_workbook(known_good) + compare_workbooks(generated, known_good_wb) + + +def test_simple(): + # convert.to_dict(xlsx=rfp("data/simple_data.xlsx"), schema=rfp("data/simple_schema.json")) + convert.to_dict(xlsx=rfp("data/multiple_refs_data.xlsx"), schema=rfp("data/multiple_refs_schema.json")) + # conv = XLSXConverter(schema=rfp("data/simple_schema.json")) + # result = conv.to_dict(rfp("data/simple_template.xlsx")) -- GitLab