Skip to content
Snippets Groups Projects
Verified Commit 2f20561b authored by Daniel Hornung's avatar Daniel Hornung
Browse files

WIP: XLSX reader.

parent 4f616ada
No related branches found
No related tags found
2 merge requests!107Release v0.11.0,!102ENH: XLSX reader
Pipeline #50031 passed with warnings
......@@ -4,7 +4,6 @@
# This file is a part of the LinkAhead Project.
#
# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com>
# Copyright (C) 2024 Henrik tom Wörden <h.tomwoerden@indiscale.com>
# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
......@@ -20,18 +19,78 @@
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
from fill_xlsx import read_or_dict
from typing import Any, BinaryIO, Dict, List, Optional, TextIO, Union
import caosadvancedtools.table_json_conversion.xlsx_utils as xlsx_utils
from . import fill_xlsx
from .fill_xlsx import read_or_dict
from openpyxl import load_workbook, Workbook
from openpyxl.worksheet.worksheet import Worksheet
class XLSXConverter:
def __init__(self, schema: Union[dict, str, TextIO]):
"""
"""Class for conversion from XLSX to JSON.
For a detailed description of the required formatting of the XLSX files, see ``specs.md`` in the
documentation.
"""
def __init__(self, xlsx: Union[str, BinaryIO], schema: Union[dict, str, TextIO]):
"""
Parameters
----------
xlsx: Union[str, BinaryIO]
Path to the XLSX file or opened file object.
schema: Union[dict, str, TextIO]
If given, validate the date against this schema first. This raises an exception if the validation
fails. May be given as dict with the schema, path to a json file as string or as a readable file
like object.
"""
Schema for validation of XLSX content.
"""
self._workbook = load_workbook(xlsx)
self._schema = read_or_dict(schema)
self._handled_sheets = set()
self._result = {}
def to_dict(self) -> dict:
"""Convert the xlsx contents to a dict.
Returns
-------
out: dict
A dict representing the JSON with the extracted data.
"""
self._handled_sheets = set()
self._result = {}
for sheetname in self._workbook.sheetnames:
if sheetname not in self._handled_sheets:
self._handle_sheet(self._workbook.get_sheet_by_name(sheetname))
def _handle_sheet(self, sheet: Worksheet) -> None:
"""Add the contents of the sheet to the result.
"""
path_rows = xlsx_utils.get_path_rows(sheet)
row_type_column = xlsx_utils.get_row_type_column_index(sheet)
foreign_columns = xlsx_utils.get_foreign_key_columns(sheet)
# Parent element: longest common path shared among any foreign column and all the data columns
parent = xlsx_utils.get_parent_path(sheet)
# from IPython import embed
# embed()
def to_dict(xlsx: Union[str, BinaryIO], schema: Union[dict, str, TextIO]) -> dict:
"""Convert the xlsx contents to a dict, it must follow a schema.
Parameters
----------
xlsx: Union[str, BinaryIO]
Path to the XLSX file or opened file object.
schema: Union[dict, str, TextIO]
Schema for validation of XLSX content.
Returns
-------
out: dict
A dict representing the JSON with the extracted data.
"""
converter = XLSXConverter(xlsx, schema)
return converter.to_dict()
# encoding: utf-8
#
# This file is a part of the LinkAhead Project.
#
# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com>
# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import json
import os
import re
import tempfile
import jsonschema.exceptions as schema_exc
import pytest
import caosadvancedtools.table_json_conversion.convert as convert
from openpyxl import load_workbook
from .utils import compare_workbooks
def rfp(*pathcomponents):
"""Return full path, a shorthand convenience function.
"""
return os.path.join(os.path.dirname(__file__), *pathcomponents)
def fill_and_compare(json_file: str, template_file: str, known_good: str,
schema: str = None, custom_output: str = None):
"""Fill the data into a template and compare to a known good.
Parameters:
-----------
schema: str, optional,
Json schema to validate against.
custom_output: str, optional
If given, write to this file and drop into an IPython shell. For development only.
"""
with tempfile.TemporaryDirectory() as tmpdir:
outfile = os.path.join(tmpdir, 'test.xlsx')
assert not os.path.exists(outfile)
if custom_output is not None:
outfile = custom_output
# fill_template(data=json_file, template=template_file, result=outfile,
# validation_schema=schema)
assert os.path.exists(outfile)
generated = load_workbook(outfile) # workbook can be read
known_good_wb = load_workbook(known_good)
compare_workbooks(generated, known_good_wb)
def test_simple():
# convert.to_dict(xlsx=rfp("data/simple_data.xlsx"), schema=rfp("data/simple_schema.json"))
convert.to_dict(xlsx=rfp("data/multiple_refs_data.xlsx"), schema=rfp("data/multiple_refs_schema.json"))
# conv = XLSXConverter(schema=rfp("data/simple_schema.json"))
# result = conv.to_dict(rfp("data/simple_template.xlsx"))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment