Skip to content
Snippets Groups Projects
Verified Commit 2f20561b authored by Daniel Hornung's avatar Daniel Hornung
Browse files

WIP: XLSX reader.

parent 4f616ada
No related branches found
No related tags found
2 merge requests!107Release v0.11.0,!102ENH: XLSX reader
Pipeline #50031 passed with warnings
...@@ -4,7 +4,6 @@ ...@@ -4,7 +4,6 @@
# This file is a part of the LinkAhead Project. # This file is a part of the LinkAhead Project.
# #
# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> # Copyright (C) 2024 Indiscale GmbH <info@indiscale.com>
# Copyright (C) 2024 Henrik tom Wörden <h.tomwoerden@indiscale.com>
# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> # Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com>
# #
# This program is free software: you can redistribute it and/or modify # This program is free software: you can redistribute it and/or modify
...@@ -20,18 +19,78 @@ ...@@ -20,18 +19,78 @@
# You should have received a copy of the GNU Affero General Public License # You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>. # along with this program. If not, see <https://www.gnu.org/licenses/>.
from fill_xlsx import read_or_dict from typing import Any, BinaryIO, Dict, List, Optional, TextIO, Union
import caosadvancedtools.table_json_conversion.xlsx_utils as xlsx_utils
from . import fill_xlsx
from .fill_xlsx import read_or_dict
from openpyxl import load_workbook, Workbook
from openpyxl.worksheet.worksheet import Worksheet
class XLSXConverter: class XLSXConverter:
def __init__(self, schema: Union[dict, str, TextIO]): """Class for conversion from XLSX to JSON.
"""
For a detailed description of the required formatting of the XLSX files, see ``specs.md`` in the
documentation.
"""
def __init__(self, xlsx: Union[str, BinaryIO], schema: Union[dict, str, TextIO]):
"""
Parameters Parameters
---------- ----------
xlsx: Union[str, BinaryIO]
Path to the XLSX file or opened file object.
schema: Union[dict, str, TextIO] schema: Union[dict, str, TextIO]
If given, validate the date against this schema first. This raises an exception if the validation Schema for validation of XLSX content.
fails. May be given as dict with the schema, path to a json file as string or as a readable file """
like object. self._workbook = load_workbook(xlsx)
"""
self._schema = read_or_dict(schema) self._schema = read_or_dict(schema)
self._handled_sheets = set()
self._result = {}
def to_dict(self) -> dict:
"""Convert the xlsx contents to a dict.
Returns
-------
out: dict
A dict representing the JSON with the extracted data.
"""
self._handled_sheets = set()
self._result = {}
for sheetname in self._workbook.sheetnames:
if sheetname not in self._handled_sheets:
self._handle_sheet(self._workbook.get_sheet_by_name(sheetname))
def _handle_sheet(self, sheet: Worksheet) -> None:
"""Add the contents of the sheet to the result.
"""
path_rows = xlsx_utils.get_path_rows(sheet)
row_type_column = xlsx_utils.get_row_type_column_index(sheet)
foreign_columns = xlsx_utils.get_foreign_key_columns(sheet)
# Parent element: longest common path shared among any foreign column and all the data columns
parent = xlsx_utils.get_parent_path(sheet)
# from IPython import embed
# embed()
def to_dict(xlsx: Union[str, BinaryIO], schema: Union[dict, str, TextIO]) -> dict:
"""Convert the xlsx contents to a dict, it must follow a schema.
Parameters
----------
xlsx: Union[str, BinaryIO]
Path to the XLSX file or opened file object.
schema: Union[dict, str, TextIO]
Schema for validation of XLSX content.
Returns
-------
out: dict
A dict representing the JSON with the extracted data.
"""
converter = XLSXConverter(xlsx, schema)
return converter.to_dict()
# encoding: utf-8
#
# This file is a part of the LinkAhead Project.
#
# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com>
# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import json
import os
import re
import tempfile
import jsonschema.exceptions as schema_exc
import pytest
import caosadvancedtools.table_json_conversion.convert as convert
from openpyxl import load_workbook
from .utils import compare_workbooks
def rfp(*pathcomponents):
"""Return full path, a shorthand convenience function.
"""
return os.path.join(os.path.dirname(__file__), *pathcomponents)
def fill_and_compare(json_file: str, template_file: str, known_good: str,
schema: str = None, custom_output: str = None):
"""Fill the data into a template and compare to a known good.
Parameters:
-----------
schema: str, optional,
Json schema to validate against.
custom_output: str, optional
If given, write to this file and drop into an IPython shell. For development only.
"""
with tempfile.TemporaryDirectory() as tmpdir:
outfile = os.path.join(tmpdir, 'test.xlsx')
assert not os.path.exists(outfile)
if custom_output is not None:
outfile = custom_output
# fill_template(data=json_file, template=template_file, result=outfile,
# validation_schema=schema)
assert os.path.exists(outfile)
generated = load_workbook(outfile) # workbook can be read
known_good_wb = load_workbook(known_good)
compare_workbooks(generated, known_good_wb)
def test_simple():
# convert.to_dict(xlsx=rfp("data/simple_data.xlsx"), schema=rfp("data/simple_schema.json"))
convert.to_dict(xlsx=rfp("data/multiple_refs_data.xlsx"), schema=rfp("data/multiple_refs_schema.json"))
# conv = XLSXConverter(schema=rfp("data/simple_schema.json"))
# result = conv.to_dict(rfp("data/simple_template.xlsx"))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment