From 2f20561baf4cb972b9c3631ce112cedfec86ce60 Mon Sep 17 00:00:00 2001
From: Daniel <d.hornung@indiscale.com>
Date: Fri, 19 Apr 2024 11:27:56 +0200
Subject: [PATCH] WIP: XLSX reader.

---
 .../table_json_conversion/convert.py          | 75 +++++++++++++++++--
 .../table_json_conversion/test_read_data.py   | 68 +++++++++++++++++
 2 files changed, 135 insertions(+), 8 deletions(-)
 create mode 100644 unittests/table_json_conversion/test_read_data.py

diff --git a/src/caosadvancedtools/table_json_conversion/convert.py b/src/caosadvancedtools/table_json_conversion/convert.py
index 344c7ad1..a19f10fb 100644
--- a/src/caosadvancedtools/table_json_conversion/convert.py
+++ b/src/caosadvancedtools/table_json_conversion/convert.py
@@ -4,7 +4,6 @@
 # This file is a part of the LinkAhead Project.
 #
 # Copyright (C) 2024 Indiscale GmbH <info@indiscale.com>
-# Copyright (C) 2024 Henrik tom Wörden <h.tomwoerden@indiscale.com>
 # Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com>
 #
 # This program is free software: you can redistribute it and/or modify
@@ -20,18 +19,78 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program. If not, see <https://www.gnu.org/licenses/>.
 
-from fill_xlsx import read_or_dict
+from typing import Any, BinaryIO, Dict, List, Optional, TextIO, Union
+
+import caosadvancedtools.table_json_conversion.xlsx_utils as xlsx_utils
+from . import fill_xlsx
+from .fill_xlsx import read_or_dict
+from openpyxl import load_workbook, Workbook
+from openpyxl.worksheet.worksheet import Worksheet
 
 
 class XLSXConverter:
-    def __init__(self, schema: Union[dict, str, TextIO]):
-        """
+    """Class for conversion from XLSX to JSON.
+
+For a detailed description of the required formatting of the XLSX files, see ``specs.md`` in the
+documentation.
+    """
 
+    def __init__(self, xlsx: Union[str, BinaryIO], schema: Union[dict, str, TextIO]):
+        """
 Parameters
 ----------
+xlsx: Union[str, BinaryIO]
+  Path to the XLSX file or opened file object.
+
 schema: Union[dict, str, TextIO]
-  If given, validate the date against this schema first.  This raises an exception if the validation
-  fails.  May be given as dict with the schema, path to a json file as string or as a readable file
-  like object.
-        """
+  Schema for validation of XLSX content.
+"""
+        self._workbook = load_workbook(xlsx)
         self._schema = read_or_dict(schema)
+        self._handled_sheets = set()
+        self._result = {}
+
+    def to_dict(self) -> dict:
+        """Convert the xlsx contents to a dict.
+
+Returns
+-------
+out: dict
+  A dict representing the JSON with the extracted data.
+        """
+        self._handled_sheets = set()
+        self._result = {}
+        for sheetname in self._workbook.sheetnames:
+            if sheetname not in self._handled_sheets:
+                self._handle_sheet(self._workbook.get_sheet_by_name(sheetname))
+
+    def _handle_sheet(self, sheet: Worksheet) -> None:
+        """Add the contents of the sheet to the result.
+        """
+        path_rows = xlsx_utils.get_path_rows(sheet)
+        row_type_column = xlsx_utils.get_row_type_column_index(sheet)
+        foreign_columns = xlsx_utils.get_foreign_key_columns(sheet)
+        # Parent element: longest common path shared among any foreign column and all the data columns
+        parent = xlsx_utils.get_parent_path(sheet)
+        # from IPython import embed
+        # embed()
+
+
+def to_dict(xlsx: Union[str, BinaryIO], schema: Union[dict, str, TextIO]) -> dict:
+    """Convert the xlsx contents to a dict, it must follow a schema.
+
+Parameters
+----------
+xlsx: Union[str, BinaryIO]
+  Path to the XLSX file or opened file object.
+
+schema: Union[dict, str, TextIO]
+  Schema for validation of XLSX content.
+
+Returns
+-------
+out: dict
+  A dict representing the JSON with the extracted data.
+    """
+    converter = XLSXConverter(xlsx, schema)
+    return converter.to_dict()
diff --git a/unittests/table_json_conversion/test_read_data.py b/unittests/table_json_conversion/test_read_data.py
new file mode 100644
index 00000000..1b1b49d0
--- /dev/null
+++ b/unittests/table_json_conversion/test_read_data.py
@@ -0,0 +1,68 @@
+# encoding: utf-8
+#
+# This file is a part of the LinkAhead Project.
+#
+# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com>
+# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+import json
+import os
+import re
+import tempfile
+
+import jsonschema.exceptions as schema_exc
+import pytest
+import caosadvancedtools.table_json_conversion.convert as convert
+from openpyxl import load_workbook
+
+from .utils import compare_workbooks
+
+
+def rfp(*pathcomponents):
+    """Return full path, a shorthand convenience function.
+    """
+    return os.path.join(os.path.dirname(__file__), *pathcomponents)
+
+
+def fill_and_compare(json_file: str, template_file: str, known_good: str,
+                     schema: str = None, custom_output: str = None):
+    """Fill the data into a template and compare to a known good.
+
+Parameters:
+-----------
+schema: str, optional,
+  Json schema to validate against.
+custom_output: str, optional
+  If given, write to this file and drop into an IPython shell.  For development only.
+    """
+    with tempfile.TemporaryDirectory() as tmpdir:
+        outfile = os.path.join(tmpdir, 'test.xlsx')
+        assert not os.path.exists(outfile)
+        if custom_output is not None:
+            outfile = custom_output
+        # fill_template(data=json_file, template=template_file, result=outfile,
+        #               validation_schema=schema)
+        assert os.path.exists(outfile)
+        generated = load_workbook(outfile)  # workbook can be read
+    known_good_wb = load_workbook(known_good)
+    compare_workbooks(generated, known_good_wb)
+
+
+def test_simple():
+    # convert.to_dict(xlsx=rfp("data/simple_data.xlsx"), schema=rfp("data/simple_schema.json"))
+    convert.to_dict(xlsx=rfp("data/multiple_refs_data.xlsx"), schema=rfp("data/multiple_refs_schema.json"))
+    # conv = XLSXConverter(schema=rfp("data/simple_schema.json"))
+    # result = conv.to_dict(rfp("data/simple_template.xlsx"))
-- 
GitLab