ENH: basic labfolder parsing

9fce241f · Henrik tom Wörden · 77e4e7f7 · 9fce241f · 9fce241f · 9fce241f
Commit 9fce241f authored 5 years ago by Henrik tom Wörden
--- a/src/caosadvancedtools/converter/__init__.py
+++ b/src/caosadvancedtools/converter/__init__.py
--- a/src/caosadvancedtools/converter/import_labfolder.py
+++ b/src/caosadvancedtools/converter/import_labfolder.py
+#!/usr/bin/env python3
+#
+# This file is a part of the CaosDB Project.
+#
+# Copyright (c) 2020 IndiScale GmbH
+# Copyright (c) 2020 Daniel Hornung <d.hornung@indiscale.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+""" Imports labfolder export """
+import argparse
+import os
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+import time
+import warnings
+from io import BytesIO, StringIO
+import requests
+import yaml
+from bs4 import BeautifulSoup
+import caosdb as db
+def handle_starttag(self, tag, attrs):
+    att_dict = {}
+    for attr in attrs:
+        if attr[0] in att_dict:
+            raise Exception("multiple")
+        att_dict[attr[0]] = attr[1]
+    if "data-id" in att_dict:
+        if self.has_class(att_dict, "eln_project_content"):
+            print("Project:", att_dict["data-id"])
+        else:
+            print("Entry:", att_dict["data-id"])
+            rec = db.Record()
+            self.records.append(rec)
+    if self.has_class(att_dict, "dd_entry_cell_content"):
+        self.records[-1].add_property("text",
+                                      att_dict["dd_entry_cell_content"])
+def has_class(ele, name):
+    if "class" not in ele.attrib:
+        return False
+    return name in ele.attrib["class"].split(" ")
+def main(args):
+    """The main function."""
+    if not os.path.exists(args.file):
+        raise ValueError("File does not exist")
+    with open(args.file) as inpu:
+        text = inpu.read()
+    tree = BeautifulSoup(text, features="lxml")
+    project = tree.find_all(id="eln_project_content")[0]
+    for entry in project.find_all(attrs={'class': 'epb_content_container'}):
+        for block in entry.find_all(attrs={'class': 'dd_entry_cell'}):
+            # If all text field would have the class dd_text_entry the
+            # following would be sufficient:
+            # if 'dd_text_entry' in block['class']:
+            # instead we check whether an editor field exists.
+            editor = block.find_all(attrs={'class': 'redactor_editor'})
+            if len(editor) > 0:
+                print("\n\n## is text ##")
+                print(editor[0].getText())
+                continue
+            download = block.find_all(
+                attrs={'class': 'dd_entry_cell_file_download'})
+            if len(download) > 0:
+                print("\n\nreferences file:\n",
+                      (download[0].parent).attrs['data-filename'])
+                continue
+            elements = block.find_all(
+                attrs={'class': 'data-element-display'})
+            if len(elements) > 0:
+                print("\n\nhas data elements:")
+                for el in elements:
+                    print(el.getText())
+                continue
+            tables = block.find_all(
+                attrs={'class': 'table-el-container'})
+            if len(tables) > 0:
+                print("\n\ntable:\n",
+                      (tables[0]).find_all(
+                          attrs={'class': 'table-el-filename'}
+                      )[0].getText().strip())
+                continue
+            empty = block.find_all(
+                attrs={'class': 'dd_entry_empty_element'})
+            if len(tables) > 0:
+                print("\n\nempty")
+                continue
+            print(block.attrs)
+    # import IPython
+    # IPython.embed()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("file", default="projects/My private projects_0/"
+                        "118217_Example project/index.html", nargs="?")
+    args = parser.parse_args()
+    sys.exit(main(args))
--- a/src/caosadvancedtools/converter/labfolder_example.html
+++ b/src/caosadvancedtools/converter/labfolder_example.html