Skip to content
Snippets Groups Projects
Commit 9fce241f authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

ENH: basic labfolder parsing

parent 77e4e7f7
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python3
#
# This file is a part of the CaosDB Project.
#
# Copyright (c) 2020 IndiScale GmbH
# Copyright (c) 2020 Daniel Hornung <d.hornung@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
""" Imports labfolder export """
import argparse
import os
import re
import shutil
import subprocess
import sys
import tempfile
import time
import warnings
from io import BytesIO, StringIO
import requests
import yaml
from bs4 import BeautifulSoup
import caosdb as db
def handle_starttag(self, tag, attrs):
att_dict = {}
for attr in attrs:
if attr[0] in att_dict:
raise Exception("multiple")
att_dict[attr[0]] = attr[1]
if "data-id" in att_dict:
if self.has_class(att_dict, "eln_project_content"):
print("Project:", att_dict["data-id"])
else:
print("Entry:", att_dict["data-id"])
rec = db.Record()
self.records.append(rec)
if self.has_class(att_dict, "dd_entry_cell_content"):
self.records[-1].add_property("text",
att_dict["dd_entry_cell_content"])
def has_class(ele, name):
if "class" not in ele.attrib:
return False
return name in ele.attrib["class"].split(" ")
def main(args):
"""The main function."""
if not os.path.exists(args.file):
raise ValueError("File does not exist")
with open(args.file) as inpu:
text = inpu.read()
tree = BeautifulSoup(text, features="lxml")
project = tree.find_all(id="eln_project_content")[0]
for entry in project.find_all(attrs={'class': 'epb_content_container'}):
for block in entry.find_all(attrs={'class': 'dd_entry_cell'}):
# If all text field would have the class dd_text_entry the
# following would be sufficient:
# if 'dd_text_entry' in block['class']:
# instead we check whether an editor field exists.
editor = block.find_all(attrs={'class': 'redactor_editor'})
if len(editor) > 0:
print("\n\n## is text ##")
print(editor[0].getText())
continue
download = block.find_all(
attrs={'class': 'dd_entry_cell_file_download'})
if len(download) > 0:
print("\n\nreferences file:\n",
(download[0].parent).attrs['data-filename'])
continue
elements = block.find_all(
attrs={'class': 'data-element-display'})
if len(elements) > 0:
print("\n\nhas data elements:")
for el in elements:
print(el.getText())
continue
tables = block.find_all(
attrs={'class': 'table-el-container'})
if len(tables) > 0:
print("\n\ntable:\n",
(tables[0]).find_all(
attrs={'class': 'table-el-filename'}
)[0].getText().strip())
continue
empty = block.find_all(
attrs={'class': 'dd_entry_empty_element'})
if len(tables) > 0:
print("\n\nempty")
continue
print(block.attrs)
# import IPython
# IPython.embed()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("file", default="projects/My private projects_0/"
"118217_Example project/index.html", nargs="?")
args = parser.parse_args()
sys.exit(main(args))
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment