Skip to content
Snippets Groups Projects
Commit 32f2070b authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

ENH: basic conversion labfolder project to records

parent 9fce241f
No related branches found
No related tags found
No related merge requests found
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
# along with this program. If not, see <https://www.gnu.org/licenses/>. # along with this program. If not, see <https://www.gnu.org/licenses/>.
# #
""" Imports labfolder export """ """ Imports labfolder exports """
import argparse import argparse
import os import os
...@@ -38,101 +38,130 @@ from bs4 import BeautifulSoup ...@@ -38,101 +38,130 @@ from bs4 import BeautifulSoup
import caosdb as db import caosdb as db
#crawler = Crawler()
def handle_starttag(self, tag, attrs):
att_dict = {}
for attr in attrs: def create_project(project):
if attr[0] in att_dict: dbproject = db.Record(name=project.attrs['data-name'])
raise Exception("multiple") dbproject.add_parent(name="Project")
att_dict[attr[0]] = attr[1] dbproject.add_property(name="projectId", value=project.attrs['data-id'])
# crawler.cached_find_identifiables([dbproject])
if "data-id" in att_dict: return dbproject
if self.has_class(att_dict, "eln_project_content"):
print("Project:", att_dict["data-id"])
else:
print("Entry:", att_dict["data-id"])
rec = db.Record()
self.records.append(rec)
if self.has_class(att_dict, "dd_entry_cell_content"):
self.records[-1].add_property("text",
att_dict["dd_entry_cell_content"])
def get_author_from_entry(entry):
person = db.Record()
person.add_parent(name="Person")
resp = entry.find_all(attrs={'class': 'author_name'})
def has_class(ele, name): for name in ["firstname", "lastname"]:
if "class" not in ele.attrib: person.add_property(
return False name=name,
value=resp[0].find_all(attrs={'class': 'author_'+name})[0].getText())
# crawler.cached_find_identifiables([person])
return name in ele.attrib["class"].split(" ") return person
def main(args): def val_or_none(stuff):
"""The main function.""" if len(stuff) == 0:
return None
else:
return stuff[0].getText()
if not os.path.exists(args.file):
raise ValueError("File does not exist")
with open(args.file) as inpu: def add_property_from_data_element(dbrecord, element):
text = inpu.read() unit = val_or_none(element.find_all(attrs={'class': 'element-unit'}))
title = val_or_none(element.find_all(attrs={'class': 'element-title'}))
quant = val_or_none(element.find_all(attrs={'class': 'element-quantity'}))
val = val_or_none(element.find_all(attrs={'class': 'element-value'}))
tree = BeautifulSoup(text, features="lxml") if quant is not None:
project = tree.find_all(id="eln_project_content")[0] title = title+"-"+quant
dbrecord.add_property(name=title, value=val, unit=unit)
def create_entry(entry, dbproject):
dbrecord = db.Record()
dbrecord.add_parent(name="LabbookEntry")
dbrecord.add_property(name="Project", value=dbproject)
dbrecord.add_property(name="entryId", value=entry.attrs['data-id'])
# crawler.cached_find_identifiables([dbrecord])
person = get_author_from_entry(entry)
dbrecord.add_property(name="responsible", value=person)
for block in entry.find_all(attrs={'class': 'dd_entry_cell'}):
# If all text field would have the class dd_text_entry the
# following would be sufficient:
# if 'dd_text_entry' in block['class']:
# instead we check whether an editor field exists.
editor = block.find_all(attrs={'class': 'redactor_editor'})
for entry in project.find_all(attrs={'class': 'epb_content_container'}): if len(editor) > 0:
for block in entry.find_all(attrs={'class': 'dd_entry_cell'}): dbrecord.add_property(name="textElement", value=editor[0].getText())
# If all text field would have the class dd_text_entry the
# following would be sufficient:
# if 'dd_text_entry' in block['class']:
# instead we check whether an editor field exists.
editor = block.find_all(attrs={'class': 'redactor_editor'})
if len(editor) > 0: continue
print("\n\n## is text ##")
print(editor[0].getText())
continue download = block.find_all(
attrs={'class': 'dd_entry_cell_file_download'})
download = block.find_all( if len(download) > 0:
attrs={'class': 'dd_entry_cell_file_download'}) local_path = (download[0].parent).attrs['data-filename']
f = db.File(path=local_path,
file=local_path)
dbrecord.add_property(name="accompaningFile", value=f)
if len(download) > 0: continue
print("\n\nreferences file:\n",
(download[0].parent).attrs['data-filename'])
continue elements = block.find_all(
attrs={'class': 'data-element-display'})
elements = block.find_all( if len(elements) > 0:
attrs={'class': 'data-element-display'}) for el in elements:
add_property_from_data_element(dbrecord=dbrecord, element=el)
if len(elements) > 0: continue
print("\n\nhas data elements:")
for el in elements: tables = block.find_all(
print(el.getText()) attrs={'class': 'table-el-container'})
continue if len(tables) > 0:
local_path = (tables[0]).find_all(
attrs={'class': 'table-el-filename'}
)[0].getText().strip()
f = db.File(path=local_path, file=local_path)
dbrecord.add_property(name="table", value=f)
tables = block.find_all( continue
attrs={'class': 'table-el-container'})
if len(tables) > 0: empty = block.find_all(
print("\n\ntable:\n", attrs={'class': 'dd_entry_empty_element'})
(tables[0]).find_all(
attrs={'class': 'table-el-filename'}
)[0].getText().strip())
continue if len(tables) > 0:
print("\n\nempty")
empty = block.find_all( continue
attrs={'class': 'dd_entry_empty_element'})
if len(tables) > 0: print(dbrecord)
print("\n\nempty")
continue
print(block.attrs) def main(args):
"""The main function."""
if not os.path.exists(args.file):
raise ValueError("File does not exist")
with open(args.file) as inpu:
text = inpu.read()
tree = BeautifulSoup(text, features="lxml")
project = tree.find_all(id="eln_project_content")[0]
dbproject = create_project(project)
for entry in project.find_all(lambda x: x.has_attr('data-id')):
create_entry(entry, dbproject)
# import IPython # import IPython
# IPython.embed() # IPython.embed()
...@@ -140,7 +169,6 @@ def main(args): ...@@ -140,7 +169,6 @@ def main(args):
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("file", default="projects/My private projects_0/" parser.add_argument("file", default="./labfolder_example.html", nargs="?")
"118217_Example project/index.html", nargs="?")
args = parser.parse_args() args = parser.parse_args()
sys.exit(main(args)) sys.exit(main(args))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment