diff --git a/src/caosadvancedtools/converter/import_labfolder.py b/src/caosadvancedtools/converter/import_labfolder.py index 3af4e807a96f32d176b384799259416431a4a34c..e6c85a94a0c09ee55249135448de57a52169351f 100644 --- a/src/caosadvancedtools/converter/import_labfolder.py +++ b/src/caosadvancedtools/converter/import_labfolder.py @@ -19,7 +19,7 @@ # along with this program. If not, see <https://www.gnu.org/licenses/>. # -""" Imports labfolder export """ +""" Imports labfolder exports """ import argparse import os @@ -38,101 +38,130 @@ from bs4 import BeautifulSoup import caosdb as db +#crawler = Crawler() -def handle_starttag(self, tag, attrs): - att_dict = {} - for attr in attrs: - if attr[0] in att_dict: - raise Exception("multiple") - att_dict[attr[0]] = attr[1] +def create_project(project): + dbproject = db.Record(name=project.attrs['data-name']) + dbproject.add_parent(name="Project") + dbproject.add_property(name="projectId", value=project.attrs['data-id']) + # crawler.cached_find_identifiables([dbproject]) - if "data-id" in att_dict: - if self.has_class(att_dict, "eln_project_content"): - print("Project:", att_dict["data-id"]) - else: - print("Entry:", att_dict["data-id"]) - rec = db.Record() - self.records.append(rec) + return dbproject - if self.has_class(att_dict, "dd_entry_cell_content"): - self.records[-1].add_property("text", - att_dict["dd_entry_cell_content"]) +def get_author_from_entry(entry): + person = db.Record() + person.add_parent(name="Person") + resp = entry.find_all(attrs={'class': 'author_name'}) -def has_class(ele, name): - if "class" not in ele.attrib: - return False + for name in ["firstname", "lastname"]: + person.add_property( + name=name, + value=resp[0].find_all(attrs={'class': 'author_'+name})[0].getText()) + # crawler.cached_find_identifiables([person]) - return name in ele.attrib["class"].split(" ") + return person -def main(args): - """The main function.""" +def val_or_none(stuff): + if len(stuff) == 0: + return None + else: + return stuff[0].getText() - if not os.path.exists(args.file): - raise ValueError("File does not exist") - with open(args.file) as inpu: - text = inpu.read() +def add_property_from_data_element(dbrecord, element): + unit = val_or_none(element.find_all(attrs={'class': 'element-unit'})) + title = val_or_none(element.find_all(attrs={'class': 'element-title'})) + quant = val_or_none(element.find_all(attrs={'class': 'element-quantity'})) + val = val_or_none(element.find_all(attrs={'class': 'element-value'})) - tree = BeautifulSoup(text, features="lxml") - project = tree.find_all(id="eln_project_content")[0] + if quant is not None: + title = title+"-"+quant + dbrecord.add_property(name=title, value=val, unit=unit) + + +def create_entry(entry, dbproject): + dbrecord = db.Record() + dbrecord.add_parent(name="LabbookEntry") + dbrecord.add_property(name="Project", value=dbproject) + dbrecord.add_property(name="entryId", value=entry.attrs['data-id']) + # crawler.cached_find_identifiables([dbrecord]) + + person = get_author_from_entry(entry) + dbrecord.add_property(name="responsible", value=person) + + for block in entry.find_all(attrs={'class': 'dd_entry_cell'}): + # If all text field would have the class dd_text_entry the + # following would be sufficient: + # if 'dd_text_entry' in block['class']: + # instead we check whether an editor field exists. + editor = block.find_all(attrs={'class': 'redactor_editor'}) - for entry in project.find_all(attrs={'class': 'epb_content_container'}): - for block in entry.find_all(attrs={'class': 'dd_entry_cell'}): - # If all text field would have the class dd_text_entry the - # following would be sufficient: - # if 'dd_text_entry' in block['class']: - # instead we check whether an editor field exists. - editor = block.find_all(attrs={'class': 'redactor_editor'}) + if len(editor) > 0: + dbrecord.add_property(name="textElement", value=editor[0].getText()) - if len(editor) > 0: - print("\n\n## is text ##") - print(editor[0].getText()) + continue - continue + download = block.find_all( + attrs={'class': 'dd_entry_cell_file_download'}) - download = block.find_all( - attrs={'class': 'dd_entry_cell_file_download'}) + if len(download) > 0: + local_path = (download[0].parent).attrs['data-filename'] + f = db.File(path=local_path, + file=local_path) + dbrecord.add_property(name="accompaningFile", value=f) - if len(download) > 0: - print("\n\nreferences file:\n", - (download[0].parent).attrs['data-filename']) + continue - continue + elements = block.find_all( + attrs={'class': 'data-element-display'}) - elements = block.find_all( - attrs={'class': 'data-element-display'}) + if len(elements) > 0: + for el in elements: + add_property_from_data_element(dbrecord=dbrecord, element=el) - if len(elements) > 0: - print("\n\nhas data elements:") + continue - for el in elements: - print(el.getText()) + tables = block.find_all( + attrs={'class': 'table-el-container'}) - continue + if len(tables) > 0: + local_path = (tables[0]).find_all( + attrs={'class': 'table-el-filename'} + )[0].getText().strip() + f = db.File(path=local_path, file=local_path) + dbrecord.add_property(name="table", value=f) - tables = block.find_all( - attrs={'class': 'table-el-container'}) + continue - if len(tables) > 0: - print("\n\ntable:\n", - (tables[0]).find_all( - attrs={'class': 'table-el-filename'} - )[0].getText().strip()) + empty = block.find_all( + attrs={'class': 'dd_entry_empty_element'}) - continue + if len(tables) > 0: + print("\n\nempty") - empty = block.find_all( - attrs={'class': 'dd_entry_empty_element'}) + continue - if len(tables) > 0: - print("\n\nempty") + print(dbrecord) - continue - print(block.attrs) +def main(args): + """The main function.""" + + if not os.path.exists(args.file): + raise ValueError("File does not exist") + + with open(args.file) as inpu: + text = inpu.read() + + tree = BeautifulSoup(text, features="lxml") + project = tree.find_all(id="eln_project_content")[0] + dbproject = create_project(project) + + for entry in project.find_all(lambda x: x.has_attr('data-id')): + create_entry(entry, dbproject) # import IPython # IPython.embed() @@ -140,7 +169,6 @@ def main(args): if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("file", default="projects/My private projects_0/" - "118217_Example project/index.html", nargs="?") + parser.add_argument("file", default="./labfolder_example.html", nargs="?") args = parser.parse_args() sys.exit(main(args))