diff --git a/.docker/Dockerfile b/.docker/Dockerfile index 9e2185058e709dc2af0d9b9497fa8b11adc8f2b9..9d2c5aedb1dbab515acce88e7736ed3f6f5ec72f 100644 --- a/.docker/Dockerfile +++ b/.docker/Dockerfile @@ -11,8 +11,12 @@ RUN apt-get update && \ libxml2 \ -y COPY .docker/wait-for-it.sh /wait-for-it.sh +ADD https://gitlab.com/api/v4/projects/13656973/repository/branches/master \ + pylib_version.json RUN git clone https://gitlab.com/caosdb/caosdb-pylib.git && \ cd caosdb-pylib && pip3 install . +ADD https://gitlab.com/api/v4/projects/13656965/repository/branches/master \ + model_version.json RUN git clone https://gitlab.com/caosdb/caosdb-models.git && \ cd caosdb-models && pip3 install . ADD https://gitlab.com/api/v4/projects/13601752/repository/branches/master \ diff --git a/.gitignore b/.gitignore index 51c99625da09cb13797014cd255e40caf2aa85bf..c68adb8f5d3109980fa44fc9dc5fcd21d570d6df 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ __pycache__ .coverage cache.db *.egg-info +.docker/cert diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 81614024972d8c1dba0f6babcf5bd39f980df96c..4cf9b51f775fd5c6a4ed5a92f0861089518e6293 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -72,8 +72,8 @@ build-testenv: tags: [cached-dind] image: docker:18.09 stage: setup - only: - - schedules + #only: + #- schedules script: - df -h - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY @@ -104,5 +104,5 @@ style: stage: style image: $CI_REGISTRY_IMAGE script: - - autopep8 -r --diff --exit-code . + - autopep8 -ar --diff --exit-code . allow_failure: true diff --git a/integrationtests/full_test/example_table.csv b/integrationtests/full_test/example_table.csv new file mode 100644 index 0000000000000000000000000000000000000000..5af49105d8b200bf21ddc177d77f830f1c118aab --- /dev/null +++ b/integrationtests/full_test/example_table.csv @@ -0,0 +1,3 @@ +firstName,lastName,email +Henrik,tom Wörden,henrik@indiscale.com +Max,Mustermann,max@mustermann.eu diff --git a/integrationtests/full_test/test.sh b/integrationtests/full_test/test.sh index f107b7a850a492aebb00e620b31a40f1ee95fd1a..f7e7191b7a26702ab0742ef8ab53ec4efe7518be 100755 --- a/integrationtests/full_test/test.sh +++ b/integrationtests/full_test/test.sh @@ -1,4 +1,8 @@ ls rm -rf cache.db +echo "Filling the database" ./filldb.sh +echo "Testing the crawler database" py.test-3 test_crawler.py +echo "Testing im and export" +python3 test_im_und_export.py diff --git a/integrationtests/full_test/test_im_und_export.py b/integrationtests/full_test/test_im_und_export.py new file mode 100644 index 0000000000000000000000000000000000000000..d6fe43ebd70cdc2663ff700730274510ed409fdc --- /dev/null +++ b/integrationtests/full_test/test_im_und_export.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 +import os +import unittest +from tempfile import TemporaryDirectory + +import caosdb as db + +from caosadvancedtools.export_related import export +from caosadvancedtools.import_from_xml import import_xml + +if __name__ == "__main__": + print("Conducting im- and export tests") + rec = db.execute_query("FIND 2019-02-03_really_cool_finding", unique=True) + directory = TemporaryDirectory() + export(rec.id, directory=directory.name) + # delete everything + rec = db.execute_query("FIND record which was inserted by me") + prop = db.execute_query("FIND property which was inserted by me") + rt = db.execute_query("FIND recordtype which was inserted by me") + fi = db.execute_query("FIND file which was inserted by me") + c = db.Container() + c.extend(rec+prop+rt+fi) + c.delete() + assert 0 == len(db.execute_query("FIND File which is stored at " + "**/poster.pdf")) + import_xml(os.path.join(directory.name, "caosdb_data.xml"), interactive=False) + + # The following tests the existence of some required entities. + # However, this is not a full list. + db.execute_query("FIND 2019-02-03_really_cool_finding", unique=True) + db.execute_query("FIND RecordType Poster", unique=True) + db.execute_query("FIND RecordType Analysis", unique=True) + db.execute_query("FIND RecordType Person", unique=True) + db.execute_query("FIND Record Person with firstname=Only", unique=True) + db.execute_query("FIND File which is stored at **/poster.pdf", unique=True) diff --git a/integrationtests/single_tests/test_crawler.py b/integrationtests/single_tests/test_crawler.py index 1647b8ccc9a61e371a00c563f08fb36bb3bab979..9e5f4905f286ea3fe53d69c88adf4cd9b82c7690 100644 --- a/integrationtests/single_tests/test_crawler.py +++ b/integrationtests/single_tests/test_crawler.py @@ -85,7 +85,7 @@ class CrawlerTest(unittest.TestCase): for el in [self.exp, self.ana, self.pub, self.rts]: try: el.delete() - except: + except BaseException: pass @@ -104,5 +104,5 @@ class CrawlerTestExist(CrawlerTest): for el in [self.exp, self.ana, self.pub, self.rts]: try: el.delete() - except: + except BaseException: pass diff --git a/setup.py b/setup.py index 251bc5c1a579af8695974e432180e5f24eae93e4..117821aaa7e18e80aa3a2060bdd6995b5cdb2938 100755 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ setup(name='caosadvancedtools', author_email='henrik.tom-woerden@ds.mpg.de', packages=find_packages('src'), package_dir={'': 'src'}, - install_requires=[], + install_requires=["caosdb>=0.3.0", "caosmodels>=0.1.0"], extras_require={}, tests_require=["pytest"], ) diff --git a/src/caosadvancedtools/converter/labfolder_api.py b/src/caosadvancedtools/converter/labfolder_api.py index de83fc87f15c90fb33cace786bfa34bdbccc9415..82b2f4f4a042ba3b9350c3f9a87121914f27e0e0 100644 --- a/src/caosadvancedtools/converter/labfolder_api.py +++ b/src/caosadvancedtools/converter/labfolder_api.py @@ -114,7 +114,7 @@ class Importer(object): try: element = self.connection.retrieve_element(element_id, el_type=el_type) - except: + except BaseException: print("Could not retrieve: ", element_id) return diff --git a/src/caosadvancedtools/export_related.py b/src/caosadvancedtools/export_related.py new file mode 100755 index 0000000000000000000000000000000000000000..47fe2f4900add818e940fa81466bb9c98a2f0223 --- /dev/null +++ b/src/caosadvancedtools/export_related.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2020 IndiScale GmbH, Henrik tom Wörden +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# +""" +This file allows to create an xml representation of a complete dataset. +Using the given entity all related entities are collected and saved in a way +that the data can be imported in another CaosDB instance. + +Files that are smaller than 1MB are saved in a downloads folder and can be +imported along with the entities themselves. +""" +import argparse +import os + +import caosdb as db +from caosdb.apiutils import apply_to_ids, retrieve_entities_with_ids +from caosdb.common.datatype import get_id_of_datatype, is_reference +from lxml import etree + + +def get_ids_of_related_entities(entity): + """ returns a list of ids of entities that related to the given one. + + Related means in this context, that it is kind of necessary for the + representation of this entity: ids of properties and parents as well as the + ids of referenced entities. + """ + entities = [] + + for par in entity.parents: + entities.append(par.id) + + for prop in entity.properties: + entities.append(prop.id) + isref = is_reference(prop.datatype) + + if isref: + if isinstance(prop.value, list) and len(prop.value) > 0: + entities.extend([int(el) for el in prop.value]) + elif prop.value is not None: + entities.append(int(prop.value)) + + if prop.datatype not in [db.FILE, db.REFERENCE, db.LIST(db.FILE), + db.LIST(db.REFERENCE)]: + entities.append(get_id_of_datatype(prop.datatype)) + + return entities + + +def recursively_collect_related(entity): + """ collects all related entities. + Starting from a single entity the related entities are retrieved (see + get_ids_of_related_entities) and then the related entities of those are + retrieved and so forth. + This is usefull to create a collection of kind of related dataset + """ + all_entities = db.Container() + all_entities.append(entity) + ids = set([entity.id]) + new_entities = [entity] + + while new_entities: + new_ids = set() + + for ent in new_entities: + new_ids.update(get_ids_of_related_entities(ent)) + new_ids = new_ids - ids + new_entities = retrieve_entities_with_ids(list(new_ids)) + ids.update([e.id for e in new_entities]) + all_entities.extend(new_entities) + + return all_entities + + +def invert_ids(entities): + apply_to_ids(entities, lambda x: x*-1) + + +def export(rec_id, directory="."): + if not isinstance(rec_id, int): + raise ValueError("rec_id needs to be an integer") + ent = db.execute_query("FIND {}".format(rec_id), unique=True) + cont = recursively_collect_related(ent) + + directory = os.path.abspath(directory) + dl_dir = os.path.join(directory, "downloads") + + if not os.path.exists(dl_dir): + os.makedirs(dl_dir) + + for el in cont: + if isinstance(el, db.File) and el.size < 1e6: + target = os.path.join(dl_dir, el.path[1:]) + os.makedirs(os.path.dirname(target), exist_ok=True) + try: + el.download(target) + print("Downloaded:", target) + except BaseException: + print("Failed download of:", target) + + invert_ids(cont) + xml = etree.tounicode(cont.to_xml( + local_serialization=True), pretty_print=True) + + with open(os.path.join(directory, "caosdb_data.xml"), "w") as fi: + fi.write(xml) + + +def defineParser(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + '-i', + '--id', + type=int, + required=True, + help='the id of the record that shall be copied and then changed') + parser.add_argument( + '-d', + '--directory', + default=".", + help='the directory where the xml file and the downloads are saved') + + return parser + + +if __name__ == "__main__": + parser = defineParser() + args = parser.parse_args() + + export(args.id, directory=args.directory) diff --git a/src/caosadvancedtools/import_from_xml.py b/src/caosadvancedtools/import_from_xml.py new file mode 100755 index 0000000000000000000000000000000000000000..9942a9a9f38de90d62471cc86d32c25d55c9cba9 --- /dev/null +++ b/src/caosadvancedtools/import_from_xml.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2020 IndiScale GmbH, Henrik tom Wörden +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# +""" +This file allows to import a dataset stored in a xml representation and +corresponding files. + +The export should have been done with export_related.py +""" +import argparse +import os +from tempfile import NamedTemporaryFile + +import caosdb as db +from caosdb.apiutils import apply_to_ids +from caosmodels.data_model import DataModel + + +def create_dummy_file(text="Please ask the administrator for this file."): + tmpfile = NamedTemporaryFile(delete=False) + tmpfile.close() + with open(tmpfile.name, "w") as tm: + tm.write(text) + + return tmpfile.name + + +def import_xml(filename, rerun=False, interactive=True): + """ + filename: path to the xml file with the data to be inserted + rerun: boolean; if true, files are not inserted as paths would conflict. + """ + cont = db.Container() + with open(filename) as fi: + cont = cont.from_xml(fi.read()) + + tmpfile = create_dummy_file() + model = [] + + files = [] + + # add files to files list and properties and record types to model + + for el in cont: + if isinstance(el, db.File): + el._checksum = None + target = os.path.join("downloads", el.path[1:]) + + if os.path.exists(target): + el.file = target + else: + el.file = tmpfile + files.append(el) + + if (isinstance(el, db.Property) or isinstance(el, db.RecordType)): + model.append(el) + + # remove entities of the model from the container + + for el in model+files: + cont.remove(el) + + id_mapping = {} + + for el in model+files: + id_mapping[el.id] = el + + # insert/update the model + datamodel = DataModel() + datamodel.extend(model) + datamodel.sync_data_model(noquestion=not interactive) + + # insert files + + if not rerun: + for _, el in enumerate(files): + r = el.insert(unique=False) + else: + for _, el in enumerate(files): + el.id = None + el.retrieve() + + def replace_by_new(old): + if old in id_mapping: + return id_mapping[old].id + else: + return old + + # set the ids of already inserted entities in the container + apply_to_ids(cont, replace_by_new) + + cont.insert(unique=False) + + +def defineParser(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("file", help='file to be imported') + parser.add_argument("--rerun", help='if this script is run at least a' + ' second time and files are already inserted', + action="store_true") + + return parser + + +if __name__ == "__main__": + parser = defineParser() + args = parser.parse_args() + + import_xml(args.file, args.rerun) diff --git a/src/caosadvancedtools/pandoc_header_tools.py b/src/caosadvancedtools/pandoc_header_tools.py index 4c6dfb21836f4e960fadd084336f053edbc4be41..262defd2e46ea1a6fbe80ab6c476bb8f311cc9a5 100644 --- a/src/caosadvancedtools/pandoc_header_tools.py +++ b/src/caosadvancedtools/pandoc_header_tools.py @@ -147,7 +147,7 @@ it is not at the beginning, it must be preceded by a blank line. raise ParseErrorsInHeader(filename, e) # except yaml.error.MarkedYAMLError as e: # raise NoValidHeader(filename) - if type(yaml_part) != dict: + if not isinstance(yaml_part, dict): raise NoValidHeader(filename) return (found_1, found_2, clean_header(yaml_part)) @@ -229,7 +229,7 @@ def clean_header(header): for k, v in header.items(): # Plain string is put into list - if type(v) == str: + if isinstance(v, str): header[k] = [v] return header diff --git a/src/caosadvancedtools/table_converter.py b/src/caosadvancedtools/table_converter.py index c2d0ac38cd59826c384a053d2e32ffcc6eafa9e5..76f4dfcdb5f040d81d923289a7a730806ad8681b 100644 --- a/src/caosadvancedtools/table_converter.py +++ b/src/caosadvancedtools/table_converter.py @@ -24,9 +24,8 @@ import argparse import re import sys -import pandas as pd - import caosdb as db +import pandas as pd def from_tsv(filename, recordtype): @@ -81,6 +80,10 @@ def from_table(spreadsheet, recordtype): rec.add_parent(name=recordtype) for key, value in row.iteritems(): + if key.lower() == "description": + rec.description = value + continue + if (pd.notnull(value) and (not isinstance(value, str) or value.strip() != "")): regexp = r"(.*)\[(.*)\].*" diff --git a/unittests/test_cache.py b/unittests/test_cache.py index c1c92330b5fba47b0a19a89913ded43ef59d3197..985ac15ca52a06c6e00c13c6d87adcb8d21f1595 100644 --- a/unittests/test_cache.py +++ b/unittests/test_cache.py @@ -37,7 +37,7 @@ class CacheTest(unittest.TestCase): def test_hash(self): ent = db.Record() - assert type(Cache.hash_entity(ent)) is str + assert isinstance(Cache.hash_entity(ent), str) assert (Cache.hash_entity(ent) != Cache.hash_entity(db.Record().add_parent("lol"))) @@ -48,7 +48,7 @@ class CacheTest(unittest.TestCase): ent_hash = Cache.hash_entity(ent) ent2_hash = Cache.hash_entity(ent2) self.cache.insert(ent2_hash, 1235) - assert type(self.cache.check_existing(ent2_hash)) is int + assert isinstance(self.cache.check_existing(ent2_hash), int) assert self.cache.check_existing(ent_hash) is None def test_hirarchy(self): diff --git a/unittests/test_read_md_header.py b/unittests/test_read_md_header.py index 21c641f769958de731962ab0b9f40c670f1abe9c..994f8f16b6158914ff87134f3efd6f157dea6736 100644 --- a/unittests/test_read_md_header.py +++ b/unittests/test_read_md_header.py @@ -53,7 +53,7 @@ data: ... """ header = get_header(to_file(file_content)) - assert type(header["data"]) is list + assert isinstance(header["data"], list) file_content = """ --- data: @@ -61,4 +61,4 @@ data: ... """ header = get_header(to_file(file_content)) - assert type(header["data"]) is list + assert isinstance(header["data"], list)