Merge branch 'im_und_export' into 'master'

Im und export See merge request caosdb/caosdb-advanced-user-tools!16

Merge branch 'im_und_export' into 'master'
3bb35450 · Henrik tom Wörden · d714a388 · 955153da · 3bb35450 · 3bb35450
Commit 3bb35450 authored 5 years ago by Henrik tom Wörden
--- a/.docker/Dockerfile
+++ b/.docker/Dockerfile
@@ -11,8 +11,12 @@ RUN apt-get update && \
     libxml2 \
 	 -y
 COPY .docker/wait-for-it.sh /wait-for-it.sh
+ADD https://gitlab.com/api/v4/projects/13656973/repository/branches/master \
+   pylib_version.json
 RUN git clone https://gitlab.com/caosdb/caosdb-pylib.git && \
   cd caosdb-pylib && pip3 install .
+ADD https://gitlab.com/api/v4/projects/13656965/repository/branches/master \
+   model_version.json
 RUN git clone https://gitlab.com/caosdb/caosdb-models.git && \
   cd caosdb-models && pip3 install .
 ADD https://gitlab.com/api/v4/projects/13601752/repository/branches/master \

--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,4 @@ __pycache__
 .coverage
 cache.db
 *.egg-info
+.docker/cert
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -72,8 +72,8 @@ build-testenv:
  tags: [cached-dind]
  image: docker:18.09
  stage: setup
-  only:
-      - schedules
+  #only:
+  #- schedules
  script: 
      - df -h
      - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
@@ -104,5 +104,5 @@ style:
  stage: style
  image: $CI_REGISTRY_IMAGE
  script:
-      - autopep8 -r --diff --exit-code .
+      - autopep8 -ar --diff --exit-code .
  allow_failure: true
--- a/integrationtests/full_test/example_table.csv
+++ b/integrationtests/full_test/example_table.csv
+firstName,lastName,email
+Henrik,tom Wörden,henrik@indiscale.com
+Max,Mustermann,max@mustermann.eu
--- a/integrationtests/full_test/test.sh
+++ b/integrationtests/full_test/test.sh
 ls 
 rm -rf cache.db
+echo "Filling the database"
 ./filldb.sh
+echo "Testing the crawler database"
 py.test-3 test_crawler.py
+echo "Testing im and export"
+python3 test_im_und_export.py
--- a/integrationtests/full_test/test_im_und_export.py
+++ b/integrationtests/full_test/test_im_und_export.py
+#!/usr/bin/env python3
+import os
+import unittest
+from tempfile import TemporaryDirectory
+
+import caosdb as db
+
+from caosadvancedtools.export_related import export
+from caosadvancedtools.import_from_xml import import_xml
+
+if __name__ == "__main__":
+    print("Conducting im- and export tests")
+    rec = db.execute_query("FIND 2019-02-03_really_cool_finding", unique=True)
+    directory = TemporaryDirectory()
+    export(rec.id, directory=directory.name)
+    # delete everything
+    rec = db.execute_query("FIND record which was inserted by me")
+    prop = db.execute_query("FIND property which was inserted by me")
+    rt = db.execute_query("FIND recordtype which was inserted by me")
+    fi = db.execute_query("FIND file which was inserted by me")
+    c = db.Container()
+    c.extend(rec+prop+rt+fi)
+    c.delete()
+    assert 0 == len(db.execute_query("FIND File which is stored at "
+                                     "**/poster.pdf"))
+    import_xml(os.path.join(directory.name, "caosdb_data.xml"), interactive=False)
+
+    # The following tests the existence of some required entities.
+    # However, this is not a full list.
+    db.execute_query("FIND 2019-02-03_really_cool_finding", unique=True)
+    db.execute_query("FIND RecordType Poster", unique=True)
+    db.execute_query("FIND RecordType Analysis", unique=True)
+    db.execute_query("FIND RecordType Person", unique=True)
+    db.execute_query("FIND Record Person with firstname=Only", unique=True)
+    db.execute_query("FIND File which is stored at **/poster.pdf", unique=True)
--- a/integrationtests/single_tests/test_crawler.py
+++ b/integrationtests/single_tests/test_crawler.py
@@ -85,7 +85,7 @@ class CrawlerTest(unittest.TestCase):
        for el in [self.exp, self.ana, self.pub, self.rts]:
            try:
                el.delete()
-            except:
+            except BaseException:
                pass


@@ -104,5 +104,5 @@ class CrawlerTestExist(CrawlerTest):
        for el in [self.exp, self.ana, self.pub, self.rts]:
            try:
                el.delete()
-            except:
+            except BaseException:
                pass
--- a/setup.py
+++ b/setup.py
@@ -32,7 +32,7 @@ setup(name='caosadvancedtools',
      author_email='henrik.tom-woerden@ds.mpg.de',
      packages=find_packages('src'),
      package_dir={'': 'src'},
-      install_requires=[],
+      install_requires=["caosdb>=0.3.0", "caosmodels>=0.1.0"],
      extras_require={},
      tests_require=["pytest"],
      )
--- a/src/caosadvancedtools/converter/labfolder_api.py
+++ b/src/caosadvancedtools/converter/labfolder_api.py
@@ -114,7 +114,7 @@ class Importer(object):

        try:
            element = self.connection.retrieve_element(element_id, el_type=el_type)
-        except:
+        except BaseException:
            print("Could not retrieve: ", element_id)

            return

--- a/src/caosadvancedtools/export_related.py
+++ b/src/caosadvancedtools/export_related.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# ** header v3.0
+# This file is a part of the CaosDB Project.
+#
+# Copyright (C) 2020 IndiScale GmbH, Henrik tom Wörden
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+# ** end header
+#
+"""
+This file allows to create an xml representation of a complete dataset.
+Using the given entity all related entities are collected and saved in a way
+that the data can be imported in another CaosDB instance.
+
+Files that are smaller than 1MB are saved in a downloads folder and can be
+imported along with the entities themselves.
+"""
+import argparse
+import os
+
+import caosdb as db
+from caosdb.apiutils import apply_to_ids, retrieve_entities_with_ids
+from caosdb.common.datatype import get_id_of_datatype, is_reference
+from lxml import etree
+
+
+def get_ids_of_related_entities(entity):
+    """ returns a list of ids of entities that related to the given one.
+
+    Related means in this context, that it is kind of necessary for the
+    representation of this entity: ids of properties and parents as well as the
+    ids of referenced entities.
+    """
+    entities = []
+
+    for par in entity.parents:
+        entities.append(par.id)
+
+    for prop in entity.properties:
+        entities.append(prop.id)
+        isref = is_reference(prop.datatype)
+
+        if isref:
+            if isinstance(prop.value, list) and len(prop.value) > 0:
+                entities.extend([int(el) for el in prop.value])
+            elif prop.value is not None:
+                entities.append(int(prop.value))
+
+            if prop.datatype not in [db.FILE, db.REFERENCE, db.LIST(db.FILE),
+                                     db.LIST(db.REFERENCE)]:
+                entities.append(get_id_of_datatype(prop.datatype))
+
+    return entities
+
+
+def recursively_collect_related(entity):
+    """ collects all related entities.
+    Starting from a single entity the related entities are retrieved (see
+    get_ids_of_related_entities) and then the related entities of those are
+    retrieved and so forth.
+    This is usefull to create a collection of kind of related dataset
+    """
+    all_entities = db.Container()
+    all_entities.append(entity)
+    ids = set([entity.id])
+    new_entities = [entity]
+
+    while new_entities:
+        new_ids = set()
+
+        for ent in new_entities:
+            new_ids.update(get_ids_of_related_entities(ent))
+        new_ids = new_ids - ids
+        new_entities = retrieve_entities_with_ids(list(new_ids))
+        ids.update([e.id for e in new_entities])
+        all_entities.extend(new_entities)
+
+    return all_entities
+
+
+def invert_ids(entities):
+    apply_to_ids(entities, lambda x: x*-1)
+
+
+def export(rec_id, directory="."):
+    if not isinstance(rec_id, int):
+        raise ValueError("rec_id needs to be an integer")
+    ent = db.execute_query("FIND {}".format(rec_id), unique=True)
+    cont = recursively_collect_related(ent)
+
+    directory = os.path.abspath(directory)
+    dl_dir = os.path.join(directory, "downloads")
+
+    if not os.path.exists(dl_dir):
+        os.makedirs(dl_dir)
+
+    for el in cont:
+        if isinstance(el, db.File) and el.size < 1e6:
+            target = os.path.join(dl_dir, el.path[1:])
+            os.makedirs(os.path.dirname(target), exist_ok=True)
+            try:
+                el.download(target)
+                print("Downloaded:", target)
+            except BaseException:
+                print("Failed download of:", target)
+
+    invert_ids(cont)
+    xml = etree.tounicode(cont.to_xml(
+        local_serialization=True), pretty_print=True)
+
+    with open(os.path.join(directory, "caosdb_data.xml"), "w") as fi:
+        fi.write(xml)
+
+
+def defineParser():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        '-i',
+        '--id',
+        type=int,
+        required=True,
+        help='the id of the record that shall be copied and then changed')
+    parser.add_argument(
+        '-d',
+        '--directory',
+        default=".",
+        help='the directory where the xml file and the downloads are saved')
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = defineParser()
+    args = parser.parse_args()
+
+    export(args.id, directory=args.directory)
--- a/src/caosadvancedtools/import_from_xml.py
+++ b/src/caosadvancedtools/import_from_xml.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# ** header v3.0
+# This file is a part of the CaosDB Project.
+#
+# Copyright (C) 2020 IndiScale GmbH, Henrik tom Wörden
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+# ** end header
+#
+"""
+This file allows to import a dataset stored in a xml representation and
+corresponding files.
+
+The export should have been done with export_related.py
+"""
+import argparse
+import os
+from tempfile import NamedTemporaryFile
+
+import caosdb as db
+from caosdb.apiutils import apply_to_ids
+from caosmodels.data_model import DataModel
+
+
+def create_dummy_file(text="Please ask the administrator for this file."):
+    tmpfile = NamedTemporaryFile(delete=False)
+    tmpfile.close()
+    with open(tmpfile.name, "w") as tm:
+        tm.write(text)
+
+    return tmpfile.name
+
+
+def import_xml(filename, rerun=False, interactive=True):
+    """
+    filename: path to the xml file with the data to be inserted
+    rerun: boolean; if true, files are not inserted as paths would conflict.
+    """
+    cont = db.Container()
+    with open(filename) as fi:
+        cont = cont.from_xml(fi.read())
+
+    tmpfile = create_dummy_file()
+    model = []
+
+    files = []
+
+    # add files to files list and properties and record types to model
+
+    for el in cont:
+        if isinstance(el, db.File):
+            el._checksum = None
+            target = os.path.join("downloads", el.path[1:])
+
+            if os.path.exists(target):
+                el.file = target
+            else:
+                el.file = tmpfile
+            files.append(el)
+
+        if (isinstance(el, db.Property) or isinstance(el, db.RecordType)):
+            model.append(el)
+
+    # remove entities of the model from the container
+
+    for el in model+files:
+        cont.remove(el)
+
+    id_mapping = {}
+
+    for el in model+files:
+        id_mapping[el.id] = el
+
+    # insert/update the model
+    datamodel = DataModel()
+    datamodel.extend(model)
+    datamodel.sync_data_model(noquestion=not interactive)
+
+    # insert files
+
+    if not rerun:
+        for _, el in enumerate(files):
+            r = el.insert(unique=False)
+    else:
+        for _, el in enumerate(files):
+            el.id = None
+            el.retrieve()
+
+    def replace_by_new(old):
+        if old in id_mapping:
+            return id_mapping[old].id
+        else:
+            return old
+
+    # set the ids of already inserted entities in the container
+    apply_to_ids(cont, replace_by_new)
+
+    cont.insert(unique=False)
+
+
+def defineParser():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("file", help='file to be imported')
+    parser.add_argument("--rerun", help='if this script is run at least a'
+                        ' second time and files are already inserted',
+                        action="store_true")
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = defineParser()
+    args = parser.parse_args()
+
+    import_xml(args.file, args.rerun)
--- a/src/caosadvancedtools/pandoc_header_tools.py
+++ b/src/caosadvancedtools/pandoc_header_tools.py
@@ -147,7 +147,7 @@ it is not at the beginning, it must be preceded by a blank line.
            raise ParseErrorsInHeader(filename, e)
        # except yaml.error.MarkedYAMLError as e:
        #     raise NoValidHeader(filename)
-        if type(yaml_part) != dict:
+        if not isinstance(yaml_part, dict):
            raise NoValidHeader(filename)
        return (found_1, found_2, clean_header(yaml_part))

@@ -229,7 +229,7 @@ def clean_header(header):

    for k, v in header.items():
        # Plain string is put into list
-        if type(v) == str:
+        if isinstance(v, str):
            header[k] = [v]

    return header

--- a/src/caosadvancedtools/table_converter.py
+++ b/src/caosadvancedtools/table_converter.py
@@ -24,9 +24,8 @@ import argparse
 import re
 import sys

-import pandas as pd
-
 import caosdb as db
+import pandas as pd


 def from_tsv(filename, recordtype):
@@ -81,6 +80,10 @@ def from_table(spreadsheet, recordtype):
        rec.add_parent(name=recordtype)

        for key, value in row.iteritems():
+            if key.lower() == "description":
+                rec.description = value
+                continue
+
            if (pd.notnull(value) and
                    (not isinstance(value, str) or value.strip() != "")):
                regexp = r"(.*)\[(.*)\].*"

--- a/unittests/test_cache.py
+++ b/unittests/test_cache.py
@@ -37,7 +37,7 @@ class CacheTest(unittest.TestCase):

    def test_hash(self):
        ent = db.Record()
-        assert type(Cache.hash_entity(ent)) is str
+        assert isinstance(Cache.hash_entity(ent), str)
        assert (Cache.hash_entity(ent) !=
                Cache.hash_entity(db.Record().add_parent("lol")))

@@ -48,7 +48,7 @@ class CacheTest(unittest.TestCase):
        ent_hash = Cache.hash_entity(ent)
        ent2_hash = Cache.hash_entity(ent2)
        self.cache.insert(ent2_hash, 1235)
-        assert type(self.cache.check_existing(ent2_hash)) is int
+        assert isinstance(self.cache.check_existing(ent2_hash), int)
        assert self.cache.check_existing(ent_hash) is None

    def test_hirarchy(self):

--- a/unittests/test_read_md_header.py
+++ b/unittests/test_read_md_header.py
@@ -53,7 +53,7 @@ data:
 ...
 """
        header = get_header(to_file(file_content))
-        assert type(header["data"]) is list
+        assert isinstance(header["data"], list)
        file_content = """
 ---
 data:
@@ -61,4 +61,4 @@ data:
 ...
 """
        header = get_header(to_file(file_content))
-        assert type(header["data"]) is list
+        assert isinstance(header["data"], list)