Skip to content
Snippets Groups Projects
Commit 3bb35450 authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

Merge branch 'im_und_export' into 'master'

Im und export

See merge request caosdb/caosdb-advanced-user-tools!16
parents d714a388 955153da
No related branches found
No related tags found
No related merge requests found
Showing with 344 additions and 15 deletions
......@@ -11,8 +11,12 @@ RUN apt-get update && \
libxml2 \
-y
COPY .docker/wait-for-it.sh /wait-for-it.sh
ADD https://gitlab.com/api/v4/projects/13656973/repository/branches/master \
pylib_version.json
RUN git clone https://gitlab.com/caosdb/caosdb-pylib.git && \
cd caosdb-pylib && pip3 install .
ADD https://gitlab.com/api/v4/projects/13656965/repository/branches/master \
model_version.json
RUN git clone https://gitlab.com/caosdb/caosdb-models.git && \
cd caosdb-models && pip3 install .
ADD https://gitlab.com/api/v4/projects/13601752/repository/branches/master \
......
......@@ -3,3 +3,4 @@ __pycache__
.coverage
cache.db
*.egg-info
.docker/cert
......@@ -72,8 +72,8 @@ build-testenv:
tags: [cached-dind]
image: docker:18.09
stage: setup
only:
- schedules
#only:
#- schedules
script:
- df -h
- docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
......@@ -104,5 +104,5 @@ style:
stage: style
image: $CI_REGISTRY_IMAGE
script:
- autopep8 -r --diff --exit-code .
- autopep8 -ar --diff --exit-code .
allow_failure: true
firstName,lastName,email
Henrik,tom Wörden,henrik@indiscale.com
Max,Mustermann,max@mustermann.eu
ls
rm -rf cache.db
echo "Filling the database"
./filldb.sh
echo "Testing the crawler database"
py.test-3 test_crawler.py
echo "Testing im and export"
python3 test_im_und_export.py
#!/usr/bin/env python3
import os
import unittest
from tempfile import TemporaryDirectory
import caosdb as db
from caosadvancedtools.export_related import export
from caosadvancedtools.import_from_xml import import_xml
if __name__ == "__main__":
print("Conducting im- and export tests")
rec = db.execute_query("FIND 2019-02-03_really_cool_finding", unique=True)
directory = TemporaryDirectory()
export(rec.id, directory=directory.name)
# delete everything
rec = db.execute_query("FIND record which was inserted by me")
prop = db.execute_query("FIND property which was inserted by me")
rt = db.execute_query("FIND recordtype which was inserted by me")
fi = db.execute_query("FIND file which was inserted by me")
c = db.Container()
c.extend(rec+prop+rt+fi)
c.delete()
assert 0 == len(db.execute_query("FIND File which is stored at "
"**/poster.pdf"))
import_xml(os.path.join(directory.name, "caosdb_data.xml"), interactive=False)
# The following tests the existence of some required entities.
# However, this is not a full list.
db.execute_query("FIND 2019-02-03_really_cool_finding", unique=True)
db.execute_query("FIND RecordType Poster", unique=True)
db.execute_query("FIND RecordType Analysis", unique=True)
db.execute_query("FIND RecordType Person", unique=True)
db.execute_query("FIND Record Person with firstname=Only", unique=True)
db.execute_query("FIND File which is stored at **/poster.pdf", unique=True)
......@@ -85,7 +85,7 @@ class CrawlerTest(unittest.TestCase):
for el in [self.exp, self.ana, self.pub, self.rts]:
try:
el.delete()
except:
except BaseException:
pass
......@@ -104,5 +104,5 @@ class CrawlerTestExist(CrawlerTest):
for el in [self.exp, self.ana, self.pub, self.rts]:
try:
el.delete()
except:
except BaseException:
pass
......@@ -32,7 +32,7 @@ setup(name='caosadvancedtools',
author_email='henrik.tom-woerden@ds.mpg.de',
packages=find_packages('src'),
package_dir={'': 'src'},
install_requires=[],
install_requires=["caosdb>=0.3.0", "caosmodels>=0.1.0"],
extras_require={},
tests_require=["pytest"],
)
......@@ -114,7 +114,7 @@ class Importer(object):
try:
element = self.connection.retrieve_element(element_id, el_type=el_type)
except:
except BaseException:
print("Could not retrieve: ", element_id)
return
......
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# ** header v3.0
# This file is a part of the CaosDB Project.
#
# Copyright (C) 2020 IndiScale GmbH, Henrik tom Wörden
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
# ** end header
#
"""
This file allows to create an xml representation of a complete dataset.
Using the given entity all related entities are collected and saved in a way
that the data can be imported in another CaosDB instance.
Files that are smaller than 1MB are saved in a downloads folder and can be
imported along with the entities themselves.
"""
import argparse
import os
import caosdb as db
from caosdb.apiutils import apply_to_ids, retrieve_entities_with_ids
from caosdb.common.datatype import get_id_of_datatype, is_reference
from lxml import etree
def get_ids_of_related_entities(entity):
""" returns a list of ids of entities that related to the given one.
Related means in this context, that it is kind of necessary for the
representation of this entity: ids of properties and parents as well as the
ids of referenced entities.
"""
entities = []
for par in entity.parents:
entities.append(par.id)
for prop in entity.properties:
entities.append(prop.id)
isref = is_reference(prop.datatype)
if isref:
if isinstance(prop.value, list) and len(prop.value) > 0:
entities.extend([int(el) for el in prop.value])
elif prop.value is not None:
entities.append(int(prop.value))
if prop.datatype not in [db.FILE, db.REFERENCE, db.LIST(db.FILE),
db.LIST(db.REFERENCE)]:
entities.append(get_id_of_datatype(prop.datatype))
return entities
def recursively_collect_related(entity):
""" collects all related entities.
Starting from a single entity the related entities are retrieved (see
get_ids_of_related_entities) and then the related entities of those are
retrieved and so forth.
This is usefull to create a collection of kind of related dataset
"""
all_entities = db.Container()
all_entities.append(entity)
ids = set([entity.id])
new_entities = [entity]
while new_entities:
new_ids = set()
for ent in new_entities:
new_ids.update(get_ids_of_related_entities(ent))
new_ids = new_ids - ids
new_entities = retrieve_entities_with_ids(list(new_ids))
ids.update([e.id for e in new_entities])
all_entities.extend(new_entities)
return all_entities
def invert_ids(entities):
apply_to_ids(entities, lambda x: x*-1)
def export(rec_id, directory="."):
if not isinstance(rec_id, int):
raise ValueError("rec_id needs to be an integer")
ent = db.execute_query("FIND {}".format(rec_id), unique=True)
cont = recursively_collect_related(ent)
directory = os.path.abspath(directory)
dl_dir = os.path.join(directory, "downloads")
if not os.path.exists(dl_dir):
os.makedirs(dl_dir)
for el in cont:
if isinstance(el, db.File) and el.size < 1e6:
target = os.path.join(dl_dir, el.path[1:])
os.makedirs(os.path.dirname(target), exist_ok=True)
try:
el.download(target)
print("Downloaded:", target)
except BaseException:
print("Failed download of:", target)
invert_ids(cont)
xml = etree.tounicode(cont.to_xml(
local_serialization=True), pretty_print=True)
with open(os.path.join(directory, "caosdb_data.xml"), "w") as fi:
fi.write(xml)
def defineParser():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
'-i',
'--id',
type=int,
required=True,
help='the id of the record that shall be copied and then changed')
parser.add_argument(
'-d',
'--directory',
default=".",
help='the directory where the xml file and the downloads are saved')
return parser
if __name__ == "__main__":
parser = defineParser()
args = parser.parse_args()
export(args.id, directory=args.directory)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# ** header v3.0
# This file is a part of the CaosDB Project.
#
# Copyright (C) 2020 IndiScale GmbH, Henrik tom Wörden
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
# ** end header
#
"""
This file allows to import a dataset stored in a xml representation and
corresponding files.
The export should have been done with export_related.py
"""
import argparse
import os
from tempfile import NamedTemporaryFile
import caosdb as db
from caosdb.apiutils import apply_to_ids
from caosmodels.data_model import DataModel
def create_dummy_file(text="Please ask the administrator for this file."):
tmpfile = NamedTemporaryFile(delete=False)
tmpfile.close()
with open(tmpfile.name, "w") as tm:
tm.write(text)
return tmpfile.name
def import_xml(filename, rerun=False, interactive=True):
"""
filename: path to the xml file with the data to be inserted
rerun: boolean; if true, files are not inserted as paths would conflict.
"""
cont = db.Container()
with open(filename) as fi:
cont = cont.from_xml(fi.read())
tmpfile = create_dummy_file()
model = []
files = []
# add files to files list and properties and record types to model
for el in cont:
if isinstance(el, db.File):
el._checksum = None
target = os.path.join("downloads", el.path[1:])
if os.path.exists(target):
el.file = target
else:
el.file = tmpfile
files.append(el)
if (isinstance(el, db.Property) or isinstance(el, db.RecordType)):
model.append(el)
# remove entities of the model from the container
for el in model+files:
cont.remove(el)
id_mapping = {}
for el in model+files:
id_mapping[el.id] = el
# insert/update the model
datamodel = DataModel()
datamodel.extend(model)
datamodel.sync_data_model(noquestion=not interactive)
# insert files
if not rerun:
for _, el in enumerate(files):
r = el.insert(unique=False)
else:
for _, el in enumerate(files):
el.id = None
el.retrieve()
def replace_by_new(old):
if old in id_mapping:
return id_mapping[old].id
else:
return old
# set the ids of already inserted entities in the container
apply_to_ids(cont, replace_by_new)
cont.insert(unique=False)
def defineParser():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("file", help='file to be imported')
parser.add_argument("--rerun", help='if this script is run at least a'
' second time and files are already inserted',
action="store_true")
return parser
if __name__ == "__main__":
parser = defineParser()
args = parser.parse_args()
import_xml(args.file, args.rerun)
......@@ -147,7 +147,7 @@ it is not at the beginning, it must be preceded by a blank line.
raise ParseErrorsInHeader(filename, e)
# except yaml.error.MarkedYAMLError as e:
# raise NoValidHeader(filename)
if type(yaml_part) != dict:
if not isinstance(yaml_part, dict):
raise NoValidHeader(filename)
return (found_1, found_2, clean_header(yaml_part))
......@@ -229,7 +229,7 @@ def clean_header(header):
for k, v in header.items():
# Plain string is put into list
if type(v) == str:
if isinstance(v, str):
header[k] = [v]
return header
......
......@@ -24,9 +24,8 @@ import argparse
import re
import sys
import pandas as pd
import caosdb as db
import pandas as pd
def from_tsv(filename, recordtype):
......@@ -81,6 +80,10 @@ def from_table(spreadsheet, recordtype):
rec.add_parent(name=recordtype)
for key, value in row.iteritems():
if key.lower() == "description":
rec.description = value
continue
if (pd.notnull(value) and
(not isinstance(value, str) or value.strip() != "")):
regexp = r"(.*)\[(.*)\].*"
......
......@@ -37,7 +37,7 @@ class CacheTest(unittest.TestCase):
def test_hash(self):
ent = db.Record()
assert type(Cache.hash_entity(ent)) is str
assert isinstance(Cache.hash_entity(ent), str)
assert (Cache.hash_entity(ent) !=
Cache.hash_entity(db.Record().add_parent("lol")))
......@@ -48,7 +48,7 @@ class CacheTest(unittest.TestCase):
ent_hash = Cache.hash_entity(ent)
ent2_hash = Cache.hash_entity(ent2)
self.cache.insert(ent2_hash, 1235)
assert type(self.cache.check_existing(ent2_hash)) is int
assert isinstance(self.cache.check_existing(ent2_hash), int)
assert self.cache.check_existing(ent_hash) is None
def test_hirarchy(self):
......
......@@ -53,7 +53,7 @@ data:
...
"""
header = get_header(to_file(file_content))
assert type(header["data"]) is list
assert isinstance(header["data"], list)
file_content = """
---
data:
......@@ -61,4 +61,4 @@ data:
...
"""
header = get_header(to_file(file_content))
assert type(header["data"]) is list
assert isinstance(header["data"], list)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment