Skip to content
Snippets Groups Projects
Commit dc8ba294 authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

ENH: add caosdbignore file to loadfiles

parent 3c8e6ded
No related branches found
No related tags found
2 merge requests!59FIX: if multiple updates for one entity exist, the retrieve would result in an...,!55ENH: include caosdbignore
Pipeline #28018 failed
lol
~README.md
stuff
lol
......@@ -5,7 +5,7 @@ python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/ExperimentalData
python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/DataAnalysis
python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/SimulationData
python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/Publications
python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/Software
python3 -m caosadvancedtools.loadFiles -c caosdbignore -l $PWD/extroot/Software /opt/caosdb/mnt/extroot/Software
python3 insert_model.py
python3 insert_some.py
python3 crawl.py /
......@@ -33,6 +33,15 @@ def get_entity_with_id(eid):
return db.execute_query("FIND "+str(eid), unique=True)
class LoadFilesTest(unittest.TestCase):
def test_lol(self):
# check whether ignored files were insered
res = db.execute_query("FIND FILE WHICH IS STORED AT '**/lol'")
assert len(res) == 0
res = db.execute_query("FIND FILE WHICH IS STORED AT '**/~README.md'")
assert len(res) == 0
class CrawlerTest(unittest.TestCase):
def test_experiment(self):
......
......@@ -25,10 +25,14 @@
import argparse
import logging
import os
import math
import sys
import re
from argparse import ArgumentParser
from tempfile import NamedTemporaryFile
import shutil
import caosdb as db
logger = logging.getLogger(__name__)
......@@ -46,37 +50,113 @@ def convert_size(size):
return '%s %s' % (s, size_name[i])
def loadpath(path, include, exclude, prefix, dryrun, forceAllowSymlinks):
if dryrun:
logger.info("Performin a dryrun!")
files = db.Container().retrieve(
unique=False,
raise_exception_on_error=True,
flags={"InsertFilesInDir": ("-p " + prefix + " " if prefix else "")
+ ("-e " + exclude + " " if exclude else "")
+ ("-i " + include + " " if include else "")
+ ("--force-allow-symlinks " if forceAllowSymlinks else "")
+ path})
def combine_ignore_files(caosdbignore, localignore, dirname=None):
tmp = NamedTemporaryFile(delete=False, mode="w", dir=dirname, prefix=".caosdbignore")
with open(caosdbignore, "r") as base:
tmp.write(base.read())
with open(localignore, "r") as local:
tmp.write(local.read())
tmp.close()
return tmp.name
def compile_file_list(caosdbignore, localpath):
from gitignore_parser import parse_gitignore
matches = parse_gitignore(caosdbignore)
current_ignore = caosdbignore
non_ignored_files = []
ignore_files = []
for root, dirs, files in os.walk(localpath):
while len(ignore_files) > 0 and not root.startswith(ignore_files[-1][0]):
shutil.os.remove(ignore_files[-1][1])
ignore_files.pop()
if len(ignore_files) > 0:
current_ignore = ignore_files[-1][1]
matches = parse_gitignore(current_ignore)
else:
current_ignore = caosdbignore
matches = parse_gitignore(current_ignore)
if ".caosdbignore" in files:
current_ignore = combine_ignore_files(current_ignore,
os.path.join(root, ".caosdbignore"),
# due to the logic of gitignore_parser the file
# has to be written to this folder
dirname=root)
ignore_files.append((root, current_ignore))
matches = parse_gitignore(current_ignore)
for fi in files:
fullpath = os.path.join(root, fi)
if not matches(fullpath):
non_ignored_files.append(fullpath)
return non_ignored_files
def create_re_for_file_list(files, localroot, remoteroot):
regexp = ""
for fi in files:
path = fi
reg = ""
while path != localroot and path != "/" and path != "":
print(path, localroot)
reg = "(/"+re.escape(os.path.basename(path)) + reg + ")?"
path = os.path.dirname(path)
regexp += "|" + re.escape(remoteroot) + reg
return "^("+regexp[1:]+")$"
def loadpath(path, include, exclude, prefix, dryrun, forceAllowSymlinks, caosdbignore=None,
localpath=None):
if caosdbignore:
filelist = compile_file_list(caosdbignore, localpath)
for fi in filelist:
print(fi)
fulllist = filelist
index = 0
step_size = 3
includes = []
while index < len(fulllist):
subset = fulllist[index:min(index+step_size, len(fulllist))]
includes.append(create_re_for_file_list(subset, localpath, path))
index += step_size
else:
# new files (inserting them using the insertFilesInDir feature of
# the server, which inserts files via symlinks)
files = db.Container().insert(
unique=False,
raise_exception_on_error=True,
flags={"InsertFilesInDir": ("-p " + prefix + " " if prefix else "")
+ ("-e " + exclude + " " if exclude else "")
+ ("-i " + include + " " if include else "")
+ ("--force-allow-symlinks " if forceAllowSymlinks else "")
+ path})
totalsize = 0 # collecting total size of all new files
for f in files:
totalsize += f.size
logger.info("Made in total {} new files with a combined size of {} "
"accessible.".format(len(files), convert_size(totalsize)))
includes = [include]
for include in includes:
print(include)
if dryrun:
logger.info("Performin a dryrun!")
files = db.Container().retrieve(
unique=False,
raise_exception_on_error=True,
flags={"InsertFilesInDir": ("-p " + prefix + " " if prefix else "")
+ ("-e " + exclude + " " if exclude else "")
+ ("-i " + include + " " if include else "")
+ ("--force-allow-symlinks " if forceAllowSymlinks else "")
+ path})
else:
# new files (inserting them using the insertFilesInDir feature of
# the server, which inserts files via symlinks)
files = db.Container().insert(
unique=False,
raise_exception_on_error=True,
flags={"InsertFilesInDir": ("-p " + prefix + " " if prefix else "")
+ ("-e " + exclude + " " if exclude else "")
+ ("-i " + include + " " if include else "")
+ ("--force-allow-symlinks " if forceAllowSymlinks else "")
+ path})
totalsize = 0 # collecting total size of all new files
for f in files:
totalsize += f.size
print("Made in total {} new files with a combined size of {} "
"accessible.".format(len(files), convert_size(totalsize)))
logger.info("Made in total {} new files with a combined size of {} "
"accessible.".format(len(files), convert_size(totalsize)))
return
......@@ -91,6 +171,18 @@ def main(argv=None):
# Setup argument parser
parser = ArgumentParser()
parser.add_argument("-p", "--prefix", dest="prefix",
help="store files with this prefix into the server's"
" file system.")
parser.add_argument("-c", "--caosdbignore", help="""
Path to a caosdbignore file that defines which files shall be included and which do not.
The syntax is the same as in a gitignore file. You must also provide the localpath option
since the check is done locally.
"""
)
parser.add_argument("-l", "--localpath", help="Path to the root directory on this machine. "
"This is needed if a caosdbignore file is used since the check is done "
"locally")
parser.add_argument("-i", "--include", dest="include",
help="""
only include paths matching this regex pattern.
......@@ -104,9 +196,6 @@ exclude is given preference over include.
parser.add_argument("-e", "--exclude", dest="exclude",
help="exclude paths matching this regex pattern.",
metavar="RE")
parser.add_argument("-p", "--prefix", dest="prefix",
help="store files with this prefix into the server's"
" file system.")
parser.add_argument("-d", "--dry-run", dest="dryrun", action="store_true",
help="Just simulate the insertion of the files.")
parser.add_argument('-t', '--timeout', dest="timeout",
......@@ -127,6 +216,15 @@ exclude is given preference over include.
"directory tree.", action="store_true")
args = parser.parse_args()
if args.caosdbignore and (args.exclude or args.include):
raise ValueError("Do not use a caosdbignore file and in- or exclude simultaneously!")
if args.caosdbignore and not args.localpath:
raise ValueError("To use caosdbignore you must supply a local path!")
if args.localpath and (args.exclude or args.include):
raise ValueError("Do not use a localpath and in- or exclude simultaneously!")
con = db.get_connection()
con.timeout = float(args.timeout)
con._login()
......@@ -137,8 +235,9 @@ exclude is given preference over include.
exclude=args.exclude,
prefix=args.prefix,
dryrun=args.dryrun,
forceAllowSymlinks=args.forceAllowSymlinks,
caosdbignore=args.caosdbignore,
localpath=args.localpath,
)
return 0
......
README.md
.caosdbignore*
!README.md
#!/usr/bin/env python3
# encoding: utf-8
#
# ** header v3.0
# This file is a part of the CaosDB Project.
#
# Copyright (C) 2022 Indiscale GmbH <info@indiscale.com>
# Copyright (C) 2022 Henrik tom Wörden <h.tomwoerden@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
# ** end header
#
import os
import re
from tempfile import NamedTemporaryFile
import unittest
from caosadvancedtools.loadFiles import compile_file_list, create_re_for_file_list, combine_ignore_files
BASEDIR = os.path.dirname(os.path.realpath(__file__))
class Caosdbignore(unittest.TestCase):
def setUp(self):
pass
def test_compile(self):
files = compile_file_list(os.path.join(BASEDIR, "caosdbignore-example"),
os.path.join(BASEDIR, "data"))
assert len(files) == 3
assert os.path.join(BASEDIR, "data", "datatypes.xlsx") in files
assert os.path.join(BASEDIR, "data", "README.xlsx") in files
assert os.path.join(BASEDIR, "data", "Publications/Posters/2019-02-03_something/README.md") in files
def test_regex(self):
files = [r"/dies/ist/simple", r"/dies/eh(er)/nich?t"]
regex = create_re_for_file_list(files, "/dies", "/dies")
assert re.match(regex, files[0]) is not None
assert re.match(regex, files[1]) is not None
assert re.match(regex, "/dies/ist") is not None
assert re.match(regex, "/die") is None
assert re.match(regex, files[0]+files[1]) is None
assert re.match(regex, "d") is None
def test_combine(self):
fi1 = NamedTemporaryFile(delete=False, mode="w")
fi1.write("ha")
fi1.close()
fi2 = NamedTemporaryFile(delete=False, mode="w")
fi2.write("ha")
fi2.close()
fi_new = combine_ignore_files(fi1.name, fi2.name)
with open(fi_new, "r") as fi:
assert "haha" == fi.read()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment