Skip to content
Snippets Groups Projects
Commit dc8ba294 authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

ENH: add caosdbignore file to loadfiles

parent 3c8e6ded
No related branches found
No related tags found
2 merge requests!59FIX: if multiple updates for one entity exist, the retrieve would result in an...,!55ENH: include caosdbignore
Pipeline #28018 failed
lol
~README.md
stuff
lol
...@@ -5,7 +5,7 @@ python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/ExperimentalData ...@@ -5,7 +5,7 @@ python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/ExperimentalData
python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/DataAnalysis python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/DataAnalysis
python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/SimulationData python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/SimulationData
python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/Publications python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/Publications
python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/Software python3 -m caosadvancedtools.loadFiles -c caosdbignore -l $PWD/extroot/Software /opt/caosdb/mnt/extroot/Software
python3 insert_model.py python3 insert_model.py
python3 insert_some.py python3 insert_some.py
python3 crawl.py / python3 crawl.py /
...@@ -33,6 +33,15 @@ def get_entity_with_id(eid): ...@@ -33,6 +33,15 @@ def get_entity_with_id(eid):
return db.execute_query("FIND "+str(eid), unique=True) return db.execute_query("FIND "+str(eid), unique=True)
class LoadFilesTest(unittest.TestCase):
def test_lol(self):
# check whether ignored files were insered
res = db.execute_query("FIND FILE WHICH IS STORED AT '**/lol'")
assert len(res) == 0
res = db.execute_query("FIND FILE WHICH IS STORED AT '**/~README.md'")
assert len(res) == 0
class CrawlerTest(unittest.TestCase): class CrawlerTest(unittest.TestCase):
def test_experiment(self): def test_experiment(self):
......
...@@ -25,10 +25,14 @@ ...@@ -25,10 +25,14 @@
import argparse import argparse
import logging import logging
import os
import math import math
import sys import sys
import re
from argparse import ArgumentParser from argparse import ArgumentParser
from tempfile import NamedTemporaryFile
import shutil
import caosdb as db import caosdb as db
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -46,8 +50,82 @@ def convert_size(size): ...@@ -46,8 +50,82 @@ def convert_size(size):
return '%s %s' % (s, size_name[i]) return '%s %s' % (s, size_name[i])
def loadpath(path, include, exclude, prefix, dryrun, forceAllowSymlinks): def combine_ignore_files(caosdbignore, localignore, dirname=None):
tmp = NamedTemporaryFile(delete=False, mode="w", dir=dirname, prefix=".caosdbignore")
with open(caosdbignore, "r") as base:
tmp.write(base.read())
with open(localignore, "r") as local:
tmp.write(local.read())
tmp.close()
return tmp.name
def compile_file_list(caosdbignore, localpath):
from gitignore_parser import parse_gitignore
matches = parse_gitignore(caosdbignore)
current_ignore = caosdbignore
non_ignored_files = []
ignore_files = []
for root, dirs, files in os.walk(localpath):
while len(ignore_files) > 0 and not root.startswith(ignore_files[-1][0]):
shutil.os.remove(ignore_files[-1][1])
ignore_files.pop()
if len(ignore_files) > 0:
current_ignore = ignore_files[-1][1]
matches = parse_gitignore(current_ignore)
else:
current_ignore = caosdbignore
matches = parse_gitignore(current_ignore)
if ".caosdbignore" in files:
current_ignore = combine_ignore_files(current_ignore,
os.path.join(root, ".caosdbignore"),
# due to the logic of gitignore_parser the file
# has to be written to this folder
dirname=root)
ignore_files.append((root, current_ignore))
matches = parse_gitignore(current_ignore)
for fi in files:
fullpath = os.path.join(root, fi)
if not matches(fullpath):
non_ignored_files.append(fullpath)
return non_ignored_files
def create_re_for_file_list(files, localroot, remoteroot):
regexp = ""
for fi in files:
path = fi
reg = ""
while path != localroot and path != "/" and path != "":
print(path, localroot)
reg = "(/"+re.escape(os.path.basename(path)) + reg + ")?"
path = os.path.dirname(path)
regexp += "|" + re.escape(remoteroot) + reg
return "^("+regexp[1:]+")$"
def loadpath(path, include, exclude, prefix, dryrun, forceAllowSymlinks, caosdbignore=None,
localpath=None):
if caosdbignore:
filelist = compile_file_list(caosdbignore, localpath)
for fi in filelist:
print(fi)
fulllist = filelist
index = 0
step_size = 3
includes = []
while index < len(fulllist):
subset = fulllist[index:min(index+step_size, len(fulllist))]
includes.append(create_re_for_file_list(subset, localpath, path))
index += step_size
else:
includes = [include]
for include in includes:
print(include)
if dryrun: if dryrun:
logger.info("Performin a dryrun!") logger.info("Performin a dryrun!")
files = db.Container().retrieve( files = db.Container().retrieve(
...@@ -75,6 +153,8 @@ def loadpath(path, include, exclude, prefix, dryrun, forceAllowSymlinks): ...@@ -75,6 +153,8 @@ def loadpath(path, include, exclude, prefix, dryrun, forceAllowSymlinks):
for f in files: for f in files:
totalsize += f.size totalsize += f.size
print("Made in total {} new files with a combined size of {} "
"accessible.".format(len(files), convert_size(totalsize)))
logger.info("Made in total {} new files with a combined size of {} " logger.info("Made in total {} new files with a combined size of {} "
"accessible.".format(len(files), convert_size(totalsize))) "accessible.".format(len(files), convert_size(totalsize)))
...@@ -91,6 +171,18 @@ def main(argv=None): ...@@ -91,6 +171,18 @@ def main(argv=None):
# Setup argument parser # Setup argument parser
parser = ArgumentParser() parser = ArgumentParser()
parser.add_argument("-p", "--prefix", dest="prefix",
help="store files with this prefix into the server's"
" file system.")
parser.add_argument("-c", "--caosdbignore", help="""
Path to a caosdbignore file that defines which files shall be included and which do not.
The syntax is the same as in a gitignore file. You must also provide the localpath option
since the check is done locally.
"""
)
parser.add_argument("-l", "--localpath", help="Path to the root directory on this machine. "
"This is needed if a caosdbignore file is used since the check is done "
"locally")
parser.add_argument("-i", "--include", dest="include", parser.add_argument("-i", "--include", dest="include",
help=""" help="""
only include paths matching this regex pattern. only include paths matching this regex pattern.
...@@ -104,9 +196,6 @@ exclude is given preference over include. ...@@ -104,9 +196,6 @@ exclude is given preference over include.
parser.add_argument("-e", "--exclude", dest="exclude", parser.add_argument("-e", "--exclude", dest="exclude",
help="exclude paths matching this regex pattern.", help="exclude paths matching this regex pattern.",
metavar="RE") metavar="RE")
parser.add_argument("-p", "--prefix", dest="prefix",
help="store files with this prefix into the server's"
" file system.")
parser.add_argument("-d", "--dry-run", dest="dryrun", action="store_true", parser.add_argument("-d", "--dry-run", dest="dryrun", action="store_true",
help="Just simulate the insertion of the files.") help="Just simulate the insertion of the files.")
parser.add_argument('-t', '--timeout', dest="timeout", parser.add_argument('-t', '--timeout', dest="timeout",
...@@ -127,6 +216,15 @@ exclude is given preference over include. ...@@ -127,6 +216,15 @@ exclude is given preference over include.
"directory tree.", action="store_true") "directory tree.", action="store_true")
args = parser.parse_args() args = parser.parse_args()
if args.caosdbignore and (args.exclude or args.include):
raise ValueError("Do not use a caosdbignore file and in- or exclude simultaneously!")
if args.caosdbignore and not args.localpath:
raise ValueError("To use caosdbignore you must supply a local path!")
if args.localpath and (args.exclude or args.include):
raise ValueError("Do not use a localpath and in- or exclude simultaneously!")
con = db.get_connection() con = db.get_connection()
con.timeout = float(args.timeout) con.timeout = float(args.timeout)
con._login() con._login()
...@@ -137,8 +235,9 @@ exclude is given preference over include. ...@@ -137,8 +235,9 @@ exclude is given preference over include.
exclude=args.exclude, exclude=args.exclude,
prefix=args.prefix, prefix=args.prefix,
dryrun=args.dryrun, dryrun=args.dryrun,
forceAllowSymlinks=args.forceAllowSymlinks, forceAllowSymlinks=args.forceAllowSymlinks,
caosdbignore=args.caosdbignore,
localpath=args.localpath,
) )
return 0 return 0
......
README.md
.caosdbignore*
!README.md
#!/usr/bin/env python3
# encoding: utf-8
#
# ** header v3.0
# This file is a part of the CaosDB Project.
#
# Copyright (C) 2022 Indiscale GmbH <info@indiscale.com>
# Copyright (C) 2022 Henrik tom Wörden <h.tomwoerden@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
# ** end header
#
import os
import re
from tempfile import NamedTemporaryFile
import unittest
from caosadvancedtools.loadFiles import compile_file_list, create_re_for_file_list, combine_ignore_files
BASEDIR = os.path.dirname(os.path.realpath(__file__))
class Caosdbignore(unittest.TestCase):
def setUp(self):
pass
def test_compile(self):
files = compile_file_list(os.path.join(BASEDIR, "caosdbignore-example"),
os.path.join(BASEDIR, "data"))
assert len(files) == 3
assert os.path.join(BASEDIR, "data", "datatypes.xlsx") in files
assert os.path.join(BASEDIR, "data", "README.xlsx") in files
assert os.path.join(BASEDIR, "data", "Publications/Posters/2019-02-03_something/README.md") in files
def test_regex(self):
files = [r"/dies/ist/simple", r"/dies/eh(er)/nich?t"]
regex = create_re_for_file_list(files, "/dies", "/dies")
assert re.match(regex, files[0]) is not None
assert re.match(regex, files[1]) is not None
assert re.match(regex, "/dies/ist") is not None
assert re.match(regex, "/die") is None
assert re.match(regex, files[0]+files[1]) is None
assert re.match(regex, "d") is None
def test_combine(self):
fi1 = NamedTemporaryFile(delete=False, mode="w")
fi1.write("ha")
fi1.close()
fi2 = NamedTemporaryFile(delete=False, mode="w")
fi2.write("ha")
fi2.close()
fi_new = combine_ignore_files(fi1.name, fi2.name)
with open(fi_new, "r") as fi:
assert "haha" == fi.read()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment