From dc8ba2949a8a3651cfa0a0146bd22adc3883bcda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Wed, 7 Sep 2022 13:39:41 +0200 Subject: [PATCH] ENH: add caosdbignore file to loadfiles --- integrationtests/caosdbignore | 2 + .../2019-02-03_v0.1/~README.md | 1 + .../extroot/Software/2010_TestSoftware/lol | 2 + integrationtests/filldb.sh | 2 +- integrationtests/test_crawler_with_cfoods.py | 9 + src/caosadvancedtools/loadFiles.py | 167 ++++++++++++++---- unittests/caosdbignore-example | 2 + .../2019-02-03_something/.caosdbignore | 1 + unittests/test_caosdbignore.py | 68 +++++++ 9 files changed, 219 insertions(+), 35 deletions(-) create mode 100644 integrationtests/caosdbignore create mode 100644 integrationtests/extroot/Software/2010_TestSoftware/2019-02-03_v0.1/~README.md create mode 100644 integrationtests/extroot/Software/2010_TestSoftware/lol create mode 100644 unittests/caosdbignore-example create mode 100644 unittests/data/Publications/Posters/2019-02-03_something/.caosdbignore create mode 100644 unittests/test_caosdbignore.py diff --git a/integrationtests/caosdbignore b/integrationtests/caosdbignore new file mode 100644 index 00000000..84b36362 --- /dev/null +++ b/integrationtests/caosdbignore @@ -0,0 +1,2 @@ +lol +~README.md diff --git a/integrationtests/extroot/Software/2010_TestSoftware/2019-02-03_v0.1/~README.md b/integrationtests/extroot/Software/2010_TestSoftware/2019-02-03_v0.1/~README.md new file mode 100644 index 00000000..f2e41136 --- /dev/null +++ b/integrationtests/extroot/Software/2010_TestSoftware/2019-02-03_v0.1/~README.md @@ -0,0 +1 @@ +stuff diff --git a/integrationtests/extroot/Software/2010_TestSoftware/lol b/integrationtests/extroot/Software/2010_TestSoftware/lol new file mode 100644 index 00000000..afd4a162 --- /dev/null +++ b/integrationtests/extroot/Software/2010_TestSoftware/lol @@ -0,0 +1,2 @@ +lol + diff --git a/integrationtests/filldb.sh b/integrationtests/filldb.sh index 9f55365e..9ee9060e 100755 --- a/integrationtests/filldb.sh +++ b/integrationtests/filldb.sh @@ -5,7 +5,7 @@ python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/ExperimentalData python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/DataAnalysis python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/SimulationData python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/Publications -python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/Software +python3 -m caosadvancedtools.loadFiles -c caosdbignore -l $PWD/extroot/Software /opt/caosdb/mnt/extroot/Software python3 insert_model.py python3 insert_some.py python3 crawl.py / diff --git a/integrationtests/test_crawler_with_cfoods.py b/integrationtests/test_crawler_with_cfoods.py index 4efef87c..19b1f8ff 100755 --- a/integrationtests/test_crawler_with_cfoods.py +++ b/integrationtests/test_crawler_with_cfoods.py @@ -33,6 +33,15 @@ def get_entity_with_id(eid): return db.execute_query("FIND "+str(eid), unique=True) +class LoadFilesTest(unittest.TestCase): + def test_lol(self): + # check whether ignored files were insered + res = db.execute_query("FIND FILE WHICH IS STORED AT '**/lol'") + assert len(res) == 0 + res = db.execute_query("FIND FILE WHICH IS STORED AT '**/~README.md'") + assert len(res) == 0 + + class CrawlerTest(unittest.TestCase): def test_experiment(self): diff --git a/src/caosadvancedtools/loadFiles.py b/src/caosadvancedtools/loadFiles.py index 445374b3..ffeda159 100755 --- a/src/caosadvancedtools/loadFiles.py +++ b/src/caosadvancedtools/loadFiles.py @@ -25,10 +25,14 @@ import argparse import logging +import os import math import sys +import re from argparse import ArgumentParser +from tempfile import NamedTemporaryFile +import shutil import caosdb as db logger = logging.getLogger(__name__) @@ -46,37 +50,113 @@ def convert_size(size): return '%s %s' % (s, size_name[i]) -def loadpath(path, include, exclude, prefix, dryrun, forceAllowSymlinks): - - if dryrun: - logger.info("Performin a dryrun!") - files = db.Container().retrieve( - unique=False, - raise_exception_on_error=True, - flags={"InsertFilesInDir": ("-p " + prefix + " " if prefix else "") - + ("-e " + exclude + " " if exclude else "") - + ("-i " + include + " " if include else "") - + ("--force-allow-symlinks " if forceAllowSymlinks else "") - + path}) +def combine_ignore_files(caosdbignore, localignore, dirname=None): + tmp = NamedTemporaryFile(delete=False, mode="w", dir=dirname, prefix=".caosdbignore") + with open(caosdbignore, "r") as base: + tmp.write(base.read()) + with open(localignore, "r") as local: + tmp.write(local.read()) + tmp.close() + return tmp.name + + +def compile_file_list(caosdbignore, localpath): + from gitignore_parser import parse_gitignore + matches = parse_gitignore(caosdbignore) + current_ignore = caosdbignore + non_ignored_files = [] + ignore_files = [] + for root, dirs, files in os.walk(localpath): + while len(ignore_files) > 0 and not root.startswith(ignore_files[-1][0]): + shutil.os.remove(ignore_files[-1][1]) + ignore_files.pop() + if len(ignore_files) > 0: + current_ignore = ignore_files[-1][1] + matches = parse_gitignore(current_ignore) + else: + current_ignore = caosdbignore + matches = parse_gitignore(current_ignore) + if ".caosdbignore" in files: + current_ignore = combine_ignore_files(current_ignore, + os.path.join(root, ".caosdbignore"), + # due to the logic of gitignore_parser the file + # has to be written to this folder + dirname=root) + ignore_files.append((root, current_ignore)) + matches = parse_gitignore(current_ignore) + for fi in files: + fullpath = os.path.join(root, fi) + if not matches(fullpath): + non_ignored_files.append(fullpath) + return non_ignored_files + + +def create_re_for_file_list(files, localroot, remoteroot): + regexp = "" + for fi in files: + path = fi + reg = "" + while path != localroot and path != "/" and path != "": + print(path, localroot) + reg = "(/"+re.escape(os.path.basename(path)) + reg + ")?" + path = os.path.dirname(path) + regexp += "|" + re.escape(remoteroot) + reg + return "^("+regexp[1:]+")$" + + +def loadpath(path, include, exclude, prefix, dryrun, forceAllowSymlinks, caosdbignore=None, + localpath=None): + + if caosdbignore: + filelist = compile_file_list(caosdbignore, localpath) + for fi in filelist: + print(fi) + + fulllist = filelist + + index = 0 + step_size = 3 + includes = [] + while index < len(fulllist): + subset = fulllist[index:min(index+step_size, len(fulllist))] + includes.append(create_re_for_file_list(subset, localpath, path)) + index += step_size else: - # new files (inserting them using the insertFilesInDir feature of - # the server, which inserts files via symlinks) - files = db.Container().insert( - unique=False, - raise_exception_on_error=True, - flags={"InsertFilesInDir": ("-p " + prefix + " " if prefix else "") - + ("-e " + exclude + " " if exclude else "") - + ("-i " + include + " " if include else "") - + ("--force-allow-symlinks " if forceAllowSymlinks else "") - + path}) - - totalsize = 0 # collecting total size of all new files - - for f in files: - totalsize += f.size - - logger.info("Made in total {} new files with a combined size of {} " - "accessible.".format(len(files), convert_size(totalsize))) + includes = [include] + + for include in includes: + print(include) + if dryrun: + logger.info("Performin a dryrun!") + files = db.Container().retrieve( + unique=False, + raise_exception_on_error=True, + flags={"InsertFilesInDir": ("-p " + prefix + " " if prefix else "") + + ("-e " + exclude + " " if exclude else "") + + ("-i " + include + " " if include else "") + + ("--force-allow-symlinks " if forceAllowSymlinks else "") + + path}) + else: + # new files (inserting them using the insertFilesInDir feature of + # the server, which inserts files via symlinks) + files = db.Container().insert( + unique=False, + raise_exception_on_error=True, + flags={"InsertFilesInDir": ("-p " + prefix + " " if prefix else "") + + ("-e " + exclude + " " if exclude else "") + + ("-i " + include + " " if include else "") + + ("--force-allow-symlinks " if forceAllowSymlinks else "") + + path}) + + totalsize = 0 # collecting total size of all new files + + for f in files: + totalsize += f.size + + print("Made in total {} new files with a combined size of {} " + "accessible.".format(len(files), convert_size(totalsize))) + logger.info("Made in total {} new files with a combined size of {} " + "accessible.".format(len(files), convert_size(totalsize))) return @@ -91,6 +171,18 @@ def main(argv=None): # Setup argument parser parser = ArgumentParser() + parser.add_argument("-p", "--prefix", dest="prefix", + help="store files with this prefix into the server's" + " file system.") + parser.add_argument("-c", "--caosdbignore", help=""" +Path to a caosdbignore file that defines which files shall be included and which do not. +The syntax is the same as in a gitignore file. You must also provide the localpath option +since the check is done locally. +""" + ) + parser.add_argument("-l", "--localpath", help="Path to the root directory on this machine. " + "This is needed if a caosdbignore file is used since the check is done " + "locally") parser.add_argument("-i", "--include", dest="include", help=""" only include paths matching this regex pattern. @@ -104,9 +196,6 @@ exclude is given preference over include. parser.add_argument("-e", "--exclude", dest="exclude", help="exclude paths matching this regex pattern.", metavar="RE") - parser.add_argument("-p", "--prefix", dest="prefix", - help="store files with this prefix into the server's" - " file system.") parser.add_argument("-d", "--dry-run", dest="dryrun", action="store_true", help="Just simulate the insertion of the files.") parser.add_argument('-t', '--timeout', dest="timeout", @@ -127,6 +216,15 @@ exclude is given preference over include. "directory tree.", action="store_true") args = parser.parse_args() + if args.caosdbignore and (args.exclude or args.include): + raise ValueError("Do not use a caosdbignore file and in- or exclude simultaneously!") + + if args.caosdbignore and not args.localpath: + raise ValueError("To use caosdbignore you must supply a local path!") + + if args.localpath and (args.exclude or args.include): + raise ValueError("Do not use a localpath and in- or exclude simultaneously!") + con = db.get_connection() con.timeout = float(args.timeout) con._login() @@ -137,8 +235,9 @@ exclude is given preference over include. exclude=args.exclude, prefix=args.prefix, dryrun=args.dryrun, - forceAllowSymlinks=args.forceAllowSymlinks, + caosdbignore=args.caosdbignore, + localpath=args.localpath, ) return 0 diff --git a/unittests/caosdbignore-example b/unittests/caosdbignore-example new file mode 100644 index 00000000..04500046 --- /dev/null +++ b/unittests/caosdbignore-example @@ -0,0 +1,2 @@ +README.md +.caosdbignore* diff --git a/unittests/data/Publications/Posters/2019-02-03_something/.caosdbignore b/unittests/data/Publications/Posters/2019-02-03_something/.caosdbignore new file mode 100644 index 00000000..43b9cb20 --- /dev/null +++ b/unittests/data/Publications/Posters/2019-02-03_something/.caosdbignore @@ -0,0 +1 @@ +!README.md diff --git a/unittests/test_caosdbignore.py b/unittests/test_caosdbignore.py new file mode 100644 index 00000000..9394bf0c --- /dev/null +++ b/unittests/test_caosdbignore.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2022 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2022 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + + +import os +import re +from tempfile import NamedTemporaryFile +import unittest + +from caosadvancedtools.loadFiles import compile_file_list, create_re_for_file_list, combine_ignore_files + +BASEDIR = os.path.dirname(os.path.realpath(__file__)) + + +class Caosdbignore(unittest.TestCase): + def setUp(self): + pass + + def test_compile(self): + files = compile_file_list(os.path.join(BASEDIR, "caosdbignore-example"), + os.path.join(BASEDIR, "data")) + assert len(files) == 3 + assert os.path.join(BASEDIR, "data", "datatypes.xlsx") in files + assert os.path.join(BASEDIR, "data", "README.xlsx") in files + assert os.path.join(BASEDIR, "data", "Publications/Posters/2019-02-03_something/README.md") in files + + def test_regex(self): + files = [r"/dies/ist/simple", r"/dies/eh(er)/nich?t"] + regex = create_re_for_file_list(files, "/dies", "/dies") + assert re.match(regex, files[0]) is not None + assert re.match(regex, files[1]) is not None + assert re.match(regex, "/dies/ist") is not None + assert re.match(regex, "/die") is None + assert re.match(regex, files[0]+files[1]) is None + assert re.match(regex, "d") is None + + def test_combine(self): + fi1 = NamedTemporaryFile(delete=False, mode="w") + fi1.write("ha") + fi1.close() + fi2 = NamedTemporaryFile(delete=False, mode="w") + fi2.write("ha") + fi2.close() + fi_new = combine_ignore_files(fi1.name, fi2.name) + with open(fi_new, "r") as fi: + assert "haha" == fi.read() -- GitLab