From dc8ba2949a8a3651cfa0a0146bd22adc3883bcda Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com>
Date: Wed, 7 Sep 2022 13:39:41 +0200
Subject: [PATCH] ENH: add caosdbignore file to loadfiles

---
 integrationtests/caosdbignore                 |   2 +
 .../2019-02-03_v0.1/~README.md                |   1 +
 .../extroot/Software/2010_TestSoftware/lol    |   2 +
 integrationtests/filldb.sh                    |   2 +-
 integrationtests/test_crawler_with_cfoods.py  |   9 +
 src/caosadvancedtools/loadFiles.py            | 167 ++++++++++++++----
 unittests/caosdbignore-example                |   2 +
 .../2019-02-03_something/.caosdbignore        |   1 +
 unittests/test_caosdbignore.py                |  68 +++++++
 9 files changed, 219 insertions(+), 35 deletions(-)
 create mode 100644 integrationtests/caosdbignore
 create mode 100644 integrationtests/extroot/Software/2010_TestSoftware/2019-02-03_v0.1/~README.md
 create mode 100644 integrationtests/extroot/Software/2010_TestSoftware/lol
 create mode 100644 unittests/caosdbignore-example
 create mode 100644 unittests/data/Publications/Posters/2019-02-03_something/.caosdbignore
 create mode 100644 unittests/test_caosdbignore.py

diff --git a/integrationtests/caosdbignore b/integrationtests/caosdbignore
new file mode 100644
index 00000000..84b36362
--- /dev/null
+++ b/integrationtests/caosdbignore
@@ -0,0 +1,2 @@
+lol
+~README.md
diff --git a/integrationtests/extroot/Software/2010_TestSoftware/2019-02-03_v0.1/~README.md b/integrationtests/extroot/Software/2010_TestSoftware/2019-02-03_v0.1/~README.md
new file mode 100644
index 00000000..f2e41136
--- /dev/null
+++ b/integrationtests/extroot/Software/2010_TestSoftware/2019-02-03_v0.1/~README.md
@@ -0,0 +1 @@
+stuff
diff --git a/integrationtests/extroot/Software/2010_TestSoftware/lol b/integrationtests/extroot/Software/2010_TestSoftware/lol
new file mode 100644
index 00000000..afd4a162
--- /dev/null
+++ b/integrationtests/extroot/Software/2010_TestSoftware/lol
@@ -0,0 +1,2 @@
+lol
+
diff --git a/integrationtests/filldb.sh b/integrationtests/filldb.sh
index 9f55365e..9ee9060e 100755
--- a/integrationtests/filldb.sh
+++ b/integrationtests/filldb.sh
@@ -5,7 +5,7 @@ python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/ExperimentalData
 python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/DataAnalysis
 python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/SimulationData
 python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/Publications
-python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/Software
+python3 -m caosadvancedtools.loadFiles -c caosdbignore -l $PWD/extroot/Software /opt/caosdb/mnt/extroot/Software
 python3 insert_model.py 
 python3 insert_some.py
 python3 crawl.py /
diff --git a/integrationtests/test_crawler_with_cfoods.py b/integrationtests/test_crawler_with_cfoods.py
index 4efef87c..19b1f8ff 100755
--- a/integrationtests/test_crawler_with_cfoods.py
+++ b/integrationtests/test_crawler_with_cfoods.py
@@ -33,6 +33,15 @@ def get_entity_with_id(eid):
     return db.execute_query("FIND "+str(eid), unique=True)
 
 
+class LoadFilesTest(unittest.TestCase):
+    def test_lol(self):
+        # check whether ignored files were insered
+        res = db.execute_query("FIND FILE WHICH IS STORED AT '**/lol'")
+        assert len(res) == 0
+        res = db.execute_query("FIND FILE WHICH IS STORED AT '**/~README.md'")
+        assert len(res) == 0
+
+
 class CrawlerTest(unittest.TestCase):
     def test_experiment(self):
 
diff --git a/src/caosadvancedtools/loadFiles.py b/src/caosadvancedtools/loadFiles.py
index 445374b3..ffeda159 100755
--- a/src/caosadvancedtools/loadFiles.py
+++ b/src/caosadvancedtools/loadFiles.py
@@ -25,10 +25,14 @@
 
 import argparse
 import logging
+import os
 import math
 import sys
+import re
 from argparse import ArgumentParser
+from tempfile import NamedTemporaryFile
 
+import shutil
 import caosdb as db
 
 logger = logging.getLogger(__name__)
@@ -46,37 +50,113 @@ def convert_size(size):
     return '%s %s' % (s, size_name[i])
 
 
-def loadpath(path, include, exclude, prefix, dryrun, forceAllowSymlinks):
-
-    if dryrun:
-        logger.info("Performin a dryrun!")
-        files = db.Container().retrieve(
-            unique=False,
-            raise_exception_on_error=True,
-            flags={"InsertFilesInDir": ("-p " + prefix + " " if prefix else "")
-                   + ("-e " + exclude + " " if exclude else "")
-                   + ("-i " + include + " " if include else "")
-                   + ("--force-allow-symlinks " if forceAllowSymlinks else "")
-                   + path})
+def combine_ignore_files(caosdbignore, localignore, dirname=None):
+    tmp = NamedTemporaryFile(delete=False, mode="w", dir=dirname, prefix=".caosdbignore")
+    with open(caosdbignore, "r") as base:
+        tmp.write(base.read())
+    with open(localignore, "r") as local:
+        tmp.write(local.read())
+    tmp.close()
+    return tmp.name
+
+
+def compile_file_list(caosdbignore, localpath):
+    from gitignore_parser import parse_gitignore
+    matches = parse_gitignore(caosdbignore)
+    current_ignore = caosdbignore
+    non_ignored_files = []
+    ignore_files = []
+    for root, dirs, files in os.walk(localpath):
+        while len(ignore_files) > 0 and not root.startswith(ignore_files[-1][0]):
+            shutil.os.remove(ignore_files[-1][1])
+            ignore_files.pop()
+        if len(ignore_files) > 0:
+            current_ignore = ignore_files[-1][1]
+            matches = parse_gitignore(current_ignore)
+        else:
+            current_ignore = caosdbignore
+            matches = parse_gitignore(current_ignore)
+        if ".caosdbignore" in files:
+            current_ignore = combine_ignore_files(current_ignore,
+                                                  os.path.join(root, ".caosdbignore"),
+                                                  # due to the logic of gitignore_parser the file
+                                                  # has to be written to this folder
+                                                  dirname=root)
+            ignore_files.append((root, current_ignore))
+            matches = parse_gitignore(current_ignore)
+        for fi in files:
+            fullpath = os.path.join(root, fi)
+            if not matches(fullpath):
+                non_ignored_files.append(fullpath)
+    return non_ignored_files
+
+
+def create_re_for_file_list(files, localroot, remoteroot):
+    regexp = ""
+    for fi in files:
+        path = fi
+        reg = ""
+        while path != localroot and path != "/" and path != "":
+            print(path, localroot)
+            reg = "(/"+re.escape(os.path.basename(path)) + reg + ")?"
+            path = os.path.dirname(path)
+        regexp += "|" + re.escape(remoteroot) + reg
+    return "^("+regexp[1:]+")$"
+
+
+def loadpath(path, include, exclude, prefix, dryrun, forceAllowSymlinks, caosdbignore=None,
+             localpath=None):
+
+    if caosdbignore:
+        filelist = compile_file_list(caosdbignore, localpath)
+        for fi in filelist:
+            print(fi)
+
+        fulllist = filelist
+
+        index = 0
+        step_size = 3
+        includes = []
+        while index < len(fulllist):
+            subset = fulllist[index:min(index+step_size, len(fulllist))]
+            includes.append(create_re_for_file_list(subset, localpath, path))
+            index += step_size
     else:
-        # new files (inserting them using the insertFilesInDir feature of
-        # the server, which inserts files via symlinks)
-        files = db.Container().insert(
-            unique=False,
-            raise_exception_on_error=True,
-            flags={"InsertFilesInDir": ("-p " + prefix + " " if prefix else "")
-                   + ("-e " + exclude + " " if exclude else "")
-                   + ("-i " + include + " " if include else "")
-                   + ("--force-allow-symlinks " if forceAllowSymlinks else "")
-                   + path})
-
-    totalsize = 0  # collecting total size of all new files
-
-    for f in files:
-        totalsize += f.size
-
-    logger.info("Made in total {} new files with a combined size of {} "
-                "accessible.".format(len(files), convert_size(totalsize)))
+        includes = [include]
+
+    for include in includes:
+        print(include)
+        if dryrun:
+            logger.info("Performin a dryrun!")
+            files = db.Container().retrieve(
+                unique=False,
+                raise_exception_on_error=True,
+                flags={"InsertFilesInDir": ("-p " + prefix + " " if prefix else "")
+                       + ("-e " + exclude + " " if exclude else "")
+                       + ("-i " + include + " " if include else "")
+                       + ("--force-allow-symlinks " if forceAllowSymlinks else "")
+                       + path})
+        else:
+            # new files (inserting them using the insertFilesInDir feature of
+            # the server, which inserts files via symlinks)
+            files = db.Container().insert(
+                unique=False,
+                raise_exception_on_error=True,
+                flags={"InsertFilesInDir": ("-p " + prefix + " " if prefix else "")
+                       + ("-e " + exclude + " " if exclude else "")
+                       + ("-i " + include + " " if include else "")
+                       + ("--force-allow-symlinks " if forceAllowSymlinks else "")
+                       + path})
+
+        totalsize = 0  # collecting total size of all new files
+
+        for f in files:
+            totalsize += f.size
+
+        print("Made in total {} new files with a combined size of {} "
+              "accessible.".format(len(files), convert_size(totalsize)))
+        logger.info("Made in total {} new files with a combined size of {} "
+                    "accessible.".format(len(files), convert_size(totalsize)))
 
     return
 
@@ -91,6 +171,18 @@ def main(argv=None):
 
     # Setup argument parser
     parser = ArgumentParser()
+    parser.add_argument("-p", "--prefix", dest="prefix",
+                        help="store files with this prefix into the server's"
+                        " file system.")
+    parser.add_argument("-c", "--caosdbignore", help="""
+Path to a caosdbignore file that defines which files shall be included and which do not.
+The syntax is the same as in a gitignore file. You must also provide the localpath option
+since the check is done locally.
+"""
+                        )
+    parser.add_argument("-l", "--localpath", help="Path to the root directory on this machine. "
+                        "This is needed if a caosdbignore file is used since the check is done "
+                        "locally")
     parser.add_argument("-i", "--include", dest="include",
                         help="""
 only include paths matching this regex pattern.
@@ -104,9 +196,6 @@ exclude is given preference over include.
     parser.add_argument("-e", "--exclude", dest="exclude",
                         help="exclude paths matching this regex pattern.",
                         metavar="RE")
-    parser.add_argument("-p", "--prefix", dest="prefix",
-                        help="store files with this prefix into the server's"
-                        " file system.")
     parser.add_argument("-d", "--dry-run", dest="dryrun", action="store_true",
                         help="Just simulate the insertion of the files.")
     parser.add_argument('-t', '--timeout', dest="timeout",
@@ -127,6 +216,15 @@ exclude is given preference over include.
                         "directory tree.", action="store_true")
     args = parser.parse_args()
 
+    if args.caosdbignore and (args.exclude or args.include):
+        raise ValueError("Do not use a caosdbignore file and in- or exclude simultaneously!")
+
+    if args.caosdbignore and not args.localpath:
+        raise ValueError("To use caosdbignore you must supply a local path!")
+
+    if args.localpath and (args.exclude or args.include):
+        raise ValueError("Do not use a localpath and in- or exclude simultaneously!")
+
     con = db.get_connection()
     con.timeout = float(args.timeout)
     con._login()
@@ -137,8 +235,9 @@ exclude is given preference over include.
         exclude=args.exclude,
         prefix=args.prefix,
         dryrun=args.dryrun,
-
         forceAllowSymlinks=args.forceAllowSymlinks,
+        caosdbignore=args.caosdbignore,
+        localpath=args.localpath,
     )
 
     return 0
diff --git a/unittests/caosdbignore-example b/unittests/caosdbignore-example
new file mode 100644
index 00000000..04500046
--- /dev/null
+++ b/unittests/caosdbignore-example
@@ -0,0 +1,2 @@
+README.md
+.caosdbignore*
diff --git a/unittests/data/Publications/Posters/2019-02-03_something/.caosdbignore b/unittests/data/Publications/Posters/2019-02-03_something/.caosdbignore
new file mode 100644
index 00000000..43b9cb20
--- /dev/null
+++ b/unittests/data/Publications/Posters/2019-02-03_something/.caosdbignore
@@ -0,0 +1 @@
+!README.md
diff --git a/unittests/test_caosdbignore.py b/unittests/test_caosdbignore.py
new file mode 100644
index 00000000..9394bf0c
--- /dev/null
+++ b/unittests/test_caosdbignore.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+#
+# ** header v3.0
+# This file is a part of the CaosDB Project.
+#
+# Copyright (C) 2022 Indiscale GmbH <info@indiscale.com>
+# Copyright (C) 2022 Henrik tom Wörden <h.tomwoerden@indiscale.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+# ** end header
+#
+
+
+import os
+import re
+from tempfile import NamedTemporaryFile
+import unittest
+
+from caosadvancedtools.loadFiles import compile_file_list, create_re_for_file_list, combine_ignore_files
+
+BASEDIR = os.path.dirname(os.path.realpath(__file__))
+
+
+class Caosdbignore(unittest.TestCase):
+    def setUp(self):
+        pass
+
+    def test_compile(self):
+        files = compile_file_list(os.path.join(BASEDIR, "caosdbignore-example"),
+                                  os.path.join(BASEDIR, "data"))
+        assert len(files) == 3
+        assert os.path.join(BASEDIR, "data", "datatypes.xlsx") in files
+        assert os.path.join(BASEDIR, "data", "README.xlsx") in files
+        assert os.path.join(BASEDIR, "data", "Publications/Posters/2019-02-03_something/README.md") in files
+
+    def test_regex(self):
+        files = [r"/dies/ist/simple", r"/dies/eh(er)/nich?t"]
+        regex = create_re_for_file_list(files, "/dies", "/dies")
+        assert re.match(regex, files[0]) is not None
+        assert re.match(regex, files[1]) is not None
+        assert re.match(regex, "/dies/ist") is not None
+        assert re.match(regex, "/die") is None
+        assert re.match(regex, files[0]+files[1]) is None
+        assert re.match(regex, "d") is None
+
+    def test_combine(self):
+        fi1 = NamedTemporaryFile(delete=False, mode="w")
+        fi1.write("ha")
+        fi1.close()
+        fi2 = NamedTemporaryFile(delete=False, mode="w")
+        fi2.write("ha")
+        fi2.close()
+        fi_new = combine_ignore_files(fi1.name, fi2.name)
+        with open(fi_new, "r") as fi:
+            assert "haha" == fi.read()
-- 
GitLab