Skip to content
Snippets Groups Projects
Commit 9be86a63 authored by Florian Spreckelsen's avatar Florian Spreckelsen
Browse files

Merge branch 'f-caosdbignore' into 'dev'

ENH: include caosdbignore

See merge request !55
parents a2471d5e e9609364
Branches
Tags
2 merge requests!59FIX: if multiple updates for one entity exist, the retrieve would result in an...,!55ENH: include caosdbignore
Pipeline #28915 failed
......@@ -25,7 +25,7 @@ ADD https://gitlab.com/api/v4/projects/13656973/repository/branches/dev \
RUN git clone https://gitlab.com/caosdb/caosdb-pylib.git && \
cd caosdb-pylib && git checkout dev && pip3 install .
# At least recommonmark 0.6 required.
RUN pip3 install -U html2text pycodestyle pylint recommonmark sphinx-rtd-theme
RUN pip3 install -U html2text pycodestyle pylint recommonmark sphinx-rtd-theme gitignore-parser
COPY . /git
RUN rm -r /git/.git \
&& mv /git/.docker/pycaosdb.ini /git/integrationtests
......
......@@ -10,6 +10,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Unauthorized inserts can now be cached. Note that the Crawler cannot postpone
inserts but the Cache has the functionality now.
- caosdbignore; You can add one or more `.caosdbignore` files to the directory
structure that you want to make available in CaosDB and the run loadFiles.
The syntax is that of `.gitignore` files. For more information see `loadFiles`
section of the Crawler in the documentation.
### Changed ###
......
lol
~README.md
stuff
lol
......@@ -5,7 +5,7 @@ python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/ExperimentalData
python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/DataAnalysis
python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/SimulationData
python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/Publications
python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/Software
python3 -m caosadvancedtools.loadFiles -c caosdbignore -l $PWD/extroot/Software /opt/caosdb/mnt/extroot/Software
python3 insert_model.py
python3 insert_some.py
python3 crawl.py /
......@@ -33,6 +33,15 @@ def get_entity_with_id(eid):
return db.execute_query("FIND "+str(eid), unique=True)
class LoadFilesTest(unittest.TestCase):
def test_lol(self):
# check whether ignored files were insered
res = db.execute_query("FIND FILE WHICH IS STORED AT '**/lol'")
assert len(res) == 0
res = db.execute_query("FIND FILE WHICH IS STORED AT '**/~README.md'")
assert len(res) == 0
class CrawlerTest(unittest.TestCase):
def test_experiment(self):
......
......@@ -162,6 +162,7 @@ def setup_package():
"xlrd>=2.0",
],
extras_require={"h5-crawler": ["h5py>=3.3.0", ],
"gitignore-parser ": ["gitignore-parser >=0.1.0", ],
},
packages=find_packages('src'),
package_dir={'': 'src'},
......
......@@ -25,10 +25,14 @@
import argparse
import logging
import os
import math
import sys
import re
from argparse import ArgumentParser
from tempfile import NamedTemporaryFile
import shutil
import caosdb as db
logger = logging.getLogger(__name__)
......@@ -46,8 +50,104 @@ def convert_size(size):
return '%s %s' % (s, size_name[i])
def loadpath(path, include, exclude, prefix, dryrun, forceAllowSymlinks):
def combine_ignore_files(caosdbignore, localignore, dirname=None):
"""appends the contents of localignore to caosdbignore and saves the result
and returns the name
"""
tmp = NamedTemporaryFile(delete=False, mode="w",
dir=dirname, prefix=".caosdbignore")
with open(caosdbignore, "r") as base:
tmp.write(base.read())
with open(localignore, "r") as local:
tmp.write(local.read())
tmp.close()
return tmp.name
def compile_file_list(caosdbignore, localpath):
"""creates a list of files that contain all files under localpath except
those excluded by caosdbignore
"""
from gitignore_parser import parse_gitignore
matches = parse_gitignore(caosdbignore)
current_ignore = caosdbignore
non_ignored_files = []
ignore_files = []
for root, dirs, files in os.walk(localpath):
# remove local ignore files that do no longer apply to the current subtree (branch switch)
while len(ignore_files) > 0 and not root.startswith(ignore_files[-1][0]):
shutil.os.remove(ignore_files[-1][1])
ignore_files.pop()
# use the global one if there are no more local ones
if len(ignore_files) > 0:
current_ignore = ignore_files[-1][1]
matches = parse_gitignore(current_ignore)
else:
current_ignore = caosdbignore
matches = parse_gitignore(current_ignore)
# create a new local ignore file
if ".caosdbignore" in files:
current_ignore = combine_ignore_files(current_ignore,
os.path.join(
root, ".caosdbignore"),
# due to the logic of gitignore_parser the file
# has to be written to this folder
dirname=root)
ignore_files.append((root, current_ignore))
matches = parse_gitignore(current_ignore)
# actually append files that are not ignored
for fi in files:
fullpath = os.path.join(root, fi)
if not matches(fullpath):
non_ignored_files.append(fullpath)
return non_ignored_files
def create_re_for_file_list(files, localroot, remoteroot):
"""creates a regular expression that matches file paths contained in the
files argument and all parent directories. The prefix localroot is replaced
by the prefix remoteroot.
"""
regexp = ""
for fi in files:
path = fi
reg = ""
while path != localroot and path != "/" and path != "":
print(path, localroot)
reg = "(/"+re.escape(os.path.basename(path)) + reg + ")?"
path = os.path.dirname(path)
regexp += "|" + re.escape(remoteroot) + reg
return "^("+regexp[1:]+")$"
def loadpath(path, include, exclude, prefix, dryrun, forceAllowSymlinks, caosdbignore=None,
localpath=None):
if caosdbignore:
# create list of files and create regular expression for small chunks
filelist = compile_file_list(caosdbignore, localpath)
fulllist = filelist
index = 0
step_size = 3
includes = []
while index < len(fulllist):
subset = fulllist[index:min(index+step_size, len(fulllist))]
includes.append(create_re_for_file_list(subset, localpath, path))
index += step_size
else:
includes = [include]
# if no caosdbignore file is used, this iterates over a single include
for include in includes:
if dryrun:
logger.info("Performin a dryrun!")
files = db.Container().retrieve(
......@@ -75,6 +175,8 @@ def loadpath(path, include, exclude, prefix, dryrun, forceAllowSymlinks):
for f in files:
totalsize += f.size
print("Made in total {} new files with a combined size of {} "
"accessible.".format(len(files), convert_size(totalsize)))
logger.info("Made in total {} new files with a combined size of {} "
"accessible.".format(len(files), convert_size(totalsize)))
......@@ -91,6 +193,18 @@ def main(argv=None):
# Setup argument parser
parser = ArgumentParser()
parser.add_argument("-p", "--prefix", dest="prefix",
help="store files with this prefix into the server's"
" file system.")
parser.add_argument("-c", "--caosdbignore", help="""
Path to a caosdbignore file that defines which files shall be included and which do not.
The syntax is the same as in a gitignore file. You must also provide the localpath option
since the check is done locally.
"""
)
parser.add_argument("-l", "--localpath", help="Path to the root directory on this machine. "
"This is needed if a caosdbignore file is used since the check is done "
"locally")
parser.add_argument("-i", "--include", dest="include",
help="""
only include paths matching this regex pattern.
......@@ -104,9 +218,6 @@ exclude is given preference over include.
parser.add_argument("-e", "--exclude", dest="exclude",
help="exclude paths matching this regex pattern.",
metavar="RE")
parser.add_argument("-p", "--prefix", dest="prefix",
help="store files with this prefix into the server's"
" file system.")
parser.add_argument("-d", "--dry-run", dest="dryrun", action="store_true",
help="Just simulate the insertion of the files.")
parser.add_argument('-t', '--timeout', dest="timeout",
......@@ -127,6 +238,17 @@ exclude is given preference over include.
"directory tree.", action="store_true")
args = parser.parse_args()
if args.caosdbignore and (args.exclude or args.include):
raise ValueError(
"Do not use a caosdbignore file and in- or exclude simultaneously!")
if args.caosdbignore and not args.localpath:
raise ValueError("To use caosdbignore you must supply a local path!")
if args.localpath and (args.exclude or args.include):
raise ValueError(
"Do not use a localpath and in- or exclude simultaneously!")
con = db.get_connection()
con.timeout = float(args.timeout)
con._login()
......@@ -137,8 +259,9 @@ exclude is given preference over include.
exclude=args.exclude,
prefix=args.prefix,
dryrun=args.dryrun,
forceAllowSymlinks=args.forceAllowSymlinks,
caosdbignore=args.caosdbignore,
localpath=args.localpath,
)
return 0
......
......@@ -121,6 +121,14 @@ as seen by the CaosDB server (The actual path may vary. This is the used
in the LinkAhead distribution of CaosDB). In this case the root file
system as seen from within the CaosDB docker process is used.
You can provide a ``.caosdbignore`` file as a commandline option to the above
loadFiles command. The syntax of that file is the same as for `gitignore
<https://git-scm.com/docs/gitignore>`_ files. Note, that you can have additional
``.caosdbignore`` files at lower levels which are appended to the current ignore
file and have an effect of the respective subtree.
Extending the Crawlers
======================
......
......@@ -7,6 +7,7 @@ deps=nose
git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev
pytest
pytest-cov
gitignore-parser
openpyxl
xlrd == 1.2
h5py
......
README.md
.caosdbignore*
!README.md
#!/usr/bin/env python3
# encoding: utf-8
#
# ** header v3.0
# This file is a part of the CaosDB Project.
#
# Copyright (C) 2022 Indiscale GmbH <info@indiscale.com>
# Copyright (C) 2022 Henrik tom Wörden <h.tomwoerden@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
# ** end header
#
import os
import re
from tempfile import NamedTemporaryFile
import unittest
from caosadvancedtools.loadFiles import compile_file_list, create_re_for_file_list, combine_ignore_files
BASEDIR = os.path.dirname(os.path.realpath(__file__))
class Caosdbignore(unittest.TestCase):
def setUp(self):
pass
def test_compile(self):
files = compile_file_list(os.path.join(BASEDIR, "caosdbignore-example"),
os.path.join(BASEDIR, "data"))
assert len(files) == 3
assert os.path.join(BASEDIR, "data", "datatypes.xlsx") in files
assert os.path.join(BASEDIR, "data", "README.xlsx") in files
assert os.path.join(BASEDIR, "data", "Publications/Posters/2019-02-03_something/README.md") in files
def test_regex(self):
files = [r"/dies/ist/simple", r"/dies/eh(er)/nich?t"]
regex = create_re_for_file_list(files, "/dies", "/dies")
assert re.match(regex, files[0]) is not None
assert re.match(regex, files[1]) is not None
assert re.match(regex, "/dies/ist") is not None
assert re.match(regex, "/die") is None
assert re.match(regex, files[0]+files[1]) is None
assert re.match(regex, "d") is None
def test_combine(self):
fi1 = NamedTemporaryFile(delete=False, mode="w")
fi1.write("ha")
fi1.close()
fi2 = NamedTemporaryFile(delete=False, mode="w")
fi2.write("ha")
fi2.close()
fi_new = combine_ignore_files(fi1.name, fi2.name)
with open(fi_new, "r") as fi:
assert "haha" == fi.read()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment