Skip to content
Snippets Groups Projects
Commit 479e5545 authored by Florian Spreckelsen's avatar Florian Spreckelsen
Browse files

Merge branch 'release-0.6' into 'main'

FIX: if multiple updates for one entity exist, the retrieve would result in an...

See merge request !59
parents a176344d ad1fde54
No related branches found
Tags v0.6.0
1 merge request!59FIX: if multiple updates for one entity exist, the retrieve would result in an...
Pipeline #29213 passed
Showing
with 532 additions and 124 deletions
......@@ -25,7 +25,7 @@ ADD https://gitlab.com/api/v4/projects/13656973/repository/branches/dev \
RUN git clone https://gitlab.com/caosdb/caosdb-pylib.git && \
cd caosdb-pylib && git checkout dev && pip3 install .
# At least recommonmark 0.6 required.
RUN pip3 install -U html2text pycodestyle pylint recommonmark sphinx-rtd-theme
RUN pip3 install -U html2text pycodestyle pylint recommonmark sphinx-rtd-theme gitignore-parser
COPY . /git
RUN rm -r /git/.git \
&& mv /git/.docker/pycaosdb.ini /git/integrationtests
......
......@@ -129,6 +129,7 @@ unittest:
image: $CI_REGISTRY_IMAGE
needs: [build-testenv]
script:
- python3 -c "import caosdb; print('CaosDB Version:', caosdb.__version__)"
- tox
# Build the sphinx documentation and make it ready for deployment by Gitlab Pages
......
......@@ -4,12 +4,24 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [0.6.0] - 2022-10-11 ##
(Florian Spreckelsen)
### Added ###
- Unauthorized inserts can now be cached. Note that the Crawler cannot postpone
inserts but the Cache has the functionality now.
- caosdbignore; You can add one or more `.caosdbignore` files to the directory
structure that you want to make available in CaosDB and the run loadFiles.
The syntax is that of `.gitignore` files. For more information see `loadFiles`
section of the Crawler in the documentation.
## [0.5.0] - 2022-09-05 ##
(Florian Spreckelsen)
### Added ###
- You can now use `python -m caosadvancedtools.models.parser model_file` to
- You can now use `python -m caosadvancedtools.models.parser model_file` to
parse and potentially synchronize data models.
### Deprecated ###
......
......@@ -11,7 +11,7 @@ git clone 'https://gitlab.com/caosdb/caosdb-advanced-user-tools'
Dependencies will be installed automatically if you use the below described
procedure.
- `caosdb>=0.6.0`
- `openpyxl>=3.0.0`
- `openpyxl>=3.0.7`
- `xlrd>=1.2.0`
- `pandas>=1.2.0`
- `numpy>=1.17.3`
......
......@@ -41,3 +41,7 @@ guidelines of the CaosDB Project
11. After the merge of main to dev, start a new development version by
setting `ISRELEASED` to `False` and by increasing at least the `MICRO`
version in [setup.py](./setup.py) and preparing CHANGELOG.md.
12. Create releases on gitlab.com and gitlab.indiscale.com that contain (at
least) the most recent section of the CHANGELOG as the description and link
to the PyPi package.
lol
~README.md
stuff
lol
......@@ -5,7 +5,7 @@ python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/ExperimentalData
python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/DataAnalysis
python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/SimulationData
python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/Publications
python3 -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/Software
python3 -m caosadvancedtools.loadFiles -c caosdbignore -l $PWD/extroot/Software /opt/caosdb/mnt/extroot/Software
python3 insert_model.py
python3 insert_some.py
python3 crawl.py /
......@@ -16,7 +16,8 @@ fi
OUT=/tmp/crawler.output
ls
cat pycaosdb.ini
rm -rf cache.db
python3 -c "import caosdb; print('CaosDB Version:', caosdb.__version__)"
rm -rf /tmp/caosdb_identifiable_cache.db
set -e
echo "Clearing database"
python3 clear_database.py
......
......@@ -33,6 +33,15 @@ def get_entity_with_id(eid):
return db.execute_query("FIND "+str(eid), unique=True)
class LoadFilesTest(unittest.TestCase):
def test_lol(self):
# check whether ignored files were insered
res = db.execute_query("FIND FILE WHICH IS STORED AT '**/lol'")
assert len(res) == 0
res = db.execute_query("FIND FILE WHICH IS STORED AT '**/~README.md'")
assert len(res) == 0
class CrawlerTest(unittest.TestCase):
def test_experiment(self):
......
......@@ -46,7 +46,7 @@ from setuptools import find_packages, setup
########################################################################
MAJOR = 0
MINOR = 5
MINOR = 6
MICRO = 0
PRE = "" # e.g. rc0, alpha.1, 0.beta-23
ISRELEASED = True
......@@ -157,11 +157,12 @@ def setup_package():
install_requires=["caosdb>=0.7.0",
"jsonschema>=4.4.0",
"numpy>=1.17.3",
"openpyxl>=3.0.0",
"openpyxl>=3.0.7",
"pandas>=1.2.0",
"xlrd>=2.0",
],
extras_require={"h5-crawler": ["h5py>=3.3.0", ],
"gitignore-parser ": ["gitignore-parser >=0.1.0", ],
},
packages=find_packages('src'),
package_dir={'': 'src'},
......
......@@ -23,11 +23,14 @@
#
# ** end header
# TODO this is implementing a cache on client side. Should it be on
# server side?
# Note: This is implementing a cache on client side. It would be great if the server would provide
# something to replace this.
import os
import sqlite3
from copy import deepcopy
from abc import ABC, abstractmethod
from hashlib import sha256
import warnings
import caosdb as db
from lxml import etree
......@@ -64,59 +67,181 @@ def get_pretty_xml(cont):
return etree.tounicode(xml, pretty_print=True)
class Cache(object):
"""
stores identifiables (as a hash of xml) and their respective ID.
class AbstractCache(ABC):
def __init__(self, db_file=None, force_creation=False):
"""
db_file: The path of the database file.
This allows to retrieve the Record corresponding to an indentifiable
without querying.
"""
if force_creation is set to True, the file will be created
regardless of a file at the same path already exists.
"""
def __init__(self, db_file=None):
if db_file is None:
self.db_file = "cache.db"
tmppath = tempfile.gettempdir()
self.db_file = os.path.join(tmppath, self.get_default_file_name())
else:
self.db_file = db_file
if not os.path.exists(self.db_file):
if not os.path.exists(self.db_file) or force_creation:
self.create_cache()
else:
self.check_cache()
@abstractmethod
def get_cache_schema_version(self):
"""
A method that has to be overloaded that sets the version of the
SQLITE database schema. The schema is saved in table version column schema.
Increase this variable, when changes to the cache tables are made.
"""
pass
@abstractmethod
def create_cache(self):
"""
Provide an overloaded function here that creates the cache in
the most recent version.
"""
pass
@abstractmethod
def get_default_file_name(self):
"""
Supply a default file name for the cache here.
"""
pass
def check_cache(self):
"""
Check whether the cache in db file self.db_file exists and conforms
to the latest database schema.
If it does not exist, it will be created using the newest database schema.
If it exists, but the schema is outdated, an exception will be raised.
"""
try:
current_schema = self.get_cache_version()
except sqlite3.OperationalError:
current_schema = 1
if current_schema > self.get_cache_schema_version():
raise RuntimeError(
"Cache is corrupt or was created with a future version of this program.")
elif current_schema < self.get_cache_schema_version():
raise RuntimeError("Cache version too old. Please remove the current cache file:\n"
+ self.db_file)
def get_cache_version(self):
"""
Return the version of the cache stored in self.db_file.
The version is stored as the only entry in colum schema of table version.
"""
try:
conn = sqlite3.connect(self.db_file)
c = conn.cursor()
c.execute("SELECT schema FROM version")
version_row = c.fetchall()
if len(version_row) != 1:
raise RuntimeError("Cache version table broken.")
return version_row[0][0]
finally:
conn.close()
def run_sql_commands(self, commands, fetchall=False):
"""
Run a list of SQL commands on self.db_file.
commands: list of sql commands (tuples) to execute
fetchall: When True, run fetchall as last command and return the results.
Otherwise nothing is returned.
"""
conn = sqlite3.connect(self.db_file)
c = conn.cursor()
c.execute(
'''CREATE TABLE identifiables (digest text primary key, caosdb_id integer)''')
for sql in commands:
c.execute(*sql)
if fetchall:
results = c.fetchall()
conn.commit()
conn.close()
if fetchall:
return results
class IdentifiableCache(AbstractCache):
"""
stores identifiables (as a hash of xml) and their respective ID.
This allows to retrieve the Record corresponding to an indentifiable
without querying.
"""
def get_cache_schema_version(self):
return 2
def get_default_file_name(self):
return "caosdb_identifiable_cache.db"
def __init__(self, db_file=None, force_creation=False):
super().__init__(db_file, force_creation)
def create_cache(self):
"""
Create a new SQLITE cache file in self.db_file.
Two tables will be created:
- identifiables is the actual cache.
- version is a table with version information about the cache.
"""
self.run_sql_commands([
('''CREATE TABLE identifiables (digest TEXT PRIMARY KEY, caosdb_id INTEGER, caosdb_version TEXT)''',),
('''CREATE TABLE version (schema INTEGER)''',),
("INSERT INTO version VALUES (?)", (self.get_cache_schema_version(),))])
@staticmethod
def hash_entity(ent):
xml = get_pretty_xml(ent)
"""
Format an entity as "pretty" XML and return the SHA256 hash.
"""
xml = get_pretty_xml(deepcopy(ent))
digest = sha256(xml.encode("utf-8")).hexdigest()
return digest
def insert(self, ent_hash, ent_id):
conn = sqlite3.connect(self.db_file)
c = conn.cursor()
c.execute('''INSERT INTO identifiables VALUES (?, ?)''',
(ent_hash, ent_id))
conn.commit()
conn.close()
def insert(self, ent_hash, ent_id, ent_version):
"""
Insert a new cache entry.
ent_hash: Hash of the entity. Should be generated with Cache.hash_entity
ent_id: ID of the entity
ent_version: Version string of the entity
"""
self.run_sql_commands([
('''INSERT INTO identifiables VALUES (?, ?, ?)''',
(ent_hash, ent_id, ent_version))])
def check_existing(self, ent_hash):
conn = sqlite3.connect(self.db_file)
c = conn.cursor()
c.execute('''Select * FROM identifiables WHERE digest=?''',
(ent_hash,))
res = c.fetchone()
conn.commit()
conn.close()
"""
Check the cache for a hash.
ent_hash: The hash to search for.
Return the ID and the version ID of the hashed entity.
Return None if no entity with that hash is in the cache.
"""
res = self.run_sql_commands([('''Select * FROM identifiables WHERE digest=?''',
(ent_hash,))], True)
if res is None:
return res
if len(res) == 0:
return None
else:
return res[1]
return res[0][1:]
def update_ids_from_cache(self, entities):
""" sets ids of those entities that are in cache
......@@ -131,7 +256,7 @@ class Cache(object):
eid = self.check_existing(ehash)
if eid is not None:
ent.id = eid
ent.id = eid[0]
return hashes
......@@ -141,25 +266,75 @@ class Cache(object):
The hashes must correspond to the entities in the list
"""
# Check whether all entities have IDs and versions:
for ent in entities:
if ent.id is None:
raise RuntimeError("Entity has no ID.")
if ent.version is None or ent.version.id is None:
raise RuntimeError("Entity has no version ID.")
for ehash, ent in zip(hashes, entities):
if self.check_existing(ehash) is None:
self.insert(ehash, ent.id)
self.insert(ehash, ent.id, ent.version.id)
def validate_cache(self, entities=None):
"""
Runs through all entities stored in the cache and checks
whether the version still matches the most recent version.
Non-matching entities will be removed from the cache.
entities: When set to a db.Container or a list of Entities
the IDs from the cache will not be retrieved from the CaosDB database,
but the versions from the cache will be checked against the versions
contained in that collection. Only entries in the cache that have
a corresponding version in the collection will be checked, all others
will be ignored. Useful for testing.
Return a list of invalidated entries or an empty list if no elements have been invalidated.
"""
res = self.run_sql_commands([(
"SELECT caosdb_id, caosdb_version FROM identifiables", ())], True)
class UpdateCache(Cache):
if entities is None:
# TODO this might become a problem. If many entities are cached,
# then all of them are retrieved here...
ids = [c_id for c_id, _ in res]
ids = set(ids)
entities = db.Container()
entities.extend([db.Entity(id=c_id) for c_id in ids])
entities.retrieve()
v = {c_id: c_version for c_id, c_version in res}
invalidate_list = []
for ent in entities:
if ent.version.id != v[ent.id]:
invalidate_list.append(ent.id)
self.run_sql_commands([(
"DELETE FROM identifiables WHERE caosdb_id IN ({})".format(
", ".join([str(caosdb_id) for caosdb_id in invalidate_list])), ())])
return invalidate_list
class UpdateCache(AbstractCache):
"""
stores unauthorized updates
stores unauthorized inserts and updates
If the Guard is set to a mode that does not allow an update, the update can
be stored in this cache such that it can be authorized and done later.
If the Guard is set to a mode that does not allow an insert or update, the insert or update can
be stored in this cache such that it can be authorized and performed later.
"""
def __init__(self, db_file=None):
if db_file is None:
tmppath = tempfile.gettempdir()
tmpf = os.path.join(tmppath, "crawler_update_cache.db")
db_file = tmpf
super().__init__(db_file=db_file)
def get_cache_schema_version(self):
return 3
def get_default_file_name(self):
return "/tmp/crawler_update_cache.db"
@staticmethod
def get_previous_version(cont):
......@@ -174,41 +349,65 @@ class UpdateCache(Cache):
return old_ones
def insert(self, cont, run_id):
"""Insert a pending, unauthorized update
def insert(self, cont, run_id, insert=False):
"""Insert a pending, unauthorized insert or update
Parameters
----------
cont: Container with the records to be updated containing the desired
cont: Container with the records to be inserted or updated containing the desired
version, i.e. the state after the update.
run_id: int
The id of the crawler run
insert: bool
Whether the entities in the container shall be inserted or updated.
"""
cont = put_in_container(cont)
old_ones = UpdateCache.get_previous_version(cont)
if insert:
old_ones = ""
else:
old_ones = UpdateCache.get_previous_version(cont)
new_ones = cont
old_hash = Cache.hash_entity(old_ones)
if insert:
old_hash = ""
else:
old_hash = Cache.hash_entity(old_ones)
new_hash = Cache.hash_entity(new_ones)
conn = sqlite3.connect(self.db_file)
c = conn.cursor()
c.execute('''INSERT INTO updates VALUES (?, ?, ?, ?, ?)''',
(old_hash, new_hash, str(old_ones), str(new_ones),
str(run_id)))
conn.commit()
conn.close()
self.run_sql_commands([('''INSERT INTO updates VALUES (?, ?, ?, ?, ?)''',
(old_hash, new_hash, str(old_ones), str(new_ones),
str(run_id)))])
def create_cache(self):
""" initialize the cache """
conn = sqlite3.connect(self.db_file)
c = conn.cursor()
c.execute('''CREATE TABLE updates (olddigest text, newdigest text,
oldrep text, newrep text, run_id text,
primary key (olddigest, newdigest, run_id))''')
conn.commit()
conn.close()
self.run_sql_commands([
('''CREATE TABLE updates (olddigest TEXT, newdigest TEXT, oldrep TEXT,
newrep TEXT, run_id TEXT, primary key (olddigest, newdigest, run_id))''',),
('''CREATE TABLE version (schema INTEGER)''',),
("INSERT INTO version VALUES (?)", (self.get_cache_schema_version(),))])
def get(self, run_id, querystring):
""" returns the pending updates for a given run id
Parameters:
-----------
run_id: the id of the crawler run
querystring: the sql query
"""
return self.run_sql_commands([(querystring, (str(run_id),))], fetchall=True)
def get_inserts(self, run_id):
""" returns the pending updates for a given run id
Parameters:
-----------
run_id: the id of the crawler run
"""
return self.get(run_id, '''Select * FROM updates WHERE olddigest='' AND run_id=?''')
def get_updates(self, run_id):
""" returns the pending updates for a given run id
......@@ -218,12 +417,10 @@ class UpdateCache(Cache):
run_id: the id of the crawler run
"""
conn = sqlite3.connect(self.db_file)
c = conn.cursor()
c.execute('''Select * FROM updates WHERE run_id=?''',
(str(run_id),))
res = c.fetchall()
conn.commit()
conn.close()
return self.get(run_id, '''Select * FROM updates WHERE olddigest!='' AND run_id=?''')
return res
class Cache(IdentifiableCache):
def __init__(self, *args, **kwargs):
warnings.warn(DeprecationWarning("This class is depricated. Please use IdentifiableCache."))
super().__init__(*args, **kwargs)
......@@ -50,7 +50,7 @@ from sqlite3 import IntegrityError
import caosdb as db
from caosdb.exceptions import BadQueryError
from .cache import Cache, UpdateCache, get_pretty_xml
from .cache import IdentifiableCache, UpdateCache, get_pretty_xml
from .cfood import RowCFood, add_files, get_ids_for_entities_with_names
from .datainconsistency import DataInconsistencyError
from .datamodel_problems import DataModelProblems
......@@ -190,7 +190,8 @@ class Crawler(object):
self.filterKnown.reset(cat)
if self.use_cache:
self.cache = Cache(db_file=cache_file)
self.cache = IdentifiableCache(db_file=cache_file)
self.cache.validate_cache()
def iteritems(self):
""" generates items to be crawled with an index"""
......@@ -208,28 +209,70 @@ class Crawler(object):
run_id: the id of the crawler run
"""
cache = UpdateCache()
inserts = cache.get_inserts(run_id)
all_inserts = 0
all_updates = 0
for _, _, _, new, _ in inserts:
new_cont = db.Container()
new_cont = new_cont.from_xml(new)
new_cont.insert(unique=False)
logger.info("Successfully inserted {} records!".format(len(new_cont)))
all_inserts += len(new_cont)
logger.info("Finished with authorized updates.")
changes = cache.get_updates(run_id)
for _, _, old, new, _ in changes:
current = db.Container()
new_cont = db.Container()
new_cont = new_cont.from_xml(new)
ids = []
tmp = []
update_incomplete = False
# remove duplicate entities
for el in new_cont:
if el.id not in ids:
ids.append(el.id)
tmp.append(el)
else:
update_incomplete = True
new_cont = tmp
if new[0].version:
valids = db.Container()
nonvalids = db.Container()
for ent in new_cont:
remote_ent = db.Entity(id=ent.id).retrieve()
if ent.version == remote_ent.version:
valids.append(remote_ent)
else:
update_incomplete = True
nonvalids.append(remote_ent)
valids.update(unique=False)
logger.info("Successfully updated {} records!".format(
len(valids)))
logger.info("{} Records were not updated because the version in the server "
"changed!".format(len(nonvalids)))
all_updates += len(valids)
else:
current = db.Container()
for ent in new_cont:
current.append(db.execute_query("FIND {}".format(ent.id),
unique=True))
current_xml = get_pretty_xml(current)
for ent in new_cont:
current.append(db.Entity(id=ent.id).retrieve())
current_xml = get_pretty_xml(current)
# check whether previous version equals current version
# if not, the update must not be done
# check whether previous version equals current version
# if not, the update must not be done
if current_xml != old:
continue
if current_xml != old:
continue
new_cont.update(unique=False)
logger.info("Successfully updated {} records!".format(
len(new_cont)))
new_cont.update(unique=False)
logger.info("Successfully updated {} records!".format(
len(new_cont)))
all_updates += len(new_cont)
logger.info("Some updates could not be applied. Crawler has to rerun.")
logger.info("Finished with authorized updates.")
return all_inserts, all_updates
def collect_cfoods(self):
"""
......
......@@ -25,10 +25,14 @@
import argparse
import logging
import os
import math
import sys
import re
from argparse import ArgumentParser
from tempfile import NamedTemporaryFile
import shutil
import caosdb as db
logger = logging.getLogger(__name__)
......@@ -46,37 +50,135 @@ def convert_size(size):
return '%s %s' % (s, size_name[i])
def loadpath(path, include, exclude, prefix, dryrun, forceAllowSymlinks):
def combine_ignore_files(caosdbignore, localignore, dirname=None):
"""appends the contents of localignore to caosdbignore and saves the result
and returns the name
if dryrun:
logger.info("Performin a dryrun!")
files = db.Container().retrieve(
unique=False,
raise_exception_on_error=True,
flags={"InsertFilesInDir": ("-p " + prefix + " " if prefix else "")
+ ("-e " + exclude + " " if exclude else "")
+ ("-i " + include + " " if include else "")
+ ("--force-allow-symlinks " if forceAllowSymlinks else "")
+ path})
"""
tmp = NamedTemporaryFile(delete=False, mode="w",
dir=dirname, prefix=".caosdbignore")
with open(caosdbignore, "r") as base:
tmp.write(base.read())
with open(localignore, "r") as local:
tmp.write(local.read())
tmp.close()
return tmp.name
def compile_file_list(caosdbignore, localpath):
"""creates a list of files that contain all files under localpath except
those excluded by caosdbignore
"""
from gitignore_parser import parse_gitignore
matches = parse_gitignore(caosdbignore)
current_ignore = caosdbignore
non_ignored_files = []
ignore_files = []
for root, dirs, files in os.walk(localpath):
# remove local ignore files that do no longer apply to the current subtree (branch switch)
while len(ignore_files) > 0 and not root.startswith(ignore_files[-1][0]):
shutil.os.remove(ignore_files[-1][1])
ignore_files.pop()
# use the global one if there are no more local ones
if len(ignore_files) > 0:
current_ignore = ignore_files[-1][1]
matches = parse_gitignore(current_ignore)
else:
current_ignore = caosdbignore
matches = parse_gitignore(current_ignore)
# create a new local ignore file
if ".caosdbignore" in files:
current_ignore = combine_ignore_files(current_ignore,
os.path.join(
root, ".caosdbignore"),
# due to the logic of gitignore_parser the file
# has to be written to this folder
dirname=root)
ignore_files.append((root, current_ignore))
matches = parse_gitignore(current_ignore)
# actually append files that are not ignored
for fi in files:
fullpath = os.path.join(root, fi)
if not matches(fullpath):
non_ignored_files.append(fullpath)
return non_ignored_files
def create_re_for_file_list(files, localroot, remoteroot):
"""creates a regular expression that matches file paths contained in the
files argument and all parent directories. The prefix localroot is replaced
by the prefix remoteroot.
"""
regexp = ""
for fi in files:
path = fi
reg = ""
while path != localroot and path != "/" and path != "":
print(path, localroot)
reg = "(/"+re.escape(os.path.basename(path)) + reg + ")?"
path = os.path.dirname(path)
regexp += "|" + re.escape(remoteroot) + reg
return "^("+regexp[1:]+")$"
def loadpath(path, include, exclude, prefix, dryrun, forceAllowSymlinks, caosdbignore=None,
localpath=None):
if caosdbignore:
# create list of files and create regular expression for small chunks
filelist = compile_file_list(caosdbignore, localpath)
fulllist = filelist
index = 0
step_size = 3
includes = []
while index < len(fulllist):
subset = fulllist[index:min(index+step_size, len(fulllist))]
includes.append(create_re_for_file_list(subset, localpath, path))
index += step_size
else:
# new files (inserting them using the insertFilesInDir feature of
# the server, which inserts files via symlinks)
files = db.Container().insert(
unique=False,
raise_exception_on_error=True,
flags={"InsertFilesInDir": ("-p " + prefix + " " if prefix else "")
+ ("-e " + exclude + " " if exclude else "")
+ ("-i " + include + " " if include else "")
+ ("--force-allow-symlinks " if forceAllowSymlinks else "")
+ path})
includes = [include]
totalsize = 0 # collecting total size of all new files
# if no caosdbignore file is used, this iterates over a single include
for include in includes:
if dryrun:
logger.info("Performin a dryrun!")
files = db.Container().retrieve(
unique=False,
raise_exception_on_error=True,
flags={"InsertFilesInDir": ("-p " + prefix + " " if prefix else "")
+ ("-e " + exclude + " " if exclude else "")
+ ("-i " + include + " " if include else "")
+ ("--force-allow-symlinks " if forceAllowSymlinks else "")
+ path})
else:
# new files (inserting them using the insertFilesInDir feature of
# the server, which inserts files via symlinks)
files = db.Container().insert(
unique=False,
raise_exception_on_error=True,
flags={"InsertFilesInDir": ("-p " + prefix + " " if prefix else "")
+ ("-e " + exclude + " " if exclude else "")
+ ("-i " + include + " " if include else "")
+ ("--force-allow-symlinks " if forceAllowSymlinks else "")
+ path})
for f in files:
totalsize += f.size
totalsize = 0 # collecting total size of all new files
logger.info("Made in total {} new files with a combined size of {} "
"accessible.".format(len(files), convert_size(totalsize)))
for f in files:
totalsize += f.size
print("Made in total {} new files with a combined size of {} "
"accessible.".format(len(files), convert_size(totalsize)))
logger.info("Made in total {} new files with a combined size of {} "
"accessible.".format(len(files), convert_size(totalsize)))
return
......@@ -91,6 +193,18 @@ def main(argv=None):
# Setup argument parser
parser = ArgumentParser()
parser.add_argument("-p", "--prefix", dest="prefix",
help="store files with this prefix into the server's"
" file system.")
parser.add_argument("-c", "--caosdbignore", help="""
Path to a caosdbignore file that defines which files shall be included and which do not.
The syntax is the same as in a gitignore file. You must also provide the localpath option
since the check is done locally.
"""
)
parser.add_argument("-l", "--localpath", help="Path to the root directory on this machine. "
"This is needed if a caosdbignore file is used since the check is done "
"locally")
parser.add_argument("-i", "--include", dest="include",
help="""
only include paths matching this regex pattern.
......@@ -104,9 +218,6 @@ exclude is given preference over include.
parser.add_argument("-e", "--exclude", dest="exclude",
help="exclude paths matching this regex pattern.",
metavar="RE")
parser.add_argument("-p", "--prefix", dest="prefix",
help="store files with this prefix into the server's"
" file system.")
parser.add_argument("-d", "--dry-run", dest="dryrun", action="store_true",
help="Just simulate the insertion of the files.")
parser.add_argument('-t', '--timeout', dest="timeout",
......@@ -127,6 +238,17 @@ exclude is given preference over include.
"directory tree.", action="store_true")
args = parser.parse_args()
if args.caosdbignore and (args.exclude or args.include):
raise ValueError(
"Do not use a caosdbignore file and in- or exclude simultaneously!")
if args.caosdbignore and not args.localpath:
raise ValueError("To use caosdbignore you must supply a local path!")
if args.localpath and (args.exclude or args.include):
raise ValueError(
"Do not use a localpath and in- or exclude simultaneously!")
con = db.get_connection()
con.timeout = float(args.timeout)
con._login()
......@@ -137,8 +259,9 @@ exclude is given preference over include.
exclude=args.exclude,
prefix=args.prefix,
dryrun=args.dryrun,
forceAllowSymlinks=args.forceAllowSymlinks,
caosdbignore=args.caosdbignore,
localpath=args.localpath,
)
return 0
......
......@@ -27,9 +27,9 @@ copyright = '2021, IndiScale GmbH'
author = 'Daniel Hornung'
# The short X.Y version
version = '0.5.0'
version = '0.6.0'
# The full version, including alpha/beta/rc tags
release = '0.5.0'
release = '0.6.0'
# -- General configuration ---------------------------------------------------
......
......@@ -121,6 +121,14 @@ as seen by the CaosDB server (The actual path may vary. This is the used
in the LinkAhead distribution of CaosDB). In this case the root file
system as seen from within the CaosDB docker process is used.
You can provide a ``.caosdbignore`` file as a commandline option to the above
loadFiles command. The syntax of that file is the same as for `gitignore
<https://git-scm.com/docs/gitignore>`_ files. Note, that you can have additional
``.caosdbignore`` files at lower levels which are appended to the current ignore
file and have an effect of the respective subtree.
Extending the Crawlers
======================
......
......@@ -7,7 +7,8 @@ deps=nose
git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev
pytest
pytest-cov
openpyxl
gitignore-parser
openpyxl >= 3.0.7
xlrd == 1.2
h5py
commands=py.test --cov=caosadvancedtools -vv {posargs}
......
README.md
.caosdbignore*
!README.md
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment