Skip to content
Snippets Groups Projects
Commit 9dafdd9f authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

Merge branch 'f-unify-notifications' into 'dev'

ENH: Allow crawler_main to operate on a list of paths

See merge request !208
parents 493d1acc 02325fd4
No related branches found
No related tags found
2 merge requests!217TST: Make NamedTemporaryFiles Windows-compatible,!208ENH: Allow crawler_main to operate on a list of paths
Pipeline #58960 passed
......@@ -23,6 +23,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
variables to `int`, `float`, `str` and `bool`.
- Transformer function definition in the cfood support variable
substitutions now.
- `crawler_main` and `scanner.scan_directory` now support list of
directories to be crawled, too. Note that giving a list of
directories is currently incompatible with
`securityMode=SecurityMode.RETRIEVE` or
`securityMode=SecurityMode.INSERT` since the functionality to
authoriye pending inserts or updates doesn't support path lists yet
and will raise a NotImplementedError for now.
- `match_newer_than_file` option for `DirectoryConverter`: A reference
file containing (only) an ISO-formatted datetime string can be
specified here. Directories with this option won't match if all
......
# This file is a part of the LinkAhead Project.
#
# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com>
# 2024 Florian Spreckelsen <f.spreckelsen@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
import logging
import tempfile
from pathlib import Path
import linkahead as db
from caoscrawler import crawl
from caoscrawler.crawl import (crawler_main, SecurityMode)
from linkahead.utils.register_tests import clear_database, set_test_key
set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2")
INTTESTDIR = Path(__file__).parent
def test_list_of_paths(clear_database, monkeypatch):
# Mock the status record
dummy_status = {
"n_calls": 0
}
def _mock_update_status_record(run_id, n_inserts, n_updates, status):
print("Update mocked status")
dummy_status["run_id"] = run_id
dummy_status["n_inserts"] = n_inserts
dummy_status["n_updates"] = n_updates
dummy_status["status"] = status
dummy_status["n_calls"] += 1
monkeypatch.setattr(crawl, "_update_status_record", _mock_update_status_record)
# mock SSS environment
monkeypatch.setenv("SHARED_DIR", tempfile.gettempdir())
# We need only one dummy RT
rt = db.RecordType(name="TestType").insert()
basepath = INTTESTDIR / "test_data" / "crawler_main_with_list_of_dirs"
dirlist = [basepath / "dir1", basepath / "dir2"]
crawler_main(
dirlist,
cfood_file_name=basepath / "cfood.yml",
identifiables_definition_file=basepath / "identifiable.yml"
)
recs = db.execute_query("FIND TestType")
assert len(recs) == 2
assert "Test1" in [r.name for r in recs]
assert "Test2" in [r.name for r in recs]
assert dummy_status["n_inserts"] == 2
assert dummy_status["n_updates"] == 0
assert dummy_status["status"] == "OK"
assert dummy_status["n_calls"] == 1
def test_not_implemented_list_with_authorization(caplog, clear_database):
rt = db.RecordType(name="TestType").insert()
basepath = INTTESTDIR / "test_data" / "crawler_main_with_list_of_dirs"
dirlist = [basepath / "dir1", basepath / "dir2"]
# This is not implemented yet, so check log for correct error.
ret = crawler_main(
dirlist,
cfood_file_name=basepath / "cfood.yml",
identifiables_definition_file=basepath / "identifiable.yml",
securityMode=SecurityMode.RETRIEVE
)
# crawler_main hides the error, but has a non-zero return code and
# errors in the log:
assert ret != 0
err_tuples = [t for t in caplog.record_tuples if t[1] == logging.ERROR]
assert len(err_tuples) == 1
assert "currently implemented only for single paths, not for lists of paths" in err_tuples[0][2]
# No inserts after the errors
assert len(db.execute_query("FIND TestType")) == 0
---
metadata:
crawler-version: 0.10.2
---
BaseDirElement:
type: Directory
match: ^dir(?P<dir_number>[0-9]+)$$
records:
TestType:
name: Test$dir_number
TestType:
- name
# This file is a part of the CaosDB Project.
# This file is a part of the LinkAhead Project.
#
# Copyright (C) 2022 Indiscale GmbH <info@indiscale.com>
# 2022 Florian Spreckelsen <f.spreckelsen@indiscale.com>
......
......@@ -621,7 +621,7 @@ one with the entities that need to be updated and the other with entities to be
crawled_data: Optional[list[db.Record]] = None,
no_insert_RTs: Optional[list[str]] = None,
no_update_RTs: Optional[list[str]] = None,
path_for_authorized_run: Optional[str] = "",
path_for_authorized_run: Optional[Union[str, list[str]]] = "",
):
"""
This function applies several stages:
......@@ -643,7 +643,7 @@ one with the entities that need to be updated and the other with entities to be
no_update_RTs : list[str], optional
list of RecordType names. Records that have one of those RecordTypes
as parent will not be updated
path_for_authorized_run : str, optional
path_for_authorized_run : str or list[str], optional
only used if there are changes that need authorization before being
applied. The form for rerunning the crawler with the authorization
of these changes will be generated with this path. See
......@@ -661,6 +661,12 @@ one with the entities that need to be updated and the other with entities to be
"use for example the Scanner to create this data."))
crawled_data = self.crawled_data
if isinstance(path_for_authorized_run, list) and self.securityMode != SecurityMode.UPDATE:
raise NotImplementedError(
"Authorization of inserts and updates is currently implemented only "
"for single paths, not for lists of paths."
)
to_be_inserted, to_be_updated = self._split_into_inserts_and_updates(
SyncGraph(crawled_data, self.identifiableAdapter))
......@@ -1004,7 +1010,7 @@ def _store_dry_run_data(ins, upd):
"update": updates}))
def crawler_main(crawled_directory_path: str,
def crawler_main(crawled_directory_path: Union[str, list[str]],
cfood_file_name: str,
identifiables_definition_file: Optional[str] = None,
debug: bool = False,
......@@ -1022,8 +1028,8 @@ def crawler_main(crawled_directory_path: str,
Parameters
----------
crawled_directory_path : str
path to be crawled
crawled_directory_path : str or list[str]
path(s) to be crawled
cfood_file_name : str
filename of the cfood to be used
identifiables_definition_file : str
......@@ -1115,42 +1121,28 @@ def crawler_main(crawled_directory_path: str,
crawler.run_id)
_update_status_record(crawler.run_id, len(inserts), len(updates), status="OK")
return 0
except ForbiddenTransaction as err:
logger.debug(traceback.format_exc())
logger.error(err)
_update_status_record(crawler.run_id, 0, 0, status="FAILED")
return 1
except ConverterValidationError as err:
logger.debug(traceback.format_exc())
logger.error(err)
_update_status_record(crawler.run_id, 0, 0, status="FAILED")
return 1
except ImpossibleMergeError as err:
logger.debug(traceback.format_exc())
logger.error(
"Encountered conflicting information when creating Records from the crawled "
f"data:\n\n{err}"
)
_update_status_record(crawler.run_id, 0, 0, status="FAILED")
return 1
except TransactionError as err:
logger.debug(traceback.format_exc())
logger.error(err)
logger.error("Transaction error details:")
for suberr in err.errors:
logger.error("---")
logger.error(suberr.msg)
logger.error(suberr.entity)
return 1
except Exception as err:
logger.debug(traceback.format_exc())
logger.error(err)
if "SHARED_DIR" in os.environ:
# pylint: disable=E0601
domain = get_config_setting("public_host_url")
logger.error("Unexpected Error: Please tell your administrator about this and provide "
f"the following path.\n{get_shared_resource_link(domain, debuglog_public)}")
# Special treatment for known error types
if isinstance(err, ImpossibleMergeError):
logger.error(
"Encountered conflicting information when creating Records from the crawled "
f"data:\n\n{err}"
)
elif isinstance(err, TransactionError):
logger.error("Transaction error details:")
for suberr in err.errors:
logger.error("---")
logger.error(suberr.msg)
logger.error(suberr.entity)
# Unkown errors get a special message
elif not isinstance(err, (ConverterValidationError, ForbiddenTransaction)):
if "SHARED_DIR" in os.environ:
# pylint: disable=E0601
domain = get_config_setting("public_host_url")
logger.error("Unexpected Error: Please tell your administrator about this and provide "
f"the following path.\n{get_shared_resource_link(domain, debuglog_public)}")
_update_status_record(crawler.run_id, 0, 0, status="FAILED")
return 1
......@@ -1174,6 +1166,7 @@ def parse_args():
"This file will only be generated if this option is set.")
parser.add_argument("--debug", required=False, action="store_true",
help="Path name of the cfood yaml file to be used.")
# TODO allow to provide multiple directories to be crawled on the commandline
parser.add_argument("crawled_directory_path",
help="The subtree of files below the given path will "
"be considered. Use '/' for everything.")
......
......@@ -421,7 +421,7 @@ def scanner(items: list[StructureElement],
# --------------------------------------------------------------------------------
def scan_directory(dirname: str, crawler_definition_path: str,
def scan_directory(dirname: Union[str, list[str]], crawler_definition_path: str,
restricted_path: Optional[list[str]] = None,
debug_tree: Optional[DebugTree] = None):
""" Crawl a single directory.
......@@ -434,10 +434,12 @@ def scan_directory(dirname: str, crawler_definition_path: str,
Parameters
----------
dirname: str or list[str]
directory or list of directories to be scanned
restricted_path: optional, list of strings
Traverse the data tree only along the given path. When the end of the given path
is reached, traverse the full tree as normal. See docstring of 'scanner' for
more details.
Traverse the data tree only along the given path. When the end
of the given path is reached, traverse the full tree as
normal. See docstring of 'scanner' for more details.
Returns
-------
......@@ -455,26 +457,31 @@ def scan_directory(dirname: str, crawler_definition_path: str,
if not dirname:
raise ValueError(
"You have to provide a non-empty path for crawling.")
dir_structure_name = os.path.basename(dirname)
# TODO: needs to be covered somewhere else
crawled_directory = dirname
if not dir_structure_name and dirname.endswith('/'):
if dirname == '/':
# Crawling the entire file system
dir_structure_name = "root"
else:
# dirname had a trailing '/'
dir_structure_name = os.path.basename(dirname[:-1])
return scan_structure_elements(Directory(dir_structure_name,
dirname),
crawler_definition,
converter_registry,
restricted_path=restricted_path,
debug_tree=debug_tree,
registered_transformer_functions=registered_transformer_functions
)
if not isinstance(dirname, list):
dirname = [dirname]
dir_element_list = []
for dname in dirname:
dir_structure_name = os.path.basename(dname)
# TODO: needs to be covered somewhere else
crawled_directory = dname
if not dir_structure_name and dname.endswith(os.path.sep):
if dname == os.path.sep:
# Crawling the entire file system
dir_structure_name = "root"
else:
# dirname had a trailing '/'
dir_structure_name = os.path.basename(dname[:-1])
dir_element_list.append(Directory(dir_structure_name, dname))
return scan_structure_elements(
dir_element_list,
crawler_definition,
converter_registry,
restricted_path=restricted_path,
debug_tree=debug_tree,
registered_transformer_functions=registered_transformer_functions
)
def scan_structure_elements(items: Union[list[StructureElement], StructureElement],
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment