From 444f98310092c168ddd34c81d059c9274487d8f5 Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Wed, 4 Dec 2024 14:11:43 +0100 Subject: [PATCH 1/8] MAINT: Reduce code duplication in error handling --- src/caoscrawler/crawl.py | 52 +++++++++++++++------------------------- 1 file changed, 19 insertions(+), 33 deletions(-) diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index a79e4434..8ca84502 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -1115,42 +1115,28 @@ def crawler_main(crawled_directory_path: str, crawler.run_id) _update_status_record(crawler.run_id, len(inserts), len(updates), status="OK") return 0 - except ForbiddenTransaction as err: - logger.debug(traceback.format_exc()) - logger.error(err) - _update_status_record(crawler.run_id, 0, 0, status="FAILED") - return 1 - except ConverterValidationError as err: - logger.debug(traceback.format_exc()) - logger.error(err) - _update_status_record(crawler.run_id, 0, 0, status="FAILED") - return 1 - except ImpossibleMergeError as err: - logger.debug(traceback.format_exc()) - logger.error( - "Encountered conflicting information when creating Records from the crawled " - f"data:\n\n{err}" - ) - _update_status_record(crawler.run_id, 0, 0, status="FAILED") - return 1 - except TransactionError as err: - logger.debug(traceback.format_exc()) - logger.error(err) - logger.error("Transaction error details:") - for suberr in err.errors: - logger.error("---") - logger.error(suberr.msg) - logger.error(suberr.entity) - return 1 except Exception as err: logger.debug(traceback.format_exc()) logger.error(err) - - if "SHARED_DIR" in os.environ: - # pylint: disable=E0601 - domain = get_config_setting("public_host_url") - logger.error("Unexpected Error: Please tell your administrator about this and provide " - f"the following path.\n{get_shared_resource_link(domain, debuglog_public)}") + # Special treatment for known error types + if isinstance(err, ImpossibleMergeError): + logger.error( + "Encountered conflicting information when creating Records from the crawled " + f"data:\n\n{err}" + ) + elif isinstance(err, TransactionError): + logger.error("Transaction error details:") + for suberr in err.errors: + logger.error("---") + logger.error(suberr.msg) + logger.error(suberr.entity) + # Unkown errors get a special message + elif not isinstance(err, (ConverterValidationError, ForbiddenTransaction)): + if "SHARED_DIR" in os.environ: + # pylint: disable=E0601 + domain = get_config_setting("public_host_url") + logger.error("Unexpected Error: Please tell your administrator about this and provide " + f"the following path.\n{get_shared_resource_link(domain, debuglog_public)}") _update_status_record(crawler.run_id, 0, 0, status="FAILED") return 1 -- GitLab From 0cd3c0c17221791adf22493204c83e8bd42142a7 Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Thu, 5 Dec 2024 17:26:04 +0100 Subject: [PATCH 2/8] ENH: Support list of directories for crawler_main and scan_directory --- src/caoscrawler/crawl.py | 20 ++++++++++---- src/caoscrawler/scanner.py | 55 +++++++++++++++++++++----------------- 2 files changed, 46 insertions(+), 29 deletions(-) diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 8ca84502..fd0beaa2 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -621,7 +621,7 @@ one with the entities that need to be updated and the other with entities to be crawled_data: Optional[list[db.Record]] = None, no_insert_RTs: Optional[list[str]] = None, no_update_RTs: Optional[list[str]] = None, - path_for_authorized_run: Optional[str] = "", + path_for_authorized_run: Optional[Union[str, list[str]]] = "", ): """ This function applies several stages: @@ -643,7 +643,7 @@ one with the entities that need to be updated and the other with entities to be no_update_RTs : list[str], optional list of RecordType names. Records that have one of those RecordTypes as parent will not be updated - path_for_authorized_run : str, optional + path_for_authorized_run : str or list[str], optional only used if there are changes that need authorization before being applied. The form for rerunning the crawler with the authorization of these changes will be generated with this path. See @@ -718,11 +718,21 @@ one with the entities that need to be updated and the other with entities to be update_cache = UpdateCache() pending_inserts = update_cache.get_inserts(self.run_id) if pending_inserts: + if isinstance(path_for_authorized_run, list): + raise NotImplementedError( + "Authorization of inserts is currently implemented only for single paths, " + "not for lists of paths." + ) Crawler.inform_about_pending_changes( pending_inserts, self.run_id, path_for_authorized_run) pending_updates = update_cache.get_updates(self.run_id) if pending_updates: + if isinstance(path_for_authorized_run, list): + raise NotImplementedError( + "Authorization of updates is currently implemented only for single paths, " + "not for lists of paths." + ) Crawler.inform_about_pending_changes( pending_updates, self.run_id, path_for_authorized_run) @@ -1004,7 +1014,7 @@ def _store_dry_run_data(ins, upd): "update": updates})) -def crawler_main(crawled_directory_path: str, +def crawler_main(crawled_directory_path: Union[str, list[str]], cfood_file_name: str, identifiables_definition_file: Optional[str] = None, debug: bool = False, @@ -1022,8 +1032,8 @@ def crawler_main(crawled_directory_path: str, Parameters ---------- - crawled_directory_path : str - path to be crawled + crawled_directory_path : str or list[str] + path(s) to be crawled cfood_file_name : str filename of the cfood to be used identifiables_definition_file : str diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py index 89bd1c04..6b4d7a12 100644 --- a/src/caoscrawler/scanner.py +++ b/src/caoscrawler/scanner.py @@ -421,7 +421,7 @@ def scanner(items: list[StructureElement], # -------------------------------------------------------------------------------- -def scan_directory(dirname: str, crawler_definition_path: str, +def scan_directory(dirname: Union[str, list[str]], crawler_definition_path: str, restricted_path: Optional[list[str]] = None, debug_tree: Optional[DebugTree] = None): """ Crawl a single directory. @@ -434,10 +434,12 @@ def scan_directory(dirname: str, crawler_definition_path: str, Parameters ---------- + dirname: str or list[str] + directory or list of directories to be scanned restricted_path: optional, list of strings - Traverse the data tree only along the given path. When the end of the given path - is reached, traverse the full tree as normal. See docstring of 'scanner' for - more details. + Traverse the data tree only along the given path. When the end + of the given path is reached, traverse the full tree as + normal. See docstring of 'scanner' for more details. Returns ------- @@ -455,26 +457,31 @@ def scan_directory(dirname: str, crawler_definition_path: str, if not dirname: raise ValueError( "You have to provide a non-empty path for crawling.") - dir_structure_name = os.path.basename(dirname) - - # TODO: needs to be covered somewhere else - crawled_directory = dirname - if not dir_structure_name and dirname.endswith('/'): - if dirname == '/': - # Crawling the entire file system - dir_structure_name = "root" - else: - # dirname had a trailing '/' - dir_structure_name = os.path.basename(dirname[:-1]) - - return scan_structure_elements(Directory(dir_structure_name, - dirname), - crawler_definition, - converter_registry, - restricted_path=restricted_path, - debug_tree=debug_tree, - registered_transformer_functions=registered_transformer_functions - ) + if not isinstance(dirname, list): + dirname = [dirname] + dir_element_list = [] + for dname in dirname: + dir_structure_name = os.path.basename(dname) + + # TODO: needs to be covered somewhere else + crawled_directory = dname + if not dir_structure_name and dname.endswith('/'): + if dname == '/': + # Crawling the entire file system + dir_structure_name = "root" + else: + # dirname had a trailing '/' + dir_structure_name = os.path.basename(dname[:-1]) + dir_element_list.append(Directory(dir_structure_name, dname)) + + return scan_structure_elements( + dir_element_list, + crawler_definition, + converter_registry, + restricted_path=restricted_path, + debug_tree=debug_tree, + registered_transformer_functions=registered_transformer_functions + ) def scan_structure_elements(items: Union[list[StructureElement], StructureElement], -- GitLab From f0a56809211896b0387ffd0b656d5668b85d6466 Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Thu, 5 Dec 2024 18:39:21 +0100 Subject: [PATCH 3/8] TST: Add integration test for crawler_main with list of dirs --- integrationtests/test_crawler_main.py | 91 +++++++++++++++++++ .../crawler_main_with_list_of_dirs/cfood.yml | 10 ++ .../dir1/.gitkeep | 0 .../dir2/.gitkeep | 0 .../identifiable.yml | 2 + integrationtests/test_issues.py | 2 +- 6 files changed, 104 insertions(+), 1 deletion(-) create mode 100644 integrationtests/test_crawler_main.py create mode 100644 integrationtests/test_data/crawler_main_with_list_of_dirs/cfood.yml create mode 100644 integrationtests/test_data/crawler_main_with_list_of_dirs/dir1/.gitkeep create mode 100644 integrationtests/test_data/crawler_main_with_list_of_dirs/dir2/.gitkeep create mode 100644 integrationtests/test_data/crawler_main_with_list_of_dirs/identifiable.yml diff --git a/integrationtests/test_crawler_main.py b/integrationtests/test_crawler_main.py new file mode 100644 index 00000000..3c0ec57e --- /dev/null +++ b/integrationtests/test_crawler_main.py @@ -0,0 +1,91 @@ +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# 2024 Florian Spreckelsen <f.spreckelsen@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +import logging + +from pathlib import Path + +import linkahead as db + +from caoscrawler import crawl +from caoscrawler.crawl import (crawler_main, SecurityMode) +from linkahead.utils.register_tests import clear_database, set_test_key + +set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") + +INTTESTDIR = Path(__file__).parent + + +def test_list_of_paths(clear_database, monkeypatch): + + # Mock the status record + dummy_status = { + "n_calls": 0 + } + + def _mock_update_status_record(run_id, n_inserts, n_updates, status): + print("Update mocked status") + dummy_status["run_id"] = run_id + dummy_status["n_inserts"] = n_inserts + dummy_status["n_updates"] = n_updates + dummy_status["status"] = status + dummy_status["n_calls"] += 1 + monkeypatch.setattr(crawl, "_update_status_record", _mock_update_status_record) + + # mock SSS environment + monkeypatch.setenv("SHARED_DIR", "/tmp") + + # We need only one dummy RT + rt = db.RecordType(name="TestType").insert() + basepath = INTTESTDIR / "test_data" / "crawler_main_with_list_of_dirs" + dirlist = [basepath / "dir1", basepath / "dir2"] + crawler_main( + dirlist, + cfood_file_name=basepath / "cfood.yml", + identifiables_definition_file=basepath / "identifiable.yml" + ) + recs = db.execute_query("FIND TestType") + assert len(recs) == 2 + assert "Test1" in [r.name for r in recs] + assert "Test2" in [r.name for r in recs] + + assert dummy_status["n_inserts"] == 2 + assert dummy_status["n_updates"] == 0 + assert dummy_status["status"] == "OK" + assert dummy_status["n_calls"] == 1 + + +def test_not_implemented_list_with_authorization(caplog, clear_database): + + rt = db.RecordType(name="TestType").insert() + basepath = INTTESTDIR / "test_data" / "crawler_main_with_list_of_dirs" + dirlist = [basepath / "dir1", basepath / "dir2"] + + # This is not implemented yet, so check log for correct error. + crawler_main( + dirlist, + cfood_file_name=basepath / "cfood.yml", + identifiables_definition_file=basepath / "identifiable.yml", + securityMode=SecurityMode.RETRIEVE + ) + err_tuples = [t for t in caplog.record_tuples if t[1] == logging.ERROR] + assert len(err_tuples) == 1 + assert "currently implemented only for single paths, not for lists of paths" in err_tuples[0][2] + # No inserts after the errors + assert len(db.execute_query("FIND TestType")) == 0 diff --git a/integrationtests/test_data/crawler_main_with_list_of_dirs/cfood.yml b/integrationtests/test_data/crawler_main_with_list_of_dirs/cfood.yml new file mode 100644 index 00000000..c7f22ce0 --- /dev/null +++ b/integrationtests/test_data/crawler_main_with_list_of_dirs/cfood.yml @@ -0,0 +1,10 @@ +--- +metadata: + crawler-version: 0.10.2 +--- +BaseDirElement: + type: Directory + match: ^dir(?P<dir_number>[0-9]+)$$ + records: + TestType: + name: Test$dir_number diff --git a/integrationtests/test_data/crawler_main_with_list_of_dirs/dir1/.gitkeep b/integrationtests/test_data/crawler_main_with_list_of_dirs/dir1/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/integrationtests/test_data/crawler_main_with_list_of_dirs/dir2/.gitkeep b/integrationtests/test_data/crawler_main_with_list_of_dirs/dir2/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/integrationtests/test_data/crawler_main_with_list_of_dirs/identifiable.yml b/integrationtests/test_data/crawler_main_with_list_of_dirs/identifiable.yml new file mode 100644 index 00000000..6d608cec --- /dev/null +++ b/integrationtests/test_data/crawler_main_with_list_of_dirs/identifiable.yml @@ -0,0 +1,2 @@ +TestType: + - name diff --git a/integrationtests/test_issues.py b/integrationtests/test_issues.py index cb1e2e09..c699e0ab 100644 --- a/integrationtests/test_issues.py +++ b/integrationtests/test_issues.py @@ -1,4 +1,4 @@ -# This file is a part of the CaosDB Project. +# This file is a part of the LinkAhead Project. # # Copyright (C) 2022 Indiscale GmbH <info@indiscale.com> # 2022 Florian Spreckelsen <f.spreckelsen@indiscale.com> -- GitLab From 3a1384d7bbaaea7752e557cf2b14814fa88f706f Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Thu, 5 Dec 2024 18:39:37 +0100 Subject: [PATCH 4/8] FIX: Raise NotImplementedError at correct position --- src/caoscrawler/crawl.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index fd0beaa2..e3dd0488 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -661,6 +661,12 @@ one with the entities that need to be updated and the other with entities to be "use for example the Scanner to create this data.")) crawled_data = self.crawled_data + if isinstance(path_for_authorized_run, list) and self.securityMode != SecurityMode.UPDATE: + raise NotImplementedError( + "Authorization of inserts and updates is currently implemented only " + "for single paths, not for lists of paths." + ) + to_be_inserted, to_be_updated = self._split_into_inserts_and_updates( SyncGraph(crawled_data, self.identifiableAdapter)) @@ -718,21 +724,11 @@ one with the entities that need to be updated and the other with entities to be update_cache = UpdateCache() pending_inserts = update_cache.get_inserts(self.run_id) if pending_inserts: - if isinstance(path_for_authorized_run, list): - raise NotImplementedError( - "Authorization of inserts is currently implemented only for single paths, " - "not for lists of paths." - ) Crawler.inform_about_pending_changes( pending_inserts, self.run_id, path_for_authorized_run) pending_updates = update_cache.get_updates(self.run_id) if pending_updates: - if isinstance(path_for_authorized_run, list): - raise NotImplementedError( - "Authorization of updates is currently implemented only for single paths, " - "not for lists of paths." - ) Crawler.inform_about_pending_changes( pending_updates, self.run_id, path_for_authorized_run) -- GitLab From ff0a883a67052f97d0681903c1c24a09048d173c Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Fri, 6 Dec 2024 11:21:35 +0100 Subject: [PATCH 5/8] DOC: Update changelog --- CHANGELOG.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4fe40138..fe302156 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 variables to `int`, `float`, `str` and `bool`. - Transformer function definition in the cfood support variable substitutions now. +- `crawler_main` and `scanner.scan_directory` now support list of + directories to be crawled, too. Note that giving a list of + directories is currently incompatible with + `securityMode=SecurityMode.RETRIEVE` or + `securityMode=SecurityMode.INSERT` since the functionality to + authoriye pending inserts or updates doesn't support path lists yet + and will raise a NotImplementedError for now. ### Changed ### -- GitLab From 4de47f4ab588e3367369918075d9d0b9292d11a5 Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Fri, 6 Dec 2024 11:28:39 +0100 Subject: [PATCH 6/8] TST: Test crawler_main return code --- integrationtests/test_crawler_main.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/integrationtests/test_crawler_main.py b/integrationtests/test_crawler_main.py index 3c0ec57e..793dd6ed 100644 --- a/integrationtests/test_crawler_main.py +++ b/integrationtests/test_crawler_main.py @@ -78,12 +78,15 @@ def test_not_implemented_list_with_authorization(caplog, clear_database): dirlist = [basepath / "dir1", basepath / "dir2"] # This is not implemented yet, so check log for correct error. - crawler_main( + ret = crawler_main( dirlist, cfood_file_name=basepath / "cfood.yml", identifiables_definition_file=basepath / "identifiable.yml", securityMode=SecurityMode.RETRIEVE ) + # crawler_main hides the error, but has a non-zero return code and + # errors in the log: + assert ret != 0 err_tuples = [t for t in caplog.record_tuples if t[1] == logging.ERROR] assert len(err_tuples) == 1 assert "currently implemented only for single paths, not for lists of paths" in err_tuples[0][2] -- GitLab From 3ebadf374f81f41e5e1aac8c0ecedba6bc71c6e9 Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Fri, 6 Dec 2024 17:17:03 +0100 Subject: [PATCH 7/8] MAINT: Use platform-independent tmp and paths --- integrationtests/test_crawler_main.py | 3 ++- src/caoscrawler/scanner.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/integrationtests/test_crawler_main.py b/integrationtests/test_crawler_main.py index 793dd6ed..a2eebf4f 100644 --- a/integrationtests/test_crawler_main.py +++ b/integrationtests/test_crawler_main.py @@ -18,6 +18,7 @@ # import logging +import tempfile from pathlib import Path @@ -49,7 +50,7 @@ def test_list_of_paths(clear_database, monkeypatch): monkeypatch.setattr(crawl, "_update_status_record", _mock_update_status_record) # mock SSS environment - monkeypatch.setenv("SHARED_DIR", "/tmp") + monkeypatch.setenv("SHARED_DIR", tempfile.gettempdir()) # We need only one dummy RT rt = db.RecordType(name="TestType").insert() diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py index 6b4d7a12..af1f4173 100644 --- a/src/caoscrawler/scanner.py +++ b/src/caoscrawler/scanner.py @@ -465,8 +465,8 @@ def scan_directory(dirname: Union[str, list[str]], crawler_definition_path: str, # TODO: needs to be covered somewhere else crawled_directory = dname - if not dir_structure_name and dname.endswith('/'): - if dname == '/': + if not dir_structure_name and dname.endswith(os.path.sep): + if dname == os.path.sep: # Crawling the entire file system dir_structure_name = "root" else: -- GitLab From caa680aee3b474bf8f153e4f476c8b087402333c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Tue, 10 Dec 2024 13:34:00 +0100 Subject: [PATCH 8/8] DOC: add comment (TODO) --- src/caoscrawler/crawl.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index e3dd0488..a939b2ff 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -1166,6 +1166,7 @@ def parse_args(): "This file will only be generated if this option is set.") parser.add_argument("--debug", required=False, action="store_true", help="Path name of the cfood yaml file to be used.") + # TODO allow to provide multiple directories to be crawled on the commandline parser.add_argument("crawled_directory_path", help="The subtree of files below the given path will " "be considered. Use '/' for everything.") -- GitLab