Merge branch 'f-unify-notifications' into 'dev'

ENH: Allow crawler_main to operate on a list of paths See merge request !208

Merge branch 'f-unify-notifications' into 'dev'
9dafdd9f · Henrik tom Wörden · 493d1acc · 02325fd4 · 9dafdd9f · 9dafdd9f
Commit 9dafdd9f authored 6 months ago by Henrik tom Wörden
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -23,6 +23,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
  variables to `int`, `float`, `str` and `bool`.
 - Transformer function definition in the cfood support variable
  substitutions now.
+- `crawler_main` and `scanner.scan_directory` now support list of
+  directories to be crawled, too. Note that giving a list of
+  directories is currently incompatible with
+  `securityMode=SecurityMode.RETRIEVE` or
+  `securityMode=SecurityMode.INSERT` since the functionality to
+  authoriye pending inserts or updates doesn't support path lists yet
+  and will raise a NotImplementedError for now.
 - `match_newer_than_file` option for `DirectoryConverter`: A reference
  file containing (only) an ISO-formatted datetime string can be
  specified here. Directories with this option won't match if all

--- a/integrationtests/test_crawler_main.py
+++ b/integrationtests/test_crawler_main.py
+# This file is a part of the LinkAhead Project.
+#
+# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com>
+#               2024 Florian Spreckelsen <f.spreckelsen@indiscale.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+
+import logging
+import tempfile
+
+from pathlib import Path
+
+import linkahead as db
+
+from caoscrawler import crawl
+from caoscrawler.crawl import (crawler_main, SecurityMode)
+from linkahead.utils.register_tests import clear_database, set_test_key
+
+set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2")
+
+INTTESTDIR = Path(__file__).parent
+
+
+def test_list_of_paths(clear_database, monkeypatch):
+
+    # Mock the status record
+    dummy_status = {
+        "n_calls": 0
+    }
+
+    def _mock_update_status_record(run_id, n_inserts, n_updates, status):
+        print("Update mocked status")
+        dummy_status["run_id"] = run_id
+        dummy_status["n_inserts"] = n_inserts
+        dummy_status["n_updates"] = n_updates
+        dummy_status["status"] = status
+        dummy_status["n_calls"] += 1
+    monkeypatch.setattr(crawl, "_update_status_record", _mock_update_status_record)
+
+    # mock SSS environment
+    monkeypatch.setenv("SHARED_DIR", tempfile.gettempdir())
+
+    # We need only one dummy RT
+    rt = db.RecordType(name="TestType").insert()
+    basepath = INTTESTDIR / "test_data" / "crawler_main_with_list_of_dirs"
+    dirlist = [basepath / "dir1", basepath / "dir2"]
+    crawler_main(
+        dirlist,
+        cfood_file_name=basepath / "cfood.yml",
+        identifiables_definition_file=basepath / "identifiable.yml"
+    )
+    recs = db.execute_query("FIND TestType")
+    assert len(recs) == 2
+    assert "Test1" in [r.name for r in recs]
+    assert "Test2" in [r.name for r in recs]
+
+    assert dummy_status["n_inserts"] == 2
+    assert dummy_status["n_updates"] == 0
+    assert dummy_status["status"] == "OK"
+    assert dummy_status["n_calls"] == 1
+
+
+def test_not_implemented_list_with_authorization(caplog, clear_database):
+
+    rt = db.RecordType(name="TestType").insert()
+    basepath = INTTESTDIR / "test_data" / "crawler_main_with_list_of_dirs"
+    dirlist = [basepath / "dir1", basepath / "dir2"]
+
+    # This is not implemented yet, so check log for correct error.
+    ret = crawler_main(
+        dirlist,
+        cfood_file_name=basepath / "cfood.yml",
+        identifiables_definition_file=basepath / "identifiable.yml",
+        securityMode=SecurityMode.RETRIEVE
+    )
+    # crawler_main hides the error, but has a non-zero return code and
+    # errors in the log:
+    assert ret != 0
+    err_tuples = [t for t in caplog.record_tuples if t[1] == logging.ERROR]
+    assert len(err_tuples) == 1
+    assert "currently implemented only for single paths, not for lists of paths" in err_tuples[0][2]
+    # No inserts after the errors
+    assert len(db.execute_query("FIND TestType")) == 0
--- a/integrationtests/test_data/crawler_main_with_list_of_dirs/cfood.yml
+++ b/integrationtests/test_data/crawler_main_with_list_of_dirs/cfood.yml
+---
+metadata:
+  crawler-version: 0.10.2
+---
+BaseDirElement:
+  type: Directory
+  match: ^dir(?P<dir_number>[0-9]+)$$
+  records:
+    TestType:
+      name: Test$dir_number
--- a/integrationtests/test_data/crawler_main_with_list_of_dirs/dir1/.gitkeep
+++ b/integrationtests/test_data/crawler_main_with_list_of_dirs/dir1/.gitkeep
--- a/integrationtests/test_data/crawler_main_with_list_of_dirs/dir2/.gitkeep
+++ b/integrationtests/test_data/crawler_main_with_list_of_dirs/dir2/.gitkeep
--- a/integrationtests/test_data/crawler_main_with_list_of_dirs/identifiable.yml
+++ b/integrationtests/test_data/crawler_main_with_list_of_dirs/identifiable.yml
+TestType:
+  - name
--- a/integrationtests/test_issues.py
+++ b/integrationtests/test_issues.py
-# This file is a part of the CaosDB Project.
+# This file is a part of the LinkAhead Project.
 #
 # Copyright (C) 2022 Indiscale GmbH <info@indiscale.com>
 #               2022 Florian Spreckelsen <f.spreckelsen@indiscale.com>

--- a/src/caoscrawler/crawl.py
+++ b/src/caoscrawler/crawl.py
@@ -621,7 +621,7 @@ one with the entities that need to be updated and the other with entities to be
                    crawled_data: Optional[list[db.Record]] = None,
                    no_insert_RTs: Optional[list[str]] = None,
                    no_update_RTs: Optional[list[str]] = None,
-                    path_for_authorized_run: Optional[str] = "",
+                    path_for_authorized_run: Optional[Union[str, list[str]]] = "",
                    ):
        """
        This function applies several stages:
@@ -643,7 +643,7 @@ one with the entities that need to be updated and the other with entities to be
        no_update_RTs : list[str], optional
            list of RecordType names. Records that have one of those RecordTypes
            as parent will not be updated
-        path_for_authorized_run : str, optional
+        path_for_authorized_run : str or list[str], optional
            only used if there are changes that need authorization before being
            applied. The form for rerunning the crawler with the authorization
            of these changes will be generated with this path. See
@@ -661,6 +661,12 @@ one with the entities that need to be updated and the other with entities to be
                "use for example the Scanner to create this data."))
            crawled_data = self.crawled_data

+        if isinstance(path_for_authorized_run, list) and self.securityMode != SecurityMode.UPDATE:
+            raise NotImplementedError(
+                "Authorization of inserts and updates is currently implemented only "
+                "for single paths, not for lists of paths."
+            )
+
        to_be_inserted, to_be_updated = self._split_into_inserts_and_updates(
            SyncGraph(crawled_data, self.identifiableAdapter))

@@ -1004,7 +1010,7 @@ def _store_dry_run_data(ins, upd):
            "update": updates}))


-def crawler_main(crawled_directory_path: str,
+def crawler_main(crawled_directory_path: Union[str, list[str]],
                 cfood_file_name: str,
                 identifiables_definition_file: Optional[str] = None,
                 debug: bool = False,
@@ -1022,8 +1028,8 @@ def crawler_main(crawled_directory_path: str,

    Parameters
    ----------
-    crawled_directory_path : str
-        path to be crawled
+    crawled_directory_path : str or list[str]
+        path(s) to be crawled
    cfood_file_name : str
        filename of the cfood to be used
    identifiables_definition_file : str
@@ -1115,42 +1121,28 @@ def crawler_main(crawled_directory_path: str,
                                                  crawler.run_id)
                _update_status_record(crawler.run_id, len(inserts), len(updates), status="OK")
        return 0
-    except ForbiddenTransaction as err:
-        logger.debug(traceback.format_exc())
-        logger.error(err)
-        _update_status_record(crawler.run_id, 0, 0, status="FAILED")
-        return 1
-    except ConverterValidationError as err:
-        logger.debug(traceback.format_exc())
-        logger.error(err)
-        _update_status_record(crawler.run_id, 0, 0, status="FAILED")
-        return 1
-    except ImpossibleMergeError as err:
-        logger.debug(traceback.format_exc())
-        logger.error(
-            "Encountered conflicting information when creating Records from the crawled "
-            f"data:\n\n{err}"
-        )
-        _update_status_record(crawler.run_id, 0, 0, status="FAILED")
-        return 1
-    except TransactionError as err:
-        logger.debug(traceback.format_exc())
-        logger.error(err)
-        logger.error("Transaction error details:")
-        for suberr in err.errors:
-            logger.error("---")
-            logger.error(suberr.msg)
-            logger.error(suberr.entity)
-        return 1
    except Exception as err:
        logger.debug(traceback.format_exc())
        logger.error(err)
-
-        if "SHARED_DIR" in os.environ:
-            # pylint: disable=E0601
-            domain = get_config_setting("public_host_url")
-            logger.error("Unexpected Error: Please tell your administrator about this and provide "
-                         f"the following path.\n{get_shared_resource_link(domain, debuglog_public)}")
+        # Special treatment for known error types
+        if isinstance(err, ImpossibleMergeError):
+            logger.error(
+                "Encountered conflicting information when creating Records from the crawled "
+                f"data:\n\n{err}"
+            )
+        elif isinstance(err, TransactionError):
+            logger.error("Transaction error details:")
+            for suberr in err.errors:
+                logger.error("---")
+                logger.error(suberr.msg)
+                logger.error(suberr.entity)
+        # Unkown errors get a special message
+        elif not isinstance(err, (ConverterValidationError, ForbiddenTransaction)):
+            if "SHARED_DIR" in os.environ:
+                # pylint: disable=E0601
+                domain = get_config_setting("public_host_url")
+                logger.error("Unexpected Error: Please tell your administrator about this and provide "
+                             f"the following path.\n{get_shared_resource_link(domain, debuglog_public)}")
        _update_status_record(crawler.run_id, 0, 0, status="FAILED")
        return 1

@@ -1174,6 +1166,7 @@ def parse_args():
                        "This file will only be generated if this option is set.")
    parser.add_argument("--debug", required=False, action="store_true",
                        help="Path name of the cfood yaml file to be used.")
+    # TODO allow to provide multiple directories to be crawled on the commandline
    parser.add_argument("crawled_directory_path",
                        help="The subtree of files below the given path will "
                        "be considered. Use '/' for everything.")

--- a/src/caoscrawler/scanner.py
+++ b/src/caoscrawler/scanner.py
@@ -421,7 +421,7 @@ def scanner(items: list[StructureElement],
 # --------------------------------------------------------------------------------


-def scan_directory(dirname: str, crawler_definition_path: str,
+def scan_directory(dirname: Union[str, list[str]], crawler_definition_path: str,
                   restricted_path: Optional[list[str]] = None,
                   debug_tree: Optional[DebugTree] = None):
    """ Crawl a single directory.
@@ -434,10 +434,12 @@ def scan_directory(dirname: str, crawler_definition_path: str,
    Parameters
    ----------

+    dirname: str or list[str]
+        directory or list of directories to be scanned
    restricted_path: optional, list of strings
-            Traverse the data tree only along the given path. When the end of the given path
-            is reached, traverse the full tree as normal. See docstring of 'scanner' for
-            more details.
+        Traverse the data tree only along the given path. When the end
+        of the given path is reached, traverse the full tree as
+        normal. See docstring of 'scanner' for more details.

    Returns
    -------
@@ -455,26 +457,31 @@ def scan_directory(dirname: str, crawler_definition_path: str,
    if not dirname:
        raise ValueError(
            "You have to provide a non-empty path for crawling.")
-    dir_structure_name = os.path.basename(dirname)
-
-    # TODO: needs to be covered somewhere else
-    crawled_directory = dirname
-    if not dir_structure_name and dirname.endswith('/'):
-        if dirname == '/':
-            # Crawling the entire file system
-            dir_structure_name = "root"
-        else:
-            # dirname had a trailing '/'
-            dir_structure_name = os.path.basename(dirname[:-1])
-
-    return scan_structure_elements(Directory(dir_structure_name,
-                                             dirname),
-                                   crawler_definition,
-                                   converter_registry,
-                                   restricted_path=restricted_path,
-                                   debug_tree=debug_tree,
-                                   registered_transformer_functions=registered_transformer_functions
-                                   )
+    if not isinstance(dirname, list):
+        dirname = [dirname]
+    dir_element_list = []
+    for dname in dirname:
+        dir_structure_name = os.path.basename(dname)
+
+        # TODO: needs to be covered somewhere else
+        crawled_directory = dname
+        if not dir_structure_name and dname.endswith(os.path.sep):
+            if dname == os.path.sep:
+                # Crawling the entire file system
+                dir_structure_name = "root"
+            else:
+                # dirname had a trailing '/'
+                dir_structure_name = os.path.basename(dname[:-1])
+        dir_element_list.append(Directory(dir_structure_name, dname))
+
+    return scan_structure_elements(
+        dir_element_list,
+        crawler_definition,
+        converter_registry,
+        restricted_path=restricted_path,
+        debug_tree=debug_tree,
+        registered_transformer_functions=registered_transformer_functions
+    )


 def scan_structure_elements(items: Union[list[StructureElement], StructureElement],