diff --git a/CHANGELOG.md b/CHANGELOG.md index bb5c7f18cd3c9e110fe618d951cff373e5454702..2ba6c84749478314882a4131754bf9cc7fc5b184 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed ### +* Use `urllib.parse.urljoin` to generate link addresses in status + mails, preventing wrong addresses, e.g., due to superfluous `/`. + ### Security ### ### Documentation ### diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 0f23acfdfde2a863a66f25901a85748b538f5d04..c761d730842647a1ec401966c4221811dda1cd9c 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -39,6 +39,7 @@ import sys import traceback import uuid import warnings + from argparse import RawTextHelpFormatter from copy import deepcopy from datetime import datetime @@ -72,6 +73,7 @@ from .scanner import (create_converter_registry, initialize_converters, from .stores import GeneralStore from .structure_elements import StructureElement from .sync_graph import SyncGraph +from .utils import get_shared_resource_link logger = logging.getLogger(__name__) @@ -750,16 +752,17 @@ one with the entities that need to be updated and the other with entities to be # Sending an Email with a link to a form to authorize updates is if get_config_setting("send_crawler_notifications"): filename = OldCrawler.save_form([el[3] for el in pending_changes], path, run_id) - text = """Dear Curator, + link_address = get_shared_resource_link(db.configuration.get_config()[ + "Connection"]["url"], filename) + changes = "\n".join([el[3] for el in pending_changes]) + text = f"""Dear Curator, there where changes that need your authorization. Please check the following carefully and if the changes are ok, click on the following link: - {url}/Shared/{filename} + {link_address} {changes} - """.format(url=db.configuration.get_config()["Connection"]["url"], - filename=filename, - changes="\n".join([el[3] for el in pending_changes])) + """ try: fro = get_config_setting("sendmail_from_address") to = get_config_setting("sendmail_to_address") @@ -899,7 +902,7 @@ the CaosDB Crawler successfully crawled the data and if get_config_setting("create_crawler_status_records"): text += ("You can checkout the CrawlerRun Record for more information:\n" f"{domain}/Entity/?P=0L10&query=find%20crawlerrun%20with%20run_id=%27{run_id}%27\n\n") - text += (f"You can download the logfile here:\n{domain}/Shared/" + logfile) + text += (f"You can download the logfile here:\n{get_shared_resource_link(domain, logfile)}") send_mail( from_addr=get_config_setting("sendmail_from_address"), to=get_config_setting("sendmail_to_address"), @@ -1059,7 +1062,7 @@ def crawler_main(crawled_directory_path: str, userlog_public, htmluserlog_public, debuglog_public = configure_server_side_logging() # TODO make this optional _create_status_record( - get_config_setting("public_host_url") + "/Shared/" + htmluserlog_public, + get_shared_resource_link(get_config_setting("public_host_url"), htmluserlog_public), crawler.run_id) else: # setup stdout logging for other cases root_logger = logging.getLogger() @@ -1128,7 +1131,7 @@ def crawler_main(crawled_directory_path: str, # pylint: disable=E0601 domain = get_config_setting("public_host_url") logger.error("Unexpected Error: Please tell your administrator about this and provide " - f"the following path.\n{domain}/Shared/" + debuglog_public) + f"the following path.\n{get_shared_resource_link(domain, debuglog_public)}") _update_status_record(crawler.run_id, 0, 0, status="FAILED") return 1 diff --git a/src/caoscrawler/utils.py b/src/caoscrawler/utils.py index 096fde9b573f4ff60995498144cad3589ce7dbb2..d9a5af839068a2582859aad1b51fbc8b9713d5d1 100644 --- a/src/caoscrawler/utils.py +++ b/src/caoscrawler/utils.py @@ -26,7 +26,10 @@ # Some utility functions, e.g. for extending pylib. import sys + +from posixpath import join as posixjoin from typing import Optional +from urllib.parse import urljoin import linkahead as db @@ -69,3 +72,18 @@ def MissingImport(name: str, hint: str = "", err: Optional[Exception] = None) -> _DummyClass.__name__ = name return _DummyClass + + +def get_shared_resource_link(host_url, filename): + """Return a link adress which is basically {host_url}/Shared/{filename}. + + Use urllib.parse.join and os.path.join to prevent missing or extra ``/`` and the like. + + """ + + if not host_url.endswith('/'): + # Fill with trailing '/' s. that urljoin doesn't remove the context root. + host_url += '/' + # Use posixjoin to always have '/' in links, even when running on + # Windows systems. + return urljoin(host_url, posixjoin("Shared/", filename)) diff --git a/unittests/test_utilities.py b/unittests/test_utilities.py index dfb79c8b6b10909952174cf24c3aa9198f3b7743..15e84a609149ac602ee80b7357f7622566563792 100644 --- a/unittests/test_utilities.py +++ b/unittests/test_utilities.py @@ -22,7 +22,7 @@ import pytest from caoscrawler.crawl import split_restricted_path -from caoscrawler.utils import MissingImport +from caoscrawler.utils import get_shared_resource_link, MissingImport def test_split_restricted_path(): @@ -66,3 +66,17 @@ def test_dummy_class(): assert "(Not Important)" in msg orig_msg = str(err_info.value.__cause__) assert orig_msg == "Old error" + + +def test_shared_resource_link(): + + assert get_shared_resource_link( + "https://example.com/", "file.txt") == "https://example.com/Shared/file.txt" + assert get_shared_resource_link( + "https://example.com", "file.txt") == "https://example.com/Shared/file.txt" + assert get_shared_resource_link( + "https://example.com", "path/to/file.txt") == "https://example.com/Shared/path/to/file.txt" + assert get_shared_resource_link( + "https://example.com/context-root", "path/to/file.txt") == "https://example.com/context-root/Shared/path/to/file.txt" + assert get_shared_resource_link( + "https://example.com/context-root/", "path/to/file.txt") == "https://example.com/context-root/Shared/path/to/file.txt"