diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index e916e34fa4de53731d037a4e6838722926628402..542a35c5582d1a3fbb63fd92bdd5d9e82f6db9b1 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -45,7 +45,6 @@ from copy import deepcopy from datetime import datetime from enum import Enum from typing import Any, List, Optional, Union -from urllib.parse import urljoin import linkahead as db import yaml @@ -74,6 +73,7 @@ from .scanner import (create_converter_registry, initialize_converters, from .stores import GeneralStore from .structure_elements import StructureElement from .sync_graph import SyncGraph +from .utils import get_shared_resource_link logger = logging.getLogger(__name__) @@ -752,8 +752,8 @@ one with the entities that need to be updated and the other with entities to be # Sending an Email with a link to a form to authorize updates is if get_config_setting("send_crawler_notifications"): filename = OldCrawler.save_form([el[3] for el in pending_changes], path, run_id) - link_address = urljoin(db.configuration.get_config()[ - "Connection"]["url"], os.path.join("/Shared/", filename)) + link_address = get_shared_resource_link(db.configuration.get_config()[ + "Connection"]["url"], filename) text = """Dear Curator, there where changes that need your authorization. Please check the following carefully and if the changes are ok, click on the following link: @@ -902,8 +902,7 @@ the CaosDB Crawler successfully crawled the data and if get_config_setting("create_crawler_status_records"): text += ("You can checkout the CrawlerRun Record for more information:\n" f"{domain}/Entity/?P=0L10&query=find%20crawlerrun%20with%20run_id=%27{run_id}%27\n\n") - text += (f"You can download the logfile here:\n{ - urljoin(domain, os.path.join('/Shared/', logfile))}") + text += (f"You can download the logfile here:\n{get_shared_resource_link(domain, logfile)}") send_mail( from_addr=get_config_setting("sendmail_from_address"), to=get_config_setting("sendmail_to_address"), @@ -1063,7 +1062,7 @@ def crawler_main(crawled_directory_path: str, userlog_public, htmluserlog_public, debuglog_public = configure_server_side_logging() # TODO make this optional _create_status_record( - urljoin(get_config_setting("public_host_url"),o s.path.join("/Shared/", htmluserlog_public)), + get_shared_resource_link(get_config_setting("public_host_url"), htmluserlog_public), crawler.run_id) else: # setup stdout logging for other cases root_logger = logging.getLogger() @@ -1132,7 +1131,7 @@ def crawler_main(crawled_directory_path: str, # pylint: disable=E0601 domain = get_config_setting("public_host_url") logger.error("Unexpected Error: Please tell your administrator about this and provide " - f"the following path.\n{urljoin(domain, os.path.join('/Shared/', debuglog_public))}") + f"the following path.\n{get_shared_resource_link(domain, debuglog_public)}") _update_status_record(crawler.run_id, 0, 0, status="FAILED") return 1 diff --git a/src/caoscrawler/utils.py b/src/caoscrawler/utils.py index 096fde9b573f4ff60995498144cad3589ce7dbb2..65a1bb7cfcb58af851ced6e7d339282c05d6bdab 100644 --- a/src/caoscrawler/utils.py +++ b/src/caoscrawler/utils.py @@ -25,8 +25,11 @@ # Some utility functions, e.g. for extending pylib. +import os import sys + from typing import Optional +from urllib.parse import urljoin import linkahead as db @@ -69,3 +72,13 @@ def MissingImport(name: str, hint: str = "", err: Optional[Exception] = None) -> _DummyClass.__name__ = name return _DummyClass + + +def get_shared_resource_link(host_url, filename): + """Return a link adress which is basically {host_url}/Shared/{filename}. + + Use urllib.parse.join and os.path.join to prevent missing or extra ``/`` and the like. + + """ + + return urljoin(host_url, os.path.join("/Shared/", filename)) diff --git a/unittests/test_utilities.py b/unittests/test_utilities.py index dfb79c8b6b10909952174cf24c3aa9198f3b7743..89b0b40cf42e9313cc8bef311942e87c26f1a611 100644 --- a/unittests/test_utilities.py +++ b/unittests/test_utilities.py @@ -22,7 +22,7 @@ import pytest from caoscrawler.crawl import split_restricted_path -from caoscrawler.utils import MissingImport +from caoscrawler.utils import get_shared_resource_link, MissingImport def test_split_restricted_path(): @@ -66,3 +66,13 @@ def test_dummy_class(): assert "(Not Important)" in msg orig_msg = str(err_info.value.__cause__) assert orig_msg == "Old error" + + +def test_shared_resource_link(): + + assert get_shared_resource_link( + "https://example.com/", "file.txt") == "https://example.com/Shared/file.txt" + assert get_shared_resource_link( + "https://example.com", "file.txt") == "https://example.com/Shared/file.txt" + assert get_shared_resource_link( + "https://example.com", "path/to/file.txt") == "https://example.com/Shared/path/to/file.txt"