diff --git a/CHANGELOG.md b/CHANGELOG.md index 49b2e2c7519894f8922ab7947ad5002979638ce7..2b4c8f976e83cfb73ebde3904b6aa5991dfca94c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed ### +* [93](https://gitlab.com/linkahead/linkahead-crawler/-/issues/93) cfood.yaml does not allow umlaut in $expression + ### Security ### ### Documentation ### @@ -72,6 +74,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * The `identifiable_adapters.IdentifiableAdapter` uses entity ids (negative for entities that don't exist remotely) instead of entity objects for keeping track of references. +* Log output is either written to $SHARED_DIR/ (when this variable is set) or just to the terminal. ### Deprecated ### diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index 9f4b6520e311581c641a1e255f66ab1b6330ee50..6280f16cc58c96145bcafca12437d18b0a4b63ba 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -53,12 +53,16 @@ from .utils import has_parent # by the converters: SPECIAL_PROPERTIES = ("description", "name", "id", "path", "file", "checksum", "size") -SINGLE_VAR_RE = re.compile(r"^\$(\{)?(?P<varname>[0-9a-zA-Z_]+)(\})?$") +ID_PATTERN = r"\D[.\w]*" +SINGLE_VAR_RE = re.compile(r"^\$(\{)?(?P<varname>" + ID_PATTERN + r")(\})?$") logger = logging.getLogger(__name__) class CrawlerTemplate(Template): - braceidpattern = r"(?a:[_a-z][_\.a-z0-9]*)" + # This also adds a dot to the default pattern. + # See: https://docs.python.org/3/library/string.html#template-strings + # Default flags is re.IGNORECASE + braceidpattern = ID_PATTERN def _only_max(children_with_keys): diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index a26b0da7f24fa74d912c9a60bb717f6edde8d9dc..928bf472e97daa09bfdf6fd742b981d981ab9204 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -1021,11 +1021,19 @@ def crawler_main(crawled_directory_path: str, try: crawler = Crawler(securityMode=securityMode) - # setup logging and reporting if serverside execution - if "SHARED_DIR" in os.environ: + if "SHARED_DIR" in os.environ: # setup logging and reporting if serverside execution userlog_public, htmluserlog_public, debuglog_public = configure_server_side_logging() + # TODO make this optional _create_status_record( - get_config_setting("public_host_url") + "/Shared/" + htmluserlog_public, crawler.run_id) + get_config_setting("public_host_url") + "/Shared/" + htmluserlog_public, + crawler.run_id) + else: # setup stdout logging for other cases + root_logger = logging.getLogger() + root_logger.setLevel(level=(logging.DEBUG if debug else logging.INFO)) + handler = logging.StreamHandler(stream=sys.stdout) + handler.setLevel(logging.DEBUG if debug else logging.INFO) + root_logger.addHandler(handler) + logger.handlers.clear() debug_tree = DebugTree() crawled_data = scan_directory( diff --git a/unittests/test_issues.py b/unittests/test_issues.py index cbbe9cabcfd17daaf07165757351f00dc051eeab..e6fe06efcd055945b9a13576b83f6bf470eaccdd 100644 --- a/unittests/test_issues.py +++ b/unittests/test_issues.py @@ -22,13 +22,11 @@ from pytest import mark -import caosdb as db - +from caoscrawler.converters import replace_variables, CrawlerTemplate from caoscrawler.crawl import Crawler -from caoscrawler.identifiable import Identifiable -from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter -from caoscrawler.structure_elements import DictElement +from caoscrawler.structure_elements import DictElement +from caoscrawler.stores import GeneralStore from caoscrawler.scanner import create_converter_registry, scan_structure_elements @@ -110,3 +108,43 @@ def test_list_datatypes(): assert isinstance(records[0].get_property("Subject").value, list) assert records[0].get_property("Subject").datatype is not None assert records[0].get_property("Subject").datatype.startswith("LIST") + + +def test_issue_93(): + """https://gitlab.com/linkahead/linkahead-crawler/-/issues/93 + + cfood.yaml does not allow umlaut in $expression""" + values = GeneralStore() + expressions = [ + "foo", + "foo.bär", + "_1", + "Ä", + "ųøîµ", + ] + for exp in expressions: + values[exp] = f"This is {exp}" + # ## Test preliminary check + # With braces + for exp in expressions: + assert replace_variables(f"${{{exp}}}", values) == f"This is {exp}" + # Without braces + for exp in expressions: + assert replace_variables(f"${exp}", values) == f"This is {exp}" + + # ## Test actual replacement + for exp in expressions: + # as-is + propvalue = f"${{{exp}}}" + propvalue_template = CrawlerTemplate(propvalue) + # from IPython import embed + # embed() + + assert propvalue_template.safe_substitute(**values.get_storage()) == f"This is {exp}" + + # String embedded into context + propvalue = f"some text before >> ${{{exp}}} << some text after" + print(propvalue) + propvalue_template = CrawlerTemplate(propvalue) + assert (propvalue_template.safe_substitute(**values.get_storage()) + == f"some text before >> This is {exp} << some text after")