diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 4702d943d89f106e86b57638d5a55429749d7f30..1ea5f84b68981b873f072470c820d6e38e1d12c5 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -54,6 +54,7 @@ from caosdb.apiutils import (EntityMergeConflictError, compare_entities, merge_entities) from caosdb.cached import cache_clear, cached_get_entity_by from caosdb.exceptions import EmptyUniqueQueryError +from linkahead.common.datatype import get_list_datatype, is_reference from .config import get_config_setting from .converters import Converter, ConverterValidationError @@ -340,7 +341,8 @@ class Crawler(object): def _has_reference_value_without_id(self, ident: Identifiable) -> bool: """ - Returns True if there is at least one value in the properties attribute of ``ident`` which: + Returns True if there is at least one value in the properties and backrefs attributes of + ``ident`` which: a) is a reference property AND b) where the value is set to a @@ -890,6 +892,51 @@ class Crawler(object): "mode. This might lead to a failure of inserts that follow.") logger.info(parent_updates) + @staticmethod + def replace_name_with_referenced_entity_id(prop: db.Property): + """changes the given property in place if it is a reference property that has a name as + value + + If the Property has a List datatype, each element is treated separately. + If the datatype is generic, i.e. FILE or REFERENCE, values stay unchanged. + If the value is not a string, the value stays unchanged. + If the query using the datatype and the string value does not uniquely identify an Entity, + the value stays unchanged. + If an Entity is identified, then the string value is replaced by the ID. + """ + if get_list_datatype(prop.datatype) is None: # not a list + if (isinstance(prop.value, str) and is_reference(prop.datatype) and + prop.datatype != db.FILE and prop.datatype != db.REFERENCE): # datatype is a non-generic reference and value is a string + try: + # the get_entity function will raise an error if not unique + prop.value = cached_get_entity_by( + query=f"FIND Entity {prop.datatype} with name='{prop.value}'").id + except (db.EmptyUniqueQueryError, db.QueryNotUniqueError): + logger.error("The Property {prop.name} with datatype={prop.datatype} has the " + "value {prop.value} and there is no appropriate Entity with such " + "a name.") + raise + else: + dt = get_list_datatype(prop.datatype) + if not (is_reference(dt) and dt != db.FILE and dt != db.REFERENCE): + return + propval = [] + for el in prop.value: + if isinstance(el, str): + try: + # the get_entity function will raise an error if not unique + propval.append(cached_get_entity_by( + query=f"FIND Entity {prop.datatype} with name='{prop.value}'").id) + except (db.EmptyUniqueQueryError, db.QueryNotUniqueError): + logger.error( + "The Property {prop.name} with datatype={prop.datatype} has the " + "value {prop.value} and there is no appropriate Entity with such " + "a name.") + raise + else: + propval.append(el) + prop.value = propval + @staticmethod def execute_inserts_in_list(to_be_inserted, securityMode, run_id: Optional[uuid.UUID] = None, @@ -898,6 +945,7 @@ class Crawler(object): for prop in record.properties: entity = cached_get_entity_by(name=prop.name) _resolve_datatype(prop, entity) + Crawler.replace_name_with_referenced_entity_id(prop) logger.debug("INSERT") logger.debug(to_be_inserted) if len(to_be_inserted) > 0: diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index 91e0e86a6d6cf2967ab3567a2ef93b7ccde56e64..fbf98346e59b0cbec88f17398eff41f26c423dee 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -33,6 +33,7 @@ from os.path import basename, dirname, join from pathlib import Path from unittest.mock import MagicMock, Mock, patch +import caoscrawler import caosdb as db import caosdb.common.models as dbmodels import pytest @@ -887,3 +888,79 @@ def test_detect_circular_dependency(crawler_mocked_identifiable_retrieve, caplog assert "Found circular dependency" in caplog.text assert "-------\na\n['C" in caplog.text caplog.clear() + + +def mock_get_entity_by_query(query=None): + if query is not None: + return db.Record(id=1111, name='rec_name').add_parent('RT') + + +@patch("caoscrawler.crawl.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by_query)) +def test_replace_name_with_referenced_entity(): + test_text = 'lkajsdf' + test_int = 134343 + test_id = 1111 + test_name = 'rec_name' + + # do not touch Properties with non-ref datatype + prop = db.Property(name='a', datatype=db.TEXT, value=test_text) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value is test_text + + # do not touch Properties with generic-ref datatype + prop = db.Property(name='a', datatype=db.REFERENCE, value=test_text) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value is test_text + + # do not touch Properties with file-ref datatype + prop = db.Property(name='a', datatype=db.FILE, value=test_text) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value is test_text + + # do not touch Properties with non-str values + prop = db.Property(name='a', datatype="RT", value=test_int) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value is test_int + + # no LinkAhead acccess until here + assert caoscrawler.crawl.cached_get_entity_by.call_count == 0 + + # change Properties with custom dt and str value + prop = db.Property(name='a', datatype="RT", value=test_name) + Crawler.replace_name_with_referenced_entity_id(prop) + assert isinstance(prop.value, int) + assert prop.value == test_id + assert caoscrawler.crawl.cached_get_entity_by.call_count == 1 + + # do not touch Properties with non-ref datatype (LIST) + prop = db.Property(name='a', datatype=db.LIST(db.TEXT), value=[test_text]) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value[0] is test_text + + # do not touch Properties with generic-ref datatype (LIST) + prop = db.Property(name='a', datatype=db.LIST(db.REFERENCE), value=[test_text]) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value[0] is test_text + + # do not touch Properties with file-ref datatype (LIST) + prop = db.Property(name='a', datatype=db.LIST(db.FILE), value=[test_text]) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value[0] is test_text + + # do not touch Properties with non-str values (LIST) + prop = db.Property(name='a', datatype=db.LIST("RT"), value=[test_int]) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value[0] is test_int + + # change Properties with custom dt and str value + prop = db.Property(name='a', datatype=db.LIST("RT"), value=[test_name, db.Record(name='hi'), + test_name]) + Crawler.replace_name_with_referenced_entity_id(prop) + assert isinstance(prop.value[0], int) + assert prop.value[0] == test_id + assert isinstance(prop.value[1], db.Entity) + assert prop.value[1].name == "hi" + assert isinstance(prop.value[2], int) + assert prop.value[2] == test_id + assert caoscrawler.crawl.cached_get_entity_by.call_count == 3