diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 32ad2a34e4f63c1cf05ea1760eb434a06fffa7de..50fbcac90a09ff77087d0c11a79513ee1772d061 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -54,6 +54,7 @@ from caosdb.apiutils import (EntityMergeConflictError, compare_entities, merge_entities) from caosdb.cached import cache_clear, cached_get_entity_by from caosdb.exceptions import EmptyUniqueQueryError +from linkahead.common.datatype import get_list_datatype, is_reference from .config import get_config_setting from .converters import Converter, ConverterValidationError @@ -298,7 +299,8 @@ class Crawler(object): def _has_reference_value_without_id(self, ident: Identifiable) -> bool: """ - Returns True if there is at least one value in the properties attribute of ``ident`` which: + Returns True if there is at least one value in the properties and backrefs attributes of + ``ident`` which: a) is a reference property AND b) where the value is set to a @@ -827,6 +829,34 @@ class Crawler(object): "mode. This might lead to a failure of inserts that follow.") logger.info(parent_updates) + @staticmethod + def replace_name_with_referenced_entity_id(prop: db.Property): + if get_list_datatype(prop.datatype) is None: # not a list + if (isinstance(prop.value, str) and is_reference(prop.datatype) and + prop.datatype != db.FILE and prop.datatype != db.REFERENCE): # datatype is a non-generic reference and value is a string + try: + # the get_entity function will raise an error if not unique + prop.value = cached_get_entity_by( + query=f"FIND Entity {prop.datatype} with name='{prop.value}'").id + except (db.EmptyUniqueQueryError, db.QueryNotUniqueError): + return + else: + dt = get_list_datatype(prop.datatype) + if not (is_reference(dt) and dt != db.FILE and dt != db.REFERENCE): + return + propval = [] + for el in prop.value: + if isinstance(el, str): + try: + # the get_entity function will raise an error if not unique + propval.append(cached_get_entity_by( + query=f"FIND Entity {prop.datatype} with name='{prop.value}'").id) + except (db.EmptyUniqueQueryError, db.QueryNotUniqueError): + propval.append(el) + else: + propval.append(el) + prop.value = propval + @staticmethod def execute_inserts_in_list(to_be_inserted, securityMode, run_id: Optional[uuid.UUID] = None, @@ -835,6 +865,7 @@ class Crawler(object): for prop in record.properties: entity = cached_get_entity_by(name=prop.name) _resolve_datatype(prop, entity) + replace_name_with_referenced_entity_id(prop) logger.debug("INSERT") logger.debug(to_be_inserted) if len(to_be_inserted) > 0: diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index 91e0e86a6d6cf2967ab3567a2ef93b7ccde56e64..954d7636f20c4d31850943b851f4d772af370da8 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -33,6 +33,7 @@ from os.path import basename, dirname, join from pathlib import Path from unittest.mock import MagicMock, Mock, patch +import caoscrawler import caosdb as db import caosdb.common.models as dbmodels import pytest @@ -887,3 +888,99 @@ def test_detect_circular_dependency(crawler_mocked_identifiable_retrieve, caplog assert "Found circular dependency" in caplog.text assert "-------\na\n['C" in caplog.text caplog.clear() + crawler = crawler_mocked_identifiable_retrieve + crawler.identifiableAdapter.get_registered_identifiable = Mock( + side_effect=lambda x: db.Record().add_parent('C').add_property(name='C')) + a = db.Record(name='a').add_parent("C") + b = db.Record(name='b').add_parent("C").add_property(name="C", value=a) + c = db.Record(name='c').add_parent("C").add_property(name='D', value='e' + ).add_property(name="C", value=b) + d = db.Record(name='c').add_parent("C") + a.add_property(name="C", value=c) + flat = [a, b, c] + circle = Crawler.detect_circular_dependency(flat) + assert [id(el) for el in circle] == [id(el) for el in [a, c, b, a]] + + assert Crawler.detect_circular_dependency([d]) is None + with raises(RuntimeError): + _, _ = crawler.split_into_inserts_and_updates(flat) + caplog.set_level(logging.ERROR, logger="caoscrawler.converters") + assert "Found circular dependency" in caplog.text + assert "-------\na\n['C" in caplog.text + caplog.clear() + + +def mock_get_entity_by_query(query=None): + if query is not None: + return db.Record(id=1111, name='rec_name').add_parent('RT') + + +@patch("caoscrawler.crawl.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by_query)) +def test_replace_name_with_referenced_entity(): + test_text = 'lkajsdf' + test_int = 134343 + test_id = 1111 + test_name = 'rec_name' + + # do not touch Properties with non-ref datatype + prop = db.Property(name='a', datatype=db.TEXT, value=test_text) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value is test_text + + # do not touch Properties with generic-ref datatype + prop = db.Property(name='a', datatype=db.REFERENCE, value=test_text) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value is test_text + + # do not touch Properties with file-ref datatype + prop = db.Property(name='a', datatype=db.FILE, value=test_text) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value is test_text + + # do not touch Properties with non-str values + prop = db.Property(name='a', datatype="RT", value=test_int) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value is test_int + + # no LinkAhead acccess until here + assert caoscrawler.crawl.cached_get_entity_by.call_count == 0 + + # change Properties with custom dt and str value + prop = db.Property(name='a', datatype="RT", value=test_name) + Crawler.replace_name_with_referenced_entity_id(prop) + assert isinstance(prop.value, int) + assert prop.value == test_id + assert caoscrawler.crawl.cached_get_entity_by.call_count == 1 + + # do not touch Properties with non-ref datatype (LIST) + prop = db.Property(name='a', datatype=db.LIST(db.TEXT), value=[test_text]) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value[0] is test_text + + # do not touch Properties with generic-ref datatype (LIST) + prop = db.Property(name='a', datatype=db.LIST(db.REFERENCE), value=[test_text]) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value[0] is test_text + + # do not touch Properties with file-ref datatype (LIST) + prop = db.Property(name='a', datatype=db.LIST(db.FILE), value=[test_text]) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value[0] is test_text + + # do not touch Properties with non-str values (LIST) + prop = db.Property(name='a', datatype=db.LIST("RT"), value=[test_int]) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value[0] is test_int + + # change Properties with custom dt and str value + prop = db.Property(name='a', datatype=db.LIST("RT"), value=[test_name, db.Record(name='hi'), + test_name]) + Crawler.replace_name_with_referenced_entity_id(prop) + assert isinstance(prop.value[0], int) + assert prop.value[0] == test_id + assert isinstance(prop.value[1], db.Entity) + assert prop.value[1].name == "hi" + assert isinstance(prop.value[2], int) + assert prop.value[2] == test_id + assert caoscrawler.crawl.cached_get_entity_by.call_count == 3