diff --git a/CHANGELOG.md b/CHANGELOG.md index 7aff30041ad4d619467e267ef7999344d586d08f..109956118239e4ec417081f7061d60947ecf90a0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -67,6 +67,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * A corner case in `split_into_inserts_and_updates` whereby two records created in different places in the cfood definition would not be merged if both were identified by the same LinkAhead id +* [#87](https://gitlab.com/linkahead/linkahead-crawler/-/issues/87) Handle long strings more gracefully. The crawler sometimes runs into + [linkahead-server#101](https://gitlab.com/linkahead/linkahead-server/-/issues/101), this is now mitigated. ### Security ### diff --git a/integrationtests/basic_example/test_basic.py b/integrationtests/basic_example/test_basic.py index d3482a19f0c95912002b2ff68101623476d452ea..c906a81d86af56669f7c522169bceb3b5fcb3e01 100755 --- a/integrationtests/basic_example/test_basic.py +++ b/integrationtests/basic_example/test_basic.py @@ -112,11 +112,11 @@ def crawler_extended(ident): return cr, crawled_data, debug_tree -def test_ambigious_lookup(clear_database, usemodel, crawler, ident): +def test_ambiguous_lookup(clear_database, usemodel, crawler, ident): ins, ups = crawler[0].synchronize(crawled_data=crawler[1]) proj = db.execute_query("FIND Project WITH identifier='SpeedOfLight'", unique=True) - with pytest.raises(RuntimeError, match=".*unambigiously.*"): + with pytest.raises(RuntimeError, match=".*unambiguously.*"): print(crawler[0].identifiableAdapter.retrieve_identified_record_for_identifiable( Identifiable(properties={'project': proj.id}))) diff --git a/integrationtests/test_issues.py b/integrationtests/test_issues.py index 441edac5481585e483c94d61d864a1baaa139aa2..814e82ad75512ec8fe217294e1a9e86c6aa01ab3 100644 --- a/integrationtests/test_issues.py +++ b/integrationtests/test_issues.py @@ -16,24 +16,26 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. # -from pytest import fixture, mark +from pytest import fixture, mark, raises -import caosdb as db -from caosdb.cached import cache_clear +import linkahead as db +from linkahead.cached import cache_clear from caosadvancedtools.models.parser import parse_model_from_string from caoscrawler.crawl import Crawler +from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter from caoscrawler.structure_elements import DictElement from caoscrawler.scanner import create_converter_registry, scan_structure_elements -from caosdb.utils.register_tests import clear_database, set_test_key +from linkahead.utils.register_tests import clear_database, set_test_key set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") @fixture(autouse=True) def clear_cache(): + """Clear the LinkAhead cache.""" cache_clear() @@ -266,3 +268,64 @@ Campaign: # Nothing to do for the existing ents assert len(ups) == 0 assert ins[0].name == event.name + + +def test_indiscale_87(clear_database): + """Handle long string queries gracefully. + + https://gitlab.com/linkahead/linkahead-crawler/-/issues/87 + """ + + prop = db.Property(name="str", datatype=db.TEXT).insert() + rt = db.RecordType(name="RT1").add_property(prop).insert() + strings = [ + "X123456789" * 26, + "X" * 260, + "X123456789" * 25 + "9876543210", + ] + recs = [ + db.Record().add_parent(rt).add_property(name="str", value=string).insert() + for string in strings + ] + idents = [ + Identifiable(record_type="RT1", properties={"str": string}) + for string in strings + ] + adapter = CaosDBIdentifiableAdapter() + for rec, ident in zip(recs, idents): + print(f"Testing: ...{rec.get_property('str').value[-10:]}") + retrieved = adapter.retrieve_identified_record_for_identifiable(ident) + # print(rec) + # print(retrieved) + print(db.apiutils.compare_entities(rec, retrieved)) + assert db.apiutils.empty_diff(rec, retrieved) + print("---") + + # add another, harmless, property + prop2 = db.Property(name="someint", datatype=db.INTEGER).insert() + rt.add_property(prop2).update() + string = "Y123456789" * 26 + numbers = [23, 42] + recs = [ + db.Record().add_parent(rt).add_property(name="str", value=string).add_property( + name="someint", value=number).insert() + for number in numbers + ] + idents = [Identifiable(record_type="RT1", properties={"str": string})] + # Ambiguous result + with raises(RuntimeError, match=".*unambiguously.*"): + retrieved = adapter.retrieve_identified_record_for_identifiable(idents[0]) + + # Upgrade new property to be identifying + idents = [ + Identifiable(record_type="RT1", properties={"str": string, "someint": number}) + for number in numbers + ] + for rec, ident in zip(recs, idents): + print(f"Testing: someint={rec.get_property('someint').value}") + retrieved = adapter.retrieve_identified_record_for_identifiable(ident) + # print(rec) + # print(retrieved) + print(db.apiutils.compare_entities(rec, retrieved)) + assert db.apiutils.empty_diff(rec, retrieved) + print("---") diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index d30472c9aa9745fb985358f86015f01286a41680..4990aecc025c290bed26e05d73833b756f904b46 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -183,6 +183,11 @@ def _treat_merge_error_of(newrecord, record): """ for this_p in newrecord.properties: that_p = record.get_property(this_p.name) + if that_p is None: + logger.debug(f"Property {this_p.name} does not exist in the second entity. Note that " + "this should not be the reason for the merge conflict.") + continue + if (isinstance(this_p.value, db.Entity) and isinstance(that_p.value, db.Entity)): if this_p.value.id is not None and that_p.value.id is not None: @@ -613,7 +618,8 @@ class Crawler(object): """ try: merge_entities( - newrecord, record, merge_references_with_empty_diffs=False, merge_id_with_resolved_entity=True) + newrecord, record, merge_references_with_empty_diffs=False, + merge_id_with_resolved_entity=True) except EntityMergeConflictError: _treat_merge_error_of(newrecord, record) # We cannot merge but it is none of the clear case where merge is @@ -637,6 +643,8 @@ class Crawler(object): """ registered_identifiable = self.identifiableAdapter.get_registered_identifiable(record) + if registered_identifiable is None: + return False refs = self.identifiableAdapter.get_identifying_referencing_entities(referencing_entities, registered_identifiable) if any([el is None for el in refs]): diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index 069afc668561ad6ba56bd48d00c6eb2a3c0e70d6..d94b6fca211fc26c2702b8f9f5ad4bc8993bd920 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -106,13 +106,16 @@ identifiabel, identifiable and identified record) for a Record. """ @staticmethod - def create_query_for_identifiable(ident: Identifiable): + def create_query_for_identifiable(ident: Identifiable, startswith: bool = False): """ This function is taken from the old crawler: caosdb-advanced-user-tools/src/caosadvancedtools/crawler.py uses the properties of ident to create a query that can determine whether the required record already exists. + + If ``startswith`` is True, use ``LIKE`` for long string values to test if the strings starts + with the first 200 characters of the value. """ query_string = "FIND RECORD " @@ -132,7 +135,9 @@ identifiabel, identifiable and identified record) for a Record. if len(ident.properties) > 0: query_string += " AND " - query_string += IdentifiableAdapter.create_property_query(ident) + query_string += IdentifiableAdapter.create_property_query(ident, startswith=startswith) + + # TODO Can these cases happen at all with the current code? if query_string.endswith(" AND WITH "): query_string = query_string[:-len(" AND WITH ")] if query_string.endswith(" AND "): @@ -140,15 +145,40 @@ identifiabel, identifiable and identified record) for a Record. return query_string @staticmethod - def create_property_query(entity: Identifiable): + def __create_pov_snippet(pname: str, pvalue, startswith: bool = False): + """Return something like ``'name'='some value'`` or ``'name' LIKE 'some*'``. + +If ``startswith`` is True, the value of strings will be cut off at 200 characters and a ``LIKE`` +operator will be used to find entities matching at the beginning. +""" + if startswith and isinstance(pvalue, str) and len(pvalue) > 200: + operator_value_str = f" LIKE '{escape_squoted_text(pvalue[:200])}*'" + else: + operator_value_str = "='" + convert_value(pvalue) + "'" + result = "'" + escape_squoted_text(pname) + "'" + operator_value_str + return result + + @staticmethod + def create_property_query(entity: Identifiable, startswith: bool = False): + """Create a POV query part with the entity's properties. + +Parameters +---------- + +entity: Identifiable + The Identifiable whose properties shall be used. + +startswith: bool, optional + If True, check string typed properties against the first 200 characters only. Default is False. + """ query_string = "" + pov = IdentifiableAdapter.__create_pov_snippet # Shortcut for pname, pvalue in entity.properties.items(): if pvalue is None: query_string += "'" + escape_squoted_text(pname) + "' IS NULL AND " elif isinstance(pvalue, list): for v in pvalue: - query_string += ("'" + escape_squoted_text(pname) + "'='" + - convert_value(v) + "' AND ") + query_string += pov(pname, v, startswith=startswith) + " AND " # TODO: (for review) # This code would allow for more complex identifiables with @@ -161,8 +191,7 @@ identifiabel, identifiable and identified record) for a Record. # IdentifiableAdapter.create_property_query(p.value) + # ") AND ") else: - query_string += ("'" + escape_squoted_text(pname) + "'='" + - convert_value(pvalue) + "' AND ") + query_string += pov(pname, pvalue, startswith=startswith) + " AND " # remove the last AND return query_string[:-4] @@ -549,10 +578,28 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): def retrieve_identified_record_for_identifiable(self, identifiable: Identifiable): query_string = self.create_query_for_identifiable(identifiable) - candidates = cached_query(query_string) + try: + candidates = cached_query(query_string) + except db.exceptions.HTTPServerError as err: + query_string = self.create_query_for_identifiable(identifiable, startswith=True) + candidates = cached_query(query_string).copy() # Copy against cache poisoning + + # Test if the candidates really match all properties + for pname, pvalue in identifiable.properties.items(): + popme = [] + for i in range(len(candidates)): + this_prop = candidates[i].get_property(pname) + if this_prop is None: + popme.append(i) + continue + if not this_prop.value == pvalue: + popme.append(i) + for i in reversed(popme): + candidates.pop(i) + if len(candidates) > 1: raise RuntimeError( - f"Identifiable was not defined unambigiously.\n{query_string}\nReturned the " + f"Identifiable was not defined unambiguously.\n{query_string}\nReturned the " f"following {candidates}." f"Identifiable:\n{identifiable.record_type}{identifiable.properties}") if len(candidates) == 0: diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index 5eecb10630bb478ff9070ad97eb5acca9a963fab..26aa9e73c42e4d3923d01d83a47c8dfa7007f0a6 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -34,8 +34,8 @@ from pathlib import Path from unittest.mock import MagicMock, Mock, patch import caoscrawler -import caosdb as db -import caosdb.common.models as dbmodels +import linkahead as db +import linkahead.common.models as dbmodels import pytest import yaml from caoscrawler.crawl import (Crawler, SecurityMode, TreatedRecordLookUp, @@ -216,6 +216,13 @@ def test_split_into_inserts_and_updates_trivial(): crawler.split_into_inserts_and_updates([]) +def test_split_into_inserts_and_updates_unidentified(): + crawler = Crawler() + with raises(ValueError) as err: + crawler.split_into_inserts_and_updates([db.Record(name="recname").add_parent("someparent")]) + assert str(err.value).startswith("There is no identifying information.") + + def basic_retrieve_by_name_mock_up(rec, referencing_entities=None, known=None): """ returns a stored Record if rec.name is an existing key, None otherwise """ if rec.name in known: