diff --git a/CHANGELOG.md b/CHANGELOG.md index 33cd6e06a51a1dcec534e3817a99b4f1a509ce2f..30a4e9a1fd5070a584462e1769fbb9fa9fbd943a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -64,6 +64,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * A corner case in `split_into_inserts_and_updates` whereby two records created in different places in the cfood definition would not be merged if both were identified by the same LinkAhead id +* [#87](https://gitlab.com/linkahead/linkahead-crawler/-/issues/87) Handle long strings more gracefully. The crawler sometimes runs into + [linkahead-server#101](https://gitlab.com/linkahead/linkahead-server/-/issues/101), this is now mitigated. ### Security ### diff --git a/integrationtests/basic_example/test_basic.py b/integrationtests/basic_example/test_basic.py index d3482a19f0c95912002b2ff68101623476d452ea..c906a81d86af56669f7c522169bceb3b5fcb3e01 100755 --- a/integrationtests/basic_example/test_basic.py +++ b/integrationtests/basic_example/test_basic.py @@ -112,11 +112,11 @@ def crawler_extended(ident): return cr, crawled_data, debug_tree -def test_ambigious_lookup(clear_database, usemodel, crawler, ident): +def test_ambiguous_lookup(clear_database, usemodel, crawler, ident): ins, ups = crawler[0].synchronize(crawled_data=crawler[1]) proj = db.execute_query("FIND Project WITH identifier='SpeedOfLight'", unique=True) - with pytest.raises(RuntimeError, match=".*unambigiously.*"): + with pytest.raises(RuntimeError, match=".*unambiguously.*"): print(crawler[0].identifiableAdapter.retrieve_identified_record_for_identifiable( Identifiable(properties={'project': proj.id}))) diff --git a/integrationtests/test_issues.py b/integrationtests/test_issues.py index 2ec772a2f7f3166325cfe4f6ab0ea7d8b9d82a69..814e82ad75512ec8fe217294e1a9e86c6aa01ab3 100644 --- a/integrationtests/test_issues.py +++ b/integrationtests/test_issues.py @@ -16,7 +16,7 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. # -from pytest import fixture, mark +from pytest import fixture, mark, raises import linkahead as db from linkahead.cached import cache_clear @@ -279,9 +279,9 @@ def test_indiscale_87(clear_database): prop = db.Property(name="str", datatype=db.TEXT).insert() rt = db.RecordType(name="RT1").add_property(prop).insert() strings = [ - "0123456789" * 26, - "0" * 260, - "0123456789" * 25 + "9876543210", + "X123456789" * 26, + "X" * 260, + "X123456789" * 25 + "9876543210", ] recs = [ db.Record().add_parent(rt).add_property(name="str", value=string).insert() @@ -293,4 +293,39 @@ def test_indiscale_87(clear_database): ] adapter = CaosDBIdentifiableAdapter() for rec, ident in zip(recs, idents): - assert adapter.retrieve_identified_record_for_identifiable(ident) == rec + print(f"Testing: ...{rec.get_property('str').value[-10:]}") + retrieved = adapter.retrieve_identified_record_for_identifiable(ident) + # print(rec) + # print(retrieved) + print(db.apiutils.compare_entities(rec, retrieved)) + assert db.apiutils.empty_diff(rec, retrieved) + print("---") + + # add another, harmless, property + prop2 = db.Property(name="someint", datatype=db.INTEGER).insert() + rt.add_property(prop2).update() + string = "Y123456789" * 26 + numbers = [23, 42] + recs = [ + db.Record().add_parent(rt).add_property(name="str", value=string).add_property( + name="someint", value=number).insert() + for number in numbers + ] + idents = [Identifiable(record_type="RT1", properties={"str": string})] + # Ambiguous result + with raises(RuntimeError, match=".*unambiguously.*"): + retrieved = adapter.retrieve_identified_record_for_identifiable(idents[0]) + + # Upgrade new property to be identifying + idents = [ + Identifiable(record_type="RT1", properties={"str": string, "someint": number}) + for number in numbers + ] + for rec, ident in zip(recs, idents): + print(f"Testing: someint={rec.get_property('someint').value}") + retrieved = adapter.retrieve_identified_record_for_identifiable(ident) + # print(rec) + # print(retrieved) + print(db.apiutils.compare_entities(rec, retrieved)) + assert db.apiutils.empty_diff(rec, retrieved) + print("---") diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index 069afc668561ad6ba56bd48d00c6eb2a3c0e70d6..e02dde269c70a1143c151f94f4847ca25f2802e3 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -106,13 +106,16 @@ identifiabel, identifiable and identified record) for a Record. """ @staticmethod - def create_query_for_identifiable(ident: Identifiable): + def create_query_for_identifiable(ident: Identifiable, startswith: bool = False): """ This function is taken from the old crawler: caosdb-advanced-user-tools/src/caosadvancedtools/crawler.py uses the properties of ident to create a query that can determine whether the required record already exists. + + If ``startswith`` is True, use ``LIKE`` for long string values to test if the strings starts + with the first 200 characters of the value. """ query_string = "FIND RECORD " @@ -132,7 +135,9 @@ identifiabel, identifiable and identified record) for a Record. if len(ident.properties) > 0: query_string += " AND " - query_string += IdentifiableAdapter.create_property_query(ident) + query_string += IdentifiableAdapter.create_property_query(ident, startswith=startswith) + + # TODO Can these cases happen at all with the current code? if query_string.endswith(" AND WITH "): query_string = query_string[:-len(" AND WITH ")] if query_string.endswith(" AND "): @@ -140,15 +145,41 @@ identifiabel, identifiable and identified record) for a Record. return query_string @staticmethod - def create_property_query(entity: Identifiable): + def __create_pov_snippet(pname: str, pvalue, startswith: bool = False): + """Return something like ``'name'='some value'`` or ``'name' LIKE 'some*'``. + +If ``startswith`` is True, the value of strings will be cut off at 200 characters and a ``LIKE`` +operator will be used to find entities matching at the beginning. +""" + if startswith and isinstance(pvalue, str) and len(pvalue) > 200: + operator_value_str = f" LIKE '{escape_squoted_text(pvalue[:200])}*'" + else: + operator_value_str = "='" + convert_value(pvalue) + "'" + result = "'" + escape_squoted_text(pname) + "'" + operator_value_str + return result + + + @staticmethod + def create_property_query(entity: Identifiable, startswith: bool = False): + """Create a POV query part with the entity's properties. + +Parameters +---------- + +entity: Identifiable + The Identifiable whose properties shall be used. + +startswith: bool, optional + If True, check string typed properties against the first 200 characters only. Default is False. + """ query_string = "" + pov = IdentifiableAdapter.__create_pov_snippet # Shortcut for pname, pvalue in entity.properties.items(): if pvalue is None: query_string += "'" + escape_squoted_text(pname) + "' IS NULL AND " elif isinstance(pvalue, list): for v in pvalue: - query_string += ("'" + escape_squoted_text(pname) + "'='" + - convert_value(v) + "' AND ") + query_string += pov(pname, v, startswith=startswith) + " AND " # TODO: (for review) # This code would allow for more complex identifiables with @@ -161,8 +192,7 @@ identifiabel, identifiable and identified record) for a Record. # IdentifiableAdapter.create_property_query(p.value) + # ") AND ") else: - query_string += ("'" + escape_squoted_text(pname) + "'='" + - convert_value(pvalue) + "' AND ") + query_string += pov(pname, pvalue, startswith=startswith) + " AND " # remove the last AND return query_string[:-4] @@ -549,10 +579,28 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): def retrieve_identified_record_for_identifiable(self, identifiable: Identifiable): query_string = self.create_query_for_identifiable(identifiable) - candidates = cached_query(query_string) + try: + candidates = cached_query(query_string) + except db.exceptions.HTTPServerError as err: + query_string = self.create_query_for_identifiable(identifiable, startswith=True) + candidates = cached_query(query_string).copy() # Copy against cache poisoning + + # Test if the candidates really match all properties + for pname, pvalue in identifiable.properties.items(): + popme = [] + for i in range(len(candidates)): + this_prop = candidates[i].get_property(pname) + if this_prop is None: + popme.append(i) + continue + if not this_prop.value == pvalue: + popme.append(i) + for i in reversed(popme): + candidates.pop(i) + if len(candidates) > 1: raise RuntimeError( - f"Identifiable was not defined unambigiously.\n{query_string}\nReturned the " + f"Identifiable was not defined unambiguously.\n{query_string}\nReturned the " f"following {candidates}." f"Identifiable:\n{identifiable.record_type}{identifiable.properties}") if len(candidates) == 0: