Skip to content
Snippets Groups Projects
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
test_issues.py 14.95 KiB
# This file is a part of the CaosDB Project.
#
# Copyright (C) 2022 Indiscale GmbH <info@indiscale.com>
#               2022 Florian Spreckelsen <f.spreckelsen@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
import linkahead as db
from caosadvancedtools.models.parser import parse_model_from_string
from caoscrawler.crawl import Crawler
from caoscrawler.identifiable import Identifiable
from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter
from caoscrawler.scanner import (create_converter_registry,
                                 scan_structure_elements)
from caoscrawler.structure_elements import DictElement
from linkahead.cached import cache_clear
from linkahead.utils.register_tests import clear_database, set_test_key
from pytest import fixture, mark, raises

import tempfile

set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2")


@fixture(autouse=True)
def clear_cache():
    """Clear the LinkAhead cache."""
    cache_clear()


def test_issue_23(clear_database):
    """Test that an update leaves existing properties, that were not found by
    the crawler, unchanged.

    See issue https://gitlab.com/caosdb/caosdb-crawler/-/issues/23

    """

    # insert a simplistic model an arecord of type TestType with identifying
    # property and prop_a, but not prop_b.
    prop_ident = db.Property(name="identifying_prop", datatype=db.TEXT)
    prop_a = db.Property(name="prop_a", datatype=db.TEXT)
    prop_b = db.Property(name="prop_b", datatype=db.TEXT)
    rt = db.RecordType(name="TestType")
    rec = db.Record(name="TestRec").add_parent(rt)
    rec.add_property(name="identifying_prop", value="identifier")
    rec.add_property(name="prop_a", value="something")
    db.Container().extend([prop_ident, prop_a, prop_b, rt, rec]).insert()

    # set up crawler, first cfood defining a TestType record with
    # identifying_prop and prop_b, but not prop_a ...
    crawler_definition = {
        "DictTest": {
            "type": "DictElement",
            "match": "(.*)",
            "records": {
                "TestType": {}
            },
            "subtree": {
                "identifying_element": {
                    "type": "TextElement",
                    "match_name": "ident",
                    "match_value": "(?P<ident_value>.*)",
                    "records": {
                        "TestType": {
                            "identifying_prop": "$ident_value"
                        }
                    }
                },
                "other_element": {
                    "type": "TextElement",
                    "match_name": "prop_b",
                    "match_value": "(?P<other_value>.*)",
                    "records": {
                        "TestType": {
                            "prop_b": "$other_value"
                        }
                    }
                }
            }
        }
    }

    # register identifiable for TestType
    ident = CaosDBIdentifiableAdapter()
    ident.register_identifiable("TestType", db.RecordType().add_parent(
        name="TestType").add_property(name="identifying_prop"))

    crawler = Crawler(identifiableAdapter=ident)
    converter_registry = create_converter_registry(crawler_definition)

    # the dictionary to be crawled...
    test_dict = {
        "ident": "identifier",
        "prop_b": "something_else"
    }

    crawler.generate_run_id()
    records = scan_structure_elements(
        DictElement("TestDict", test_dict), crawler_definition, converter_registry)

    assert len(records) == 1
    rec_crawled = records[0]
    assert rec_crawled.parents[0].name == "TestType"
    assert rec_crawled.get_property("identifying_prop") is not None
    assert rec_crawled.get_property("identifying_prop").value == "identifier"
    assert rec_crawled.get_property("prop_b") is not None
    assert rec_crawled.get_property("prop_b").value == "something_else"
    # no interaction with the database yet, so the record shouldn't have a prop_a yet
    assert rec_crawled.get_property("prop_a") is None

    # synchronize with database and update the record
    ins, ups = crawler.synchronize(crawled_data=records)
    assert len(ins) == 0
    assert len(ups) == 1

    # retrieve and check that name and properties have been combined correctly
    rec_retrieved = db.Record(id=rec.id).retrieve()
    assert rec_retrieved.name == rec.name
    assert rec_retrieved.get_property(
        "identifying_prop").value == rec.get_property("identifying_prop").value
    assert rec_retrieved.get_property(
        "prop_a").value == rec.get_property("prop_a").value
    assert rec_retrieved.get_property(
        "identifying_prop").value == rec_crawled.get_property("identifying_prop").value
    assert rec_retrieved.get_property(
        "prop_b").value == rec_crawled.get_property("prop_b").value


def test_issue_83(clear_database):
    """https://gitlab.com/linkahead/linkahead-crawler/-/issues/83. Test that
    names don't need to be unique for referenced entities if they are not part
    of the identifiable.

    """

    # Very simple data model
    identifying_prop = db.Property(name="IdentifyingProp", datatype=db.INTEGER).insert()
    referenced_type = db.RecordType(name="ReferencedType").add_property(
        name=identifying_prop.name, importance=db.OBLIGATORY).insert()
    referencing_type = db.RecordType(name="ReferencingType").add_property(
        name=referenced_type.name, datatype=db.LIST(referenced_type.name)).insert()

    # Define identifiables. ReferencingType by name, ReferencedType by
    # IdentifyingProp and not by name.
    ident = CaosDBIdentifiableAdapter()
    ident.register_identifiable(referenced_type.name, db.RecordType().add_parent(
        name=referenced_type.name).add_property(name=identifying_prop.name))
    ident.register_identifiable(referencing_type.name, db.RecordType().add_parent(
        name=referencing_type.name).add_property(name="name"))

    crawler = Crawler(identifiableAdapter=ident)

    ref_target1 = db.Record(name="RefTarget").add_parent(
        name=referenced_type.name).add_property(name=identifying_prop.name, value=1)
    ref_target2 = db.Record(name="RefTarget").add_parent(
        name=referenced_type.name).add_property(name=identifying_prop.name, value=2)

    referencing1 = db.Record(name="Referencing1").add_parent(
        name=referencing_type.name).add_property(name=referenced_type.name, value=[ref_target1])
    referencing2 = db.Record(name="Referencing2").add_parent(
        name=referencing_type.name).add_property(name=referenced_type.name, value=[ref_target2])
    referencing3 = db.Record(name="Referencing3").add_parent(
        name=referencing_type.name).add_property(name=referenced_type.name, value=[ref_target1,
                                                                                   ref_target2])

    records = db.Container().extend(
        [ref_target1, ref_target2, referencing1, referencing2, referencing3])

    ins, ups = crawler.synchronize(crawled_data=records, unique_names=False)
    assert len(ins) == len(records)
    assert len(ups) == 0

    retrieved_target1 = db.execute_query(
        f"FIND {referenced_type.name} WITH {identifying_prop.name}=1", unique=True)
    retrieved_target2 = db.execute_query(
        f"FIND {referenced_type.name} WITH {identifying_prop.name}=2", unique=True)
    assert retrieved_target2.name == retrieved_target1.name
    assert retrieved_target1.name == ref_target1.name
    assert retrieved_target1.id != retrieved_target2.id

    retrieved_referencing1 = db.execute_query(
        f"FIND {referencing_type.name} WITH name={referencing1.name}", unique=True)
    assert retrieved_referencing1.get_property(referenced_type.name) is not None
    assert retrieved_referencing1.get_property(referenced_type.name).value == [
        retrieved_target1.id]
    assert retrieved_referencing1.get_property(referenced_type.name).value != [
        retrieved_target2.id]

    retrieved_referencing2 = db.execute_query(
        f"FIND {referencing_type.name} WITH name={referencing2.name}", unique=True)
    assert retrieved_referencing2.get_property(referenced_type.name) is not None
    assert retrieved_referencing2.get_property(referenced_type.name).value == [
        retrieved_target2.id]
    assert retrieved_referencing2.get_property(referenced_type.name).value != [
        retrieved_target1.id]

    retrieved_referencing3 = db.execute_query(
        f"FIND {referencing_type.name} WITH name={referencing3.name}", unique=True)
    assert retrieved_referencing3.get_property(referenced_type.name) is not None
    assert len(retrieved_referencing3.get_property(referenced_type.name).value) == 2
    assert retrieved_target1.id in retrieved_referencing3.get_property(referenced_type.name).value
    assert retrieved_target2.id in retrieved_referencing3.get_property(referenced_type.name).value


def test_indiscale_113(clear_database):
    """Somewhat mysterious failures to resolve references in
    split_into_inserts_and_updates, see
    https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/113

    """

    # Create and insert minimal datamodel
    datamodel_str = """
Event:
  recommended_properties:
    Basis:
    Campaign:
Basis:
Campaign:
  recommended_properties:
    Basis:
"""
    model = parse_model_from_string(datamodel_str)
    model.sync_data_model(noquestion=True)

    # Register identifiables, everything is identified by name
    ident = CaosDBIdentifiableAdapter()
    ident.register_identifiable("Event", db.RecordType().add_parent(
        name="Event").add_property(name="name"))
    ident.register_identifiable("Basis", db.RecordType().add_parent(
        name="Basis").add_property(name="name"))
    ident.register_identifiable("Campaign", db.RecordType().add_parent(
        name="Campaign").add_property(name="name"))

    crawler = Crawler(identifiableAdapter=ident)

    # Add records: event references basis and campaign, campaign references
    # basis.
    basis = db.Record(name="Poseidon").add_parent(name="Basis")
    campaign = db.Record(name="POS386").add_parent(
        name="Campaign").add_property(name="Basis", value=basis)
    event = db.Record(name="GeoB13952").add_parent(name="Event")
    event.add_property(name="Basis", value=basis)
    event.add_property(name="Campaign", value=campaign)

    # basis and campaign already exist in the db
    db.Container().extend([basis, campaign]).insert()
    # redefine to trigger resolving
    basis = db.Record(name="Poseidon").add_parent(name="Basis")
    campaign = db.Record(name="POS386").add_parent(
        name="Campaign").add_property(name="Basis", value=basis)
    recs = [event, basis, campaign]

    ins, ups = crawler.synchronize(crawled_data=recs, unique_names=False)
    # There is only one event to be inserted
    assert len(ins) == 1
    # Nothing to do for the existing ents
    assert len(ups) == 0
    assert ins[0].name == event.name


def test_indiscale_87(clear_database):
    """Handle long string queries gracefully.

    https://gitlab.com/linkahead/linkahead-crawler/-/issues/87
    """

    prop = db.Property(name="str", datatype=db.TEXT).insert()
    rt = db.RecordType(name="RT1").add_property(prop).insert()
    strings = [
        "X123456789" * 26,
        "X" * 260,
        "X123456789" * 25 + "9876543210",
    ]
    recs = [
        db.Record().add_parent(rt).add_property(name="str", value=string).insert()
        for string in strings
    ]
    idents = [
        Identifiable(record_type="RT1", properties={"str": string})
        for string in strings
    ]
    adapter = CaosDBIdentifiableAdapter()
    for rec, ident in zip(recs, idents):
        print(f"Testing: ...{rec.get_property('str').value[-10:]}")
        retrieved = adapter.retrieve_identified_record_for_identifiable(ident)
        # print(rec)
        # print(retrieved)
        print(db.apiutils.compare_entities(rec, retrieved))
        assert db.apiutils.empty_diff(rec, retrieved)
        print("---")

    # add another, harmless, property
    prop2 = db.Property(name="someint", datatype=db.INTEGER).insert()
    rt.add_property(prop2).update()
    string = "Y123456789" * 26
    numbers = [23, 42]
    recs = [
        db.Record().add_parent(rt).add_property(name="str", value=string).add_property(
            name="someint", value=number).insert()
        for number in numbers
    ]
    idents = [Identifiable(record_type="RT1", properties={"str": string})]
    # Ambiguous result
    with raises(RuntimeError, match=".*unambiguously.*"):
        retrieved = adapter.retrieve_identified_record_for_identifiable(idents[0])

    # Upgrade new property to be identifying
    idents = [
        Identifiable(record_type="RT1", properties={"str": string, "someint": number})
        for number in numbers
    ]
    for rec, ident in zip(recs, idents):
        print(f"Testing: someint={rec.get_property('someint').value}")
        retrieved = adapter.retrieve_identified_record_for_identifiable(ident)
        # print(rec)
        # print(retrieved)
        print(db.apiutils.compare_entities(rec, retrieved))
        assert db.apiutils.empty_diff(rec, retrieved)
        print("---")


def test_issue_14(clear_database):
    """
    Issue title: Some parent updates are required before inserts

    https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/14
    """

    rt1 = db.RecordType(name="RT1")
    rt2 = db.RecordType(name="RT2").insert()
    rt1.add_property(rt2, importance=db.OBLIGATORY)
    rt1.insert()

    r = db.Record()
    r.add_parent(rt1)
    with tempfile.NamedTemporaryFile() as tmpf:
        f = db.File(name="test_parent", path="parent_test/file.txt", file=tmpf.name)
        f.insert()

        # We create a clean new file object here:
        f2 = db.File(name="test_parent", path="parent_test/file.txt", file=tmpf.name)

    f2.add_parent(rt2)
    r.add_property(name="RT2", value=f2)

    # Current state in the database: File without parents
    f_test_base = db.File(name="test_parent").retrieve()
    assert len(f_test_base.parents) == 0
    assert len(db.execute_query("FIND Record")) == 0

    ident = CaosDBIdentifiableAdapter()
    ident.register_identifiable("RT1", db.RecordType().add_parent(
        name="RT1").add_property(name="RT2"))
    crawler = Crawler(identifiableAdapter=ident)
    crawler.synchronize(crawled_data=[f2, r])

    f_test = db.File(name="test_parent").retrieve()
    assert len(f_test.parents) == 1
    assert f_test.parents[0].name == "RT2"
    records = db.execute_query("FIND Record")
    assert len(records) == 1
    assert records[0].get_property("RT2").value == f_test.id