Skip to content
Snippets Groups Projects
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
test_crawler.py 45.94 KiB
#!/usr/bin/env python3
# encoding: utf-8
#
# This file is a part of the CaosDB Project.
#
# Copyright (C) 2023 Indiscale GmbH <info@indiscale.com>
# Copyright (C) 2023 Henrik tom Wörden <h.tomwoerden@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#

"""
test the Crawler class
"""
import json
import logging
import os
import warnings
from copy import deepcopy
from functools import partial
from os.path import basename, dirname, join
from pathlib import Path
from unittest.mock import MagicMock, Mock, patch

import caoscrawler
import linkahead as db
import linkahead.common.models as dbmodels
import pytest
import yaml
from caosadvancedtools.models.parser import parse_model_from_string
from caoscrawler.crawl import (Crawler, SecurityMode, TreatedRecordLookUp,
                               _treat_deprecated_prefix, crawler_main,
                               split_restricted_path)
from caoscrawler.debug_tree import DebugTree
from caoscrawler.identifiable import Identifiable
from caoscrawler.identifiable_adapters import (CaosDBIdentifiableAdapter,
                                               IdentifiableAdapter,
                                               LocalStorageIdentifiableAdapter)
from caoscrawler.scanner import (create_converter_registry, scan_directory,
                                 scan_structure_elements)
from caoscrawler.stores import GeneralStore, RecordStore
from caoscrawler.structure_elements import (DictElement, DictListElement,
                                            DictTextElement, File)
from linkahead.apiutils import compare_entities
from linkahead.cached import cache_clear
from linkahead.exceptions import EmptyUniqueQueryError
from pytest import raises

UNITTESTDIR = Path(__file__).parent

EXAMPLE_SERVER_STATE = [
    db.Property(id=1, name='result', datatype=db.TEXT),
    db.Property(id=2, name='date', datatype=db.DATETIME),
    db.RecordType(id=3, name="Experiment"),
    db.RecordType(id=4, name="Analysis"),
    db.Record(id=5)
    .add_parent(name="Experiment", id=3)
    .add_property(name="date", value="2022-02-01")
    .add_property(name="result", value="FAIL"),
    db.Record(id=6)
    .add_parent(name="Experiment", id=3)
    .add_property(name="date", value="2022-02-02")
    .add_property(name="result", value="SUCCESS"),
    db.Record(id=7)
    .add_parent(name="Analysis", id=4)
    .add_property(name="date", value="2022-03-01")
    .add_property(name="result", value="homogeneous"),
    db.Record(id=8)
    .add_parent(name="Analysis", id=4)
    .add_property(name="date", value="2022-03-02")
    .add_property(name="result", value="heterogeneous"),
]
NEW_ELEMENT = (db.Record()
               .add_parent(name="Analysis", id=4)
               .add_property(name="date", value="2022-03-05")  # new date
               .add_property(name="result", value="homogeneous"))


def mock_get_entity_by(eid=None, name=None, path=None):
    if eid is not None:
        candidates = [el for el in EXAMPLE_SERVER_STATE if el.id == eid]
        if len(candidates) > 0:
            return candidates[0]
        else:
            raise EmptyUniqueQueryError("")
    if name is not None:
        candidates = [el for el in EXAMPLE_SERVER_STATE
                      if (el.name is not None and el.name.lower() == name.lower())]
        if len(candidates) > 0:
            return candidates[0]
        else:
            raise EmptyUniqueQueryError("")
    if path is not None:
        candidates = [el for el in EXAMPLE_SERVER_STATE
                      if (el.path is not None and el.path == path)]
        if len(candidates) > 0:
            return candidates[0]
        else:
            raise EmptyUniqueQueryError("")


def mock_retrieve_record(identifiable: Identifiable):
    """ assumes that the identifiable is always only the date"""

    for record in EXAMPLE_SERVER_STATE:
        if (record.role == "Record" and "date" in identifiable.properties
                and record.get_property("date").value == identifiable.properties['date']):
            return record
    return None


def mock_cached_only_rt(query_string: str):
    """Always return an empty Container"""
    result = db.Container()
    lo_query = query_string.lower()
    if lo_query.startswith("find record ") or lo_query.startswith("find file "):
        return result
    model = parse_model_from_string("""
B:
  obligatory_properties:
    C:
      obligatory_properties:
        prop_other:
          datatype: INTEGER
    prop_ident:
      datatype: INTEGER
A:
  obligatory_properties:
    B:
      datatype: LIST<B>
    prop_ident:
""")
    if query_string == "FIND RECORDTYPE 'A'":
        model.get_deep("A").id = 1
        return result + [model.get_deep("A")]
    if query_string == "FIND RECORDTYPE 'B'":
        model.get_deep("A").id = 2
        return result + [model.get_deep("B")]
    print(query_string)
    raise NotImplementedError("Mock for this case is missing")


@pytest.fixture(autouse=True)
def clear_cache():
    cache_clear()


@pytest.mark.filterwarnings("ignore::DeprecationWarning")
def test_constructor():
    with warnings.catch_warnings(record=True) as w:
        # Cause all warnings to always be triggered.
        warnings.filterwarnings("ignore")
        warnings.filterwarnings("always", category=DeprecationWarning)

        Crawler(debug=True)
        assert issubclass(w[-1].category, DeprecationWarning)
        assert "The debug argument of the Crawler class" in str(w[-1].message)

        Crawler(generalStore=GeneralStore())
        assert issubclass(w[-1].category, DeprecationWarning)
        assert "The generalStore argument of the Crawler" in str(w[-1].message)


@pytest.mark.filterwarnings("ignore::DeprecationWarning")
def test_deprecated_functions():
    with warnings.catch_warnings(record=True) as w:
        # Cause all warnings to always be triggered.
        warnings.filterwarnings("ignore")
        warnings.filterwarnings("always", category=DeprecationWarning)
        cr = Crawler()
        cr.crawl_directory(UNITTESTDIR, UNITTESTDIR / "scifolder_cfood.yml")
        print(w)
        print(w[0].message)
        assert issubclass(w[-1].category, DeprecationWarning)
        assert "The function crawl_directory in the crawl" in str(w[-1].message)

        cr.start_crawling([], {}, {})
        assert issubclass(w[-1].category, DeprecationWarning)
        assert "The function start_crawling in the crawl module" in str(w[-1].message)

        cr.crawled_data
        assert issubclass(w[-1].category, DeprecationWarning)
        assert "The use of self.crawled_data is depricated" in str(w[-1].message)


def test_check_whether_parent_exists():
    trivial_result = Crawler.check_whether_parent_exists([], [])
    assert len(trivial_result) == 0
    assert isinstance(trivial_result, list)

    trivial_result2 = Crawler.check_whether_parent_exists([db.Record(), db.Record()], [])
    assert len(trivial_result) == 0
    assert isinstance(trivial_result, list)

    # make sure records with parent is collected
    a_recs = Crawler.check_whether_parent_exists(
        [
            db.Record(id=1).add_parent("A"),
            db.Record(id=2).add_parent("B"),
            db.Record(id=3).add_parent("B"),
            db.Record(id=4).add_parent("A"),
        ], ["A"])
    a_recs_ids = [el.id for el in a_recs]
    assert 1 in a_recs_ids
    assert 4 in a_recs_ids


def test_remove_unnecessary_updates():
    # test trvial case
    upl = [db.Record().add_parent("A")]
    irs = [db.Record().add_parent("A")]
    updates = Crawler.remove_unnecessary_updates(upl, irs)
    assert len(updates) == 0

    # test property difference case
    # TODO this should work right?
    # upl = [db.Record().add_parent("A").add_property("a", 3)]
    # irs = [db.Record().add_parent("A")]  # ID should be s
    # Crawler.remove_unnecessary_updates(upl, irs)
    # assert len(upl) == 1

    # test value difference case
    upl = [db.Record().add_parent("A").add_property("a", 5)]
    irs = [db.Record().add_parent("A").add_property("a")]
    updates = Crawler.remove_unnecessary_updates(upl, irs)
    assert len(updates) == 1
    upl = [db.Record().add_parent("A").add_property("a", 5)]
    irs = [db.Record().add_parent("A").add_property("a", 5)]
    updates = Crawler.remove_unnecessary_updates(upl, irs)
    assert len(updates) == 0

    # test unit difference case
    upl = [db.Record().add_parent("A").add_property("a", unit='cm')]
    irs = [db.Record().add_parent("A").add_property("a")]
    updates = Crawler.remove_unnecessary_updates(upl, irs)
    assert len(updates) == 1

    # test None difference case
    upl = [db.Record().add_parent("A").add_property("a")]
    irs = [db.Record().add_parent("A").add_property("a", 5)]
    updates = Crawler.remove_unnecessary_updates(upl, irs)
    assert len(updates) == 1


def test_split_into_inserts_and_updates_trivial():
    crawler = Crawler()
    crawler.split_into_inserts_and_updates([])


def test_split_into_inserts_and_updates_unidentified():
    crawler = Crawler()
    with raises(ValueError) as err:
        crawler.split_into_inserts_and_updates([db.Record(name="recname").add_parent("someparent")])
    assert str(err.value).startswith("There is no identifying information.")


def basic_retrieve_by_name_mock_up(rec, referencing_entities=None, known=None):
    """ returns a stored Record if rec.name is an existing key, None otherwise """
    if rec.name in known:
        return known[rec.name]
    else:
        return None


@pytest.fixture
def crawler_mocked_identifiable_retrieve():
    crawler = Crawler()
    # TODO use minimal setup
    # mock retrieval of registered identifiabls: return Record with just a parent
    crawler.identifiableAdapter.get_registered_identifiable = Mock(
        side_effect=lambda x: db.Record().add_parent(x.parents[0].name).add_property(name='name'))

    # Simulate remote server content by using the names to identify records
    # There is only a single known Record with name A
    crawler.identifiableAdapter.retrieve_identified_record_for_record = Mock(side_effect=partial(
        basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A")}))
    crawler.identifiableAdapter.retrieve_identified_record_for_identifiable = Mock(
        side_effect=partial(
            basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A")}))
    return crawler


def test_split_into_inserts_and_updates_single(crawler_mocked_identifiable_retrieve):
    crawler = crawler_mocked_identifiable_retrieve
    identlist = [Identifiable(name="A", record_type="C"), Identifiable(name="B", record_type="C")]
    entlist = [db.Record(name="A").add_parent(
        "C"), db.Record(name="B").add_parent("C")]

    assert crawler.treated_records_lookup.get_any(entlist[0], identlist[0]) is None
    assert crawler.treated_records_lookup.get_any(entlist[0], identlist[0]) is None
    assert not crawler._has_reference_value_without_id(identlist[0])
    assert not crawler._has_reference_value_without_id(identlist[1])
    assert crawler.identifiableAdapter.retrieve_identified_record_for_record(
        identlist[0]).id == 1111
    assert crawler.identifiableAdapter.retrieve_identified_record_for_record(
        identlist[1]) is None

    insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist))
    assert len(insert) == 1
    assert insert[0].name == "B"
    assert len(update) == 1
    assert update[0].name == "A"
    # if this ever fails, the mock up may be removed
    crawler.identifiableAdapter.get_registered_identifiable.assert_called()
    crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called()


def test_split_into_inserts_and_updates_with_duplicate(crawler_mocked_identifiable_retrieve):
    crawler = crawler_mocked_identifiable_retrieve
    a = db.Record(name="A").add_parent("C")
    b = db.Record(name="B").add_parent("C")
    b.add_property("A", a)
    # This is identical to a and should be removed
    c = db.Record(name="A").add_parent("C")
    entlist = [a, b, c]
    insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist))
    assert len(insert) == 1
    assert insert[0].name == "B"
    assert len(update) == 1
    assert update[0].name == "A"
    # if this ever fails, the mock up may be removed
    crawler.identifiableAdapter.get_registered_identifiable.assert_called()
    crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called()


def test_split_into_inserts_and_updates_with_ref(crawler_mocked_identifiable_retrieve):
    crawler = crawler_mocked_identifiable_retrieve
    # try it with a reference
    a = db.Record(name="A").add_parent("C")
    b = db.Record(name="B").add_parent("C")
    b.add_property("A", a)
    entlist = [a, b]
    insert, update = crawler.split_into_inserts_and_updates(entlist)
    assert len(insert) == 1
    assert insert[0].name == "B"
    assert len(update) == 1
    assert update[0].name == "A"
    # if this ever fails, the mock up may be removed
    crawler.identifiableAdapter.get_registered_identifiable.assert_called()
    crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called()


def test_split_into_inserts_and_updates_with_circ():
    # try circular
    a = db.Record(name="A").add_parent("C")
    b = db.Record(name="B").add_parent("C")
    b.add_property("A", a)
    a.add_property("B", b)
    entlist = [a, b]
    # TODO this does not seem to be complete!


def test_split_into_inserts_and_updates_with_complex(crawler_mocked_identifiable_retrieve):
    crawler = crawler_mocked_identifiable_retrieve
    #      A
    #      ^
    #      |
    # F <- B <- G
    a = db.Record(name="A").add_parent("C").add_property(
        'd', 13).add_property('e', "lskdjlsfdj")
    b = db.Record(name="B").add_parent("C")
    g = db.Record(name="G").add_parent("C")
    f = db.Record(name="F").add_parent("C")
    g.add_property("A", a)
    b.add_property("A", f)
    b.add_property("A", a)
    entlist = [a, b, g]
    insert, update = crawler.split_into_inserts_and_updates(entlist)
    assert len(insert) == 3
    assert "B" in [el.name for el in insert]
    assert len(update) == 1
    assert update[0].name == "A"
    # if this ever fails, the mock up may be removed
    crawler.identifiableAdapter.get_registered_identifiable.assert_called()
    crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called()

    # TODO write test where the unresoled entity is not part of the identifiable


def test_split_into_inserts_and_updates_with_copy_attr(crawler_mocked_identifiable_retrieve):
    crawler = crawler_mocked_identifiable_retrieve
    # assume identifiable is only the name
    a = db.Record(name="A").add_parent("C")
    a.add_property("foo", 1)
    b = db.Record(name="A").add_parent("C")
    b.add_property("bar", 2)
    entlist = [a, b]
    insert, update = crawler.split_into_inserts_and_updates(entlist)

    assert update[0].get_property("bar").value == 2
    assert update[0].get_property("foo").value == 1
    # if this ever fails, the mock up may be removed
    crawler.identifiableAdapter.get_registered_identifiable.assert_called()
    crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called()


@pytest.mark.xfail(reason="https://gitlab.com/linkahead/linkahead-crawler/-/issues/88")
@patch("caoscrawler.identifiable_adapters.cached_query",
       new=Mock(side_effect=mock_cached_only_rt))
def test_split_iiau_with_unmergeable_list_items():
    """Test for meaningful exception when referencing a list of unmergeable entities.

Datamodel
---------
A:
  B: LIST<B>
  prop_ident: INTEGER

B:
  prop_ident:
  C:

C:
  prop_other: INTEGER

Identifiables
-------------

id_A: [prop_ident]
id_B: [prop_ident, "is_referenced_by: A"]

Data
----

b1: ("same", 23)
b2: ("same", 42)

a: ([b1, b2])
    """
    prop_ident = db.Property("prop_ident", datatype=db.INTEGER)
    prop_other = db.Property("prop_ident", datatype=db.INTEGER)
    rt_c = db.RecordType("C").add_property(prop_other)
    # Somehow it is necessary that `B` has a reference property.  Dunno if C must have an
    # identifiable as well.
    rt_b = db.RecordType("B").add_property(prop_ident).add_property("C")
    rt_a = db.RecordType("A").add_property(prop_ident).add_property("LIST<B>")

    ident_a = db.RecordType().add_parent("A").add_property("prop_ident")
    ident_b = db.RecordType().add_parent("B").add_property("prop_ident").add_property(
        "is_referenced_by", value="A")
    ident_c = db.RecordType().add_parent("C").add_property("prop_other").add_property(
        "is_referenced_by", value="B")

    rec_a = db.Record("a").add_parent(rt_a).add_property("prop_ident", value=1234)
    rec_b = []
    rec_c = []
    for value in [23, 42]:
        new_c = db.Record().add_parent(rt_c).add_property("prop_other", value=value)
        rec_c.append(new_c)
        rec_b.append(db.Record().add_parent(rt_b).add_property(
            "prop_ident", value=2020).add_property("C", value=new_c))
    rec_a.add_property("B", rec_b)

    ident_adapter = CaosDBIdentifiableAdapter()
    ident_adapter.register_identifiable("A", ident_a)
    ident_adapter.register_identifiable("B", ident_b)
    ident_adapter.register_identifiable("C", ident_c)

    crawler = Crawler(identifiableAdapter=ident_adapter)

    # This should give a merge conflict, and not
    # "Could not find referencing entities of type(s): A"

    # from IPython import embed; embed()
    with raises(RuntimeError) as rte:
        crawler.synchronize(commit_changes=False,
                            crawled_data=[rec_a, *rec_b, *rec_c])
    assert not isinstance(rte.value, NotImplementedError), \
        "Exception must not be NotImplementedError, but plain RuntimeError."
    assert "Could not find referencing entities" not in rte.value.args[0]
    assert "merging impossible" in rte.something
    # crawler.split_into_inserts_and_updates(ent_list=[rec_a, *rec_b, *rec_c])


def test_has_missing_object_in_references():
    crawler = Crawler()
    # Simulate remote server content by using the names to identify records
    # There are only two known Records with name A and B
    crawler.identifiableAdapter.get_registered_identifiable = Mock(side_effect=partial(
        basic_retrieve_by_name_mock_up, known={"C": db.Record(name="C").add_parent("RTC")
                                               .add_property("d").add_property("name"),
                                               "D": db.Record(name="D").add_parent("RTD")
                                               .add_property("d").add_property("e").add_property("name"),
                                               }))

    # one reference with id -> check
    assert not crawler._has_missing_object_in_references(
        Identifiable(name="C", record_type="RTC", properties={'d': 123}), {})
    # one ref with Entity with id -> check
    rec = db.Record(id=123).add_parent("C")
    assert not crawler._has_missing_object_in_references(
        Identifiable(name="C", record_type="RTC", properties={'d': rec}), {id(rec): {'C': [None]}})
    # one ref with id one with Entity with id (mixed) -> check
    rec = db.Record(id=123).add_parent("RTC")
    assert not crawler._has_missing_object_in_references(
        Identifiable(name="C", record_type="RTD",
                     properties={'d': 123, 'b': rec}), {id(rec): {'C': [None]}})
    # entity to be referenced in the following
    a = db.Record(name="C").add_parent("C").add_property("d", 12311)
    # one ref with id one with Entity without id (but not identifying) -> fail
    assert not crawler._has_missing_object_in_references(
        Identifiable(name="C", record_type="RTC", properties={'d': 123, 'e': a}),
        {id(a): {'C': [None]}})

    # one ref with id one with Entity without id (mixed) -> fail
    assert not crawler._has_missing_object_in_references(
        Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}),
        {id(a): {'C': [None]}})

    crawler.treated_records_lookup.add(a, Identifiable(name="C", record_type="RTC",
                                                       properties={'d': 12311}))
    # one ref with id one with Entity without id but in cache -> check
    assert crawler._has_missing_object_in_references(
        Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}),
        {id(a): {'C': [None]}})

    # if this ever fails, the mock up may be removed
    crawler.identifiableAdapter.get_registered_identifiable.assert_called()


@ pytest.mark.xfail()
def test_references_entities_without_ids():
    crawler = Crawler()
    assert not crawler._has_reference_value_without_id(db.Record().add_parent("Person")
                                                       .add_property('last_name', 123)
                                                       .add_property('first_name', 123))
    # id and rec with id
    assert not crawler._has_reference_value_without_id(db.Record().add_parent("Person")
                                                       .add_property('first_name', 123)
                                                       .add_property('last_name',
                                                                     db.Record(id=123)))
    # id and rec with id and one unneeded prop
    assert crawler._has_reference_value_without_id(db.Record().add_parent("Person")
                                                   .add_property('first_name', 123)
                                                   .add_property('stuff', db.Record())
                                                   .add_property('last_name', db.Record(id=123)))

    # one identifying prop is missing
    assert crawler._has_reference_value_without_id(db.Record().add_parent("Person")
                                                   .add_property('first_name', 123)
                                                   .add_property('last_name', db.Record()))


def test_replace_entities_with_ids():
    crawler = Crawler()
    a = (db.Record().add_parent("B").add_property("A", 12345)
         .add_property("B", db.Record(id=12345))
         .add_property("C", [db.Record(id=12345), 233324]))

    crawler.replace_entities_with_ids(a)
    assert a.get_property("A").value == 12345
    assert a.get_property("B").value == 12345
    assert a.get_property("C").value == [12345, 233324]


def reset_mocks(mocks):
    for mock in mocks:
        mock.reset_mock()


@ patch("caoscrawler.crawl.cached_get_entity_by",
        new=Mock(side_effect=mock_get_entity_by))
@ patch("caoscrawler.identifiable_adapters.cached_get_entity_by",
        new=Mock(side_effect=mock_get_entity_by))
@ patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter."
        "retrieve_identified_record_for_identifiable",
        new=Mock(side_effect=mock_retrieve_record))
@ patch("caoscrawler.crawl.db.Container.insert")
@ patch("caoscrawler.crawl.db.Container.update")
def test_synchronization_no_commit(upmock, insmock):
    crawled_data = [r.copy() for r in EXAMPLE_SERVER_STATE if r.role == "Record"]
    # change  one; add one
    crawled_data[-1].get_property('result').value = "wst"
    crawled_data.append(NEW_ELEMENT.copy())

    ident = CaosDBIdentifiableAdapter()
    ident.load_from_yaml_definition(UNITTESTDIR / "example_identifiables.yml")
    crawler = Crawler(securityMode=SecurityMode.UPDATE, identifiableAdapter=ident)
    ins, ups = crawler.synchronize(commit_changes=False, crawled_data=crawled_data)
    insmock.assert_not_called()
    upmock.assert_not_called()
    assert len(ins) == 1
    assert len(ups) == 1


@ patch("caoscrawler.crawl.cached_get_entity_by",
        new=Mock(side_effect=mock_get_entity_by))
@ patch("caoscrawler.identifiable_adapters.cached_get_entity_by",
        new=Mock(side_effect=mock_get_entity_by))
@ patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter."
        "retrieve_identified_record_for_identifiable",
        new=Mock(side_effect=mock_retrieve_record))
@ patch("caoscrawler.crawl.db.Container.insert")
@ patch("caoscrawler.crawl.db.Container.update")
@ patch("caoscrawler.crawl.UpdateCache.insert")
def test_security_mode(updateCacheMock, upmock, insmock):
    # trivial case: nothing to do
    crawled_data = [r.copy() for r in EXAMPLE_SERVER_STATE if r.role == "Record"]
    print(crawled_data)
    crawler = Crawler(securityMode=SecurityMode.RETRIEVE)
    crawler.synchronize(commit_changes=True, crawled_data=crawled_data)
    assert crawler.run_id is not None
    insmock.assert_not_called()
    upmock.assert_not_called()
    updateCacheMock.assert_not_called()

    # RETRIEVE: insert only
    ident = CaosDBIdentifiableAdapter()
    ident.load_from_yaml_definition(UNITTESTDIR / "example_identifiables.yml")
    crawler = Crawler(securityMode=SecurityMode.RETRIEVE, identifiableAdapter=ident)

    # add a new entity
    crawled_data.append(NEW_ELEMENT.copy())

    # insert forbidden
    crawler.synchronize(commit_changes=True, crawled_data=crawled_data)
    assert crawler.run_id is not None
    insmock.assert_not_called()
    upmock.assert_not_called()
    assert updateCacheMock.call_count == 1
    # reset counts
    reset_mocks([updateCacheMock, insmock, upmock])
    # remove new record again
    crawled_data.pop()

    # RETRIEVE: update only
    crawler = Crawler(securityMode=SecurityMode.RETRIEVE)
    # change one element
    crawled_data[-1].get_property('result').value = "wst"
    crawler.synchronize(commit_changes=True, crawled_data=crawled_data)
    assert crawler.run_id is not None
    insmock.assert_not_called()
    upmock.assert_not_called()
    # import IPython
    # IPython.embed()
    # print(updateCacheMock.call_args_list)
    assert updateCacheMock.call_count == 1
    # reset counts
    reset_mocks([updateCacheMock, insmock, upmock])
    # reset value
    crawled_data[-1] = EXAMPLE_SERVER_STATE[-1].copy()

    # INSERT: insert only
    # add one element
    crawled_data.append(NEW_ELEMENT.copy())
    ident = CaosDBIdentifiableAdapter()
    ident.load_from_yaml_definition(UNITTESTDIR / "example_identifiables.yml")
    crawler = Crawler(securityMode=SecurityMode.INSERT, identifiableAdapter=ident)
    crawler.synchronize(commit_changes=True, crawled_data=crawled_data)
    assert crawler.run_id is not None
    insmock.assert_called_once()
    upmock.assert_not_called()
    updateCacheMock.assert_not_called()
    # reset counts
    reset_mocks([updateCacheMock, insmock, upmock])
    # remove new record again
    crawled_data.pop()

    # INSERT: update only
    crawler = Crawler(securityMode=SecurityMode.INSERT, identifiableAdapter=ident)
    # change one element
    crawled_data[-1].get_property('result').value = "wst"
    crawler.synchronize(commit_changes=True, crawled_data=crawled_data)
    assert crawler.run_id is not None
    insmock.assert_not_called()
    upmock.assert_not_called()
    updateCacheMock.assert_called_once()
    # reset counts
    reset_mocks([updateCacheMock, insmock, upmock])
    # reset value
    crawled_data[-1] = EXAMPLE_SERVER_STATE[-1].copy()

    # INSERT: insert and update
    ident = CaosDBIdentifiableAdapter()
    ident.load_from_yaml_definition(UNITTESTDIR / "example_identifiables.yml")
    crawler = Crawler(securityMode=SecurityMode.INSERT, identifiableAdapter=ident)
    # change  one; add one
    crawled_data[-1].get_property('result').value = "wst"
    crawled_data.append(NEW_ELEMENT.copy())
    crawler.synchronize(commit_changes=True, crawled_data=crawled_data)
    assert crawler.run_id is not None
    insmock.asser_called_once()
    upmock.assert_not_called()
    updateCacheMock.assert_called_once()
    # reset counts
    reset_mocks([updateCacheMock, insmock, upmock])
    # restore original ident
    crawled_data.pop()
    crawled_data[-1] = EXAMPLE_SERVER_STATE[-1].copy()


def test_create_reference_mapping():
    a = db.Record().add_parent("A")
    b = db.Record(id=132).add_parent("B").add_property('a', a)
    ref = Crawler.create_reference_mapping([a, b])
    assert id(a) in ref
    assert id(b) in ref
    assert "B" in ref[id(a)]
    assert {} == ref[id(b)]
    assert ref[id(a)]["B"] == [132]


def test_create_flat_list():
    a = db.Record()
    b = db.Record()
    a.add_property(name="a", value=a)
    a.add_property(name="b", value=b)
    flat = Crawler.create_flat_list([a])
    assert len(flat) == 2
    assert a in flat
    assert b in flat
    c = db.Record()
    c.add_property(name="a", value=a)
    # This would caus recursion if it is not dealt with properly.
    a.add_property(name="c", value=c)
    flat = Crawler.create_flat_list([c])
    assert len(flat) == 3
    assert a in flat
    assert b in flat
    assert c in flat


@ pytest.fixture
def crawler_mocked_for_backref_test():
    crawler = Crawler()
    # mock retrieval of registered identifiabls: return Record with just a parent

    def get_reg_ident(x):
        if x.parents[0].name == "C":
            return db.Record().add_parent(x.parents[0].name).add_property(
                "is_referenced_by", value=["BR"]).add_property("name")
        elif x.parents[0].name == "D":
            return db.Record().add_parent(x.parents[0].name).add_property(
                "is_referenced_by", value=["BR", "BR2"]).add_property("name")
        else:
            return db.Record().add_parent(x.parents[0].name).add_property("name")
    crawler.identifiableAdapter.get_registered_identifiable = Mock(side_effect=get_reg_ident)

    # Simulate remote server content by using the names to identify records
    # There is only a single known Record with name A
    crawler.identifiableAdapter.retrieve_identified_record_for_record = Mock(side_effect=partial(
        basic_retrieve_by_name_mock_up, known={"A":
                                               db.Record(id=1111, name="A").add_parent("BR")}))
    crawler.identifiableAdapter.retrieve_identified_record_for_identifiable = Mock(
        side_effect=partial(
            basic_retrieve_by_name_mock_up, known={"A":
                                                   db.Record(id=1111, name="A").add_parent("BR")}))
    return crawler


def test_validation_error_print(caplog):
    caplog.set_level(logging.DEBUG, logger="caoscrawler.converters")
    # there should be no server interaction since we only test the behavior if a validation error
    # occurs during the data collection stage
    DATADIR = os.path.join(os.path.dirname(__file__), "test_data", "failing_validation")
    for fi in ["cfood.yml", "cfood2.yml"]:
        ret = crawler_main(DATADIR,
                           os.path.join(DATADIR, fi),
                           os.path.join(DATADIR, "identifiables.yml"),
                           True,
                           None,
                           False)
        assert "Couldn't validate" in caplog.text
        caplog.clear()


@ patch("caoscrawler.identifiable_adapters.get_children_of_rt",
        new=Mock(side_effect=lambda x: [x]))
def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test):
    crawler = crawler_mocked_for_backref_test
    identlist = [Identifiable(name="A", record_type="BR"),
                 Identifiable(name="B", record_type="C", backrefs=[db.Entity()])]
    referenced = db.Record(name="B").add_parent("C")
    entlist = [referenced, db.Record(name="A").add_parent("BR").add_property("ref", referenced), ]

    # Test without referencing object
    # currently a NotImplementedError is raised if necessary properties are missing.
    with raises(NotImplementedError):
        crawler.split_into_inserts_and_updates([db.Record(name="B").add_parent("C")])

    # identifiables were not yet checked
    assert crawler.treated_records_lookup.get_any(entlist[1], identlist[0]) is None
    assert crawler.treated_records_lookup.get_any(entlist[0], identlist[1]) is None
    # one with reference, one without
    assert not crawler._has_reference_value_without_id(identlist[0])
    assert crawler._has_reference_value_without_id(identlist[1])
    # one can be found remotely, one not
    assert crawler.identifiableAdapter.retrieve_identified_record_for_record(
        identlist[0]).id == 1111
    assert crawler.identifiableAdapter.retrieve_identified_record_for_record(
        identlist[1]) is None

    # check the split...
    insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist))
    # A was found remotely and is therefore in the update list
    assert len(update) == 1
    assert update[0].name == "A"
    # B does not exist on the (simulated) remote server
    assert len(insert) == 1
    assert insert[0].name == "B"


@ patch("caoscrawler.identifiable_adapters.get_children_of_rt",
        new=Mock(side_effect=lambda x: [x]))
def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_test):
    # test whether multiple references of the same record type are correctly used
    crawler = crawler_mocked_for_backref_test
    referenced = db.Record(name="B").add_parent("C")
    entlist = [referenced,
               db.Record(name="A").add_parent("BR").add_property("ref", referenced),
               db.Record(name="C").add_parent("BR").add_property("ref", referenced),
               ]

    # test whether both entities are listed in the backref attribute of the identifiable
    referencing_entities = crawler.create_reference_mapping(entlist)
    identifiable = crawler.identifiableAdapter.get_identifiable(
        referenced,
        referencing_entities[id(referenced)])
    assert len(identifiable.backrefs) == 2

    # check the split...
    insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist))
    assert len(update) == 1
    assert len(insert) == 2


@ patch("caoscrawler.identifiable_adapters.get_children_of_rt",
        new=Mock(side_effect=lambda x: [x]))
def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_test):
    # test whether multiple references of the different record types are correctly used
    crawler = crawler_mocked_for_backref_test
    referenced = db.Record(name="B").add_parent("D")
    entlist = [referenced,
               db.Record(name="A").add_parent("BR").add_property("ref", referenced),
               db.Record(name="A").add_parent("BR2").add_property("ref", referenced),
               ]

    # test whether both entities are listed in the backref attribute of the identifiable
    referencing_entities = crawler.create_reference_mapping(entlist)
    identifiable = crawler.identifiableAdapter.get_identifiable(
        referenced,
        referencing_entities[id(referenced)])

    assert len(identifiable.backrefs) == 2

    # check the split...
    insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist))
    assert len(update) == 2
    assert len(insert) == 1


def mock_create_values(values, element):
    pass


@ patch("caoscrawler.converters.IntegerElementConverter.create_values")
def test_restricted_path(create_mock):
    """
    The restricted_path argument allows to ignroe part of the crawled data structure. Here, we make
    sure, that is that argument is provided, ideed only the given path of the tree is traversed.

    The check is done using the mock of the create_values function of the IntegerElementConverter.
    This function is only called if elements are being treated.
    """
    crawler_definition = {
        "DictTest": {
            "type": "DictElement",
            "match": "(.*)",
            "subtree": {
                "nextdict": {
                    "type": "DictElement",
                    "match": "(.*)",
                    "subtree": {
                        "int_element": {
                            "type": "IntegerElement",
                            "match_name": ".*",
                            "match_value": "(?P<int_value>.*)",
                            "records": {
                                "Dataset": {
                                    "Subject": "$int_value"
                                }
                            }
                        }
                    }
                }
            }
        }
    }

    crawler = Crawler()
    converter_registry = create_converter_registry(crawler_definition)

    # This structure is crawled
    test_dict = {
        "v1": {
            "a": 1,
            "b": 2,
        },
        "v2": {
            "c": 3,
            "d": 4,
        }
    }
    # first test without a restricted_path
    restricted_path = None
    records = scan_structure_elements(
        DictElement("TestDict", test_dict), crawler_definition, converter_registry,
        restricted_path
    )
    assert create_mock.call_count == 4
    create_mock.reset_mock()

    # test with a restricted_path but one that has no effect (single root element)
    # this also tests that the remainder of the tree is fully traversed
    restricted_path = ["TestDict"]
    records = scan_structure_elements(
        DictElement("TestDict", test_dict), crawler_definition, converter_registry,
        restricted_path
    )
    assert create_mock.call_count == 4
    create_mock.reset_mock()

    # test with a restricted_path that restricts the tree (single root element)
    restricted_path = ["TestDict", "v2"]
    records = scan_structure_elements(
        DictElement("TestDict", test_dict), crawler_definition, converter_registry,
        restricted_path
    )
    assert create_mock.call_count == 2
    create_mock.reset_mock()

    # test with a restricted_path that contains a bad element
    restricted_path = ["TestDict", "v3"]
    with raises(RuntimeError):
        records = scan_structure_elements(
            DictElement("TestDict", test_dict), crawler_definition, converter_registry,
            restricted_path
        )


def test_split_restricted_path():
    assert ["el"] == split_restricted_path("/el")
    assert ["el"] == split_restricted_path("/el/")
    assert ["el", "el"] == split_restricted_path("/el/el")


# Filter the warning because we want to have it here and this way it does not hinder running
# tests with -Werror.
@ pytest.mark.filterwarnings("ignore:The prefix:DeprecationWarning")
def test_deprecated_prefix_option():
    """Test that calling the crawler's main function with the deprecated
    `prefix` option raises the correct errors and warnings.

    """

    with pytest.deprecated_call():
        crawler_main("./", UNITTESTDIR / "scifolder_cfood.yml", prefix="to/be/removed")

    # Check that crawler main terminates with an error
    assert 1 == crawler_main("./", UNITTESTDIR / "scifolder_cfood.yml", prefix="to/be/removed",
                             remove_prefix="to/be/removed")

    with raises(ValueError) as ve:

        _treat_deprecated_prefix(prefix="to/be/removed", remove_prefix="to/be/removed")

    assert "(deprecated) `prefix` and the `remove_prefix`" in str(ve.value)


def test_create_entity_summary():
    assert "" == Crawler.create_entity_summary([]).strip()

    entities = [
        db.Record(id=1).add_parent("A"),
        db.Record(id=4, name='a').add_parent("B"),
        db.Record(id=5).add_parent("A"),
        db.Record(id=6, name='b').add_parent("B"),
    ]
    text = Crawler.create_entity_summary(entities).strip()
    assert 'a' in text
    assert 'b' in text
    assert 'A:' in text
    assert 'B:' in text
    assert "<a href='/Entity/4'>a</a>, <a href='/Entity/6'>b</a>" in text


def test_detect_circular_dependency(crawler_mocked_identifiable_retrieve, caplog):
    crawler = crawler_mocked_identifiable_retrieve
    crawler.identifiableAdapter.get_registered_identifiable = Mock(
        side_effect=lambda x: db.Record().add_parent('C').add_property(name='C'))
    a = db.Record(name='a').add_parent("C")
    b = db.Record(name='b').add_parent("C").add_property(name="C", value=a)
    c = db.Record(name='c').add_parent("C").add_property(name='D', value='e'
                                                         ).add_property(name="C", value=b)
    d = db.Record(name='c').add_parent("C")
    a.add_property(name="C", value=c)
    flat = [a, b, c]
    circle = Crawler.detect_circular_dependency(flat)
    assert [id(el) for el in circle] == [id(el) for el in [a, c, b, a]]

    assert Crawler.detect_circular_dependency([d]) is None
    with raises(RuntimeError):
        _, _ = crawler.split_into_inserts_and_updates(flat)
    caplog.set_level(logging.ERROR, logger="caoscrawler.converters")
    assert "Found circular dependency" in caplog.text
    assert "\n--------\n\n> Parent: C\n\n>> Name: a\n[\'C\']" in caplog.text
    caplog.clear()


def mock_get_entity_by_query(query=None):
    if query is not None:
        return db.Record(id=1111, name='rec_name').add_parent('RT')


@ patch("caoscrawler.crawl.cached_get_entity_by",
        new=Mock(side_effect=mock_get_entity_by_query))
def test_replace_name_with_referenced_entity():
    test_text = 'lkajsdf'
    test_int = 134343
    test_id = 1111
    test_name = 'rec_name'

    # do not touch Properties with non-ref datatype
    prop = db.Property(name='a', datatype=db.TEXT, value=test_text)
    Crawler.replace_name_with_referenced_entity_id(prop)
    assert prop.value is test_text

    # do not touch Properties with generic-ref datatype
    prop = db.Property(name='a', datatype=db.REFERENCE, value=test_text)
    Crawler.replace_name_with_referenced_entity_id(prop)
    assert prop.value is test_text

    # do not touch Properties with file-ref datatype
    prop = db.Property(name='a', datatype=db.FILE, value=test_text)
    Crawler.replace_name_with_referenced_entity_id(prop)
    assert prop.value is test_text

    # do not touch Properties with non-str values
    prop = db.Property(name='a', datatype="RT", value=test_int)
    Crawler.replace_name_with_referenced_entity_id(prop)
    assert prop.value is test_int

    # no LinkAhead acccess until here
    assert caoscrawler.crawl.cached_get_entity_by.call_count == 0

    # change Properties with custom dt and str value
    prop = db.Property(name='a', datatype="RT", value=test_name)
    Crawler.replace_name_with_referenced_entity_id(prop)
    assert isinstance(prop.value, int)
    assert prop.value == test_id
    assert caoscrawler.crawl.cached_get_entity_by.call_count == 1

    # do not touch Properties with non-ref datatype (LIST)
    prop = db.Property(name='a', datatype=db.LIST(db.TEXT), value=[test_text])
    Crawler.replace_name_with_referenced_entity_id(prop)
    assert prop.value[0] is test_text

    # do not touch Properties with generic-ref datatype (LIST)
    prop = db.Property(name='a', datatype=db.LIST(db.REFERENCE), value=[test_text])
    Crawler.replace_name_with_referenced_entity_id(prop)
    assert prop.value[0] is test_text

    # do not touch Properties with file-ref datatype (LIST)
    prop = db.Property(name='a', datatype=db.LIST(db.FILE), value=[test_text])
    Crawler.replace_name_with_referenced_entity_id(prop)
    assert prop.value[0] is test_text

    # do not touch Properties with non-str values (LIST)
    prop = db.Property(name='a', datatype=db.LIST("RT"), value=[test_int])
    Crawler.replace_name_with_referenced_entity_id(prop)
    assert prop.value[0] is test_int

    # change Properties with custom dt and str value
    prop = db.Property(name='a', datatype=db.LIST("RT"), value=[test_name, db.Record(name='hi'),
                                                                test_name])
    Crawler.replace_name_with_referenced_entity_id(prop)
    assert isinstance(prop.value[0], int)
    assert prop.value[0] == test_id
    assert isinstance(prop.value[1], db.Entity)
    assert prop.value[1].name == "hi"
    assert isinstance(prop.value[2], int)
    assert prop.value[2] == test_id
    assert caoscrawler.crawl.cached_get_entity_by.call_count == 3


def test_treated_record_lookup():
    trlu = TreatedRecordLookUp()
    exist = db.Record(id=1)
    trlu.add(exist)
    assert len(trlu._existing) == 1
    # was added to existing
    assert trlu._existing[id(exist)] is exist
    # is in ID lookup
    assert trlu._id_look_up[exist.id] is exist
    # can be accessed via get_existing
    assert trlu.get_existing(db.Record(id=1)) is exist

    miss = db.Record()
    # exception when identifiable is missing
    with raises(RuntimeError):
        trlu.add(miss)
    ident = Identifiable(name='a')
    trlu.add(miss, ident)
    # was added to missing
    assert trlu._missing[id(miss)] is miss
    # is in ident lookup
    assert trlu._identifiable_look_up[ident.get_representation()] is miss
    # can be accessed via get_missing
    assert trlu.get_missing(db.Record(), Identifiable(name='a')) is miss

    fi = db.File(path='a', id=2)
    trlu.add(fi)
    assert len(trlu._existing) == 2
    # was added to existing
    assert trlu._existing[id(fi)] is fi
    # is in ID lookup
    assert trlu._id_look_up[fi.id] is fi
    # is in path lookup
    assert trlu._path_look_up[fi.path] is fi
    # can be accessed via get_existing
    assert trlu.get_existing(fi) is fi

    all_exi = trlu.get_existing_list()
    assert fi in all_exi
    assert exist in all_exi
    all_mi = trlu.get_missing_list()
    assert miss in all_mi

    # If a Record was added using the ID, the ID must be used to identify it even though later an
    # identifiable may be passed as well
    assert trlu.get_any(exist, Identifiable(name='b')) is exist

    fi2 = db.File(path='b')
    trlu.add(fi2)
    assert trlu.get_any(db.File(path='b'), Identifiable(name='c')) is fi2


def test_merge_entity_with_identifying_reference(crawler_mocked_identifiable_retrieve):
    # When one python object representing a record is merged into another python object
    # representing the same record, the former object can be forgotten and references from it to
    # other records must not play a role
    crawler = crawler_mocked_identifiable_retrieve
    crawler.identifiableAdapter.get_registered_identifiable = Mock(
        side_effect=lambda x: db.Record().add_parent('C').add_property(name='name') if
        x.parents[0].name == "C" else
        db.Record().add_parent('D').add_property(name='is_referenced_by', value="*")
    )
    a = db.Record(name='a').add_parent("D")
    b = db.Record(name='b').add_parent("C")
    c = db.Record(name='b').add_parent("C").add_property(name="C", value=a)
    flat = [a, c, b]
    _, _ = crawler.split_into_inserts_and_updates(flat)