Skip to content
Snippets Groups Projects
Select Git revision
  • 7bff47f89dd0f9f27ee9ea7cf7edcd52b714d06b
  • main default protected
  • dev
  • f-spss-value-label-name
  • f-unmod
  • f-checkidentical
  • f-simple-breakpoint
  • f-new-debug-tree
  • f-existing-file-id
  • f-no-ident
  • f-collect-problems
  • f-refactor-debug-tree
  • v0.13.0
  • v0.12.0
  • v0.11.0
  • v0.10.1
  • v0.10.0
  • v0.9.1
  • v0.9.0
  • v0.8.0
  • v0.7.1
  • v0.7.0
  • v0.6.0
  • v0.5.0
  • v0.4.0
  • v0.3.0
  • v0.2.0
  • v0.1.0
28 results

test_crawler.py

Blame
  • Code owners
    Assign users and groups as approvers for specific file changes. Learn more.
    test_crawler.py 36.06 KiB
    #!/usr/bin/env python3
    # encoding: utf-8
    #
    # This file is a part of the CaosDB Project.
    #
    # Copyright (C) 2023 Indiscale GmbH <info@indiscale.com>
    # Copyright (C) 2023 Henrik tom Wörden <h.tomwoerden@indiscale.com>
    #
    # This program is free software: you can redistribute it and/or modify
    # it under the terms of the GNU Affero General Public License as
    # published by the Free Software Foundation, either version 3 of the
    # License, or (at your option) any later version.
    #
    # This program is distributed in the hope that it will be useful,
    # but WITHOUT ANY WARRANTY; without even the implied warranty of
    # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    # GNU Affero General Public License for more details.
    #
    # You should have received a copy of the GNU Affero General Public License
    # along with this program. If not, see <https://www.gnu.org/licenses/>.
    #
    
    """
    test the Crawler class
    """
    import logging
    import os
    import warnings
    from copy import deepcopy
    from functools import partial
    from os.path import basename, dirname, join
    from pathlib import Path
    from unittest.mock import MagicMock, Mock, patch
    
    import linkahead as db
    import linkahead.common.models as dbmodels
    import pytest
    import yaml
    from caosadvancedtools.models.parser import parse_model_from_string
    from linkahead.apiutils import compare_entities
    from linkahead.cached import cache_clear
    from linkahead.exceptions import EmptyUniqueQueryError
    from pytest import raises
    
    import caoscrawler
    from caoscrawler.crawl import (Crawler, SecurityMode, _treat_deprecated_prefix,
                                   crawler_main, split_restricted_path)
    from caoscrawler.debug_tree import DebugTree
    from caoscrawler.exceptions import (ImpossibleMergeError,
                                        MissingIdentifyingProperty,
                                        MissingReferencingEntityError)
    from caoscrawler.identifiable import Identifiable
    from caoscrawler.identifiable_adapters import (CaosDBIdentifiableAdapter,
                                                   IdentifiableAdapter,
                                                   LocalStorageIdentifiableAdapter)
    from caoscrawler.scanner import (create_converter_registry, scan_directory,
                                     scan_structure_elements)
    from caoscrawler.stores import GeneralStore, RecordStore
    from caoscrawler.structure_elements import (DictElement, DictListElement,
                                                DictTextElement, File)
    from caoscrawler.sync_graph import SyncGraph
    
    UNITTESTDIR = Path(__file__).parent
    
    EXAMPLE_SERVER_STATE = [
        db.Property(id=1, name='result', datatype=db.TEXT),
        db.Property(id=2, name='date', datatype=db.DATETIME),
        db.RecordType(id=3, name="Experiment"),
        db.RecordType(id=4, name="Analysis"),
        db.Record(id=5)
        .add_parent(name="Experiment", id=3)
        .add_property(name="date", value="2022-02-01")
        .add_property(name="result", value="FAIL"),
        db.Record(id=6)
        .add_parent(name="Experiment", id=3)
        .add_property(name="date", value="2022-02-02")
        .add_property(name="result", value="SUCCESS"),
        db.Record(id=7)
        .add_parent(name="Analysis", id=4)
        .add_property(name="date", value="2022-03-01")
        .add_property(name="result", value="homogeneous"),
        db.Record(id=8)
        .add_parent(name="Analysis", id=4)
        .add_property(name="date", value="2022-03-02")
        .add_property(name="result", value="heterogeneous"),
    ]
    NEW_ELEMENT = (db.Record()
                   .add_parent(name="Analysis", id=4)
                   .add_property(name="date", value="2022-03-05")  # new date
                   .add_property(name="result", value="homogeneous"))
    
    
    def reset_mocks(mocks):
        for mock in mocks:
            mock.reset_mock()
    
    
    def mock_create_values(values, element):
        pass
    
    
    def mock_get_entity_by_query(query=None):
        if query is not None:
            return db.Record(id=1111, name='rec_name').add_parent('RT')
    
    
    def mock_get_entity_by(eid=None, name=None, path=None):
        if eid is not None:
            candidates = [el for el in EXAMPLE_SERVER_STATE if el.id == eid]
            if len(candidates) > 0:
                return candidates[0]
            else:
                raise EmptyUniqueQueryError("")
        if name is not None:
            candidates = [el for el in EXAMPLE_SERVER_STATE
                          if (el.name is not None and el.name.lower() == name.lower())]
            if len(candidates) > 0:
                return candidates[0]
            else:
                raise EmptyUniqueQueryError("")
        if path is not None:
            candidates = [el for el in EXAMPLE_SERVER_STATE
                          if (el.path is not None and el.path == path)]
            if len(candidates) > 0:
                return candidates[0]
            else:
                raise EmptyUniqueQueryError("")
    
    
    def basic_retrieve_by_name_mock_up(rec, referencing_entities=None, known=None):
        """ returns a stored Record if rec.name is an existing key, None otherwise """
        if rec.name in known:
            return known[rec.name]
        else:
            return None
    
    
    def mock_retrieve_record(identifiable: Identifiable):
        """ assumes that the identifiable is always only the date"""
    
        for record in EXAMPLE_SERVER_STATE:
            if (record.role == "Record" and "date" in identifiable.properties
                    and record.get_property("date").value == identifiable.properties['date']):
                return record
        return None
    
    
    def mock_cached_only_rt(query_string: str):
        """Always return an empty Container"""
        result = db.Container()
        lo_query = query_string.lower()
        if lo_query.startswith("find record ") or lo_query.startswith("find file "):
            return result
        model = parse_model_from_string("""
    B:
      obligatory_properties:
        C:
          obligatory_properties:
            prop_other:
              datatype: INTEGER
        prop_ident:
          datatype: INTEGER
    A:
      obligatory_properties:
        B:
          datatype: LIST<B>
        prop_ident:
    """)
        if query_string == "FIND RECORDTYPE 'A'":
            model.get_deep("A").id = 1
            return result + [model.get_deep("A")]
        if query_string == "FIND RECORDTYPE 'B'":
            model.get_deep("A").id = 2
            return result + [model.get_deep("B")]
        print(query_string)
        raise NotImplementedError(f"Mock for this case is missing: {query_string}")
    
    
    def mock_cached_only_rt_allow_empty(query_string: str):
        try:
            result = mock_cached_only_rt(query_string)
        except NotImplementedError:
            result = db.Container()
        return result
    
    
    @pytest.fixture(autouse=True)
    def clear_cache():
        cache_clear()
    
    
    @pytest.fixture
    def crawler_mocked_identifiable_retrieve():
        crawler = Crawler()
        # TODO use minimal setup
        # mock retrieval of registered identifiabls: return Record with just a parent
        crawler.identifiableAdapter.get_registered_identifiable = Mock(
            side_effect=lambda x: db.Record().add_parent(x.parents[0].name).add_property(name='name'))
    
        # Simulate remote server content by using the names to identify records
        # There is only a single known Record with name A
        crawler.identifiableAdapter.retrieve_identified_record_for_identifiable = Mock(
            side_effect=partial(
                basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A")}))
        return crawler
    
    
    @pytest.fixture
    def crawler_mocked_for_backref_test():
        crawler = Crawler()
        # mock retrieval of registered identifiabls: return Record with just a parent
    
        def get_reg_ident(x):
            if x.parents[0].name == "C":
                return db.Record().add_parent(x.parents[0].name).add_property(
                    "is_referenced_by", value=["BR"]).add_property("name")
            elif x.parents[0].name == "D":
                return db.Record().add_parent(x.parents[0].name).add_property(
                    "is_referenced_by", value=["BR", "BR2"]).add_property("name")
            else:
                return db.Record().add_parent(x.parents[0].name).add_property("name")
        crawler.identifiableAdapter.get_registered_identifiable = Mock(side_effect=get_reg_ident)
    
        # Simulate remote server content by using the names to identify records
        # There is only a single known Record with name A
        crawler.identifiableAdapter.retrieve_identified_record_for_identifiable = Mock(
            side_effect=partial(
                basic_retrieve_by_name_mock_up, known={"A":
                                                       db.Record(id=1111, name="A").add_parent("BR")}))
        return crawler
    
    
    @pytest.mark.filterwarnings("ignore::DeprecationWarning")
    def test_constructor():
        # tests that appropriate DeprecationWarnings are triggered by the constructor when deprecated
        # arguments are being passed.
        with warnings.catch_warnings(record=True) as w:
            # Cause all warnings to always be triggered.
            warnings.filterwarnings("ignore")
            warnings.filterwarnings("always", category=DeprecationWarning)
    
            Crawler(debug=True)
            assert issubclass(w[-1].category, DeprecationWarning)
            assert "The debug argument of the Crawler class" in str(w[-1].message)
    
            Crawler(generalStore=GeneralStore())
            assert issubclass(w[-1].category, DeprecationWarning)
            assert "The generalStore argument of the Crawler" in str(w[-1].message)
    
    
    @pytest.mark.filterwarnings("ignore::DeprecationWarning")
    def test_deprecated_functions():
        # tests that appropriate DeprecationWarnings are triggered by deprecated methods
        with warnings.catch_warnings(record=True) as w:
            # Cause all warnings to always be triggered.
            warnings.filterwarnings("ignore")
            warnings.filterwarnings("always", category=DeprecationWarning)
            cr = Crawler()
            cr.crawl_directory(UNITTESTDIR, UNITTESTDIR / "scifolder_cfood.yml")
            print(w)
            print(w[0].message)
            assert issubclass(w[-1].category, DeprecationWarning)
            assert "The function crawl_directory in the crawl" in str(w[-1].message)
    
            cr.start_crawling([], {}, {})
            assert issubclass(w[-1].category, DeprecationWarning)
            assert "The function start_crawling in the crawl module" in str(w[-1].message)
    
            cr.crawled_data
            assert issubclass(w[-1].category, DeprecationWarning)
            assert "The use of self.crawled_data is depricated" in str(w[-1].message)
    
    
    def test_check_whether_parent_exists():
        trivial_result = Crawler.check_whether_parent_exists([], [])
        assert len(trivial_result) == 0
        assert isinstance(trivial_result, list)
    
        trivial_result2 = Crawler.check_whether_parent_exists([db.Record(), db.Record()], [])
        assert len(trivial_result) == 0
        assert isinstance(trivial_result, list)
    
        # make sure records with parent is collected
        a_recs = Crawler.check_whether_parent_exists(
            [
                db.Record(id=1).add_parent("A"),
                db.Record(id=2).add_parent("B"),
                db.Record(id=3).add_parent("B"),
                db.Record(id=4).add_parent("A"),
            ], ["A"])
        a_recs_ids = [el.id for el in a_recs]
        assert 1 in a_recs_ids
        assert 4 in a_recs_ids
    
    
    def test_remove_unnecessary_updates():
        # test trvial case
        crawled_data = [db.Record().add_parent("A")]
        identified_records = [db.Record().add_parent("A")]
        updates = Crawler.remove_unnecessary_updates(crawled_data, identified_records)
        assert len(updates) == 0
    
        # test property difference case
        crawled_data = [db.Record().add_parent("A").add_property("a", 3)]
        identified_records = [db.Record().add_parent("A")]  # ID should be s
        Crawler.remove_unnecessary_updates(crawled_data, identified_records)
        assert len(crawled_data) == 1
    
        # test value difference case
        crawled_data = [db.Record().add_parent("A").add_property("a", 5)]
        identified_records = [db.Record().add_parent("A").add_property("a")]
        updates = Crawler.remove_unnecessary_updates(crawled_data, identified_records)
        assert len(updates) == 1
        crawled_data = [db.Record().add_parent("A").add_property("a", 5)]
        identified_records = [db.Record().add_parent("A").add_property("a", 5)]
        updates = Crawler.remove_unnecessary_updates(crawled_data, identified_records)
        assert len(updates) == 0
    
        # test unit difference case
        crawled_data = [db.Record().add_parent("A").add_property("a", unit='cm')]
        identified_records = [db.Record().add_parent("A").add_property("a")]
        updates = Crawler.remove_unnecessary_updates(crawled_data, identified_records)
        assert len(updates) == 1
    
        # test None difference case
        crawled_data = [db.Record().add_parent("A").add_property("a")]
        identified_records = [db.Record().add_parent("A").add_property("a", 5)]
        updates = Crawler.remove_unnecessary_updates(crawled_data, identified_records)
        assert len(updates) == 1
    
    
    def test_split_into_inserts_and_updates_trivial():
        crawler = Crawler()
        st = SyncGraph([], crawler.identifiableAdapter)
        crawler._split_into_inserts_and_updates(st)
    
    
    def test_split_into_inserts_and_updates_simple(crawler_mocked_identifiable_retrieve):
        # basic test that checks whether two records are correctly sorted to update and insert based on
        # whether an entity can be found using the identifiable
        crawler = crawler_mocked_identifiable_retrieve
        identlist = [Identifiable(name="A", record_type="C"), Identifiable(name="B", record_type="C")]
        entlist = [db.Record(name="A").add_parent("C"),
                   db.Record(name="B").add_parent("C")]
    
        st = SyncGraph(entlist, crawler.identifiableAdapter)
        # check setup
    
        insert, update = crawler._split_into_inserts_and_updates(st)
        assert len(insert) == 1
        assert insert[0].name == "B"
        assert len(update) == 1
        assert update[0].name == "A"
        # if this ever fails, the mock up may be removed
        crawler.identifiableAdapter.get_registered_identifiable.assert_called()
        crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called()
    
    
    def test_split_into_inserts_and_updates_with_circ(crawler_mocked_identifiable_retrieve):
        # test trying to split circular dependency
        crawler = crawler_mocked_identifiable_retrieve
        crawler.identifiableAdapter.get_registered_identifiable = Mock(
            side_effect=lambda x: db.Record().add_parent('C').add_property(name='a')
        )
        # two records that reference each other via identifying properties
        a = db.Record().add_parent("C")
        b = db.Record().add_parent("C").add_property(name='a', value=a)
        a.add_property(name='a', value=b)
    
        st = SyncGraph([a, b], crawler.identifiableAdapter)
        with pytest.raises(RuntimeError):
            crawler._split_into_inserts_and_updates(st)
    
    
    def test_split_into_inserts_and_updates_with_complex(crawler_mocked_identifiable_retrieve):
        crawler = crawler_mocked_identifiable_retrieve
        #      A
        #      ^
        #      |
        # F <- B <- G
        a = db.Record(name="A").add_parent("C").add_property(
            'd', 13).add_property('e', "lskdjlsfdj")
        b = db.Record(name="B").add_parent("C")
        g = db.Record(name="G").add_parent("C")
        f = db.Record(name="F").add_parent("C")
        g.add_property("C", b)
        b.add_property("A", a)
        b.add_property("C", f)
        entlist = [a, b, g]
        st = SyncGraph(entlist, crawler.identifiableAdapter)
        insert, update = crawler._split_into_inserts_and_updates(st)
        assert len(insert) == 3
        assert "B" in [el.name for el in insert]
        assert len(update) == 1
        assert update[0].name == "A"
        # if this ever fails, the mock up may be removed
        crawler.identifiableAdapter.get_registered_identifiable.assert_called()
        crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called()
    
        # TODO write test where the unresoled entity is not part of the identifiable
    
    
    @patch("caoscrawler.crawl.cached_get_entity_by",
           new=Mock(side_effect=mock_get_entity_by))
    @patch("caoscrawler.identifiable_adapters.cached_query",
           new=Mock(side_effect=mock_cached_only_rt))
    def test_split_iiau_with_unmergeable_list_items():
        """Test for meaningful exception when referencing a list of unmergeable entities.
    
    Datamodel
    ---------
    A:
      B: LIST<B>
      prop_ident: INTEGER
    
    B:
      prop_ident:
      C:
    
    C:
      prop_other: INTEGER
    
    Identifiables
    -------------
    
    id_A: [prop_ident]
    id_B: [prop_ident, "is_referenced_by: A"]
    id_C: [prop_other, "is_referenced_by: B"]
    
    Data
    ----
    
    c1: (23)
    c2: (42)
    
    b1: ("same", c1)
    b2: ("same", c2)
    
    a: ([b1, b2])
    
    
    
    - a can be identified.
    - bs can be identified with each other once a is identified
    - cs depend on b(s), but cannot be put in one Entity because they have conflicting properties
        """
        prop_ident = db.Property("prop_ident", datatype=db.INTEGER)
        prop_other = db.Property("prop_ident", datatype=db.INTEGER)
        rt_c = db.RecordType("C").add_property(prop_other)
        # Somehow it is necessary that `B` has a reference property.  Dunno if C must have an
        # identifiable as well.
        rt_b = db.RecordType("B").add_property(prop_ident).add_property("C")
        rt_a = db.RecordType("A").add_property(prop_ident).add_property("LIST<B>")
    
        ident_a = db.RecordType().add_parent("A").add_property("prop_ident")
        ident_b = db.RecordType().add_parent("B").add_property("prop_ident").add_property(
            "is_referenced_by", value="A")
        ident_c = db.RecordType().add_parent("C").add_property("prop_other").add_property(
            "is_referenced_by", value="B")
    
        rec_a = db.Record("a").add_parent(rt_a).add_property("prop_ident", value=1234)
        rec_b = []
        rec_c = []
        for value in [23, 42]:
            new_c = db.Record().add_parent(rt_c).add_property("prop_other", value=value)
            rec_c.append(new_c)
            rec_b.append(db.Record().add_parent(rt_b).add_property(
                "prop_ident", value=2020).add_property("C", value=new_c))
        rec_a.add_property("B", rec_b)
    
        ident_adapter = CaosDBIdentifiableAdapter()
        ident_adapter.register_identifiable("A", ident_a)
        ident_adapter.register_identifiable("B", ident_b)
        ident_adapter.register_identifiable("C", ident_c)
    
        crawler = Crawler(identifiableAdapter=ident_adapter)
    
        st = SyncGraph(deepcopy([rec_a, *rec_b, *rec_c]), crawler.identifiableAdapter)
        assert st._identity_relies_on_unchecked_entity(st.nodes[0]) is False
        assert st._identity_relies_on_unchecked_entity(st.nodes[1])
        assert st._identity_relies_on_unchecked_entity(st.nodes[2])
        assert st._identity_relies_on_unchecked_entity(st.nodes[3])
        assert st._identity_relies_on_unchecked_entity(st.nodes[4])
        assert len(st.unchecked) == 5
    
        # The Cs cannot be merged due to different identifying properties
        # The Bs cannot be merged due to different references to Cs
        with raises(ImpossibleMergeError) as rte:
            crawler._split_into_inserts_and_updates(st)
    
        # The order of the Cs is random so we only know that they are the
        # last two elements but not in which order they have been tried to
        # be merged.
        assert "The problematic property is 'C' with values " in str(rte.value)
        assert f"'[{st.nodes[-2]}]'" in str(rte.value)
        assert f"'[{st.nodes[-1]}]'" in str(rte.value)
    
        # TODO
        # assert not isinstance(rte.value, NotImplementedError), \
        # "Exception must not be NotImplementedError, but plain RuntimeError."
        # assert "Could not find referencing entities" in rte.value.args[0]
        # assert "merge conflicts in the referencing" in rte.value.args[0]
    
    
    @patch("caoscrawler.identifiable_adapters.get_children_of_rt",
           new=Mock(side_effect=lambda x: [x]))
    def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test):
        # test that backrefs are appropriately considered in the identifiable
        crawler = crawler_mocked_for_backref_test
        identlist = [Identifiable(name="A", record_type="BR"),
                     Identifiable(name="B", record_type="C", backrefs=[db.Entity()])]
        referenced = db.Record(name="B").add_parent("C")
        entlist = [referenced, db.Record(name="A").add_parent("BR").add_property("ref", referenced), ]
    
        # Test without referencing object
        # currently a RuntimeError is raised if necessary properties are missing.
        with raises(MissingReferencingEntityError):
            st = SyncGraph([db.Record(name="B").add_parent("C")], crawler.identifiableAdapter)
    
        # identifiables were not yet checked
        st = SyncGraph(entlist, crawler.identifiableAdapter)
        assert st.get_equivalent(st.nodes[1]) is None
        assert st.get_equivalent(st.nodes[0]) is None
        # one can be found remotely, one not
    
        # check the split...
        insert, update = crawler._split_into_inserts_and_updates(st)
        # A was found remotely and is therefore in the update list
        assert len(update) == 1
        assert update[0].name == "A"
        # B does not exist on the (simulated) remote server
        assert len(insert) == 1
        assert insert[0].name == "B"
    
    
    @patch("caoscrawler.identifiable_adapters.get_children_of_rt",
           new=Mock(side_effect=lambda x: [x]))
    def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_test):
        # test whether multiple references of the same record type are correctly used
        crawler = crawler_mocked_for_backref_test
        referenced = db.Record(name="B").add_parent("C")
        entlist = [referenced,
                   db.Record(id=1, name="A").add_parent("BR").add_property("ref", referenced),
                   db.Record(id=2, name="C").add_parent("BR").add_property("ref", referenced),
                   ]
    
        # test whether both entities are listed in the backref attribute of the identifiable
        st = SyncGraph(entlist, crawler.identifiableAdapter)
    
        identifiable = crawler.identifiableAdapter.get_identifiable(
            st.nodes[0],
            st.backward_references_backref[id(st.nodes[0])])
        assert len(identifiable.backrefs) == 2
    
        # check the split...
        insert, update = crawler._split_into_inserts_and_updates(st)
        assert len(update) == 2
        assert len(insert) == 1
    
    
    @patch("caoscrawler.identifiable_adapters.get_children_of_rt",
           new=Mock(side_effect=lambda x: [x]))
    def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_test):
        # test whether multiple references of the different record types are correctly used
        crawler = crawler_mocked_for_backref_test
        referenced = db.Record(name="B").add_parent("D")
        entlist = [referenced,
                   db.Record(id=1, name="A").add_parent("BR").add_property("ref", referenced),
                   db.Record(id=2, name="A").add_parent("BR2").add_property("ref", referenced),
                   ]
    
        # test whether both entities are listed in the backref attribute of the identifiable
        st = SyncGraph(entlist, crawler.identifiableAdapter)
        identifiable = crawler.identifiableAdapter.get_identifiable(
            st.nodes[0],
            st.backward_references_backref[id(st.nodes[0])])
    
        assert len(identifiable.backrefs) == 2
    
        # check the split...
        insert, update = crawler._split_into_inserts_and_updates(st)
        assert len(update) == 2
        assert len(insert) == 1
    
    
    def test_replace_entities_with_ids():
        crawler = Crawler()
        a = (db.Record().add_parent("B").add_property("A", 12345)
             .add_property("B", db.Record(id=12345))
             .add_property("C", [db.Record(id=12345), 233324]))
    
        crawler.replace_entities_with_ids(a)
        assert a.get_property("A").value == 12345
        assert a.get_property("B").value == 12345
        assert a.get_property("C").value == [12345, 233324]
    
    
    @patch("caoscrawler.crawl.cached_get_entity_by",
           new=Mock(side_effect=mock_get_entity_by))
    @patch("caoscrawler.identifiable_adapters.cached_get_entity_by",
           new=Mock(side_effect=mock_get_entity_by))
    @patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter."
           "retrieve_identified_record_for_identifiable",
           new=Mock(side_effect=mock_retrieve_record))
    @patch("caoscrawler.crawl.db.Container.insert")
    @patch("caoscrawler.crawl.db.Container.update")
    def test_synchronization_no_commit(upmock, insmock):
        crawled_data = [r.copy() for r in EXAMPLE_SERVER_STATE if r.role == "Record"]
        # change  one; add one
        crawled_data[-1].get_property('result').value = "wst"
        crawled_data.append(NEW_ELEMENT.copy())
    
        ident = CaosDBIdentifiableAdapter()
        ident.load_from_yaml_definition(UNITTESTDIR / "example_identifiables.yml")
        crawler = Crawler(securityMode=SecurityMode.UPDATE, identifiableAdapter=ident)
        ins, ups = crawler.synchronize(commit_changes=False, crawled_data=crawled_data)
        insmock.assert_not_called()
        upmock.assert_not_called()
        assert len(ins) == 1
        assert len(ups) == 1
    
    
    @patch("caoscrawler.crawl.cached_get_entity_by",
           new=Mock(side_effect=mock_get_entity_by))
    @patch("caoscrawler.identifiable_adapters.cached_get_entity_by",
           new=Mock(side_effect=mock_get_entity_by))
    @patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter."
           "retrieve_identified_record_for_identifiable",
           new=Mock(side_effect=mock_retrieve_record))
    @patch("caoscrawler.crawl.db.Container.insert")
    @patch("caoscrawler.crawl.db.Container.update")
    @patch("caoscrawler.crawl.UpdateCache.insert")
    def test_security_mode(updateCacheMock, upmock, insmock):
        # trivial case: nothing to do
        crawled_data = [r.copy() for r in EXAMPLE_SERVER_STATE if r.role == "Record"]
        crawler = Crawler(securityMode=SecurityMode.RETRIEVE)
        crawler.synchronize(commit_changes=True, crawled_data=crawled_data)
        assert crawler.run_id is not None
        insmock.assert_not_called()
        upmock.assert_not_called()
        updateCacheMock.assert_not_called()
    
        # RETRIEVE: insert only
        ident = CaosDBIdentifiableAdapter()
        ident.load_from_yaml_definition(UNITTESTDIR / "example_identifiables.yml")
        crawler = Crawler(securityMode=SecurityMode.RETRIEVE, identifiableAdapter=ident)
    
        # add a new entity
        crawled_data.append(NEW_ELEMENT.copy())
    
        # insert forbidden
        crawler.synchronize(commit_changes=True, crawled_data=crawled_data)
        assert crawler.run_id is not None
        insmock.assert_not_called()
        upmock.assert_not_called()
        assert updateCacheMock.call_count == 1
        # reset counts
        reset_mocks([updateCacheMock, insmock, upmock])
        # remove new record again
        crawled_data.pop()
    
        # RETRIEVE: update only
        crawler = Crawler(securityMode=SecurityMode.RETRIEVE)
        # change one element
        crawled_data[-1].get_property('result').value = "wst"
        crawler.synchronize(commit_changes=True, crawled_data=crawled_data)
        assert crawler.run_id is not None
        insmock.assert_not_called()
        upmock.assert_not_called()
        assert updateCacheMock.call_count == 1
        # reset counts
        reset_mocks([updateCacheMock, insmock, upmock])
        # reset value
        crawled_data[-1] = EXAMPLE_SERVER_STATE[-1].copy()
    
        # INSERT: insert only
        # add one element
        crawled_data.append(NEW_ELEMENT.copy())
        ident = CaosDBIdentifiableAdapter()
        ident.load_from_yaml_definition(UNITTESTDIR / "example_identifiables.yml")
        crawler = Crawler(securityMode=SecurityMode.INSERT, identifiableAdapter=ident)
        crawler.synchronize(commit_changes=True, crawled_data=crawled_data)
        assert crawler.run_id is not None
        insmock.assert_called_once()
        upmock.assert_not_called()
        updateCacheMock.assert_not_called()
        # reset counts
        reset_mocks([updateCacheMock, insmock, upmock])
        # remove new record again
        crawled_data.pop()
    
        # INSERT: update only
        crawler = Crawler(securityMode=SecurityMode.INSERT, identifiableAdapter=ident)
        # change one element
        crawled_data[-1].get_property('result').value = "wst"
        crawler.synchronize(commit_changes=True, crawled_data=crawled_data)
        assert crawler.run_id is not None
        insmock.assert_not_called()
        upmock.assert_not_called()
        updateCacheMock.assert_called_once()
        # reset counts
        reset_mocks([updateCacheMock, insmock, upmock])
        # reset value
        crawled_data[-1] = EXAMPLE_SERVER_STATE[-1].copy()
    
        # INSERT: insert and update
        ident = CaosDBIdentifiableAdapter()
        ident.load_from_yaml_definition(UNITTESTDIR / "example_identifiables.yml")
        crawler = Crawler(securityMode=SecurityMode.INSERT, identifiableAdapter=ident)
        # change  one; add one
        crawled_data[-1].get_property('result').value = "wst"
        crawled_data.append(NEW_ELEMENT.copy())
        crawler.synchronize(commit_changes=True, crawled_data=crawled_data)
        assert crawler.run_id is not None
        insmock.asser_called_once()
        upmock.assert_not_called()
        updateCacheMock.assert_called_once()
        # reset counts
        reset_mocks([updateCacheMock, insmock, upmock])
        # restore original ident
        crawled_data.pop()
        crawled_data[-1] = EXAMPLE_SERVER_STATE[-1].copy()
    
    
    def test_validation_error_print(caplog):
        caplog.set_level(logging.DEBUG, logger="caoscrawler.converters")
        # there should be no server interaction since we only test the behavior if a validation error
        # occurs during the data collection stage
        DATADIR = os.path.join(os.path.dirname(__file__), "test_data", "failing_validation")
        for fi in ["cfood.yml", "cfood2.yml"]:
            ret = crawler_main(DATADIR,
                               os.path.join(DATADIR, fi),
                               os.path.join(DATADIR, "identifiables.yml"),
                               True,
                               None,
                               False)
            assert "Couldn't validate" in caplog.text
            caplog.clear()
    
    
    @patch("caoscrawler.converters.IntegerElementConverter.create_values")
    def test_restricted_path(create_mock):
        """
        The restricted_path argument allows to ignroe part of the crawled data structure. Here, we make
        sure, that is that argument is provided, ideed only the given path of the tree is traversed.
    
        The check is done using the mock of the create_values function of the IntegerElementConverter.
        This function is only called if elements are being treated.
        """
        crawler_definition = {
            "DictTest": {
                "type": "DictElement",
                "match": "(.*)",
                "subtree": {
                    "nextdict": {
                        "type": "DictElement",
                        "match": "(.*)",
                        "subtree": {
                            "int_element": {
                                "type": "IntegerElement",
                                "match_name": ".*",
                                "match_value": "(?P<int_value>.*)",
                                "records": {
                                    "Dataset": {
                                        "Subject": "$int_value"
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    
        crawler = Crawler()
        converter_registry = create_converter_registry(crawler_definition)
    
        # This structure is crawled
        test_dict = {
            "v1": {
                "a": 1,
                "b": 2,
            },
            "v2": {
                "c": 3,
                "d": 4,
            }
        }
        # first test without a restricted_path
        restricted_path = None
        records = scan_structure_elements(
            DictElement("TestDict", test_dict), crawler_definition, converter_registry,
            restricted_path
        )
        assert create_mock.call_count == 4
        create_mock.reset_mock()
    
        # test with a restricted_path but one that has no effect (single root element)
        # this also tests that the remainder of the tree is fully traversed
        restricted_path = ["TestDict"]
        records = scan_structure_elements(
            DictElement("TestDict", test_dict), crawler_definition, converter_registry,
            restricted_path
        )
        assert create_mock.call_count == 4
        create_mock.reset_mock()
    
        # test with a restricted_path that restricts the tree (single root element)
        restricted_path = ["TestDict", "v2"]
        records = scan_structure_elements(
            DictElement("TestDict", test_dict), crawler_definition, converter_registry,
            restricted_path
        )
        assert create_mock.call_count == 2
        create_mock.reset_mock()
    
        # test with a restricted_path that contains a bad element
        restricted_path = ["TestDict", "v3"]
        with raises(RuntimeError):
            records = scan_structure_elements(
                DictElement("TestDict", test_dict), crawler_definition, converter_registry,
                restricted_path
            )
    
    
    def test_split_restricted_path():
        assert ["el"] == split_restricted_path("/el")
        assert ["el"] == split_restricted_path("/el/")
        assert ["el", "el"] == split_restricted_path("/el/el")
    
    
    # Filter the warning because we want to have it here and this way it does not hinder running
    # tests with -Werror.
    @pytest.mark.filterwarnings("ignore:The prefix:DeprecationWarning")
    def test_deprecated_prefix_option():
        """Test that calling the crawler's main function with the deprecated
        `prefix` option raises the correct errors and warnings.
    
        """
    
        with pytest.deprecated_call():
            crawler_main("./", UNITTESTDIR / "scifolder_cfood.yml", prefix="to/be/removed")
    
        # Check that crawler main terminates with an error
        assert 1 == crawler_main("./", UNITTESTDIR / "scifolder_cfood.yml", prefix="to/be/removed",
                                 remove_prefix="to/be/removed")
    
        with raises(ValueError) as ve:
    
            _treat_deprecated_prefix(prefix="to/be/removed", remove_prefix="to/be/removed")
    
        assert "(deprecated) `prefix` and the `remove_prefix`" in str(ve.value)
    
    
    def test_create_entity_summary():
        assert "" == Crawler.create_entity_summary([]).strip()
    
        entities = [
            db.Record(id=1).add_parent("A"),
            db.Record(id=4, name='a').add_parent("B"),
            db.Record(id=5).add_parent("A"),
            db.Record(id=6, name='b').add_parent("B"),
        ]
        text = Crawler.create_entity_summary(entities).strip()
        assert 'a' in text
        assert 'b' in text
        assert 'A:' in text
        assert 'B:' in text
        assert "<a href='/Entity/4'>a</a>, <a href='/Entity/6'>b</a>" in text
    
    
    @patch("caoscrawler.crawl.cached_get_entity_by",
           new=Mock(side_effect=mock_get_entity_by_query))
    def test_replace_name_with_referenced_entity():
        test_text = 'lkajsdf'
        test_int = 134343
        test_id = 1111
        test_name = 'rec_name'
    
        # do not touch Properties with non-ref datatype
        prop = db.Property(name='a', datatype=db.TEXT, value=test_text)
        Crawler.replace_name_with_referenced_entity_id(prop)
        assert prop.value is test_text
    
        # do not touch Properties with generic-ref datatype
        prop = db.Property(name='a', datatype=db.REFERENCE, value=test_text)
        Crawler.replace_name_with_referenced_entity_id(prop)
        assert prop.value is test_text
    
        # do not touch Properties with file-ref datatype
        prop = db.Property(name='a', datatype=db.FILE, value=test_text)
        Crawler.replace_name_with_referenced_entity_id(prop)
        assert prop.value is test_text
    
        # do not touch Properties with non-str values
        prop = db.Property(name='a', datatype="RT", value=test_int)
        Crawler.replace_name_with_referenced_entity_id(prop)
        assert prop.value is test_int
    
        # no LinkAhead acccess until here
        assert caoscrawler.crawl.cached_get_entity_by.call_count == 0
    
        # change Properties with custom dt and str value
        prop = db.Property(name='a', datatype="RT", value=test_name)
        Crawler.replace_name_with_referenced_entity_id(prop)
        assert isinstance(prop.value, int)
        assert prop.value == test_id
        assert caoscrawler.crawl.cached_get_entity_by.call_count == 1
    
        # do not touch Properties with non-ref datatype (LIST)
        prop = db.Property(name='a', datatype=db.LIST(db.TEXT), value=[test_text])
        Crawler.replace_name_with_referenced_entity_id(prop)
        assert prop.value[0] is test_text
    
        # do not touch Properties with generic-ref datatype (LIST)
        prop = db.Property(name='a', datatype=db.LIST(db.REFERENCE), value=[test_text])
        Crawler.replace_name_with_referenced_entity_id(prop)
        assert prop.value[0] is test_text
    
        # do not touch Properties with file-ref datatype (LIST)
        prop = db.Property(name='a', datatype=db.LIST(db.FILE), value=[test_text])
        Crawler.replace_name_with_referenced_entity_id(prop)
        assert prop.value[0] is test_text
    
        # do not touch Properties with non-str values (LIST)
        prop = db.Property(name='a', datatype=db.LIST("RT"), value=[test_int])
        Crawler.replace_name_with_referenced_entity_id(prop)
        assert prop.value[0] is test_int
    
        # change Properties with custom dt and str value
        prop = db.Property(name='a', datatype=db.LIST("RT"), value=[test_name, db.Record(name='hi'),
                                                                    test_name])
        Crawler.replace_name_with_referenced_entity_id(prop)
        assert isinstance(prop.value[0], int)
        assert prop.value[0] == test_id
        assert isinstance(prop.value[1], db.Entity)
        assert prop.value[1].name == "hi"
        assert isinstance(prop.value[2], int)
        assert prop.value[2] == test_id
        assert caoscrawler.crawl.cached_get_entity_by.call_count == 3