-
Henrik tom Wörden authoredHenrik tom Wörden authored
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
test_crawler.py 35.86 KiB
#!/usr/bin/env python3
# encoding: utf-8
#
# This file is a part of the CaosDB Project.
#
# Copyright (C) 2023 Indiscale GmbH <info@indiscale.com>
# Copyright (C) 2023 Henrik tom Wörden <h.tomwoerden@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
"""
test the Crawler class
"""
import json
import logging
import os
import warnings
from copy import deepcopy
from functools import partial
from os.path import basename, dirname, join
from pathlib import Path
from unittest.mock import MagicMock, Mock, patch
import caosdb as db
import caosdb.common.models as dbmodels
import pytest
import yaml
from caoscrawler.crawl import (Crawler, SecurityMode, _treat_deprecated_prefix,
crawler_main, split_restricted_path)
from caoscrawler.debug_tree import DebugTree
from caoscrawler.identifiable import Identifiable
from caoscrawler.identifiable_adapters import (CaosDBIdentifiableAdapter,
IdentifiableAdapter,
LocalStorageIdentifiableAdapter)
from caoscrawler.scanner import (create_converter_registry, scan_directory,
scan_structure_elements)
from caoscrawler.stores import GeneralStore, RecordStore
from caoscrawler.structure_elements import (DictElement, DictListElement,
DictTextElement, File)
from caosdb.apiutils import compare_entities
from caosdb.cached import cache_clear
from caosdb.exceptions import EmptyUniqueQueryError
from pytest import raises
UNITTESTDIR = Path(__file__).parent
EXAMPLE_SERVER_STATE = [
db.Property(id=1, name='result', datatype=db.TEXT),
db.Property(id=2, name='date', datatype=db.DATETIME),
db.RecordType(id=3, name="Experiment"),
db.RecordType(id=4, name="Analysis"),
db.Record(id=5)
.add_parent(name="Experiment", id=3)
.add_property(name="date", value="2022-02-01")
.add_property(name="result", value="FAIL"),
db.Record(id=6)
.add_parent(name="Experiment", id=3)
.add_property(name="date", value="2022-02-02")
.add_property(name="result", value="SUCCESS"),
db.Record(id=7)
.add_parent(name="Analysis", id=4)
.add_property(name="date", value="2022-03-01")
.add_property(name="result", value="homogeneous"),
db.Record(id=8)
.add_parent(name="Analysis", id=4)
.add_property(name="date", value="2022-03-02")
.add_property(name="result", value="heterogeneous"),
]
NEW_ELEMENT = (db.Record()
.add_parent(name="Analysis", id=4)
.add_property(name="date", value="2022-03-05") # new date
.add_property(name="result", value="homogeneous"))
def mock_get_entity_by(eid=None, name=None, path=None):
if eid is not None:
candidates = [el for el in EXAMPLE_SERVER_STATE if el.id == eid]
if len(candidates) > 0:
return candidates[0]
else:
raise EmptyUniqueQueryError("")
if name is not None:
candidates = [el for el in EXAMPLE_SERVER_STATE
if (el.name is not None and el.name.lower() == name.lower())]
if len(candidates) > 0:
return candidates[0]
else:
raise EmptyUniqueQueryError("")
if path is not None:
candidates = [el for el in EXAMPLE_SERVER_STATE
if (el.path is not None and el.path == path)]
if len(candidates) > 0:
return candidates[0]
else:
raise EmptyUniqueQueryError("")
@pytest.fixture(autouse=True)
def clear_cache():
cache_clear()
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
def test_constructor():
with warnings.catch_warnings(record=True) as w:
# Cause all warnings to always be triggered.
warnings.filterwarnings("ignore")
warnings.filterwarnings("always", category=DeprecationWarning)
Crawler(debug=True)
assert issubclass(w[-1].category, DeprecationWarning)
assert "The debug argument of the Crawler class" in str(w[-1].message)
Crawler(generalStore=GeneralStore())
assert issubclass(w[-1].category, DeprecationWarning)
assert "The generalStore argument of the Crawler" in str(w[-1].message)
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
def test_deprecated_functions():
with warnings.catch_warnings(record=True) as w:
# Cause all warnings to always be triggered.
warnings.filterwarnings("ignore")
warnings.filterwarnings("always", category=DeprecationWarning)
cr = Crawler()
cr.crawl_directory(UNITTESTDIR, UNITTESTDIR / "scifolder_cfood.yml")
print(w)
print(w[0].message)
assert issubclass(w[-1].category, DeprecationWarning)
assert "The function crawl_directory in the crawl" in str(w[-1].message)
cr.start_crawling([], {}, {})
assert issubclass(w[-1].category, DeprecationWarning)
assert "The function start_crawling in the crawl module" in str(w[-1].message)
cr.crawled_data
assert issubclass(w[-1].category, DeprecationWarning)
assert "The use of self.crawled_data is depricated" in str(w[-1].message)
def test_check_whether_parent_exists():
trivial_result = Crawler.check_whether_parent_exists([], [])
assert len(trivial_result) == 0
assert isinstance(trivial_result, list)
trivial_result2 = Crawler.check_whether_parent_exists([db.Record(), db.Record()], [])
assert len(trivial_result) == 0
assert isinstance(trivial_result, list)
# make sure records with parent is collected
a_recs = Crawler.check_whether_parent_exists(
[
db.Record(id=1).add_parent("A"),
db.Record(id=2).add_parent("B"),
db.Record(id=3).add_parent("B"),
db.Record(id=4).add_parent("A"),
], ["A"])
a_recs_ids = [el.id for el in a_recs]
assert 1 in a_recs_ids
assert 4 in a_recs_ids
def test_remove_unnecessary_updates():
# test trvial case
upl = [db.Record().add_parent("A")]
irs = [db.Record().add_parent("A")]
updates = Crawler.remove_unnecessary_updates(upl, irs)
assert len(updates) == 0
# test property difference case
# TODO this should work right?
# upl = [db.Record().add_parent("A").add_property("a", 3)]
# irs = [db.Record().add_parent("A")] # ID should be s
# Crawler.remove_unnecessary_updates(upl, irs)
# assert len(upl) == 1
# test value difference case
upl = [db.Record().add_parent("A").add_property("a", 5)]
irs = [db.Record().add_parent("A").add_property("a")]
updates = Crawler.remove_unnecessary_updates(upl, irs)
assert len(updates) == 1
upl = [db.Record().add_parent("A").add_property("a", 5)]
irs = [db.Record().add_parent("A").add_property("a", 5)]
updates = Crawler.remove_unnecessary_updates(upl, irs)
assert len(updates) == 0
# test unit difference case
upl = [db.Record().add_parent("A").add_property("a", unit='cm')]
irs = [db.Record().add_parent("A").add_property("a")]
updates = Crawler.remove_unnecessary_updates(upl, irs)
assert len(updates) == 1
# test None difference case
upl = [db.Record().add_parent("A").add_property("a")]
irs = [db.Record().add_parent("A").add_property("a", 5)]
updates = Crawler.remove_unnecessary_updates(upl, irs)
assert len(updates) == 1
def test_split_into_inserts_and_updates_trivial():
crawler = Crawler()
crawler.split_into_inserts_and_updates([])
def basic_retrieve_by_name_mock_up(rec, referencing_entities=None, known=None):
""" returns a stored Record if rec.name is an existing key, None otherwise """
if rec.name in known:
return known[rec.name]
else:
return None
@pytest.fixture
def crawler_mocked_identifiable_retrieve():
crawler = Crawler()
# TODO use minimal setup
# mock retrieval of registered identifiabls: return Record with just a parent
crawler.identifiableAdapter.get_registered_identifiable = Mock(
side_effect=lambda x: db.Record().add_parent(x.parents[0].name).add_property(name='name'))
# Simulate remote server content by using the names to identify records
# There is only a single known Record with name A
crawler.identifiableAdapter.retrieve_identified_record_for_record = Mock(side_effect=partial(
basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A")}))
crawler.identifiableAdapter.retrieve_identified_record_for_identifiable = Mock(
side_effect=partial(
basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A")}))
return crawler
def test_split_into_inserts_and_updates_single(crawler_mocked_identifiable_retrieve):
crawler = crawler_mocked_identifiable_retrieve
identlist = [Identifiable(name="A", record_type="C"), Identifiable(name="B", record_type="C")]
entlist = [db.Record(name="A").add_parent(
"C"), db.Record(name="B").add_parent("C")]
assert crawler.get_from_any_cache(identlist[0]) is None
assert crawler.get_from_any_cache(identlist[1]) is None
assert not crawler._has_reference_value_without_id(identlist[0])
assert not crawler._has_reference_value_without_id(identlist[1])
assert crawler.identifiableAdapter.retrieve_identified_record_for_record(
identlist[0]).id == 1111
assert crawler.identifiableAdapter.retrieve_identified_record_for_record(
identlist[1]) is None
insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist))
assert len(insert) == 1
assert insert[0].name == "B"
assert len(update) == 1
assert update[0].name == "A"
# if this ever fails, the mock up may be removed
crawler.identifiableAdapter.get_registered_identifiable.assert_called()
crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called()
def test_split_into_inserts_and_updates_with_duplicate(crawler_mocked_identifiable_retrieve):
crawler = crawler_mocked_identifiable_retrieve
a = db.Record(name="A").add_parent("C")
b = db.Record(name="B").add_parent("C")
b.add_property("A", a)
# This is identical to a and should be removed
c = db.Record(name="A").add_parent("C")
entlist = [a, b, c]
insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist))
assert len(insert) == 1
assert insert[0].name == "B"
assert len(update) == 1
assert update[0].name == "A"
# if this ever fails, the mock up may be removed
crawler.identifiableAdapter.get_registered_identifiable.assert_called()
crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called()
def test_split_into_inserts_and_updates_with_ref(crawler_mocked_identifiable_retrieve):
crawler = crawler_mocked_identifiable_retrieve
# try it with a reference
a = db.Record(name="A").add_parent("C")
b = db.Record(name="B").add_parent("C")
b.add_property("A", a)
entlist = [a, b]
insert, update = crawler.split_into_inserts_and_updates(entlist)
assert len(insert) == 1
assert insert[0].name == "B"
assert len(update) == 1
assert update[0].name == "A"
# if this ever fails, the mock up may be removed
crawler.identifiableAdapter.get_registered_identifiable.assert_called()
crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called()
def test_split_into_inserts_and_updates_with_circ():
# try circular
a = db.Record(name="A").add_parent("C")
b = db.Record(name="B").add_parent("C")
b.add_property("A", a)
a.add_property("B", b)
entlist = [a, b]
# TODO this does not seem to be complete!
def test_split_into_inserts_and_updates_with_complex(crawler_mocked_identifiable_retrieve):
crawler = crawler_mocked_identifiable_retrieve
# A
# ^
# |
# F <- B <- G
a = db.Record(name="A").add_parent("C").add_property(
'd', 13).add_property('e', "lskdjlsfdj")
b = db.Record(name="B").add_parent("C")
g = db.Record(name="G").add_parent("C")
f = db.Record(name="F").add_parent("C")
g.add_property("A", a)
b.add_property("A", f)
b.add_property("A", a)
entlist = [a, b, g]
insert, update = crawler.split_into_inserts_and_updates(entlist)
assert len(insert) == 3
assert "B" in [el.name for el in insert]
assert len(update) == 1
assert update[0].name == "A"
# if this ever fails, the mock up may be removed
crawler.identifiableAdapter.get_registered_identifiable.assert_called()
crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called()
# TODO write test where the unresoled entity is not part of the identifiable
def test_split_into_inserts_and_updates_with_copy_attr(crawler_mocked_identifiable_retrieve):
crawler = crawler_mocked_identifiable_retrieve
# assume identifiable is only the name
a = db.Record(name="A").add_parent("C")
a.add_property("foo", 1)
b = db.Record(name="A").add_parent("C")
b.add_property("bar", 2)
entlist = [a, b]
insert, update = crawler.split_into_inserts_and_updates(entlist)
assert update[0].get_property("bar").value == 2
assert update[0].get_property("foo").value == 1
# if this ever fails, the mock up may be removed
crawler.identifiableAdapter.get_registered_identifiable.assert_called()
crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called()
def test_has_missing_object_in_references():
crawler = Crawler()
# Simulate remote server content by using the names to identify records
# There are only two known Records with name A and B
crawler.identifiableAdapter.get_registered_identifiable = Mock(side_effect=partial(
basic_retrieve_by_name_mock_up, known={"C": db.Record(name="C").add_parent("RTC")
.add_property("d").add_property("name"),
"D": db.Record(name="D").add_parent("RTD")
.add_property("d").add_property("e").add_property("name"),
}))
# one reference with id -> check
assert not crawler._has_missing_object_in_references(
Identifiable(name="C", record_type="RTC", properties={'d': 123}), [])
# one ref with Entity with id -> check
assert not crawler._has_missing_object_in_references(
Identifiable(name="C", record_type="RTC", properties={'d': db.Record(id=123)
.add_parent("C")}), [])
# one ref with id one with Entity with id (mixed) -> check
assert not crawler._has_missing_object_in_references(
Identifiable(name="C", record_type="RTD",
properties={'d': 123, 'b': db.Record(id=123).add_parent("RTC")}), [])
# entity to be referenced in the following
a = db.Record(name="C").add_parent("C").add_property("d", 12311)
# one ref with id one with Entity without id (but not identifying) -> fail
assert not crawler._has_missing_object_in_references(
Identifiable(name="C", record_type="RTC", properties={'d': 123, 'e': a}), [])
# one ref with id one with Entity without id (mixed) -> fail
assert not crawler._has_missing_object_in_references(
Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), [])
crawler.add_to_remote_missing_cache(a, Identifiable(name="C", record_type="RTC",
properties={'d': 12311}))
# one ref with id one with Entity without id but in cache -> check
assert crawler._has_missing_object_in_references(
Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), [])
# if this ever fails, the mock up may be removed
crawler.identifiableAdapter.get_registered_identifiable.assert_called()
@pytest.mark.xfail()
def test_references_entities_without_ids():
crawler = Crawler()
assert not crawler._has_reference_value_without_id(db.Record().add_parent("Person")
.add_property('last_name', 123)
.add_property('first_name', 123))
# id and rec with id
assert not crawler._has_reference_value_without_id(db.Record().add_parent("Person")
.add_property('first_name', 123)
.add_property('last_name',
db.Record(id=123)))
# id and rec with id and one unneeded prop
assert crawler._has_reference_value_without_id(db.Record().add_parent("Person")
.add_property('first_name', 123)
.add_property('stuff', db.Record())
.add_property('last_name', db.Record(id=123)))
# one identifying prop is missing
assert crawler._has_reference_value_without_id(db.Record().add_parent("Person")
.add_property('first_name', 123)
.add_property('last_name', db.Record()))
def test_replace_entities_with_ids():
crawler = Crawler()
a = (db.Record().add_parent("B").add_property("A", 12345)
.add_property("B", db.Record(id=12345))
.add_property("C", [db.Record(id=12345), 233324]))
crawler.replace_entities_with_ids(a)
assert a.get_property("A").value == 12345
assert a.get_property("B").value == 12345
assert a.get_property("C").value == [12345, 233324]
def reset_mocks(mocks):
for mock in mocks:
mock.reset_mock()
def mock_retrieve_record(identifiable: Identifiable):
""" assumes that the identifiable is always only the date"""
for record in EXAMPLE_SERVER_STATE:
if (record.role == "Record"
and record.get_property("date").value == identifiable.properties['date']):
return record
return None
@patch("caoscrawler.crawl.cached_get_entity_by",
new=Mock(side_effect=mock_get_entity_by))
@patch("caoscrawler.identifiable_adapters.cached_get_entity_by",
new=Mock(side_effect=mock_get_entity_by))
@patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter."
"retrieve_identified_record_for_identifiable",
new=Mock(side_effect=mock_retrieve_record))
@patch("caoscrawler.crawl.db.Container.insert")
@patch("caoscrawler.crawl.db.Container.update")
def test_synchronization_no_commit(upmock, insmock):
crawled_data = [r.copy() for r in EXAMPLE_SERVER_STATE if r.role == "Record"]
# change one; add one
crawled_data[-1].get_property('result').value = "wst"
crawled_data.append(NEW_ELEMENT.copy())
ident = CaosDBIdentifiableAdapter()
ident.load_from_yaml_definition(UNITTESTDIR / "example_identifiables.yml")
crawler = Crawler(securityMode=SecurityMode.UPDATE, identifiableAdapter=ident)
ins, ups = crawler.synchronize(commit_changes=False, crawled_data=crawled_data)
insmock.assert_not_called()
upmock.assert_not_called()
assert len(ins) == 1
assert len(ups) == 1
@patch("caoscrawler.crawl.cached_get_entity_by",
new=Mock(side_effect=mock_get_entity_by))
@patch("caoscrawler.identifiable_adapters.cached_get_entity_by",
new=Mock(side_effect=mock_get_entity_by))
@patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter."
"retrieve_identified_record_for_identifiable",
new=Mock(side_effect=mock_retrieve_record))
@patch("caoscrawler.crawl.db.Container.insert")
@patch("caoscrawler.crawl.db.Container.update")
@patch("caoscrawler.crawl.UpdateCache.insert")
def test_security_mode(updateCacheMock, upmock, insmock):
# trivial case: nothing to do
crawled_data = [r.copy() for r in EXAMPLE_SERVER_STATE if r.role == "Record"]
print(crawled_data)
crawler = Crawler(securityMode=SecurityMode.RETRIEVE)
crawler.synchronize(commit_changes=True, crawled_data=crawled_data)
assert crawler.run_id is not None
insmock.assert_not_called()
upmock.assert_not_called()
updateCacheMock.assert_not_called()
# RETRIEVE: insert only
ident = CaosDBIdentifiableAdapter()
ident.load_from_yaml_definition(UNITTESTDIR / "example_identifiables.yml")
crawler = Crawler(securityMode=SecurityMode.RETRIEVE, identifiableAdapter=ident)
# add a new entity
crawled_data.append(NEW_ELEMENT.copy())
# insert forbidden
crawler.synchronize(commit_changes=True, crawled_data=crawled_data)
assert crawler.run_id is not None
insmock.assert_not_called()
upmock.assert_not_called()
assert updateCacheMock.call_count == 1
# reset counts
reset_mocks([updateCacheMock, insmock, upmock])
# remove new record again
crawled_data.pop()
# RETRIEVE: update only
crawler = Crawler(securityMode=SecurityMode.RETRIEVE)
# change one element
crawled_data[-1].get_property('result').value = "wst"
crawler.synchronize(commit_changes=True, crawled_data=crawled_data)
assert crawler.run_id is not None
insmock.assert_not_called()
upmock.assert_not_called()
# import IPython
# IPython.embed()
# print(updateCacheMock.call_args_list)
assert updateCacheMock.call_count == 1
# reset counts
reset_mocks([updateCacheMock, insmock, upmock])
# reset value
crawled_data[-1] = EXAMPLE_SERVER_STATE[-1].copy()
# INSERT: insert only
# add one element
crawled_data.append(NEW_ELEMENT.copy())
ident = CaosDBIdentifiableAdapter()
ident.load_from_yaml_definition(UNITTESTDIR / "example_identifiables.yml")
crawler = Crawler(securityMode=SecurityMode.INSERT, identifiableAdapter=ident)
crawler.synchronize(commit_changes=True, crawled_data=crawled_data)
assert crawler.run_id is not None
insmock.assert_called_once()
upmock.assert_not_called()
updateCacheMock.assert_not_called()
# reset counts
reset_mocks([updateCacheMock, insmock, upmock])
# remove new record again
crawled_data.pop()
# INSERT: update only
crawler = Crawler(securityMode=SecurityMode.INSERT, identifiableAdapter=ident)
# change one element
crawled_data[-1].get_property('result').value = "wst"
crawler.synchronize(commit_changes=True, crawled_data=crawled_data)
assert crawler.run_id is not None
insmock.assert_not_called()
upmock.assert_not_called()
updateCacheMock.assert_called_once()
# reset counts
reset_mocks([updateCacheMock, insmock, upmock])
# reset value
crawled_data[-1] = EXAMPLE_SERVER_STATE[-1].copy()
# INSERT: insert and update
ident = CaosDBIdentifiableAdapter()
ident.load_from_yaml_definition(UNITTESTDIR / "example_identifiables.yml")
crawler = Crawler(securityMode=SecurityMode.INSERT, identifiableAdapter=ident)
# change one; add one
crawled_data[-1].get_property('result').value = "wst"
crawled_data.append(NEW_ELEMENT.copy())
crawler.synchronize(commit_changes=True, crawled_data=crawled_data)
assert crawler.run_id is not None
insmock.asser_called_once()
upmock.assert_not_called()
updateCacheMock.assert_called_once()
# reset counts
reset_mocks([updateCacheMock, insmock, upmock])
# restore original ident
crawled_data.pop()
crawled_data[-1] = EXAMPLE_SERVER_STATE[-1].copy()
def test_create_reference_mapping():
a = db.Record().add_parent("A")
b = db.Record().add_parent("B").add_property('a', a)
ref = Crawler.create_reference_mapping([a, b])
assert id(a) in ref
assert id(b) not in ref
assert "B" in ref[id(a)]
assert ref[id(a)]["B"] == [b]
def test_create_flat_list():
a = db.Record()
b = db.Record()
a.add_property(name="a", value=a)
a.add_property(name="b", value=b)
flat = Crawler.create_flat_list([a])
assert len(flat) == 2
assert a in flat
assert b in flat
c = db.Record()
c.add_property(name="a", value=a)
# This would caus recursion if it is not dealt with properly.
a.add_property(name="c", value=c)
flat = Crawler.create_flat_list([c])
assert len(flat) == 3
assert a in flat
assert b in flat
assert c in flat
@pytest.fixture
def crawler_mocked_for_backref_test():
crawler = Crawler()
# mock retrieval of registered identifiabls: return Record with just a parent
def get_reg_ident(x):
if x.parents[0].name == "C":
return db.Record().add_parent(x.parents[0].name).add_property(
"is_referenced_by", value=["BR"]).add_property("name")
elif x.parents[0].name == "D":
return db.Record().add_parent(x.parents[0].name).add_property(
"is_referenced_by", value=["BR", "BR2"]).add_property("name")
else:
return db.Record().add_parent(x.parents[0].name).add_property("name")
crawler.identifiableAdapter.get_registered_identifiable = Mock(side_effect=get_reg_ident)
# Simulate remote server content by using the names to identify records
# There is only a single known Record with name A
crawler.identifiableAdapter.retrieve_identified_record_for_record = Mock(side_effect=partial(
basic_retrieve_by_name_mock_up, known={"A":
db.Record(id=1111, name="A").add_parent("BR")}))
crawler.identifiableAdapter.retrieve_identified_record_for_identifiable = Mock(
side_effect=partial(
basic_retrieve_by_name_mock_up, known={"A":
db.Record(id=1111, name="A").add_parent("BR")}))
return crawler
def test_validation_error_print(caplog):
caplog.set_level(logging.DEBUG, logger="caoscrawler.converters")
# there should be no server interaction since we only test the behavior if a validation error
# occurs during the data collection stage
DATADIR = os.path.join(os.path.dirname(__file__), "test_data", "failing_validation")
for fi in ["cfood.yml", "cfood2.yml"]:
ret = crawler_main(DATADIR,
os.path.join(DATADIR, fi),
os.path.join(DATADIR, "identifiables.yml"),
True,
None,
False)
assert "Couldn't validate" in caplog.text
caplog.clear()
def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test):
crawler = crawler_mocked_for_backref_test
identlist = [Identifiable(name="A", record_type="BR"),
Identifiable(name="B", record_type="C", backrefs=[db.Entity()])]
referenced = db.Record(name="B").add_parent("C")
entlist = [referenced, db.Record(name="A").add_parent("BR").add_property("ref", referenced), ]
# Test without referencing object
# currently a NotImplementedError is raised if necessary properties are missing.
with raises(NotImplementedError):
crawler.split_into_inserts_and_updates([db.Record(name="B").add_parent("C")])
# identifiables were not yet checked
assert crawler.get_from_any_cache(identlist[0]) is None
assert crawler.get_from_any_cache(identlist[1]) is None
# one with reference, one without
assert not crawler._has_reference_value_without_id(identlist[0])
assert crawler._has_reference_value_without_id(identlist[1])
# one can be found remotely, one not
assert crawler.identifiableAdapter.retrieve_identified_record_for_record(
identlist[0]).id == 1111
assert crawler.identifiableAdapter.retrieve_identified_record_for_record(
identlist[1]) is None
# check the split...
insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist))
# A was found remotely and is therefore in the update list
assert len(update) == 1
assert update[0].name == "A"
# B does not exist on the (simulated) remote server
assert len(insert) == 1
assert insert[0].name == "B"
@patch("caoscrawler.identifiable_adapters.get_children_of_rt",
new=Mock(side_effect=lambda x: [x]))
def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_test):
# test whether multiple references of the same record type are correctly used
crawler = crawler_mocked_for_backref_test
referenced = db.Record(name="B").add_parent("C")
entlist = [referenced,
db.Record(name="A").add_parent("BR").add_property("ref", referenced),
db.Record(name="C").add_parent("BR").add_property("ref", referenced),
]
# test whether both entities are listed in the backref attribute of the identifiable
referencing_entities = crawler.create_reference_mapping(entlist)
identifiable = crawler.identifiableAdapter.get_identifiable(referenced, referencing_entities)
assert len(identifiable.backrefs) == 2
# check the split...
insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist))
assert len(update) == 1
assert len(insert) == 2
@patch("caoscrawler.identifiable_adapters.get_children_of_rt",
new=Mock(side_effect=lambda x: [x]))
def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_test):
# test whether multiple references of the different record types are correctly used
crawler = crawler_mocked_for_backref_test
referenced = db.Record(name="B").add_parent("D")
entlist = [referenced,
db.Record(name="A").add_parent("BR").add_property("ref", referenced),
db.Record(name="A").add_parent("BR2").add_property("ref", referenced),
]
# test whether both entities are listed in the backref attribute of the identifiable
referencing_entities = crawler.create_reference_mapping(entlist)
identifiable = crawler.identifiableAdapter.get_identifiable(referenced, referencing_entities)
assert len(identifiable.backrefs) == 2
# check the split...
insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist))
assert len(update) == 2
assert len(insert) == 1
def mock_create_values(values, element):
pass
@patch("caoscrawler.converters.IntegerElementConverter.create_values")
def test_restricted_path(create_mock):
"""
The restricted_path argument allows to ignroe part of the crawled data structure. Here, we make
sure, that is that argument is provided, ideed only the given path of the tree is traversed.
The check is done using the mock of the create_values function of the IntegerElementConverter.
This function is only called if elements are being treated.
"""
crawler_definition = {
"DictTest": {
"type": "DictElement",
"match": "(.*)",
"subtree": {
"nextdict": {
"type": "DictElement",
"match": "(.*)",
"subtree": {
"int_element": {
"type": "IntegerElement",
"match_name": ".*",
"match_value": "(?P<int_value>.*)",
"records": {
"Dataset": {
"Subject": "$int_value"
}
}
}
}
}
}
}
}
crawler = Crawler()
converter_registry = create_converter_registry(crawler_definition)
# This structure is crawled
test_dict = {
"v1": {
"a": 1,
"b": 2,
},
"v2": {
"c": 3,
"d": 4,
}
}
# first test without a restricted_path
restricted_path = None
records = scan_structure_elements(
DictElement("TestDict", test_dict), crawler_definition, converter_registry,
restricted_path
)
assert create_mock.call_count == 4
create_mock.reset_mock()
# test with a restricted_path but one that has no effect (single root element)
# this also tests that the remainder of the tree is fully traversed
restricted_path = ["TestDict"]
records = scan_structure_elements(
DictElement("TestDict", test_dict), crawler_definition, converter_registry,
restricted_path
)
assert create_mock.call_count == 4
create_mock.reset_mock()
# test with a restricted_path that restricts the tree (single root element)
restricted_path = ["TestDict", "v2"]
records = scan_structure_elements(
DictElement("TestDict", test_dict), crawler_definition, converter_registry,
restricted_path
)
assert create_mock.call_count == 2
create_mock.reset_mock()
# test with a restricted_path that contains a bad element
restricted_path = ["TestDict", "v3"]
with raises(RuntimeError):
records = scan_structure_elements(
DictElement("TestDict", test_dict), crawler_definition, converter_registry,
restricted_path
)
def test_split_restricted_path():
assert ["el"] == split_restricted_path("/el")
assert ["el"] == split_restricted_path("/el/")
assert ["el", "el"] == split_restricted_path("/el/el")
# Filter the warning because we want to have it here and this way it does not hinder running
# tests with -Werror.
@pytest.mark.filterwarnings("ignore:The prefix:DeprecationWarning")
def test_deprecated_prefix_option():
"""Test that calling the crawler's main function with the deprecated
`prefix` option raises the correct errors and warnings.
"""
with pytest.deprecated_call():
crawler_main("./", UNITTESTDIR / "scifolder_cfood.yml", prefix="to/be/removed")
# Check that crawler main terminates with an error
assert 1 == crawler_main("./", UNITTESTDIR / "scifolder_cfood.yml", prefix="to/be/removed",
remove_prefix="to/be/removed")
with raises(ValueError) as ve:
_treat_deprecated_prefix(prefix="to/be/removed", remove_prefix="to/be/removed")
assert "(deprecated) `prefix` and the `remove_prefix`" in str(ve.value)
def test_create_entity_summary():
assert "" == Crawler.create_entity_summary([]).strip()
entities = [
db.Record(id=1).add_parent("A"),
db.Record(id=4, name='a').add_parent("B"),
db.Record(id=5).add_parent("A"),
db.Record(id=6, name='b').add_parent("B"),
]
text = Crawler.create_entity_summary(entities).strip()
assert 'a' in text
assert 'b' in text
assert 'A:' in text
assert 'B:' in text
assert "<a href='/Entity/4'>a</a>, <a href='/Entity/6'>b</a>" in text
def test_detect_circular_dependency(crawler_mocked_identifiable_retrieve, caplog):
crawler = crawler_mocked_identifiable_retrieve
crawler.identifiableAdapter.get_registered_identifiable = Mock(
side_effect=lambda x: db.Record().add_parent('C').add_property(name='C'))
a = db.Record(name='a').add_parent("C")
b = db.Record(name='b').add_parent("C").add_property(name="C", value=a)
c = db.Record(name='c').add_parent("C").add_property(name='D', value='e'
).add_property(name="C", value=b)
d = db.Record(name='c').add_parent("C")
a.add_property(name="C", value=c)
flat = [a, b, c]
circle = Crawler.detect_circular_dependency(flat)
assert [id(el) for el in circle] == [id(el) for el in [a, c, b, a]]
assert Crawler.detect_circular_dependency([d]) is None
with raises(RuntimeError):
_, _ = crawler.split_into_inserts_and_updates(flat)
caplog.set_level(logging.ERROR, logger="caoscrawler.converters")
assert "Found circular dependency" in caplog.text
assert "-------\na\n['C" in caplog.text
caplog.clear()