diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 3a3b36b3758e7a492101424bc117128d95a6658c..4f0a692e4b8b8fa4bc0a90c7eae986466ca125ac 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -326,21 +326,21 @@ class Crawler(object): # This only might add properties of the postponed records to the already used ones. if len(st.unchecked) > 0: - circle = st.unchecked_contains_circular_dependency() - if circle is None: - logger.error("Failed, but found NO circular dependency. The data is as follows:" - + "\n".join([str(el) for el in st.unchecked]) - - ) - else: - logger.error("Found circular dependency (Note that this might include references " - "that are not identifying properties): " - + "\n".join([str(el) for el in st.unchecked]) - ) + # circle = st.unchecked_contains_circular_dependency() + # if circle is None: + # logger.error("Failed, but found NO circular dependency. The data is as follows:" + # + "\n".join([str(el) for el in st.unchecked]) + + # ) + # else: + # logger.error("Found circular dependency (Note that this might include references " + # "that are not identifying properties): " + # + "\n".join([str(el) for el in st.unchecked]) + # ) raise RuntimeError( - f"Could not finish split_into_inserts_and_updates. Circular dependency: " - f"{circle is not None}") + f"Could not finish split_into_inserts_and_updates. " + "It might be due to a circular dependency") return st.export_record_lists() diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index e0c2631e39c1a946492d26b4236101aca258ce59..ccd441e7b4a6c97898021f87026349f6ccc37eb5 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -90,6 +90,20 @@ NEW_ELEMENT = (db.Record() .add_property(name="result", value="homogeneous")) +def reset_mocks(mocks): + for mock in mocks: + mock.reset_mock() + + +def mock_create_values(values, element): + pass + + +def mock_get_entity_by_query(query=None): + if query is not None: + return db.Record(id=1111, name='rec_name').add_parent('RT') + + def mock_get_entity_by(eid=None, name=None, path=None): if eid is not None: candidates = [el for el in EXAMPLE_SERVER_STATE if el.id == eid] @@ -343,14 +357,20 @@ def test_split_into_inserts_and_updates_simple(crawler_mocked_identifiable_retri crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() -def test_split_into_inserts_and_updates_with_circ(): - # try circular - a = db.Record(name="A").add_parent("C") - b = db.Record(name="B").add_parent("C") - b.add_property("A", a) - a.add_property("B", b) - entlist = [a, b] - # TODO this does not seem to be complete! +def test_split_into_inserts_and_updates_with_circ(crawler_mocked_identifiable_retrieve): + # test trying to split circular dependency + crawler = crawler_mocked_identifiable_retrieve + crawler.identifiableAdapter.get_registered_identifiable = Mock( + side_effect=lambda x: db.Record().add_parent('C').add_property(name='a') + ) + # two records that reference each other via identifying properties + a = db.Record().add_parent("C") + b = db.Record().add_parent("C").add_property(name='a', value=a) + a.add_property(name='a', value=b) + + st = SyncGraph([a, b], crawler.identifiableAdapter) + with pytest.raises(RuntimeError): + crawler.split_into_inserts_and_updates(st) def test_split_into_inserts_and_updates_with_complex(crawler_mocked_identifiable_retrieve): @@ -381,24 +401,6 @@ def test_split_into_inserts_and_updates_with_complex(crawler_mocked_identifiable # TODO write test where the unresoled entity is not part of the identifiable -def test_split_into_inserts_and_updates_with_copy_attr(crawler_mocked_identifiable_retrieve): - crawler = crawler_mocked_identifiable_retrieve - # assume identifiable is only the name - a = db.Record(name="A").add_parent("C") - a.add_property("foo", 1) - b = db.Record(name="A").add_parent("C") - b.add_property("bar", 2) - entlist = [a, b] - st = SyncGraph(entlist, crawler.identifiableAdapter) - insert, update = crawler.split_into_inserts_and_updates(st) - - assert update[0].get_property("bar").value == 2 - assert update[0].get_property("foo").value == 1 - # if this ever fails, the mock up may be removed - crawler.identifiableAdapter.get_registered_identifiable.assert_called() - crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() - - @patch("caoscrawler.crawl.cached_get_entity_by", new=Mock(side_effect=mock_get_entity_by)) @patch("caoscrawler.identifiable_adapters.cached_query", @@ -436,6 +438,12 @@ b1: ("same", c1) b2: ("same", c2) a: ([b1, b2]) + + + +- a can be identified. +- bs can be identified with each other once a is identified +- cs depend on b(s), but cannot be put in one Entity because they have conflicting properties """ prop_ident = db.Property("prop_ident", datatype=db.INTEGER) prop_other = db.Property("prop_ident", datatype=db.INTEGER) @@ -469,28 +477,109 @@ a: ([b1, b2]) crawler = Crawler(identifiableAdapter=ident_adapter) st = SyncGraph(deepcopy([rec_a, *rec_b, *rec_c]), crawler.identifiableAdapter) - for node in st.nodes: - print(node.id, node.parents) assert st._identity_relies_on_unchecked_entity(st.nodes[0]) is False assert st._identity_relies_on_unchecked_entity(st.nodes[1]) assert st._identity_relies_on_unchecked_entity(st.nodes[2]) assert st._identity_relies_on_unchecked_entity(st.nodes[3]) assert st._identity_relies_on_unchecked_entity(st.nodes[4]) - print("SEEEEEEEEEEEEEEEET") - print(st.nodes[1].properties) - print(st.nodes[2].properties) assert len(st.unchecked) == 5 # The Cs cannot be merged due to different identifying properties - # The Bs cannot be merged due to differeng references to Cs + # The Bs cannot be merged due to different references to Cs with raises(ImpossibleMergeError) as rte: crawler.split_into_inserts_and_updates(st) + # TODO # assert not isinstance(rte.value, NotImplementedError), \ # "Exception must not be NotImplementedError, but plain RuntimeError." # assert "Could not find referencing entities" in rte.value.args[0] # assert "merge conflicts in the referencing" in rte.value.args[0] +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test): + # test that backrefs are appropriately considered in the identifiable + crawler = crawler_mocked_for_backref_test + identlist = [Identifiable(name="A", record_type="BR"), + Identifiable(name="B", record_type="C", backrefs=[db.Entity()])] + referenced = db.Record(name="B").add_parent("C") + entlist = [referenced, db.Record(name="A").add_parent("BR").add_property("ref", referenced), ] + + # Test without referencing object + # currently a RuntimeError is raised if necessary properties are missing. + with raises(MissingReferencingEntityError): + st = SyncGraph([db.Record(name="B").add_parent("C")], crawler.identifiableAdapter) + + # identifiables were not yet checked + st = SyncGraph(entlist, crawler.identifiableAdapter) + assert st.get_equivalent(st.nodes[1]) is None + assert st.get_equivalent(st.nodes[0]) is None + # one can be found remotely, one not + assert crawler.identifiableAdapter.retrieve_identified_record_for_record( + identlist[0]).id == 1111 + assert crawler.identifiableAdapter.retrieve_identified_record_for_record( + identlist[1]) is None + + # check the split... + insert, update = crawler.split_into_inserts_and_updates(st) + # A was found remotely and is therefore in the update list + assert len(update) == 1 + assert update[0].name == "A" + # B does not exist on the (simulated) remote server + assert len(insert) == 1 + assert insert[0].name == "B" + + +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_test): + # test whether multiple references of the same record type are correctly used + crawler = crawler_mocked_for_backref_test + referenced = db.Record(name="B").add_parent("C") + entlist = [referenced, + db.Record(id=1, name="A").add_parent("BR").add_property("ref", referenced), + db.Record(id=2, name="C").add_parent("BR").add_property("ref", referenced), + ] + + # test whether both entities are listed in the backref attribute of the identifiable + st = SyncGraph(entlist, crawler.identifiableAdapter) + + identifiable = crawler.identifiableAdapter.get_identifiable( + st.nodes[0], + st.backward_id_referenced_by[id(st.nodes[0])]) + assert len(identifiable.backrefs) == 2 + + # check the split... + insert, update = crawler.split_into_inserts_and_updates(st) + assert len(update) == 2 + assert len(insert) == 1 + + +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_test): + # test whether multiple references of the different record types are correctly used + crawler = crawler_mocked_for_backref_test + referenced = db.Record(name="B").add_parent("D") + entlist = [referenced, + db.Record(id=1, name="A").add_parent("BR").add_property("ref", referenced), + db.Record(id=2, name="A").add_parent("BR2").add_property("ref", referenced), + ] + + # test whether both entities are listed in the backref attribute of the identifiable + st = SyncGraph(entlist, crawler.identifiableAdapter) + identifiable = crawler.identifiableAdapter.get_identifiable( + st.nodes[0], + st.backward_id_referenced_by[id(st.nodes[0])]) + + assert len(identifiable.backrefs) == 2 + + # check the split... + insert, update = crawler.split_into_inserts_and_updates(st) + assert len(update) == 2 + assert len(insert) == 1 + + def test_replace_entities_with_ids(): crawler = Crawler() a = (db.Record().add_parent("B").add_property("A", 12345) @@ -503,20 +592,15 @@ def test_replace_entities_with_ids(): assert a.get_property("C").value == [12345, 233324] -def reset_mocks(mocks): - for mock in mocks: - mock.reset_mock() - - -@ patch("caoscrawler.crawl.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by)) -@ patch("caoscrawler.identifiable_adapters.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by)) -@ patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter." - "retrieve_identified_record_for_identifiable", - new=Mock(side_effect=mock_retrieve_record)) -@ patch("caoscrawler.crawl.db.Container.insert") -@ patch("caoscrawler.crawl.db.Container.update") +@patch("caoscrawler.crawl.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@patch("caoscrawler.identifiable_adapters.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter." + "retrieve_identified_record_for_identifiable", + new=Mock(side_effect=mock_retrieve_record)) +@patch("caoscrawler.crawl.db.Container.insert") +@patch("caoscrawler.crawl.db.Container.update") def test_synchronization_no_commit(upmock, insmock): crawled_data = [r.copy() for r in EXAMPLE_SERVER_STATE if r.role == "Record"] # change one; add one @@ -580,9 +664,6 @@ def test_security_mode(updateCacheMock, upmock, insmock): assert crawler.run_id is not None insmock.assert_not_called() upmock.assert_not_called() - # import IPython - # IPython.embed() - # print(updateCacheMock.call_args_list) assert updateCacheMock.call_count == 1 # reset counts reset_mocks([updateCacheMock, insmock, upmock]) @@ -654,95 +735,7 @@ def test_validation_error_print(caplog): caplog.clear() -@ patch("caoscrawler.identifiable_adapters.get_children_of_rt", - new=Mock(side_effect=lambda x: [x])) -def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test): - crawler = crawler_mocked_for_backref_test - identlist = [Identifiable(name="A", record_type="BR"), - Identifiable(name="B", record_type="C", backrefs=[db.Entity()])] - referenced = db.Record(name="B").add_parent("C") - entlist = [referenced, db.Record(name="A").add_parent("BR").add_property("ref", referenced), ] - - # Test without referencing object - # currently a RuntimeError is raised if necessary properties are missing. - with raises(MissingReferencingEntityError): - st = SyncGraph([db.Record(name="B").add_parent("C")], crawler.identifiableAdapter) - - # identifiables were not yet checked - st = SyncGraph(entlist, crawler.identifiableAdapter) - assert st.get_equivalent(st.nodes[1]) is None - assert st.get_equivalent(st.nodes[0]) is None - # one can be found remotely, one not - assert crawler.identifiableAdapter.retrieve_identified_record_for_record( - identlist[0]).id == 1111 - assert crawler.identifiableAdapter.retrieve_identified_record_for_record( - identlist[1]) is None - - # check the split... - insert, update = crawler.split_into_inserts_and_updates(st) - # A was found remotely and is therefore in the update list - assert len(update) == 1 - assert update[0].name == "A" - # B does not exist on the (simulated) remote server - assert len(insert) == 1 - assert insert[0].name == "B" - - -@ patch("caoscrawler.identifiable_adapters.get_children_of_rt", - new=Mock(side_effect=lambda x: [x])) -def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_test): - # test whether multiple references of the same record type are correctly used - crawler = crawler_mocked_for_backref_test - referenced = db.Record(name="B").add_parent("C") - entlist = [referenced, - db.Record(id=1, name="A").add_parent("BR").add_property("ref", referenced), - db.Record(id=2, name="C").add_parent("BR").add_property("ref", referenced), - ] - - # test whether both entities are listed in the backref attribute of the identifiable - st = SyncGraph(entlist, crawler.identifiableAdapter) - - identifiable = crawler.identifiableAdapter.get_identifiable( - st.nodes[0], - st.backward_id_referenced_by[id(st.nodes[0])]) - assert len(identifiable.backrefs) == 2 - - # check the split... - insert, update = crawler.split_into_inserts_and_updates(st) - assert len(update) == 2 - assert len(insert) == 1 - - -@ patch("caoscrawler.identifiable_adapters.get_children_of_rt", - new=Mock(side_effect=lambda x: [x])) -def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_test): - # test whether multiple references of the different record types are correctly used - crawler = crawler_mocked_for_backref_test - referenced = db.Record(name="B").add_parent("D") - entlist = [referenced, - db.Record(id=1, name="A").add_parent("BR").add_property("ref", referenced), - db.Record(id=2, name="A").add_parent("BR2").add_property("ref", referenced), - ] - - # test whether both entities are listed in the backref attribute of the identifiable - st = SyncGraph(entlist, crawler.identifiableAdapter) - identifiable = crawler.identifiableAdapter.get_identifiable( - st.nodes[0], - st.backward_id_referenced_by[id(st.nodes[0])]) - - assert len(identifiable.backrefs) == 2 - - # check the split... - insert, update = crawler.split_into_inserts_and_updates(st) - assert len(update) == 2 - assert len(insert) == 1 - - -def mock_create_values(values, element): - pass - - -@ patch("caoscrawler.converters.IntegerElementConverter.create_values") +@patch("caoscrawler.converters.IntegerElementConverter.create_values") def test_restricted_path(create_mock): """ The restricted_path argument allows to ignroe part of the crawled data structure. Here, we make @@ -835,7 +828,7 @@ def test_split_restricted_path(): # Filter the warning because we want to have it here and this way it does not hinder running # tests with -Werror. -@ pytest.mark.filterwarnings("ignore:The prefix:DeprecationWarning") +@pytest.mark.filterwarnings("ignore:The prefix:DeprecationWarning") def test_deprecated_prefix_option(): """Test that calling the crawler's main function with the deprecated `prefix` option raises the correct errors and warnings. @@ -873,38 +866,8 @@ def test_create_entity_summary(): assert "<a href='/Entity/4'>a</a>, <a href='/Entity/6'>b</a>" in text -@pytest.mark.xfail -def test_detect_circular_dependency(crawler_mocked_identifiable_retrieve, caplog): - crawler = crawler_mocked_identifiable_retrieve - crawler.identifiableAdapter.get_registered_identifiable = Mock( - side_effect=lambda x: db.Record().add_parent('C').add_property(name='C')) - a = db.Record(name='a').add_parent("C") - b = db.Record(name='b').add_parent("C").add_property(name="C", value=a) - c = db.Record(name='c').add_parent("C").add_property(name='D', value='e' - ).add_property(name="C", value=b) - d = db.Record(name='c').add_parent("C") - a.add_property(name="C", value=c) - flat = [a, b, c] - circle = Crawler.detect_circular_dependency(flat) - assert [id(el) for el in circle] == [id(el) for el in [a, c, b, a]] - - assert Crawler.detect_circular_dependency([d]) is None - st = SyncGraph(flat, crawler.identifiableAdapter) - with raises(RuntimeError): - _, _ = crawler.split_into_inserts_and_updates(st) - caplog.set_level(logging.ERROR, logger="caoscrawler.converters") - assert "Found circular dependency" in caplog.text - assert "\n--------\n\n> Parent: C\n\n>> Name: a\n[\'C\']" in caplog.text - caplog.clear() - - -def mock_get_entity_by_query(query=None): - if query is not None: - return db.Record(id=1111, name='rec_name').add_parent('RT') - - -@ patch("caoscrawler.crawl.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by_query)) +@patch("caoscrawler.crawl.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by_query)) def test_replace_name_with_referenced_entity(): test_text = 'lkajsdf' test_int = 134343 @@ -972,21 +935,3 @@ def test_replace_name_with_referenced_entity(): assert isinstance(prop.value[2], int) assert prop.value[2] == test_id assert caoscrawler.crawl.cached_get_entity_by.call_count == 3 - - -def test_merge_entity_with_identifying_reference(crawler_mocked_identifiable_retrieve): - # When one python object representing a record is merged into another python object - # representing the same record, the former object can be forgotten and references from it to - # other records must not play a role - crawler = crawler_mocked_identifiable_retrieve - crawler.identifiableAdapter.get_registered_identifiable = Mock( - side_effect=lambda x: db.Record().add_parent('C').add_property(name='name') if - x.parents[0].name == "C" else - db.Record().add_parent('D').add_property(name='is_referenced_by', value="*") - ) - a = db.Record(name='a').add_parent("D") - b = db.Record(name='b').add_parent("C") - c = db.Record(name='b').add_parent("C").add_property(name="C", value=a) - flat = [a, c, b] - st = SyncGraph(flat, crawler.identifiableAdapter) - _, _ = crawler.split_into_inserts_and_updates(st) diff --git a/unittests/test_sync_graph.py b/unittests/test_sync_graph.py index 1fdab27b15a34939376d6b015fc8d7a185efb671..5ea2bee7e128c1c6234958710e48a720fe225f01 100644 --- a/unittests/test_sync_graph.py +++ b/unittests/test_sync_graph.py @@ -582,3 +582,30 @@ def test_ignoring_irrelevant_references(simple_adapter): assert len(st.unchecked) == 1 # now a nolonger relies on unchecked assert not st._identity_relies_on_unchecked_entity(st.nodes[0]) + +# 'is implementation insufficient' + + +@pytest.mark.xfail() +def test_detect_circular_dependency(crawler_mocked_identifiable_retrieve, caplog): + crawler = crawler_mocked_identifiable_retrieve + crawler.identifiableAdapter.get_registered_identifiable = Mock( + side_effect=lambda x: db.Record().add_parent('C').add_property(name='C')) + a = db.Record(name='a').add_parent("C") + b = db.Record(name='b').add_parent("C").add_property(name="C", value=a) + c = db.Record(name='c').add_parent("C").add_property(name='D', value='e' + ).add_property(name="C", value=b) + d = db.Record(name='c').add_parent("C") + a.add_property(name="C", value=c) + flat = [a, b, c] + circle = Crawler.detect_circular_dependency(flat) + assert [id(el) for el in circle] == [id(el) for el in [a, c, b, a]] + + assert Crawler.detect_circular_dependency([d]) is None + st = SyncGraph(flat, crawler.identifiableAdapter) + with raises(RuntimeError): + _, _ = crawler.split_into_inserts_and_updates(st) + caplog.set_level(logging.ERROR, logger="caoscrawler.converters") + assert "Found circular dependency" in caplog.text + assert "\n--------\n\n> Parent: C\n\n>> Name: a\n[\'C\']" in caplog.text + caplog.clear()