diff --git a/CHANGELOG.md b/CHANGELOG.md index 081b3a84035642a50d4163ec926281d409b42e66..51aa8cf0a3c8c2409b032c2f6f0ed7be299d9631 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,8 +9,23 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### +* `apiutils.empty_diff` function that returns `True` if the diffs of two + entities found with the `compare_entitis` function are empty, `False` + otherwise. + ### Changed ### +* `apiutils.compare_entities` now has an optional `compare_referenced_records` + argument to compare referenced Entities recursively (fomerly, only the + referenced Python objects would be compared). The default is `False` to + recover the original behavior. +* `apiutils.merge_entities` now has an optional + `merge_references_with_empty_diffs` argument that determines whether a merge + of two entities will be performed if they reference identical records (w.r.t + th above `empty_diff` function). Formerly this would have caused a merge + conflict if the referenced record(s) were identical, but stored in different + Python objects. + ### Deprecated ### ### Removed ### diff --git a/src/caosdb/apiutils.py b/src/caosdb/apiutils.py index bd5b0eeca217e1f77d1bd5d5c60e18f33dd76212..9d2f3c4cc343305eaff986031dfe3adc8f45dbc2 100644 --- a/src/caosdb/apiutils.py +++ b/src/caosdb/apiutils.py @@ -188,9 +188,8 @@ def getCommitIn(folder): return t.readline().strip() -def compare_entities(old_entity: Entity, new_entity: Entity): - """ - Compare two entites. +def compare_entities(old_entity: Entity, new_entity: Entity, compare_referenced_records: bool = False): + """Compare two entites. Return a tuple of dictionaries, the first index belongs to additional information for old entity, the second index belongs to additional information for new entity. @@ -204,6 +203,23 @@ def compare_entities(old_entity: Entity, new_entity: Entity): - ... value (not implemented yet) In case of changed information the value listed under the respective key shows the value that is stored in the respective entity. + + If `compare_referenced_records` is `True`, also referenced entities will be + compared using this function (which is then called with + `compare_referenced_records = False` to prevent infinite recursion in case + of circular references). + + Parameters + ---------- + old_entity, new_entity : Entity + Entities to be compared + compare_referenced_records : bool, optional + Whether to compare referenced records in case of both, `old_entity` and + `new_entity`, have the same reference properties and both have a Record + object as value. If set to `False`, only the corresponding Python + objects are compared which may lead to unexpected behavior when + identical records are stored in different objects. Default is False. + """ olddiff: Dict[str, Any] = {"properties": {}, "parents": []} newdiff: Dict[str, Any] = {"properties": {}, "parents": []} @@ -270,9 +286,29 @@ def compare_entities(old_entity: Entity, new_entity: Entity): matching[0].unit if (prop.value != matching[0].value): - olddiff["properties"][prop.name]["value"] = prop.value - newdiff["properties"][prop.name]["value"] = \ - matching[0].value + # basic comparison of value objects says they are different + same_value = False + if compare_referenced_records: + # scalar reference + if isinstance(prop.value, Entity) and isinstance(matching[0].value, Entity): + # explicitely not recursive to prevent infinite recursion + same_value = empty_diff( + prop.value, matching[0].value, compare_referenced_records=False) + # list of references + elif isinstance(prop.value, list) and isinstance(matching[0].value, list): + # all elements in both lists actually are entity objects + # TODO: check, whether mixed cases can be allowed or should lead to an error + if all([isinstance(x, Entity) for x in prop.value]) and all([isinstance(x, Entity) for x in matching[0].value]): + # can't be the same if the lengths are different + if len(prop.value) == len(matching[0].value): + # do a one-by-one comparison; the values are the same, if all diffs are empty + same_value = all( + [empty_diff(x, y, False) for x, y in zip(prop.value, matching[0].value)]) + + if not same_value: + olddiff["properties"][prop.name]["value"] = prop.value + newdiff["properties"][prop.name]["value"] = \ + matching[0].value if (len(newdiff["properties"][prop.name]) == 0 and len(olddiff["properties"][prop.name]) == 0): @@ -300,7 +336,32 @@ def compare_entities(old_entity: Entity, new_entity: Entity): return (olddiff, newdiff) -def merge_entities(entity_a: Entity, entity_b: Entity): +def empty_diff(old_entity: Entity, new_entity: Entity, compare_referenced_records: bool = False): + """Check whether the `compare_entities` found any differences between + old_entity and new_entity. + + Parameters + ---------- + old_entity, new_entity : Entity + Entities to be compared + compare_referenced_records : bool, optional + Whether to compare referenced records in case of both, `old_entity` and + `new_entity`, have the same reference properties and both have a Record + object as value. + + """ + olddiff, newdiff = compare_entities( + old_entity, new_entity, compare_referenced_records) + for diff in [olddiff, newdiff]: + for key in diff: + if len(diff[key]) > 0: + # There is a difference somewhere in the diff + return False + # all elements of the two diffs were empty + return True + + +def merge_entities(entity_a: Entity, entity_b: Entity, merge_references_with_empty_diffs=True): """ Merge entity_b into entity_a such that they have the same parents and properties. @@ -314,13 +375,30 @@ def merge_entities(entity_a: Entity, entity_b: Entity): Returns entity_a. WARNING: This function is currently experimental and insufficiently tested. Use with care. + + Parameters + ---------- + entity_a, entity_b : Entity + The entities to be merged. entity_b will be merged into entity_a in place + merge_references_with_empty_diffs : bool, optional + Whether the merge is performed if entity_a and entity_b both reference + record(s) that may be different Python objects but have empty diffs. If + set to `False` a merge conflict will be raised in this case + instead. Default is True. + + Returns + ------- + entity_a : Entity + The initial entity_a after the in-place merge + """ logging.warning( "This function is currently experimental and insufficiently tested. Use with care.") # Compare both entities: - diff_r1, diff_r2 = compare_entities(entity_a, entity_b) + diff_r1, diff_r2 = compare_entities( + entity_a, entity_b, compare_referenced_records=merge_references_with_empty_diffs) # Go through the comparison and try to apply changes to entity_a: for key in diff_r2["parents"]: diff --git a/unittests/test_apiutils.py b/unittests/test_apiutils.py index 2ebdf95a3aa5ce76b983b2c3c47630e1a8884705..89be9b86ae840e39271cc1b2aca7f0e0a82100cc 100644 --- a/unittests/test_apiutils.py +++ b/unittests/test_apiutils.py @@ -30,7 +30,7 @@ import pytest import caosdb as db import caosdb.apiutils from caosdb.apiutils import (apply_to_ids, compare_entities, create_id_query, - resolve_reference, merge_entities) + empty_diff, resolve_reference, merge_entities) from caosdb.common.models import SPECIAL_ATTRIBUTES @@ -272,8 +272,10 @@ def test_copy_entities(): for i in [0, 1]: assert c.properties[i] is not r.properties[i] for special in SPECIAL_ATTRIBUTES: - assert getattr(c.properties[i], special) == getattr(r.properties[i], special) - assert c.get_importance(c.properties[i]) == r.get_importance(r.properties[i]) + assert getattr(c.properties[i], special) == getattr( + r.properties[i], special) + assert c.get_importance( + c.properties[i]) == r.get_importance(r.properties[i]) def test_merge_entities(): @@ -326,10 +328,12 @@ def test_merge_bug_109(): assert r_a.get_property("test_bug_property").value == [18, 19] assert "<Value>18</Value>\n <Value>19</Value>" in str(r_b) - assert "<Value>18</Value>\n <Value>19</Value>\n <Value>18</Value>\n <Value>19</Value>" not in str(r_b) + assert "<Value>18</Value>\n <Value>19</Value>\n <Value>18</Value>\n <Value>19</Value>" not in str( + r_b) assert "<Value>18</Value>\n <Value>19</Value>" in str(r_a) - assert "<Value>18</Value>\n <Value>19</Value>\n <Value>18</Value>\n <Value>19</Value>" not in str(r_a) + assert "<Value>18</Value>\n <Value>19</Value>\n <Value>18</Value>\n <Value>19</Value>" not in str( + r_a) @pytest.mark.xfail @@ -349,7 +353,116 @@ def test_bug_109(): assert r_a.get_property("test_bug_property").value == [18, 19] assert "<Value>18</Value>\n <Value>19</Value>" in str(r_b) - assert "<Value>18</Value>\n <Value>19</Value>\n <Value>18</Value>\n <Value>19</Value>" not in str(r_b) + assert "<Value>18</Value>\n <Value>19</Value>\n <Value>18</Value>\n <Value>19</Value>" not in str( + r_b) assert "<Value>18</Value>\n <Value>19</Value>" in str(r_a) - assert "<Value>18</Value>\n <Value>19</Value>\n <Value>18</Value>\n <Value>19</Value>" not in str(r_a) + assert "<Value>18</Value>\n <Value>19</Value>\n <Value>18</Value>\n <Value>19</Value>" not in str( + r_a) + + +def test_wrong_merge_conflict_reference(): + """Test a wrongly detected merge conflict in case of two records referencing + two different, but identical objects. + + """ + # Two identical license records will be referenced from both records to be + # merged + license_rt = db.RecordType(name="license") + license_rec_a = db.Record(name="CC-BY-3.0").add_parent(license_rt) + license_rec_b = db.Record(name="CC-BY-3.0").add_parent(license_rt) + + # two referencing records + dataset_rt = db.RecordType(name="Dataset") + title_prop = db.Property(name="title", datatype=db.TEXT) + doi_prop = db.Property(name="DOI", datatype=db.TEXT) + rec_a = db.Record().add_parent(dataset_rt) + rec_a.add_property(name=license_rt.name, + datatype=license_rt.name, value=license_rec_a) + rec_a.add_property(name=title_prop.name, value="Some dataset title") + + rec_b = db.Record().add_parent(dataset_rt) + rec_b.add_property(name=license_rt.name, + datatype=license_rt.name, value=license_rec_b) + rec_b.add_property(name=doi_prop.name, value="https://doi.org/12345.678") + + merge_entities(rec_a, rec_b) + assert rec_a.get_property(license_rt.name) is not None + assert rec_a.get_property(license_rt.name).value is not None + assert isinstance(rec_a.get_property(license_rt.name).value, db.Record) + assert rec_a.get_property(license_rt.name).value.name == license_rec_a.name + assert rec_a.get_property(license_rt.name).value.name == license_rec_b.name + assert rec_a.get_property("title").value == "Some dataset title" + assert rec_a.get_property("doi").value == "https://doi.org/12345.678" + + # Reset rec_a + rec_a = db.Record().add_parent(dataset_rt) + rec_a.add_property(name=license_rt.name, + datatype=license_rt.name, value=license_rec_a) + rec_a.add_property(name=title_prop.name, value="Some dataset title") + + # this does not compare referenced records, so it will fail + with pytest.raises(RuntimeError) as re: + merge_entities(rec_a, rec_b, merge_references_with_empty_diffs=False) + assert "Merge conflict" in str(re.value) + + # ... as should this, of course + rec_b.get_property(license_rt.name).value.name = "Another license" + with pytest.raises(RuntimeError) as re: + merge_entities(rec_a, rec_b) + assert "Merge conflict" in str(re.value) + + +def test_empty_diff(): + + rec_a = db.Record(name="A") + rec_b = db.Record(name="B") + + assert empty_diff(rec_a, rec_a) + assert not empty_diff(rec_a, rec_b) + + rec_a.add_parent(name="RT") + rec_b.add_parent(name="RT") + assert empty_diff(rec_a, rec_a) + assert not empty_diff(rec_a, rec_b) + + rec_b.name = "A" + assert empty_diff(rec_a, rec_b) + + rec_a.add_property(name="some_prop", value=1) + assert not empty_diff(rec_a, rec_b) + + rec_b.add_property(name="some_prop", value=1) + assert empty_diff(rec_a, rec_b) + + rec_b.get_property("some_prop").value = 2 + assert not empty_diff(rec_a, rec_b) + + rec_b.get_property("some_prop").value = 1 + rec_b.add_property(name="some_other_prop", value="Test") + assert not empty_diff(rec_a, rec_b) + + rec_a.add_property(name="some_other_prop", value="Test") + assert empty_diff(rec_a, rec_b) + + # reference identical records, but different Python Record objects + ref_rec_a = db.Record(name="Ref").add_parent(name="RefType") + ref_rec_b = db.Record(name="Ref").add_parent(name="RefType") + rec_a.add_property(name="RefType", datatype="RefType", value=ref_rec_a) + rec_b.add_property(name="RefType", datatype="RefType", value=ref_rec_b) + # the default is `compare_referenced_records=False`, so the diff shouldn't + # be empty (different Python objects are referenced.) + assert not empty_diff(rec_a, rec_b) + # when looking into the referenced record, the diffs should be empty again + assert empty_diff(rec_a, rec_b, compare_referenced_records=True) + + # The same for lists of references + rec_a.remove_property("RefType") + rec_b.remove_property("RefType") + assert empty_diff(rec_a, rec_b) + rec_a.add_property(name="RefType", datatype=db.LIST( + "RefType"), value=[ref_rec_a, ref_rec_a]) + rec_b.add_property(name="RefType", datatype=db.LIST( + "RefType"), value=[ref_rec_b, ref_rec_b]) + assert not empty_diff(rec_a, rec_b) + assert empty_diff(rec_a, rec_b, compare_referenced_records=True)