diff --git a/src/linkahead/apiutils.py b/src/linkahead/apiutils.py index 7ffd8ebb7baf2f2f9dd8bc0adb14fd9ff771cb42..c2c687f1b3e5a175ce1c06653f198aa58b2eac63 100644 --- a/src/linkahead/apiutils.py +++ b/src/linkahead/apiutils.py @@ -179,68 +179,56 @@ def getCommitIn(folder): return get_commit_in(folder) -def compare_entities(entity0: Entity, entity1: Entity, +def compare_entities(old_entity: Entity, + new_entity: Entity, compare_referenced_records: bool = False ) -> tuple[dict[str, Any], dict[str, Any]]: """Compare two entities. - Returns two dicts listing the differences between the two entities. The - order of the two returned dicts corresponds to the two input entities. - The dicts contain two keys, 'parents' and 'properties'. The list saved - under the 'parents' key contains those parents of the respective entity - that are missing in the other entity, and the 'properties' dict contains - properties and SPECIAL_ATTRIBUTES if they are missing or different from - their counterparts in the other entity. - - The value of the properties dict for each listed property is again a dict - detailing the differences between this property and its counterpart. - The characteristics that are checked to determine whether two properties - match are the following: + The following attributes are compared: + - parents + - properties + - all SPECIAL_ATTRIBUTES (see models.py) + + Returns a tuple of dictionaries, the first element in the tupel lists additional information of old + entity, the second elements lists to additional information of new entity. + + Additional information means in detail: + - Additional parents: key="parents", value=a list of names of parents that are present in the respective entity + - Information about properties (key="properties", value=dict with property names as keys): + - properties are included in the dict if they are missing in the other entity or if one of + the following differs: - datatype - - importance + - importance or - value - If any of these characteristics differ for a property, the respective - string (datatype, importance, value) is added as a key to the dict of the - property with its value being the characteristics value, - e.g. {"prop": {"value": 6, 'importance': 'SUGGESTED'}}. - If a property is of type LIST, the comparison is order-sensitive. - Comparison of multi-properties is not yet supported, so should either - entity have several instances of one Property, the comparison is aborted - and an error is raised. + In case of changed information the value listed under the respective key is the + value that is stored in the respective entity. - Two parents match, if TODO: get final decision on parent match criteria + If `compare_referenced_records` is `True`, also referenced entities will be + compared using this function (which is then called with + `compare_referenced_records = False` to prevent infinite recursion in case + of circular references). - Should referenced records not be checked for equality between the entities - but for equivalency, this is possible by setting the parameter - compare_referenced_records. + Parameters + ---------- + old_entity, new_entity : Entity + Entities to be compared + compare_referenced_records : bool, optional + Whether to compare referenced records in case of both, `old_entity` and + `new_entity`, have the same reference properties and both have a Record + object as value. If set to `False`, only the corresponding Python + objects are compared which may lead to unexpected behavior when + identical records are stored in different objects. Default is False. - Params - ------ - entity0 : Entity - First entity to be compared. - entity1 : Entity - Second entity to be compared. - compare_referenced_records: bool, default: False - If set to True, corresponding referenced - records are not checked for equality but for - equivalency, using this function. - compare_referenced_records is set to False for - these recursive calls, so references of - references need to be equal. If set to `False`, - only the Python objects are compared, which may - lead to unexpected behavior. """ - diff = ({"properties": {}, "parents": []}, - {"properties": {}, "parents": []}) + olddiff: dict[str, Any] = {"properties": {}, "parents": []} + newdiff: dict[str, Any] = {"properties": {}, "parents": []} - # ToDo: Is is wanted here? == would make more sense unless there are plans - # to do use this function in the equality check of entities - if entity0 is entity1: - return diff + if old_entity is new_entity: + return olddiff, newdiff - # ToDo: Same question as above, is not or != - if type(entity0) is not type(entity1): + if type(old_entity) is not type(new_entity): raise ValueError( "Comparison of different Entity types is not supported.") @@ -249,206 +237,135 @@ def compare_entities(entity0: Entity, entity1: Entity, if attr == "value": continue - attr0 = entity0.__getattribute__(attr) + oldattr = old_entity.__getattribute__(attr) # we consider "" and None to be nonexistent - attr0_unset = (attr0 == "" or attr0 is None) + old_entity_attr_na = (oldattr == "" or oldattr is None) - attr1 = entity1.__getattribute__(attr) + newattr = new_entity.__getattribute__(attr) # we consider "" and None to be nonexistent - attr1_unset = (attr1 == "" or attr1 is None) + new_entity_attr_na = (newattr == "" or newattr is None) # in both entities the current attribute is not set - if attr0_unset and attr1_unset: + if old_entity_attr_na and new_entity_attr_na: continue # treat datatype separately if one datatype is an object and the other - # a string or int, and therefore may be a name or id + # a string or int, and may be a name or id if attr == "datatype": - if not attr0_unset and not attr1_unset: - if isinstance(attr0, RecordType): - if attr0.name == attr1: + if not old_entity_attr_na and not new_entity_attr_na: + if isinstance(oldattr, RecordType): + if oldattr.name == newattr: continue - if str(attr0.id) == str(attr1): + if oldattr.id == newattr: continue - if isinstance(attr1, RecordType): - if attr1.name == attr0: + if isinstance(newattr, RecordType): + if newattr.name == oldattr: continue - if str(attr1.id) == str(attr0): + if newattr.id == oldattr: continue - # add to diff if attr has different values or is not set for one entity - if (attr0_unset != attr1_unset) or (attr0 != attr1): - if not attr0_unset: - diff[0][attr] = attr0 - if not attr1_unset: - diff[1][attr] = attr1 + # add to diff if attribute has different values or is not set for one entity + if ((old_entity_attr_na ^ new_entity_attr_na) + or (oldattr != newattr)): + if not old_entity_attr_na: + olddiff[attr] = oldattr + + if not new_entity_attr_na: + newdiff[attr] = newattr # compare value - ent0_val, ent1_val = entity0.value, entity1.value - if ent0_val != ent1_val: + if old_entity.value != new_entity.value: # though the values are not equal, they might be equivalent: same_value = False if compare_referenced_records: # both values are scalar references: - if isinstance(ent0_val, Entity) and isinstance(ent1_val, Entity): - # comp_referenced_records=False to prevent infinite recursion - # ToDo: Consider keeping a list of traversed entities rather - # than abort past first layer - same_value = empty_diff(ent0_val, ent1_val, False) + if isinstance(old_entity.value, Entity) and isinstance(new_entity.value, Entity): + # compare_referenced_records=False to prevent infinite recursion + same_value = empty_diff( + old_entity.value, new_entity.value, compare_referenced_records=False) # both values are a list of references: - elif isinstance(ent0_val, list) and isinstance(ent1_val, list): - # lists can't be the same if the lengths are different - if len(ent0_val) == len(ent1_val): - lists_match = True - for val0, val1 in zip(ent0_val, ent1_val): - if val0 == val1: - continue - if (isinstance(val0, Entity) - and isinstance(val1, Entity)): - if empty_diff(val0, val1, False): - continue - # ToDo: Henrik wasn't sure whether it would make sense - # to check int and str against an Entity id/name - # -> ask, but if uncommented the test - # test_merge_id_with_resolved_entity in - # test_apiutils fails - #if (isinstance(val0, Entity) - # and isinstance(val1, (int, str))): - # if (str(val0.id) == str(val1) - # or str(val0.name) == str(val1)): - # continue - #if (isinstance(val1, Entity) - # and isinstance(val0, (int, str))): - # if (str(val1.id) == str(val0) - # or str(val1.name) == str(val0)): - # continue - # val0 and val1 could not be matched - lists_match = False - break - if lists_match: - same_value = True + elif isinstance(old_entity.value, list) and isinstance(new_entity.value, list): + # if all elements in both lists are entity objects, check each pair for equality + # TODO: check whether mixed cases should be allowed or lead to an error + if (all([isinstance(x, Entity) for x in old_entity.value]) + and all([isinstance(x, Entity) for x in new_entity.value])): + # lists can't be the same if the lengths are different + if len(old_entity.value) == len(new_entity.value): + # the lists are the same if the diffs of each entry pair are empty + same_value = all( + [empty_diff(x, y, False) for x, y + in zip(old_entity.value, new_entity.value)]) if not same_value: - diff[0]["value"] = ent0_val - diff[1]["value"] = ent1_val + olddiff["value"] = old_entity.value + newdiff["value"] = new_entity.value # compare properties - for prop in entity0.properties: - matching = entity1.properties.filter(prop, check_wrapped=False) + for prop in old_entity.properties: + matching = [p for p in new_entity.properties if p.name.lower() == prop.name.lower()] if len(matching) == 0: - # entity1 has prop, entity0 does not - diff[0]["properties"][prop.name] = {} - elif len(matching) == 1: - diff[0]["properties"][prop.name] = {} - diff[1]["properties"][prop.name] = {} - propdiff = (diff[0]["properties"][prop.name], - diff[1]["properties"][prop.name]) - - # Recursive call to determine the differences between properties + # old_entity has prop, new_entity does not + olddiff["properties"][prop.name] = {} + elif len(matching) ==1: + olddiff["properties"][prop.name] = {} + newdiff["properties"][prop.name] = {} + oldpropdiff = olddiff["properties"][prop.name] + newpropdiff = newdiff["properties"][prop.name] + + # recursive call to determine the differences between properties # ToDo: This can lead to infinite recursion if two properties have - # themselves or each other as subproperties - would also be - # fixed by keeping a list of traversed entities I believe - # We should compare the wrapped properties instead of the - # wrapping entities if possible: - comp1, comp2 = prop, matching[0] - if (comp1._wrapped_entity is not None - and comp2._wrapped_entity is not None): - comp1, comp2 = comp1._wrapped_entity, comp2._wrapped_entity - # ToDo: compare_referenced_records is being ignored here, is this - # intended? If not, also fix in other places - od, nd = compare_entities(comp1, comp2, compare_referenced_records) - # ToDo: Do we care about parents and properties here? - # Note: If we can identify a subset of characteristics that we - # exclusively care about, maybe restrict the recursion to - # those checks at some point? Would be faster, and might - # replace the full recursions that we later call empty_diff - # on anyways - # as we do not care about parents and properties here, - # discard their entries + # each other as subproperties + od, nd = compare_entities(prop, matching[0], compare_referenced_records=compare_referenced_records) + # as we do not care about parents and properties here, discard their entries + # TODO do we? od.pop("parents") od.pop("properties") nd.pop("parents") nd.pop("properties") # use the remaining diff - propdiff[0].update(od) - propdiff[1].update(nd) + oldpropdiff.update(od) + newpropdiff.update(nd) - # As the importance of a property is an attribute of the record - # and not the property, it is not contained in the diff returned - # by compare_entities and needs to be added separately - if (entity0.get_importance(prop.name) != - entity1.get_importance(prop.name)): - propdiff[0]["importance"] = entity0.get_importance(prop.name) - propdiff[1]["importance"] = entity1.get_importance(prop.name) + # As the importance of a property is an attribute of the record and not + # the property, it is not contained in the diff returned by compare_entities + if old_entity.get_importance(prop.name) != new_entity.get_importance(prop.name): + oldpropdiff["importance"] = old_entity.get_importance(prop.name) + newpropdiff["importance"] = new_entity.get_importance(prop.name) # in case there is no difference, we remove the dict keys again - if len(propdiff[0]) == 0 and len(propdiff[1]) == 0: - diff[0]["properties"].pop(prop.name) - diff[1]["properties"].pop(prop.name) - + if len(newpropdiff) == 0 and len(oldpropdiff) == 0: + newdiff["properties"].pop(prop.name) + olddiff["properties"].pop(prop.name) else: raise NotImplementedError( "Comparison not implemented for multi-properties.") - # ToDo: Iterate and match analogously to parents? - - # we have not yet compared properties that do not exist in entity0 - for prop in entity1.properties: - # check how often the property appears in entity0 - num_prop_in_ent0 = len(entity0.properties.filter(prop)) - if num_prop_in_ent0 == 0: - # property is only present in entity0 - add to diff - diff[1]["properties"][prop.name] = {} - if num_prop_in_ent0 > 1: - # Check whether the property is present multiple times in entity0 + + # we have not yet compared properties that do not exist in old_entity + for prop in new_entity.properties: + # check how often the property appears in old_entity + num_old_prop = len([0 for p in old_entity.properties + if p.name.lower() == prop.name.lower()]) + if num_old_prop == 0: + # property is only present in new_entity - add to diff + newdiff["properties"][prop.name] = {} + if num_old_prop > 1: + # Check whether the property is present multiple times in old_entity # and raise error - result would be incorrect raise NotImplementedError( "Comparison not implemented for multi-properties.") # compare parents - for index, parents, other_entity in [(0, entity0.parents, entity1), - (1, entity1.parents, entity0)]: - for parent in parents: - # As parent is a wrapper entity, we also want the wrappers - matching = other_entity.parents.filter(parent, check_wrapped=False) - if len(matching) == 0: - diff[index]["parents"].append(parent.name) - continue - # ToDo: Should parents also have a differences dict? - # ToDo: What about duplicate parents? Is this already satisfactory? - # ToDo: Recheck whether matching id and name is intended to be a - # sufficient criterion for a parent match, if yes, delete - # this and uncomment+test the commented block below - match_found = False - for match in matching: - comp1, comp2 = parent, match - if (comp1._wrapped_entity is not None - and comp2._wrapped_entity is not None): - comp1, comp2 = comp1._wrapped_entity, comp2._wrapped_entity - if empty_diff(comp1, comp2, False): - # For RecordType check inheritance, otherwise it's a match - if not isinstance(parent, RecordType): - match_found = True - break - elif (parent._flags['inheritance'] == - match._flags['inheritance']): - match_found = True - break - if not match_found: - diff[index]["parents"].append(parent.name) - # if isinstance(parent, RecordType): - # # compare inheritance level only for RecordTypes - # matching_inheritance_found = False - # for potential in matching: - # if (parent._flags['inheritance'] == - # potential._flags['inheritance']): - # matching_inheritance_found = True - # break - # if not matching_inheritance_found: - # # ToDo: How to indicate that this is a non-matching - # # importance level instead of a missing parent? - # diff[index]["parents"].append(parent.name) - - return diff + # ToDo: Compare using filter function, compare inheritance level for RTs + # TODO we currently only use names for parents and property matching + for parent in old_entity.parents: + if len([0 for p in new_entity.parents if p.name.lower() == parent.name.lower()]) == 0: + olddiff["parents"].append(parent.name) + + for parent in new_entity.parents: + if len([0 for p in old_entity.parents if p.name.lower() == parent.name.lower()]) == 0: + newdiff["parents"].append(parent.name) + + return olddiff, newdiff def empty_diff(old_entity: Entity, new_entity: Entity,