diff --git a/src/caoscrawler/identified_cache.py b/src/caoscrawler/identified_cache.py index 0b9d7a47bdecc4094edb1296f4c04dfa083a2436..02618f31e432ab4b9d650ba77951503893254f1b 100644 --- a/src/caoscrawler/identified_cache.py +++ b/src/caoscrawler/identified_cache.py @@ -31,6 +31,32 @@ import caosdb as db from hashlib import sha256 +""" +TODO: We need a general review: +- How are entities identified with each other? +- What happens if the identification fails? +""" + + +def _value_representation(value): + + # TODO: (for review) + # This expansion of the hash function was introduced recently + # to allow the special case of Files as values of properties. + # We need to review the completeness of all the cases here, as the cache + # is crucial for correct identification of insertion and updates. + if isinstance(value, db.File): + return str(value.path) + elif isinstance(value, db.Entity): + if value.id is not None: + return str(value.id) + else: + return "PyID="+str(id(value)) + elif isinstance(value, list): + return "["+", ".join([_value_representation(el) for el in value])+"]" + else: + return str(value) + def _create_hashable_string(identifiable: db.Record): """ @@ -46,28 +72,11 @@ def _create_hashable_string(identifiable: db.Record): # sorted([p.name for p in identifiable.parents]) raise RuntimeError("Cache entry can only be generated for entities with 1 parent.") rec_string = "P<{}>N<{}>".format(identifiable.parents[0].name, identifiable.name) + # TODO this structure neglects Properties if multiple exist for the same name for pname in sorted([p.name for p in identifiable.properties]): - value = str(identifiable.get_property(pname).value) - - # TODO: (for review) - # This expansion of the hash function was introduced recently - # to allow the special case of Files as values of properties. - # We need to review the completeness of all the cases here, as the cache - # is crucial for correct identification of insertion and updates. - if isinstance(identifiable.get_property(pname).value, db.File): - value = str(identifiable.get_property(pname).value.path) - elif isinstance(identifiable.get_property(pname).value, db.Entity): - value = str(identifiable.get_property(pname).value.id) - elif isinstance(identifiable.get_property(pname).value, list): - tmplist = [] - for val in identifiable.get_property(pname).value: - if isinstance(val, db.Entity): - tmplist.append(val.id) - else: - tmplist.append(val) - value = str(tmplist) - - rec_string += "{}:".format(pname) + value + + rec_string += ("{}:".format(pname) + + _value_representation(identifiable.get_property(pname).value)) return rec_string diff --git a/unittests/test_identified_cache.py b/unittests/test_identified_cache.py index 3809f5e45a063981f99f3370d4bdc0004c376193..aeb5f0afcd9fc9912579bf5320bbb36b52899f07 100644 --- a/unittests/test_identified_cache.py +++ b/unittests/test_identified_cache.py @@ -53,8 +53,10 @@ def test_create_hash(): db.Record("A") .add_parent("B") .add_property('a', [db.Record(id=12), 11])) == "P<B>N<A>a:[12, 11]") - assert (_create_hashable_string(db.Record().add_parent("B").add_property('a', [db.Record()])) - != _create_hashable_string(db.Record().add_parent("B").add_property('a', [db.Record()]))) + assert (_create_hashable_string( + db.Record().add_parent("B").add_property('a', [db.Record()])) + != _create_hashable_string( + db.Record().add_parent("B").add_property('a', [db.Record()]))) def test_IdentifiedCache():