From 475174d70522ca37cedd38483c560c5d3815ebc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Fri, 21 Oct 2022 14:04:15 +0200 Subject: [PATCH] FIX: fix hash func --- src/caoscrawler/identified_cache.py | 51 +++++++++++++++++------------ unittests/test_identified_cache.py | 6 ++-- 2 files changed, 34 insertions(+), 23 deletions(-) diff --git a/src/caoscrawler/identified_cache.py b/src/caoscrawler/identified_cache.py index 0b9d7a47..02618f31 100644 --- a/src/caoscrawler/identified_cache.py +++ b/src/caoscrawler/identified_cache.py @@ -31,6 +31,32 @@ import caosdb as db from hashlib import sha256 +""" +TODO: We need a general review: +- How are entities identified with each other? +- What happens if the identification fails? +""" + + +def _value_representation(value): + + # TODO: (for review) + # This expansion of the hash function was introduced recently + # to allow the special case of Files as values of properties. + # We need to review the completeness of all the cases here, as the cache + # is crucial for correct identification of insertion and updates. + if isinstance(value, db.File): + return str(value.path) + elif isinstance(value, db.Entity): + if value.id is not None: + return str(value.id) + else: + return "PyID="+str(id(value)) + elif isinstance(value, list): + return "["+", ".join([_value_representation(el) for el in value])+"]" + else: + return str(value) + def _create_hashable_string(identifiable: db.Record): """ @@ -46,28 +72,11 @@ def _create_hashable_string(identifiable: db.Record): # sorted([p.name for p in identifiable.parents]) raise RuntimeError("Cache entry can only be generated for entities with 1 parent.") rec_string = "P<{}>N<{}>".format(identifiable.parents[0].name, identifiable.name) + # TODO this structure neglects Properties if multiple exist for the same name for pname in sorted([p.name for p in identifiable.properties]): - value = str(identifiable.get_property(pname).value) - - # TODO: (for review) - # This expansion of the hash function was introduced recently - # to allow the special case of Files as values of properties. - # We need to review the completeness of all the cases here, as the cache - # is crucial for correct identification of insertion and updates. - if isinstance(identifiable.get_property(pname).value, db.File): - value = str(identifiable.get_property(pname).value.path) - elif isinstance(identifiable.get_property(pname).value, db.Entity): - value = str(identifiable.get_property(pname).value.id) - elif isinstance(identifiable.get_property(pname).value, list): - tmplist = [] - for val in identifiable.get_property(pname).value: - if isinstance(val, db.Entity): - tmplist.append(val.id) - else: - tmplist.append(val) - value = str(tmplist) - - rec_string += "{}:".format(pname) + value + + rec_string += ("{}:".format(pname) + + _value_representation(identifiable.get_property(pname).value)) return rec_string diff --git a/unittests/test_identified_cache.py b/unittests/test_identified_cache.py index 3809f5e4..aeb5f0af 100644 --- a/unittests/test_identified_cache.py +++ b/unittests/test_identified_cache.py @@ -53,8 +53,10 @@ def test_create_hash(): db.Record("A") .add_parent("B") .add_property('a', [db.Record(id=12), 11])) == "P<B>N<A>a:[12, 11]") - assert (_create_hashable_string(db.Record().add_parent("B").add_property('a', [db.Record()])) - != _create_hashable_string(db.Record().add_parent("B").add_property('a', [db.Record()]))) + assert (_create_hashable_string( + db.Record().add_parent("B").add_property('a', [db.Record()])) + != _create_hashable_string( + db.Record().add_parent("B").add_property('a', [db.Record()]))) def test_IdentifiedCache(): -- GitLab