diff --git a/CHANGELOG.md b/CHANGELOG.md index e98dece18716f6a1034a9987caa7d38afa72837c..8cfed2795f8f91bde5053839b68871cdfbf47eb9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,9 +16,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Removed ### ### Fixed ### +* FIX: https://gitlab.com/caosdb/caosdb-crawler/-/issues/31 Identified cache: Hash is the same for Records without IDs * FIX: https://gitlab.com/caosdb/caosdb-crawler/-/issues/30 - ### Security ### ## [0.1.0] - 2022-10-11 diff --git a/src/caoscrawler/identified_cache.py b/src/caoscrawler/identified_cache.py index 0b9d7a47bdecc4094edb1296f4c04dfa083a2436..b5db8ba04fc813ea8aca0131c6265adab666b2e7 100644 --- a/src/caoscrawler/identified_cache.py +++ b/src/caoscrawler/identified_cache.py @@ -23,13 +23,52 @@ # ** end header # + """ -stores identified records and is able to detect duplicates +This module is a cache for Records where we checked the existence in a remote server using +identifiables. If the Record was found, this means that we identified the corresponding Record +in the remote server and the ID of the local object can be set. +To prevent querying the server again and again for the same objects, this cache allows storing +Records that were found on a remote server and those that were not (typically in separate caches). +The look up in the cache is done using a hash of a string representation. + +TODO: We need a general review: +- How are entities identified with each other? +- What happens if the identification fails? + +Checkout how this was done in the old crawler. """ import caosdb as db from hashlib import sha256 +from datetime import datetime + + +def _value_representation(value): + """returns the string representation of property values to be used in the hash function """ + + # TODO: (for review) + # This expansion of the hash function was introduced recently + # to allow the special case of Files as values of properties. + # We need to review the completeness of all the cases here, as the cache + # is crucial for correct identification of insertion and updates. + if value is None: + return "None" + elif isinstance(value, db.File): + return str(value.path) + elif isinstance(value, db.Entity): + if value.id is not None: + return str(value.id) + else: + return "PyID="+str(id(value)) + elif isinstance(value, list): + return "["+", ".join([_value_representation(el) for el in value])+"]" + elif (isinstance(value, str) or isinstance(value, int) or isinstance(value, float) + or isinstance(value, datetime)): + return str(value) + else: + raise ValueError(f"Unknown datatype of the value: {value}") def _create_hashable_string(identifiable: db.Record): @@ -46,28 +85,11 @@ def _create_hashable_string(identifiable: db.Record): # sorted([p.name for p in identifiable.parents]) raise RuntimeError("Cache entry can only be generated for entities with 1 parent.") rec_string = "P<{}>N<{}>".format(identifiable.parents[0].name, identifiable.name) + # TODO this structure neglects Properties if multiple exist for the same name for pname in sorted([p.name for p in identifiable.properties]): - value = str(identifiable.get_property(pname).value) - - # TODO: (for review) - # This expansion of the hash function was introduced recently - # to allow the special case of Files as values of properties. - # We need to review the completeness of all the cases here, as the cache - # is crucial for correct identification of insertion and updates. - if isinstance(identifiable.get_property(pname).value, db.File): - value = str(identifiable.get_property(pname).value.path) - elif isinstance(identifiable.get_property(pname).value, db.Entity): - value = str(identifiable.get_property(pname).value.id) - elif isinstance(identifiable.get_property(pname).value, list): - tmplist = [] - for val in identifiable.get_property(pname).value: - if isinstance(val, db.Entity): - tmplist.append(val.id) - else: - tmplist.append(val) - value = str(tmplist) - - rec_string += "{}:".format(pname) + value + + rec_string += ("{}:".format(pname) + + _value_representation(identifiable.get_property(pname).value)) return rec_string diff --git a/unittests/test_identified_cache.py b/unittests/test_identified_cache.py index 33add97d4309d87705144ec5331366d0bcd05541..aeb5f0afcd9fc9912579bf5320bbb36b52899f07 100644 --- a/unittests/test_identified_cache.py +++ b/unittests/test_identified_cache.py @@ -53,6 +53,10 @@ def test_create_hash(): db.Record("A") .add_parent("B") .add_property('a', [db.Record(id=12), 11])) == "P<B>N<A>a:[12, 11]") + assert (_create_hashable_string( + db.Record().add_parent("B").add_property('a', [db.Record()])) + != _create_hashable_string( + db.Record().add_parent("B").add_property('a', [db.Record()]))) def test_IdentifiedCache():