diff --git a/src/caoscrawler/identified_cache.py b/src/caoscrawler/identified_cache.py index 02618f31e432ab4b9d650ba77951503893254f1b..7b680b5ab69b74469c0ce064c91e56bbef7340ed 100644 --- a/src/caoscrawler/identified_cache.py +++ b/src/caoscrawler/identified_cache.py @@ -23,22 +23,30 @@ # ** end header # -""" -stores identified records and is able to detect duplicates -""" - -import caosdb as db - -from hashlib import sha256 """ +This module is a cache for Records where we checked the existence in a remote server using +identifiables. If the Record was found, this means that we identified the corresponding Record +in the remote server and the ID of the local object can be set. +To prevent querying the server again and again for the same objects, this cache allows storing +Records that were found on a remote server and those that were not (typically in separate caches). +The look up in the cache is done using a hash of a string representation. + TODO: We need a general review: - How are entities identified with each other? - What happens if the identification fails? + +Checkout how this was done in the old crawler. """ +import caosdb as db + +from hashlib import sha256 +from datetime import datetime + def _value_representation(value): + """returns the string representation of property values to be used in the hash function """ # TODO: (for review) # This expansion of the hash function was introduced recently @@ -54,8 +62,11 @@ def _value_representation(value): return "PyID="+str(id(value)) elif isinstance(value, list): return "["+", ".join([_value_representation(el) for el in value])+"]" - else: + elif (isinstance(value, str) or isinstance(value, int) or isinstance(value, float) + or isinstance(value, datetime)): return str(value) + else: + raise ValueError(f"Unknown datatype of the value: {value}") def _create_hashable_string(identifiable: db.Record):