From 4e9d33d1f55e620c3f6030d94d42cf7cd711a891 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Fri, 21 Oct 2022 15:55:45 +0200 Subject: [PATCH] MAINT: docs and string conversion only for specific types --- src/caoscrawler/identified_cache.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/src/caoscrawler/identified_cache.py b/src/caoscrawler/identified_cache.py index 02618f31..7b680b5a 100644 --- a/src/caoscrawler/identified_cache.py +++ b/src/caoscrawler/identified_cache.py @@ -23,22 +23,30 @@ # ** end header # -""" -stores identified records and is able to detect duplicates -""" - -import caosdb as db - -from hashlib import sha256 """ +This module is a cache for Records where we checked the existence in a remote server using +identifiables. If the Record was found, this means that we identified the corresponding Record +in the remote server and the ID of the local object can be set. +To prevent querying the server again and again for the same objects, this cache allows storing +Records that were found on a remote server and those that were not (typically in separate caches). +The look up in the cache is done using a hash of a string representation. + TODO: We need a general review: - How are entities identified with each other? - What happens if the identification fails? + +Checkout how this was done in the old crawler. """ +import caosdb as db + +from hashlib import sha256 +from datetime import datetime + def _value_representation(value): + """returns the string representation of property values to be used in the hash function """ # TODO: (for review) # This expansion of the hash function was introduced recently @@ -54,8 +62,11 @@ def _value_representation(value): return "PyID="+str(id(value)) elif isinstance(value, list): return "["+", ".join([_value_representation(el) for el in value])+"]" - else: + elif (isinstance(value, str) or isinstance(value, int) or isinstance(value, float) + or isinstance(value, datetime)): return str(value) + else: + raise ValueError(f"Unknown datatype of the value: {value}") def _create_hashable_string(identifiable: db.Record): -- GitLab