diff --git a/src/caoscrawler/identifiable.py b/src/caoscrawler/identifiable.py index dd6491eafec64a9309513f2a2cf573f1b62314b7..129bb9f36ddb5a44eb0491984bb4a70837449bed 100644 --- a/src/caoscrawler/identifiable.py +++ b/src/caoscrawler/identifiable.py @@ -65,7 +65,12 @@ class Identifiable(): @staticmethod def _value_representation(value) -> str: - """returns the string representation of property values to be used in the hash function """ + """returns the string representation of property values to be used in the hash function + + The string is the path of a File Entity, the CaosDB ID or Python ID of other Entities + (Python Id only if there is no CaosDB ID) and the string representation of bool, float, int + and str. + """ if value is None: return "None" diff --git a/src/caoscrawler/identified_cache.py b/src/caoscrawler/identified_cache.py index 878ae443bd21607408ed1dea05535507b34f014a..9df66e926903d0f975e1f6813a06b21207f75e6a 100644 --- a/src/caoscrawler/identified_cache.py +++ b/src/caoscrawler/identified_cache.py @@ -25,18 +25,7 @@ """ -This module is a cache for Records where we checked the existence in a remote server using -identifiables. If the Record was found, this means that we identified the corresponding Record -in the remote server and the ID of the local object can be set. -To prevent querying the server again and again for the same objects, this cache allows storing -Records that were found on a remote server and those that were not (typically in separate caches). -The look up in the cache is done using a hash of a string representation. - -TODO: We need a general review: -- How are entities identified with each other? -- What happens if the identification fails? - -Checkout how this was done in the old crawler. +see class docstring """ from .identifiable import Identifiable @@ -44,7 +33,23 @@ import caosdb as db class IdentifiedCache(object): - def __init__(self): + """ + This class is like a dictionary where the keys are Identifiables. When you check whether an + Identifiable exists as key this class returns True not only if that exact Python object is + used as a key, but if an Identifiable is used as key that is **equal** to the one being + considered (see __eq__ function of Identifiable). Similarly, if you do `cache[identifiable]` + you get the Record where the key is an Identifiable that is equal to the one in the rectangular + brackets. + + This class is used for Records where we checked the existence in a remote server using + identifiables. If the Record was found, this means that we identified the corresponding Record + in the remote server and the ID of the local object can be set. + To prevent querying the server again and again for the same objects, this cache allows storing + Records that were found on a remote server and those that were not (typically in separate + caches). + """ + + def __init__(self): self._cache = {} self._identifiables = []