From 6055192a1495907ed632c1e3cbc9350fda24df88 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com>
Date: Mon, 14 Nov 2022 17:51:23 +0100
Subject: [PATCH] MAINT: include id and path in Identifiable

---
 src/caoscrawler/crawl.py            | 61 +++++++++++++++++------
 src/caoscrawler/identifiable.py     | 75 ++++++++++++++++++++++++++---
 src/caoscrawler/identified_cache.py | 62 ++++--------------------
 src/doc/concepts.rst                | 18 ++++---
 unittests/test_identified_cache.py  | 60 ++++++++---------------
 unittests/test_tool.py              | 13 +++--
 6 files changed, 163 insertions(+), 126 deletions(-)

diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py
index e1cb2009..6e84b1bf 100644
--- a/src/caoscrawler/crawl.py
+++ b/src/caoscrawler/crawl.py
@@ -718,15 +718,45 @@ class Crawler(object):
         while resolved_references and len(flat) > 0:
             resolved_references = False
 
+            # For each element we try to find out whether we can find it in the server or whether
+            # it does not yet exist. Since a Record may reference other unkown Records it might not
+            # be possible to answer this right away.
+            # The following checks are done on each Record:
+            # 1. Can it be identified via an ID?
+            # 2. Can it be identified via a path?
+            # 3. Is it in the cache of already checked Records?
+            # 4. Can it be checked on the remote server?
+            # 5. Does it have to be new since a needed referenc is missing?
             for i in reversed(range(len(flat))):
                 record = flat[i]
                 identifiable = self.identifiableAdapter.get_identifiable(record)
 
                 # TODO remove if the exception is never raised
-                if (record.id is not None or record in to_be_inserted):
+                if record in to_be_inserted:
                     raise RuntimeError("This should not be reached since treated elements"
                                        "are removed from the list")
-                # Check whether this record is a duplicate that can be removed
+                # 1. Can it be identified via an ID?
+                elif record.id is not None:
+                    to_be_updated.append(record)
+                    self.add_to_remote_existing_cache(record)
+                    del flat[i]
+                # 2. Can it be identified via a path?
+                elif record.path is not None:
+                    exiting = self._get_entity_by_path(record.path)
+                    if exiting is None:
+                        to_be_inserted.append(record)
+                        self.add_to_remote_missing_cache(record)
+                        del flat[i]
+                    else:
+                        record.id = exiting.id
+                        # TODO check the following copying of _size and _checksum
+                        # Copy over checksum and size too if it is a file
+                        record._size = exiting._size
+                        record._checksum = exiting._checksum
+                        to_be_updated.append(record)
+                        self.add_to_remote_existing_cache(record)
+                        del flat[i]
+                # 3. Is it in the cache of already checked Records?
                 elif self.get_from_any_cache(identifiable) is not None:
                     # We merge the two in order to prevent loss of information
                     newrecord = self.get_from_any_cache(identifiable)
@@ -737,12 +767,11 @@ class Crawler(object):
                     del flat[i]
                     resolved_references = True
 
-                # can we check whether the record(identifiable) exists on the remote server?
+                # 4. Can it be checked on the remote server?
                 elif not self._has_reference_value_without_id(identifiable):
-                    # TODO: remove deepcopy?
                     identified_record = (
-                        self.identifiableAdapter.retrieve_identified_record_for_record(
-                            deepcopy(record)))
+                        self.identifiableAdapter.retrieve_identified_record_for_identifiable(
+                            identifiable))
                     if identified_record is None:
                         # identifiable does not exist remotely -> record needs to be inserted
                         to_be_inserted.append(record)
@@ -751,18 +780,14 @@ class Crawler(object):
                     else:
                         # side effect
                         record.id = identified_record.id
-                        # Copy over checksum and size too if it is a file
-                        if isinstance(record, db.File):
-                            record._size = identified_record._size
-                            record._checksum = identified_record._checksum
-
                         to_be_updated.append(record)
                         self.add_to_remote_existing_cache(record)
                         del flat[i]
                     resolved_references = True
 
-                # is it impossible to check this record because an identifiable references a
-                # missing record?
+                # 5. Does it have to be new since a needed referenc is missing?
+                # (Is it impossible to check this record because an identifiable references a
+                # missing record?)
                 elif self._has_missing_object_in_references(identifiable):
                     to_be_inserted.append(record)
                     self.add_to_remote_missing_cache(record)
@@ -901,6 +926,13 @@ class Crawler(object):
     def _get_entity_by_name(name):
         return db.Entity(name=name).retrieve()
 
+    @staticmethod
+    def _get_entity_by_path(path):
+        try:
+            return db.execute_query(f"FIND FILE WHICH IS STORED AT '{path}'", unique=True)
+        except db.exceptions.EmptyUniqueQueryError:
+            return None
+
     @staticmethod
     def _get_entity_by_id(id):
         return db.Entity(id=id).retrieve()
@@ -974,8 +1006,7 @@ class Crawler(object):
             self.replace_entities_with_ids(el)
 
         identified_records = [
-            self.identifiableAdapter.retrieve_identified_record_for_record(
-                record)
+            self.identifiableAdapter.retrieve_identified_record_for_record(record)
             for record in to_be_updated]
         # Merge with existing data to prevent unwanted overwrites
         to_be_updated = self._merge_properties_from_remote(to_be_updated,
diff --git a/src/caoscrawler/identifiable.py b/src/caoscrawler/identifiable.py
index 3d2fce0e..78dfe08f 100644
--- a/src/caoscrawler/identifiable.py
+++ b/src/caoscrawler/identifiable.py
@@ -20,6 +20,9 @@
 #
 
 from __future__ import annotations
+import caosdb as db
+from datetime import datetime
+from hashlib import sha256
 from typing import Union
 
 
@@ -27,9 +30,10 @@ class Identifiable():
     """
     The fingerprint of a Record in CaosDB.
 
-    This class contains the information that is used by the CaosDB Crawler to identify Records .
-    For example, in order to check whether a Record exits in the CaosDB Server, a query is created
-    using the information contained in the Identifiable.
+    This class contains the information that is used by the CaosDB Crawler to identify Records.
+    On one hand, this can be the ID or a Record or the path of a File.
+    On the other hand, in order to check whether a Record exits in the CaosDB Server, a query can
+    be created using the information contained in the Identifiable.
 
     Parameters
     ----------
@@ -42,14 +46,73 @@ class Identifiable():
     backrefs: list, TODO future
     """
 
-    def __init__(self, record_type: str = None, name: str = None, properties: dict = None,
-                 path: str = None, backrefs: list[Union[int, str]] = None):
+    def __init__(self, record_id: int = None, path: str = None, record_type: str = None,
+                 name: str = None, properties: dict = None,
+                 backrefs: list[Union[int, str]] = None):
+        self.record_id = record_id
+        self.path = path
         self.record_type = record_type
         self.name = name
         self.properties: dict = {}
         if properties is not None:
             self.properties = properties
-        self.path = path
         self.backrefs: list = []
         if backrefs is not None:
             self.backrefs = backrefs
+
+    def get_representation(self) -> str:
+        return sha256(Identifiable._create_hashable_string(self).encode('utf-8')).hexdigest()
+
+    @staticmethod
+    def _value_representation(value) -> str:
+        """returns the string representation of property values to be used in the hash function """
+
+        if value is None:
+            return "None"
+        elif isinstance(value, db.File):
+            return str(value.path)
+        elif isinstance(value, db.Entity):
+            if value.id is not None:
+                return str(value.id)
+            else:
+                return "PyID=" + str(id(value))
+        elif isinstance(value, list):
+            return "[" + ", ".join([Identifiable._value_representation(el) for el in value]) + "]"
+        elif (isinstance(value, str) or isinstance(value, int) or isinstance(value, float)
+              or isinstance(value, datetime)):
+            return str(value)
+        else:
+            raise ValueError(f"Unknown datatype of the value: {value}")
+
+    @staticmethod
+    def _create_hashable_string(identifiable: Identifiable) -> str:
+        """
+        creates a string from the attributes of an identifiable that can be hashed
+        """
+        rec_string = "P<{}>N<{}>".format(identifiable.record_type, identifiable.name)
+        # TODO this structure neglects Properties if multiple exist for the same name
+        for pname in sorted(identifiable.properties.keys()):
+            rec_string += ("{}:".format(pname) +
+                           Identifiable._value_representation(identifiable.properties[pname]))
+        return rec_string
+
+    def __eq__(self, other) -> bool:
+        """
+        Identifiables are equal if they belong to the same Record. Since ID and path are on their
+        own enough to identify the Record it is sufficient if those attributes are equal.
+        """
+        if not isinstance(other, Identifiable):
+            return False
+        if self.record_id is not None and self.record_id == other.record_id:
+            return True
+        elif (self.record_id is not None and other.record_id is not None
+              and self.record_id == other.record_id):
+            return False
+        elif self.path is not None and self.path == other.path:
+            return True
+        elif (self.path is not None and other.path is not None and self.path == other.id):
+            return False
+        elif self.get_representation() == other.get_representation():
+            return True
+        else:
+            return False
diff --git a/src/caoscrawler/identified_cache.py b/src/caoscrawler/identified_cache.py
index c9388798..878ae443 100644
--- a/src/caoscrawler/identified_cache.py
+++ b/src/caoscrawler/identified_cache.py
@@ -39,66 +39,22 @@ TODO: We need a general review:
 Checkout how this was done in the old crawler.
 """
 
-import caosdb as db
-
-from hashlib import sha256
-from datetime import datetime
-
 from .identifiable import Identifiable
-
-
-def _value_representation(value):
-    """returns the string representation of property values to be used in the hash function """
-
-    # TODO: (for review)
-    #       This expansion of the hash function was introduced recently
-    #       to allow the special case of Files as values of properties.
-    #       We need to review the completeness of all the cases here, as the cache
-    #       is crucial for correct identification of insertion and updates.
-    if value is None:
-        return "None"
-    elif isinstance(value, db.File):
-        return str(value.path)
-    elif isinstance(value, db.Entity):
-        if value.id is not None:
-            return str(value.id)
-        else:
-            return "PyID=" + str(id(value))
-    elif isinstance(value, list):
-        return "[" + ", ".join([_value_representation(el) for el in value]) + "]"
-    elif (isinstance(value, str) or isinstance(value, int) or isinstance(value, float)
-          or isinstance(value, datetime)):
-        return str(value)
-    else:
-        raise ValueError(f"Unknown datatype of the value: {value}")
-
-
-def _create_hashable_string(identifiable: Identifiable):
-    """
-    creates a string from the attributes of an identifiable that can be hashed
-    """
-    rec_string = "P<{}>N<{}>Path<{}>".format(identifiable.record_type, identifiable.name,
-                                             identifiable.path)
-    # TODO this structure neglects Properties if multiple exist for the same name
-    for pname in sorted(identifiable.properties.keys()):
-        rec_string += ("{}:".format(pname) +
-                       _value_representation(identifiable.properties[pname]))
-    return rec_string
-
-
-def _create_hash(identifiable: Identifiable) -> str:
-    return sha256(_create_hashable_string(identifiable).encode('utf-8')).hexdigest()
+import caosdb as db
 
 
 class IdentifiedCache(object):
     def __init__(self):
         self._cache = {}
+        self._identifiables = []
 
-    def __contains__(self, identifiable: db.Record):
-        return _create_hash(identifiable) in self._cache
+    def __contains__(self, identifiable: Identifiable):
+        return identifiable in self._identifiables
 
     def __getitem__(self, identifiable: db.Record):
-        return self._cache[_create_hash(identifiable)]
+        index = self._identifiables.index(identifiable)
+        return self._cache[id(self._identifiables[index])]
 
-    def add(self, record: db.Record, identifiable: db.Record):
-        self._cache[_create_hash(identifiable)] = record
+    def add(self, record: db.Record, identifiable: Identifiable):
+        self._cache[id(identifiable)] = record
+        self._identifiables.append(identifiable)
diff --git a/src/doc/concepts.rst b/src/doc/concepts.rst
index 0f602f7e..012f393b 100644
--- a/src/doc/concepts.rst
+++ b/src/doc/concepts.rst
@@ -42,30 +42,36 @@ In order to check whether a Record exits in the CaosDB Server, the CaosDB Crawle
 using the information contained in the Identifiable.
 
 For example, suppose a certain experiment is at most done once per day, then the identifiable could
-consist of the RecordType "SomeExperiment" and the Property "date".
+consist of the RecordType "SomeExperiment" (as a parent) and the Property "date".
 
-You can think of the properties that are used by the identifiable as dictionary. For each property
+You can think of the properties that are used by the identifiable as a dictionary. For each property
 name there can be one value. However, this value can be a list such that the created query can look
 like "FIND RECORD ParamenterSet WITH a=5 AND a=6". This is meaningful if there is a ParamenterSet
-with two Properties with the name 'a' or if 'a' is a list containing at least the values 5 and 6.
+with two Properties with the name 'a' (multi property) or if 'a' is a list containing at least the values 5 and 6.
 
 The path of a File object can serve as a Property that identifies files and similarly the name of 
 Records can be used.
 
-An identifiable can only use one RecordType even though the identified Records might have multiple
+In the current implementation an identifiable can only use one RecordType even though the identified Records might have multiple
 Parents.
 
-Relevant sources in:
+Relevant sources in
 
 - ``src/identifiable_adapters.py``
 - ``src/identifiable.py``
 
 RegisteredIdentifiables
 +++++++++++++++++++++++
-A Registered Identifiable is the blue print for Identifiables. RegisteredIdentifiables are
+A Registered Identifiable is the blue print for Identifiables. 
+You can think of registered identifiables as identifiables without concrete values for properties.
+RegisteredIdentifiables are
 associated with RecordTypes and define of what information an identifiable for that RecordType
 exists. There can be multiple Registered Identifiables for one RecordType.
 
+Identified Records
+++++++++++++++++++
+TODO
+
 The Crawler
 +++++++++++
 
diff --git a/unittests/test_identified_cache.py b/unittests/test_identified_cache.py
index 4a9ee2a8..dfddbbd9 100644
--- a/unittests/test_identified_cache.py
+++ b/unittests/test_identified_cache.py
@@ -27,40 +27,38 @@
 test identified_cache module
 """
 
-from caoscrawler.identified_cache import _create_hashable_string, IdentifiedCache
 import caosdb as db
 from caoscrawler.identifiable import Identifiable
-
-from caoscrawler.identified_cache import _create_hashable_string as create_hash_string
+from caoscrawler.identified_cache import IdentifiedCache
 
 
 def test_create_hash():
-    assert _create_hashable_string(
-        Identifiable(name="A", record_type="B")) == "P<B>N<A>Path<None>"
-    assert _create_hashable_string(
-        Identifiable(name="A", record_type="B", properties={'a': 5})) == "P<B>N<A>Path<None>a:5"
-    a = _create_hashable_string(
+    assert Identifiable._create_hashable_string(
+        Identifiable(name="A", record_type="B")) == "P<B>N<A>"
+    assert Identifiable._create_hashable_string(
+        Identifiable(name="A", record_type="B", properties={'a': 5})) == "P<B>N<A>a:5"
+    a = Identifiable._create_hashable_string(
         Identifiable(name="A", record_type="B", properties={'a': 4, 'b': 5}))
-    b = _create_hashable_string(
+    b = Identifiable._create_hashable_string(
         Identifiable(name="A", record_type="B", properties={'b': 5, 'a': 4}))
     assert a == b
     assert (
-        _create_hashable_string(
+        Identifiable._create_hashable_string(
             Identifiable(name="A", record_type="B", properties={'a': db.Record(id=12)})
-        ) == "P<B>N<A>Path<None>a:12")
-    a = _create_hashable_string(
+        ) == "P<B>N<A>a:12")
+    a = Identifiable._create_hashable_string(
         Identifiable(name="A", record_type="B", properties={'a': [db.Record(id=12)]}))
-    assert (a == "P<B>N<A>Path<None>a:[12]")
-    assert (_create_hashable_string(Identifiable(name="A", record_type="B", properties={'a': [12]})
-                                    ) == "P<B>N<A>Path<None>a:[12]")
+    assert (a == "P<B>N<A>a:[12]")
+    assert (Identifiable._create_hashable_string(
+        Identifiable(name="A", record_type="B", properties={'a': [12]})) == "P<B>N<A>a:[12]")
     assert (
-        _create_hashable_string(
+        Identifiable._create_hashable_string(
             Identifiable(name="A", record_type="B", properties={'a': [db.Record(id=12), 11]})
-        ) == "P<B>N<A>Path<None>a:[12, 11]")
+        ) == "P<B>N<A>a:[12, 11]")
     assert (
-        _create_hashable_string(
+        Identifiable._create_hashable_string(
             Identifiable(record_type="B", properties={'a': [db.Record()]})
-        ) != _create_hashable_string(
+        ) != Identifiable._create_hashable_string(
             Identifiable(record_type="B", properties={'a': [db.Record()]})))
 
 
@@ -72,25 +70,5 @@ def test_IdentifiedCache():
     cache.add(record=record, identifiable=ident)
     assert ident in cache
     assert cache[ident] is record
-
-
-def test_normal_hash_creation():
-    # Test the initial functionality:
-    # hash comprises only one parent, name and properties:
-
-    hash1 = create_hash_string(Identifiable(name="test", record_type="bla"))
-    assert len(hash1) > 0
-    hash2 = create_hash_string(Identifiable(name="test2", record_type="bla"))
-
-    assert hash1 != hash2
-
-    hash3 = create_hash_string(Identifiable(name="test", record_type="bla bla"))
-    assert hash1 != hash3
-    assert hash2 != hash3
-
-
-def test_file_hash_creation():
-    hash1 = create_hash_string(Identifiable(path="/bla/bla/test1.txt"))
-    hash2 = create_hash_string(Identifiable(path="/bla/bla/test2.txt"))
-
-    assert hash1 != hash2
+    assert Identifiable(name="A", record_type="C") != Identifiable(name="A", record_type="B")
+    assert Identifiable(name="A", record_type="C") not in cache
diff --git a/unittests/test_tool.py b/unittests/test_tool.py
index fa613721..f81ee2fa 100755
--- a/unittests/test_tool.py
+++ b/unittests/test_tool.py
@@ -352,6 +352,9 @@ def crawler_mocked_identifiable_retrieve(crawler):
     # There is only a single known Record with name A
     crawler.identifiableAdapter.retrieve_identified_record_for_record = Mock(side_effect=partial(
         basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A")}))
+    crawler.identifiableAdapter.retrieve_identified_record_for_identifiable = Mock(
+        side_effect=partial(
+            basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A")}))
     return crawler
 
 
@@ -382,7 +385,7 @@ def test_split_into_inserts_and_updates_single(crawler_mocked_identifiable_retri
     assert update[0].name == "A"
     # if this ever fails, the mock up may be removed
     crawler.identifiableAdapter.get_registered_identifiable.assert_called()
-    crawler.identifiableAdapter.retrieve_identified_record_for_record.assert_called()
+    crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called()
 
 
 def test_split_into_inserts_and_updates_with_duplicate(crawler_mocked_identifiable_retrieve):
@@ -400,7 +403,7 @@ def test_split_into_inserts_and_updates_with_duplicate(crawler_mocked_identifiab
     assert update[0].name == "A"
     # if this ever fails, the mock up may be removed
     crawler.identifiableAdapter.get_registered_identifiable.assert_called()
-    crawler.identifiableAdapter.retrieve_identified_record_for_record.assert_called()
+    crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called()
 
 
 def test_split_into_inserts_and_updates_with_ref(crawler_mocked_identifiable_retrieve):
@@ -416,8 +419,8 @@ def test_split_into_inserts_and_updates_with_ref(crawler_mocked_identifiable_ret
     assert len(update) == 1
     assert update[0].name == "A"
     # if this ever fails, the mock up may be removed
-    crawler.identifiableAdapter.retrieve_identified_record_for_record.assert_called()
     crawler.identifiableAdapter.get_registered_identifiable.assert_called()
+    crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called()
 
 
 def test_split_into_inserts_and_updates_with_circ(crawler):
@@ -452,7 +455,7 @@ def test_split_into_inserts_and_updates_with_complex(crawler_mocked_identifiable
     assert update[0].name == "A"
     # if this ever fails, the mock up may be removed
     crawler.identifiableAdapter.get_registered_identifiable.assert_called()
-    crawler.identifiableAdapter.retrieve_identified_record_for_record.assert_called()
+    crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called()
 
     # TODO write test where the unresoled entity is not part of the identifiable
 
@@ -471,7 +474,7 @@ def test_split_into_inserts_and_updates_with_copy_attr(crawler_mocked_identifiab
     assert update[0].get_property("foo").value == 1
     # if this ever fails, the mock up may be removed
     crawler.identifiableAdapter.get_registered_identifiable.assert_called()
-    crawler.identifiableAdapter.retrieve_identified_record_for_record.assert_called()
+    crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called()
 
 
 @pytest.mark.xfail()
-- 
GitLab