From 6055192a1495907ed632c1e3cbc9350fda24df88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Mon, 14 Nov 2022 17:51:23 +0100 Subject: [PATCH] MAINT: include id and path in Identifiable --- src/caoscrawler/crawl.py | 61 +++++++++++++++++------ src/caoscrawler/identifiable.py | 75 ++++++++++++++++++++++++++--- src/caoscrawler/identified_cache.py | 62 ++++-------------------- src/doc/concepts.rst | 18 ++++--- unittests/test_identified_cache.py | 60 ++++++++--------------- unittests/test_tool.py | 13 +++-- 6 files changed, 163 insertions(+), 126 deletions(-) diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index e1cb2009..6e84b1bf 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -718,15 +718,45 @@ class Crawler(object): while resolved_references and len(flat) > 0: resolved_references = False + # For each element we try to find out whether we can find it in the server or whether + # it does not yet exist. Since a Record may reference other unkown Records it might not + # be possible to answer this right away. + # The following checks are done on each Record: + # 1. Can it be identified via an ID? + # 2. Can it be identified via a path? + # 3. Is it in the cache of already checked Records? + # 4. Can it be checked on the remote server? + # 5. Does it have to be new since a needed referenc is missing? for i in reversed(range(len(flat))): record = flat[i] identifiable = self.identifiableAdapter.get_identifiable(record) # TODO remove if the exception is never raised - if (record.id is not None or record in to_be_inserted): + if record in to_be_inserted: raise RuntimeError("This should not be reached since treated elements" "are removed from the list") - # Check whether this record is a duplicate that can be removed + # 1. Can it be identified via an ID? + elif record.id is not None: + to_be_updated.append(record) + self.add_to_remote_existing_cache(record) + del flat[i] + # 2. Can it be identified via a path? + elif record.path is not None: + exiting = self._get_entity_by_path(record.path) + if exiting is None: + to_be_inserted.append(record) + self.add_to_remote_missing_cache(record) + del flat[i] + else: + record.id = exiting.id + # TODO check the following copying of _size and _checksum + # Copy over checksum and size too if it is a file + record._size = exiting._size + record._checksum = exiting._checksum + to_be_updated.append(record) + self.add_to_remote_existing_cache(record) + del flat[i] + # 3. Is it in the cache of already checked Records? elif self.get_from_any_cache(identifiable) is not None: # We merge the two in order to prevent loss of information newrecord = self.get_from_any_cache(identifiable) @@ -737,12 +767,11 @@ class Crawler(object): del flat[i] resolved_references = True - # can we check whether the record(identifiable) exists on the remote server? + # 4. Can it be checked on the remote server? elif not self._has_reference_value_without_id(identifiable): - # TODO: remove deepcopy? identified_record = ( - self.identifiableAdapter.retrieve_identified_record_for_record( - deepcopy(record))) + self.identifiableAdapter.retrieve_identified_record_for_identifiable( + identifiable)) if identified_record is None: # identifiable does not exist remotely -> record needs to be inserted to_be_inserted.append(record) @@ -751,18 +780,14 @@ class Crawler(object): else: # side effect record.id = identified_record.id - # Copy over checksum and size too if it is a file - if isinstance(record, db.File): - record._size = identified_record._size - record._checksum = identified_record._checksum - to_be_updated.append(record) self.add_to_remote_existing_cache(record) del flat[i] resolved_references = True - # is it impossible to check this record because an identifiable references a - # missing record? + # 5. Does it have to be new since a needed referenc is missing? + # (Is it impossible to check this record because an identifiable references a + # missing record?) elif self._has_missing_object_in_references(identifiable): to_be_inserted.append(record) self.add_to_remote_missing_cache(record) @@ -901,6 +926,13 @@ class Crawler(object): def _get_entity_by_name(name): return db.Entity(name=name).retrieve() + @staticmethod + def _get_entity_by_path(path): + try: + return db.execute_query(f"FIND FILE WHICH IS STORED AT '{path}'", unique=True) + except db.exceptions.EmptyUniqueQueryError: + return None + @staticmethod def _get_entity_by_id(id): return db.Entity(id=id).retrieve() @@ -974,8 +1006,7 @@ class Crawler(object): self.replace_entities_with_ids(el) identified_records = [ - self.identifiableAdapter.retrieve_identified_record_for_record( - record) + self.identifiableAdapter.retrieve_identified_record_for_record(record) for record in to_be_updated] # Merge with existing data to prevent unwanted overwrites to_be_updated = self._merge_properties_from_remote(to_be_updated, diff --git a/src/caoscrawler/identifiable.py b/src/caoscrawler/identifiable.py index 3d2fce0e..78dfe08f 100644 --- a/src/caoscrawler/identifiable.py +++ b/src/caoscrawler/identifiable.py @@ -20,6 +20,9 @@ # from __future__ import annotations +import caosdb as db +from datetime import datetime +from hashlib import sha256 from typing import Union @@ -27,9 +30,10 @@ class Identifiable(): """ The fingerprint of a Record in CaosDB. - This class contains the information that is used by the CaosDB Crawler to identify Records . - For example, in order to check whether a Record exits in the CaosDB Server, a query is created - using the information contained in the Identifiable. + This class contains the information that is used by the CaosDB Crawler to identify Records. + On one hand, this can be the ID or a Record or the path of a File. + On the other hand, in order to check whether a Record exits in the CaosDB Server, a query can + be created using the information contained in the Identifiable. Parameters ---------- @@ -42,14 +46,73 @@ class Identifiable(): backrefs: list, TODO future """ - def __init__(self, record_type: str = None, name: str = None, properties: dict = None, - path: str = None, backrefs: list[Union[int, str]] = None): + def __init__(self, record_id: int = None, path: str = None, record_type: str = None, + name: str = None, properties: dict = None, + backrefs: list[Union[int, str]] = None): + self.record_id = record_id + self.path = path self.record_type = record_type self.name = name self.properties: dict = {} if properties is not None: self.properties = properties - self.path = path self.backrefs: list = [] if backrefs is not None: self.backrefs = backrefs + + def get_representation(self) -> str: + return sha256(Identifiable._create_hashable_string(self).encode('utf-8')).hexdigest() + + @staticmethod + def _value_representation(value) -> str: + """returns the string representation of property values to be used in the hash function """ + + if value is None: + return "None" + elif isinstance(value, db.File): + return str(value.path) + elif isinstance(value, db.Entity): + if value.id is not None: + return str(value.id) + else: + return "PyID=" + str(id(value)) + elif isinstance(value, list): + return "[" + ", ".join([Identifiable._value_representation(el) for el in value]) + "]" + elif (isinstance(value, str) or isinstance(value, int) or isinstance(value, float) + or isinstance(value, datetime)): + return str(value) + else: + raise ValueError(f"Unknown datatype of the value: {value}") + + @staticmethod + def _create_hashable_string(identifiable: Identifiable) -> str: + """ + creates a string from the attributes of an identifiable that can be hashed + """ + rec_string = "P<{}>N<{}>".format(identifiable.record_type, identifiable.name) + # TODO this structure neglects Properties if multiple exist for the same name + for pname in sorted(identifiable.properties.keys()): + rec_string += ("{}:".format(pname) + + Identifiable._value_representation(identifiable.properties[pname])) + return rec_string + + def __eq__(self, other) -> bool: + """ + Identifiables are equal if they belong to the same Record. Since ID and path are on their + own enough to identify the Record it is sufficient if those attributes are equal. + """ + if not isinstance(other, Identifiable): + return False + if self.record_id is not None and self.record_id == other.record_id: + return True + elif (self.record_id is not None and other.record_id is not None + and self.record_id == other.record_id): + return False + elif self.path is not None and self.path == other.path: + return True + elif (self.path is not None and other.path is not None and self.path == other.id): + return False + elif self.get_representation() == other.get_representation(): + return True + else: + return False diff --git a/src/caoscrawler/identified_cache.py b/src/caoscrawler/identified_cache.py index c9388798..878ae443 100644 --- a/src/caoscrawler/identified_cache.py +++ b/src/caoscrawler/identified_cache.py @@ -39,66 +39,22 @@ TODO: We need a general review: Checkout how this was done in the old crawler. """ -import caosdb as db - -from hashlib import sha256 -from datetime import datetime - from .identifiable import Identifiable - - -def _value_representation(value): - """returns the string representation of property values to be used in the hash function """ - - # TODO: (for review) - # This expansion of the hash function was introduced recently - # to allow the special case of Files as values of properties. - # We need to review the completeness of all the cases here, as the cache - # is crucial for correct identification of insertion and updates. - if value is None: - return "None" - elif isinstance(value, db.File): - return str(value.path) - elif isinstance(value, db.Entity): - if value.id is not None: - return str(value.id) - else: - return "PyID=" + str(id(value)) - elif isinstance(value, list): - return "[" + ", ".join([_value_representation(el) for el in value]) + "]" - elif (isinstance(value, str) or isinstance(value, int) or isinstance(value, float) - or isinstance(value, datetime)): - return str(value) - else: - raise ValueError(f"Unknown datatype of the value: {value}") - - -def _create_hashable_string(identifiable: Identifiable): - """ - creates a string from the attributes of an identifiable that can be hashed - """ - rec_string = "P<{}>N<{}>Path<{}>".format(identifiable.record_type, identifiable.name, - identifiable.path) - # TODO this structure neglects Properties if multiple exist for the same name - for pname in sorted(identifiable.properties.keys()): - rec_string += ("{}:".format(pname) + - _value_representation(identifiable.properties[pname])) - return rec_string - - -def _create_hash(identifiable: Identifiable) -> str: - return sha256(_create_hashable_string(identifiable).encode('utf-8')).hexdigest() +import caosdb as db class IdentifiedCache(object): def __init__(self): self._cache = {} + self._identifiables = [] - def __contains__(self, identifiable: db.Record): - return _create_hash(identifiable) in self._cache + def __contains__(self, identifiable: Identifiable): + return identifiable in self._identifiables def __getitem__(self, identifiable: db.Record): - return self._cache[_create_hash(identifiable)] + index = self._identifiables.index(identifiable) + return self._cache[id(self._identifiables[index])] - def add(self, record: db.Record, identifiable: db.Record): - self._cache[_create_hash(identifiable)] = record + def add(self, record: db.Record, identifiable: Identifiable): + self._cache[id(identifiable)] = record + self._identifiables.append(identifiable) diff --git a/src/doc/concepts.rst b/src/doc/concepts.rst index 0f602f7e..012f393b 100644 --- a/src/doc/concepts.rst +++ b/src/doc/concepts.rst @@ -42,30 +42,36 @@ In order to check whether a Record exits in the CaosDB Server, the CaosDB Crawle using the information contained in the Identifiable. For example, suppose a certain experiment is at most done once per day, then the identifiable could -consist of the RecordType "SomeExperiment" and the Property "date". +consist of the RecordType "SomeExperiment" (as a parent) and the Property "date". -You can think of the properties that are used by the identifiable as dictionary. For each property +You can think of the properties that are used by the identifiable as a dictionary. For each property name there can be one value. However, this value can be a list such that the created query can look like "FIND RECORD ParamenterSet WITH a=5 AND a=6". This is meaningful if there is a ParamenterSet -with two Properties with the name 'a' or if 'a' is a list containing at least the values 5 and 6. +with two Properties with the name 'a' (multi property) or if 'a' is a list containing at least the values 5 and 6. The path of a File object can serve as a Property that identifies files and similarly the name of Records can be used. -An identifiable can only use one RecordType even though the identified Records might have multiple +In the current implementation an identifiable can only use one RecordType even though the identified Records might have multiple Parents. -Relevant sources in: +Relevant sources in - ``src/identifiable_adapters.py`` - ``src/identifiable.py`` RegisteredIdentifiables +++++++++++++++++++++++ -A Registered Identifiable is the blue print for Identifiables. RegisteredIdentifiables are +A Registered Identifiable is the blue print for Identifiables. +You can think of registered identifiables as identifiables without concrete values for properties. +RegisteredIdentifiables are associated with RecordTypes and define of what information an identifiable for that RecordType exists. There can be multiple Registered Identifiables for one RecordType. +Identified Records +++++++++++++++++++ +TODO + The Crawler +++++++++++ diff --git a/unittests/test_identified_cache.py b/unittests/test_identified_cache.py index 4a9ee2a8..dfddbbd9 100644 --- a/unittests/test_identified_cache.py +++ b/unittests/test_identified_cache.py @@ -27,40 +27,38 @@ test identified_cache module """ -from caoscrawler.identified_cache import _create_hashable_string, IdentifiedCache import caosdb as db from caoscrawler.identifiable import Identifiable - -from caoscrawler.identified_cache import _create_hashable_string as create_hash_string +from caoscrawler.identified_cache import IdentifiedCache def test_create_hash(): - assert _create_hashable_string( - Identifiable(name="A", record_type="B")) == "P<B>N<A>Path<None>" - assert _create_hashable_string( - Identifiable(name="A", record_type="B", properties={'a': 5})) == "P<B>N<A>Path<None>a:5" - a = _create_hashable_string( + assert Identifiable._create_hashable_string( + Identifiable(name="A", record_type="B")) == "P<B>N<A>" + assert Identifiable._create_hashable_string( + Identifiable(name="A", record_type="B", properties={'a': 5})) == "P<B>N<A>a:5" + a = Identifiable._create_hashable_string( Identifiable(name="A", record_type="B", properties={'a': 4, 'b': 5})) - b = _create_hashable_string( + b = Identifiable._create_hashable_string( Identifiable(name="A", record_type="B", properties={'b': 5, 'a': 4})) assert a == b assert ( - _create_hashable_string( + Identifiable._create_hashable_string( Identifiable(name="A", record_type="B", properties={'a': db.Record(id=12)}) - ) == "P<B>N<A>Path<None>a:12") - a = _create_hashable_string( + ) == "P<B>N<A>a:12") + a = Identifiable._create_hashable_string( Identifiable(name="A", record_type="B", properties={'a': [db.Record(id=12)]})) - assert (a == "P<B>N<A>Path<None>a:[12]") - assert (_create_hashable_string(Identifiable(name="A", record_type="B", properties={'a': [12]}) - ) == "P<B>N<A>Path<None>a:[12]") + assert (a == "P<B>N<A>a:[12]") + assert (Identifiable._create_hashable_string( + Identifiable(name="A", record_type="B", properties={'a': [12]})) == "P<B>N<A>a:[12]") assert ( - _create_hashable_string( + Identifiable._create_hashable_string( Identifiable(name="A", record_type="B", properties={'a': [db.Record(id=12), 11]}) - ) == "P<B>N<A>Path<None>a:[12, 11]") + ) == "P<B>N<A>a:[12, 11]") assert ( - _create_hashable_string( + Identifiable._create_hashable_string( Identifiable(record_type="B", properties={'a': [db.Record()]}) - ) != _create_hashable_string( + ) != Identifiable._create_hashable_string( Identifiable(record_type="B", properties={'a': [db.Record()]}))) @@ -72,25 +70,5 @@ def test_IdentifiedCache(): cache.add(record=record, identifiable=ident) assert ident in cache assert cache[ident] is record - - -def test_normal_hash_creation(): - # Test the initial functionality: - # hash comprises only one parent, name and properties: - - hash1 = create_hash_string(Identifiable(name="test", record_type="bla")) - assert len(hash1) > 0 - hash2 = create_hash_string(Identifiable(name="test2", record_type="bla")) - - assert hash1 != hash2 - - hash3 = create_hash_string(Identifiable(name="test", record_type="bla bla")) - assert hash1 != hash3 - assert hash2 != hash3 - - -def test_file_hash_creation(): - hash1 = create_hash_string(Identifiable(path="/bla/bla/test1.txt")) - hash2 = create_hash_string(Identifiable(path="/bla/bla/test2.txt")) - - assert hash1 != hash2 + assert Identifiable(name="A", record_type="C") != Identifiable(name="A", record_type="B") + assert Identifiable(name="A", record_type="C") not in cache diff --git a/unittests/test_tool.py b/unittests/test_tool.py index fa613721..f81ee2fa 100755 --- a/unittests/test_tool.py +++ b/unittests/test_tool.py @@ -352,6 +352,9 @@ def crawler_mocked_identifiable_retrieve(crawler): # There is only a single known Record with name A crawler.identifiableAdapter.retrieve_identified_record_for_record = Mock(side_effect=partial( basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A")})) + crawler.identifiableAdapter.retrieve_identified_record_for_identifiable = Mock( + side_effect=partial( + basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A")})) return crawler @@ -382,7 +385,7 @@ def test_split_into_inserts_and_updates_single(crawler_mocked_identifiable_retri assert update[0].name == "A" # if this ever fails, the mock up may be removed crawler.identifiableAdapter.get_registered_identifiable.assert_called() - crawler.identifiableAdapter.retrieve_identified_record_for_record.assert_called() + crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() def test_split_into_inserts_and_updates_with_duplicate(crawler_mocked_identifiable_retrieve): @@ -400,7 +403,7 @@ def test_split_into_inserts_and_updates_with_duplicate(crawler_mocked_identifiab assert update[0].name == "A" # if this ever fails, the mock up may be removed crawler.identifiableAdapter.get_registered_identifiable.assert_called() - crawler.identifiableAdapter.retrieve_identified_record_for_record.assert_called() + crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() def test_split_into_inserts_and_updates_with_ref(crawler_mocked_identifiable_retrieve): @@ -416,8 +419,8 @@ def test_split_into_inserts_and_updates_with_ref(crawler_mocked_identifiable_ret assert len(update) == 1 assert update[0].name == "A" # if this ever fails, the mock up may be removed - crawler.identifiableAdapter.retrieve_identified_record_for_record.assert_called() crawler.identifiableAdapter.get_registered_identifiable.assert_called() + crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() def test_split_into_inserts_and_updates_with_circ(crawler): @@ -452,7 +455,7 @@ def test_split_into_inserts_and_updates_with_complex(crawler_mocked_identifiable assert update[0].name == "A" # if this ever fails, the mock up may be removed crawler.identifiableAdapter.get_registered_identifiable.assert_called() - crawler.identifiableAdapter.retrieve_identified_record_for_record.assert_called() + crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() # TODO write test where the unresoled entity is not part of the identifiable @@ -471,7 +474,7 @@ def test_split_into_inserts_and_updates_with_copy_attr(crawler_mocked_identifiab assert update[0].get_property("foo").value == 1 # if this ever fails, the mock up may be removed crawler.identifiableAdapter.get_registered_identifiable.assert_called() - crawler.identifiableAdapter.retrieve_identified_record_for_record.assert_called() + crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() @pytest.mark.xfail() -- GitLab