diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 1c6968d76257109cc24458a048d6d41ea64d2e5f..876c6cdc247b24d2bb6b40dea24c2a386ce446f3 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -64,7 +64,6 @@ from .identifiable import Identifiable from .identifiable_adapters import (CaosDBIdentifiableAdapter, IdentifiableAdapter, LocalStorageIdentifiableAdapter) -from .identified_cache import IdentifiedCache from .logging import configure_server_side_logging from .macros import defmacro_constructor, macro_constructor from .scanner import (create_converter_registry, initialize_converters, @@ -220,6 +219,127 @@ class SecurityMode(Enum): UPDATE = 2 +class TreatedRecordLookUp(): + """tracks Records and Identifiables for which it was checked whether they exist in the remote + server + + For a given Record it can be checked, whether it exists in the remote sever if + - it has a (valid) ID + - it has a (valid) path (FILEs only) + - an identifiable can be created for the Record. + + Records are added by calling the `add` function and they are then added to the internal + existing or missing list depending on whether the Record has a valid ID. + Additionally, the Record is added to three look up dicts. The keys of those are paths, IDs and + the representation of the identifiables. + """ + + def __init__(self): + self._id_look_up: dict[int, db.Entity] = {} + self._path_look_up: dict[str, db.Entity] = {} + self._identifiable_look_up: dict[str, db.Entity] = {} + self.remote_missing_counter = -1 + self._missing: dict[int, db.Entity] = {} + self._existing: dict[int, db.Entity] = {} + + # TODO is this needed? + def by_path(self, path): + if path in self._path_look_up: + return self._path_look_up[path] + else: + return None + + # TODO is this needed? + def by_ID(self, eid): + if eid in self._id_look_up: + return self._id_look_up[path] + else: + return None + + # TODO is this needed? + def by_identifiable(self, identifiable): + if identifiable.get_representation() in self._id_look_up: + return self._id_look_up[identifiable.get_representation()] + else: + return None + + def add(self, record: db.Entity, identifiable: Optional[Identifiable] = None): + """ + Add a Record that was treated, such that it is contained in the internal look up dicts + """ + if record.id is None: + if record.path is None and identifiable is None: + raise RuntimeError("Record must have ID or path or an identifiable must be given." + f"Record is\n{record}") + record.id = self.remote_missing_counter + self.remote_missing_counter -= 1 + self._add_any(record, self._missing, identifiable) + else: + self._add_any(record, self._existing, identifiable) + + def get_any(self, record: db.Entity, identifiable: Optional[Identifiable] = None): + """ + Check whether this Record was already added. Identity is based on ID, path or Identifiable + represenation + """ + if record.id is not None and record.id in self._id_look_up: + return self._id_look_up[record.id] + if record.path is not None and record.path in self._path_look_up: + return self._path_look_up[record.path] + if (identifiable is not None and identifiable.get_representation() in + self._identifiable_look_up): + return self._identifiable_look_up[identifiable.get_representation()] + + def get_existing(self, record: db.Entity, identifiable: Optional[Identifiable] = None): + """ Check whether this Record exists on the remote server + + Returns: The stored Record + """ + rec = self.get_any(record, identifiable) + if id(rec) in self._existing: + return rec + else: + return None + + def get_missing(self, record: db.Entity, identifiable: Optional[Identifiable] = None): + """ Check whether this Record is missing on the remote server + + Returns: The stored Record + """ + rec = self.get_any(record, identifiable) + if id(rec) in self._missing: + return rec + else: + return None + + # TODO is this needed? + def is_missing(self, record: db.Entity): + """ Check whether this Record is missing on the remote server """ + return id(record) in self._missing + + # TODO is this needed? + def is_existing(self, record: db.Entity): + """ Check whether this Record is existing on the remote server """ + return id(record) in self._existing + + def get_missing_list(self): + """ Return all Records that are missing in the remote server """ + return list(self._missing.values()) + + def get_existing_list(self): + """ Return all Records that exist in the remote server """ + return list(self._existing.values()) + + def _add_any(self, record: db.Entity, lookup, identifiable: Optional[Identifiable] = None): + if record.id is not None: + self._id_look_up[record.id] = record + if record.path is not None: + self._path_look_up[record.path] = record + if identifiable is not None: + self._identifiable_look_up[identifiable.get_representation()] = record + lookup[id(record)] = record + + class Crawler(object): """ Crawler class that encapsulates crawling functions. @@ -256,8 +376,8 @@ class Crawler(object): # The following caches store records, where we checked whether they exist on the remote # server. Since, it is important to know whether they exist or not, we store them into two # different caches. - self.remote_existing_cache = IdentifiedCache() - self.remote_missing_cache = IdentifiedCache() + self.treated_records_lookup = TreatedRecordLookUp() + # TODO does it make sense to have this as member variable? self.securityMode = securityMode # TODO does it make sense to have this as member variable(run_id)? @@ -417,11 +537,15 @@ class Crawler(object): # Entity instead of ID and not cached locally if (isinstance(pvalue, list)): for el in pvalue: - if (isinstance(el, db.Entity) and self.get_from_remote_missing_cache( - self.identifiableAdapter.get_identifiable(el, referencing_entities)) is not None): + if (isinstance(el, db.Entity) and self.treated_records_lookup.get_missing( + el, + self.identifiableAdapter.get_identifiable( + el, referencing_entities)) is not None): return True - if (isinstance(pvalue, db.Entity) and self.get_from_remote_missing_cache( - self.identifiableAdapter.get_identifiable(pvalue, referencing_entities)) is not None): + if (isinstance(pvalue, db.Entity) and self.treated_records_lookup.get_missing( + pvalue, + self.identifiableAdapter.get_identifiable(pvalue, referencing_entities) + ) is not None): # might be checked when reference is resolved return True return False @@ -437,7 +561,8 @@ class Crawler(object): lst = [] for el in p.value: if (isinstance(el, db.Entity) and el.id is None): - cached = self.get_from_any_cache( + cached = self.treated_records_lookup.get_any( + el, self.identifiableAdapter.get_identifiable(el, referencing_entities)) if cached is None: lst.append(el) @@ -459,8 +584,8 @@ class Crawler(object): lst.append(el) p.value = lst if (isinstance(p.value, db.Entity) and p.value.id is None): - cached = self.get_from_any_cache( - self.identifiableAdapter.get_identifiable(p.value, referencing_entities)) + cached = self.treated_records_lookup.get_any(p.value, + self.identifiableAdapter.get_identifiable(p.value, referencing_entities)) if cached is None: continue if not check_identical(cached, p.value, True): @@ -477,64 +602,6 @@ class Crawler(object): ) p.value = cached - def get_from_remote_missing_cache(self, identifiable: Identifiable): - """ - returns the identified record if an identifiable with the same values already exists locally - (Each identifiable that is not found on the remote server, is 'cached' locally to prevent - that the same identifiable exists twice) - """ - if identifiable is None: - raise ValueError("Identifiable has to be given as argument") - - if identifiable in self.remote_missing_cache: - return self.remote_missing_cache[identifiable] - else: - return None - - def get_from_any_cache(self, identifiable: Identifiable): - """ - returns the identifiable if an identifiable with the same values already exists locally - (Each identifiable that is not found on the remote server, is 'cached' locally to prevent - that the same identifiable exists twice) - """ - if identifiable is None: - raise ValueError("Identifiable has to be given as argument") - - if identifiable in self.remote_existing_cache: - return self.remote_existing_cache[identifiable] - elif identifiable in self.remote_missing_cache: - return self.remote_missing_cache[identifiable] - else: - return None - - def add_to_remote_missing_cache(self, record: db.Record, identifiable: Identifiable): - """ - stores the given Record in the remote_missing_cache. - - If identifiable is None, the Record is NOT stored. - """ - self.add_to_cache(record=record, cache=self.remote_missing_cache, - identifiable=identifiable) - - def add_to_remote_existing_cache(self, record: db.Record, identifiable: Identifiable): - """ - stores the given Record in the remote_existing_cache. - - If identifiable is None, the Record is NOT stored. - """ - self.add_to_cache(record=record, cache=self.remote_existing_cache, - identifiable=identifiable) - - def add_to_cache(self, record: db.Record, cache: IdentifiedCache, - identifiable: Identifiable) -> None: - """ - stores the given Record in the given cache. - - If identifiable is None, the Record is NOT stored. - """ - if identifiable is not None: - cache.add(identifiable=identifiable, record=record) - @staticmethod def bend_references_to_new_object(old, new, entities): """ Bend references to the other object @@ -583,9 +650,8 @@ class Crawler(object): return references def split_into_inserts_and_updates(self, ent_list: list[db.Entity]): - to_be_inserted: list[db.Entity] = [] - to_be_updated: list[db.Entity] = [] flat = Crawler.create_flat_list(ent_list) + all_records = list(flat) # TODO: can the following be removed at some point for ent in flat: @@ -598,7 +664,7 @@ class Crawler(object): while resolved_references and len(flat) > 0: resolved_references = False referencing_entities = self.create_reference_mapping( - flat + to_be_updated + try_to_merge_later + to_be_inserted) + all_records) # For each element we try to find out whether we can find it in the server or whether # it does not yet exist. Since a Record may reference other unkown Records it might not @@ -615,14 +681,9 @@ class Crawler(object): record, referencing_entities=referencing_entities) - # TODO remove if the exception is never raised - if record in to_be_inserted: - raise RuntimeError("This should not be reached since treated elements" - "are removed from the list") # 1. Can it be identified via an ID? - elif record.id is not None: - to_be_updated.append(record) - self.add_to_remote_existing_cache(record, identifiable) + if record.id is not None: + self.treated_records_lookup.add(record, identifiable) del flat[i] # 2. Can it be identified via a path? elif record.path is not None: @@ -631,8 +692,7 @@ class Crawler(object): except EmptyUniqueQueryError: existing = None if existing is None: - to_be_inserted.append(record) - self.add_to_remote_missing_cache(record, identifiable) + self.treated_records_lookup.add(record, identifiable) del flat[i] else: record.id = existing.id @@ -640,12 +700,11 @@ class Crawler(object): # Copy over checksum and size too if it is a file record._size = existing._size record._checksum = existing._checksum - to_be_updated.append(record) - self.add_to_remote_existing_cache(record, identifiable) + self.treated_records_lookup.add(record, identifiable) del flat[i] # 3. Is it in the cache of already checked Records? - elif self.get_from_any_cache(identifiable) is not None: - newrecord = self.get_from_any_cache(identifiable) + elif self.treated_records_lookup.get_any(record, identifiable) is not None: + newrecord = self.treated_records_lookup.get_any(record, identifiable) # Since the identifiables are the same, newrecord and record actually describe # the same obejct. # We merge the two in order to prevent loss of information @@ -665,10 +724,9 @@ class Crawler(object): raise Crawler.bend_references_to_new_object( old=record, new=newrecord, - entities=flat + to_be_updated + to_be_inserted + try_to_merge_later + entities=all_records ) - referencing_entities = self.create_reference_mapping( - flat + to_be_updated + try_to_merge_later + to_be_inserted) + referencing_entities = self.create_reference_mapping(all_records) del flat[i] resolved_references = True @@ -680,23 +738,19 @@ class Crawler(object): identifiable)) if identified_record is None: # identifiable does not exist remotely -> record needs to be inserted - to_be_inserted.append(record) - self.add_to_remote_missing_cache(record, identifiable) - del flat[i] + self.treated_records_lookup.add(record, identifiable) else: # side effect record.id = identified_record.id - to_be_updated.append(record) - self.add_to_remote_existing_cache(record, identifiable) - del flat[i] + self.treated_records_lookup.add(record, identifiable) + del flat[i] resolved_references = True # 5. Does it have to be new since a needed reference is missing? # (Is it impossible to check this record because an identifiable references a # missing record?) elif self._has_missing_object_in_references(identifiable, referencing_entities): - to_be_inserted.append(record) - self.add_to_remote_missing_cache(record, identifiable) + self.treated_records_lookup.add(record, identifiable) del flat[i] resolved_references = True @@ -709,7 +763,7 @@ class Crawler(object): identifiable = self.identifiableAdapter.get_identifiable( record, referencing_entities=referencing_entities) - newrecord = self.get_from_any_cache(identifiable) + newrecord = self.treated_records_lookup.get_any(record, identifiable) merge_entities(newrecord, record, merge_id_with_resolved_entity=True) if len(flat) > 0: circle = self.detect_circular_dependency(flat) @@ -724,7 +778,16 @@ class Crawler(object): f"Could not finish split_into_inserts_and_updates. Circular dependency: " f"{circle is not None}") - return to_be_inserted, to_be_updated + # remove negative IDs + missing = self.treated_records_lookup.get_missing_list() + for el in missing: + if el.id is None: + raise RuntimeError("This should not happen") # TODO remove + if el.id >= 0: + raise RuntimeError("This should not happen") # TODO remove + el.id = None + + return (missing, self.treated_records_lookup.get_existing_list()) def replace_entities_with_ids(self, rec: db.Record): for el in rec.properties: @@ -1051,7 +1114,6 @@ class Crawler(object): crawled_data = self.crawled_data to_be_inserted, to_be_updated = self.split_into_inserts_and_updates(crawled_data) - referencing_entities = self.create_reference_mapping(to_be_updated + to_be_inserted) for el in to_be_updated: # all entity objects are replaced by their IDs except for the not yet inserted ones diff --git a/src/caoscrawler/identified_cache.py b/src/caoscrawler/identified_cache.py deleted file mode 100644 index aa2d82f8e66c738e737c62f3cc68eaf60127e28b..0000000000000000000000000000000000000000 --- a/src/caoscrawler/identified_cache.py +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -# -# ** header v3.0 -# This file is a part of the CaosDB Project. -# -# Copyright (C) 2021 Indiscale GmbH <info@indiscale.com> -# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see <https://www.gnu.org/licenses/>. -# -# ** end header -# - - -""" -see class docstring -""" - -from .identifiable import Identifiable -import caosdb as db - - -class IdentifiedCache(object): - """ - This class is like a dictionary where the keys are Identifiables. When you check whether an - Identifiable exists as key this class returns True not only if that exact Python object is - used as a key, but if an Identifiable is used as key that is **equal** to the one being - considered (see __eq__ function of Identifiable). Similarly, if you do `cache[identifiable]` - you get the Record where the key is an Identifiable that is equal to the one in the rectangular - brackets. - - This class is used for Records where we checked the existence in a remote server using - identifiables. If the Record was found, this means that we identified the corresponding Record - in the remote server and the ID of the local object can be set. - To prevent querying the server again and again for the same objects, this cache allows storing - Records that were found on a remote server and those that were not (typically in separate - caches). - """ - - def __init__(self): - self._cache = {} - self._identifiables = [] - - def __contains__(self, identifiable: Identifiable): - return identifiable in self._identifiables - - def __getitem__(self, identifiable: db.Record): - index = self._identifiables.index(identifiable) - return self._cache[id(self._identifiables[index])] - - def add(self, record: db.Record, identifiable: Identifiable): - self._cache[id(identifiable)] = record - self._identifiables.append(identifiable) diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index fbf98346e59b0cbec88f17398eff41f26c423dee..7c62b004b207da42a37bf26dce295810ec9e3075 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -38,8 +38,9 @@ import caosdb as db import caosdb.common.models as dbmodels import pytest import yaml -from caoscrawler.crawl import (Crawler, SecurityMode, _treat_deprecated_prefix, - crawler_main, split_restricted_path) +from caoscrawler.crawl import (Crawler, SecurityMode, TreatedRecordLookUp, + _treat_deprecated_prefix, crawler_main, + split_restricted_path) from caoscrawler.debug_tree import DebugTree from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import (CaosDBIdentifiableAdapter, @@ -247,8 +248,8 @@ def test_split_into_inserts_and_updates_single(crawler_mocked_identifiable_retri entlist = [db.Record(name="A").add_parent( "C"), db.Record(name="B").add_parent("C")] - assert crawler.get_from_any_cache(identlist[0]) is None - assert crawler.get_from_any_cache(identlist[1]) is None + assert crawler.treated_records_lookup.get_any(entlist[0], identlist[0]) is None + assert crawler.treated_records_lookup.get_any(entlist[0], identlist[0]) is None assert not crawler._has_reference_value_without_id(identlist[0]) assert not crawler._has_reference_value_without_id(identlist[1]) assert crawler.identifiableAdapter.retrieve_identified_record_for_record( @@ -387,8 +388,8 @@ def test_has_missing_object_in_references(): assert not crawler._has_missing_object_in_references( Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), []) - crawler.add_to_remote_missing_cache(a, Identifiable(name="C", record_type="RTC", - properties={'d': 12311})) + crawler.treated_records_lookup.add(a, Identifiable(name="C", record_type="RTC", + properties={'d': 12311})) # one ref with id one with Entity without id but in cache -> check assert crawler._has_missing_object_in_references( Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), []) @@ -667,8 +668,8 @@ def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test) crawler.split_into_inserts_and_updates([db.Record(name="B").add_parent("C")]) # identifiables were not yet checked - assert crawler.get_from_any_cache(identlist[0]) is None - assert crawler.get_from_any_cache(identlist[1]) is None + assert crawler.treated_records_lookup.get_any(entlist[1], identlist[0]) is None + assert crawler.treated_records_lookup.get_any(entlist[0], identlist[1]) is None # one with reference, one without assert not crawler._has_reference_value_without_id(identlist[0]) assert crawler._has_reference_value_without_id(identlist[1]) @@ -964,3 +965,59 @@ def test_replace_name_with_referenced_entity(): assert isinstance(prop.value[2], int) assert prop.value[2] == test_id assert caoscrawler.crawl.cached_get_entity_by.call_count == 3 + + +def test_treated_record_lookup(): + trlu = TreatedRecordLookUp() + exist = db.Record(id=1) + trlu.add(exist) + assert len(trlu._existing) == 1 + # was added to existing + assert trlu._existing[id(exist)] is exist + # is in ID lookup + assert trlu._id_look_up[exist.id] is exist + # can be accessed via get_existing + assert trlu.get_existing(db.Record(id=1)) is exist + # assert trlu.is_existing(db.Record(id=1)) # TODO remove? + # assert not trlu.is_missing(db.Record(id=1)) # TODO remove? + + miss = db.Record() + # exception when identifiable is missing + with raises(RuntimeError): + trlu.add(miss) + ident = Identifiable(name='a') + trlu.add(miss, ident) + # was added to missing + assert trlu._missing[id(miss)] is miss + # is in ident lookup + assert trlu._identifiable_look_up[ident.get_representation()] is miss + # can be accessed via get_missing + assert trlu.get_missing(db.Record(), Identifiable(name='a')) is miss + # assert not trlu.is_existing(db.Record()) # TODO remove? + # assert trlu.is_missing(db.Record()) # TODO remove? + + fi = db.File(path='a', id=2) + trlu.add(fi) + assert len(trlu._existing) == 2 + # was added to existing + assert trlu._existing[id(fi)] is fi + # is in ID lookup + assert trlu._id_look_up[fi.id] is fi + # is in path lookup + assert trlu._path_look_up[fi.path] is fi + # can be accessed via get_existing + assert trlu.get_existing(fi) is fi + + all_exi = trlu.get_existing_list() + assert fi in all_exi + assert exist in all_exi + all_mi = trlu.get_missing_list() + assert miss in all_mi + + # If a Record was added using the ID, the ID must be used to identify it even though later an + # identifiable may be passed as well + assert trlu.get_any(exist, Identifiable(name='b')) is exist + + fi2 = db.File(path='b') + trlu.add(fi2) + assert trlu.get_any(db.File(path='b'), Identifiable(name='c')) is fi2 diff --git a/unittests/test_identifiable.py b/unittests/test_identifiable.py index 3f3c606b163df4dc238be9a669fd31eb630a582d..28bdb7a2ad75d5b9389b47ca3f0ec2b2e2a1404b 100644 --- a/unittests/test_identifiable.py +++ b/unittests/test_identifiable.py @@ -24,10 +24,9 @@ test identifiable module """ -import pytest import caosdb as db +import pytest from caoscrawler.identifiable import Identifiable -from caoscrawler.identified_cache import IdentifiedCache def test_create_hashable_string(): diff --git a/unittests/test_identified_cache.py b/unittests/test_identified_cache.py deleted file mode 100644 index 4ed7c55c7326415308917e20e9f391b17b07ad87..0000000000000000000000000000000000000000 --- a/unittests/test_identified_cache.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -# -# ** header v3.0 -# This file is a part of the CaosDB Project. -# -# Copyright (C) 2021 Indiscale GmbH <info@indiscale.com> -# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see <https://www.gnu.org/licenses/>. -# -# ** end header -# - -""" -test identified_cache module -""" - -import caosdb as db -from caoscrawler.identifiable import Identifiable -from caoscrawler.identified_cache import IdentifiedCache - - -def test_IdentifiedCache(): - ident = Identifiable(name="A", record_type="B") - record = db.Record("A").add_parent("B").add_property('b', 5) - cache = IdentifiedCache() - assert ident not in cache - cache.add(record=record, identifiable=ident) - assert ident in cache - assert cache[ident] is record - assert Identifiable(name="A", record_type="C") != Identifiable(name="A", record_type="B") - assert Identifiable(name="A", record_type="C") not in cache