Skip to content
Snippets Groups Projects
Commit 02666380 authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

ENH: Add new class to track treated Records during sync

This replaces the old IdentifiableCahce(s)
parent 96ad981b
Branches
Tags
2 merge requests!160STY: styling,!151New class to track treated Records during sync
Pipeline #47309 passed
......@@ -64,7 +64,6 @@ from .identifiable import Identifiable
from .identifiable_adapters import (CaosDBIdentifiableAdapter,
IdentifiableAdapter,
LocalStorageIdentifiableAdapter)
from .identified_cache import IdentifiedCache
from .logging import configure_server_side_logging
from .macros import defmacro_constructor, macro_constructor
from .scanner import (create_converter_registry, initialize_converters,
......@@ -220,6 +219,127 @@ class SecurityMode(Enum):
UPDATE = 2
class TreatedRecordLookUp():
"""tracks Records and Identifiables for which it was checked whether they exist in the remote
server
For a given Record it can be checked, whether it exists in the remote sever if
- it has a (valid) ID
- it has a (valid) path (FILEs only)
- an identifiable can be created for the Record.
Records are added by calling the `add` function and they are then added to the internal
existing or missing list depending on whether the Record has a valid ID.
Additionally, the Record is added to three look up dicts. The keys of those are paths, IDs and
the representation of the identifiables.
"""
def __init__(self):
self._id_look_up: dict[int, db.Entity] = {}
self._path_look_up: dict[str, db.Entity] = {}
self._identifiable_look_up: dict[str, db.Entity] = {}
self.remote_missing_counter = -1
self._missing: dict[int, db.Entity] = {}
self._existing: dict[int, db.Entity] = {}
# TODO is this needed?
def by_path(self, path):
if path in self._path_look_up:
return self._path_look_up[path]
else:
return None
# TODO is this needed?
def by_ID(self, eid):
if eid in self._id_look_up:
return self._id_look_up[path]
else:
return None
# TODO is this needed?
def by_identifiable(self, identifiable):
if identifiable.get_representation() in self._id_look_up:
return self._id_look_up[identifiable.get_representation()]
else:
return None
def add(self, record: db.Entity, identifiable: Optional[Identifiable] = None):
"""
Add a Record that was treated, such that it is contained in the internal look up dicts
"""
if record.id is None:
if record.path is None and identifiable is None:
raise RuntimeError("Record must have ID or path or an identifiable must be given."
f"Record is\n{record}")
record.id = self.remote_missing_counter
self.remote_missing_counter -= 1
self._add_any(record, self._missing, identifiable)
else:
self._add_any(record, self._existing, identifiable)
def get_any(self, record: db.Entity, identifiable: Optional[Identifiable] = None):
"""
Check whether this Record was already added. Identity is based on ID, path or Identifiable
represenation
"""
if record.id is not None and record.id in self._id_look_up:
return self._id_look_up[record.id]
if record.path is not None and record.path in self._path_look_up:
return self._path_look_up[record.path]
if (identifiable is not None and identifiable.get_representation() in
self._identifiable_look_up):
return self._identifiable_look_up[identifiable.get_representation()]
def get_existing(self, record: db.Entity, identifiable: Optional[Identifiable] = None):
""" Check whether this Record exists on the remote server
Returns: The stored Record
"""
rec = self.get_any(record, identifiable)
if id(rec) in self._existing:
return rec
else:
return None
def get_missing(self, record: db.Entity, identifiable: Optional[Identifiable] = None):
""" Check whether this Record is missing on the remote server
Returns: The stored Record
"""
rec = self.get_any(record, identifiable)
if id(rec) in self._missing:
return rec
else:
return None
# TODO is this needed?
def is_missing(self, record: db.Entity):
""" Check whether this Record is missing on the remote server """
return id(record) in self._missing
# TODO is this needed?
def is_existing(self, record: db.Entity):
""" Check whether this Record is existing on the remote server """
return id(record) in self._existing
def get_missing_list(self):
""" Return all Records that are missing in the remote server """
return list(self._missing.values())
def get_existing_list(self):
""" Return all Records that exist in the remote server """
return list(self._existing.values())
def _add_any(self, record: db.Entity, lookup, identifiable: Optional[Identifiable] = None):
if record.id is not None:
self._id_look_up[record.id] = record
if record.path is not None:
self._path_look_up[record.path] = record
if identifiable is not None:
self._identifiable_look_up[identifiable.get_representation()] = record
lookup[id(record)] = record
class Crawler(object):
"""
Crawler class that encapsulates crawling functions.
......@@ -256,8 +376,8 @@ class Crawler(object):
# The following caches store records, where we checked whether they exist on the remote
# server. Since, it is important to know whether they exist or not, we store them into two
# different caches.
self.remote_existing_cache = IdentifiedCache()
self.remote_missing_cache = IdentifiedCache()
self.treated_records_lookup = TreatedRecordLookUp()
# TODO does it make sense to have this as member variable?
self.securityMode = securityMode
# TODO does it make sense to have this as member variable(run_id)?
......@@ -417,11 +537,15 @@ class Crawler(object):
# Entity instead of ID and not cached locally
if (isinstance(pvalue, list)):
for el in pvalue:
if (isinstance(el, db.Entity) and self.get_from_remote_missing_cache(
self.identifiableAdapter.get_identifiable(el, referencing_entities)) is not None):
if (isinstance(el, db.Entity) and self.treated_records_lookup.get_missing(
el,
self.identifiableAdapter.get_identifiable(
el, referencing_entities)) is not None):
return True
if (isinstance(pvalue, db.Entity) and self.get_from_remote_missing_cache(
self.identifiableAdapter.get_identifiable(pvalue, referencing_entities)) is not None):
if (isinstance(pvalue, db.Entity) and self.treated_records_lookup.get_missing(
pvalue,
self.identifiableAdapter.get_identifiable(pvalue, referencing_entities)
) is not None):
# might be checked when reference is resolved
return True
return False
......@@ -437,7 +561,8 @@ class Crawler(object):
lst = []
for el in p.value:
if (isinstance(el, db.Entity) and el.id is None):
cached = self.get_from_any_cache(
cached = self.treated_records_lookup.get_any(
el,
self.identifiableAdapter.get_identifiable(el, referencing_entities))
if cached is None:
lst.append(el)
......@@ -459,7 +584,7 @@ class Crawler(object):
lst.append(el)
p.value = lst
if (isinstance(p.value, db.Entity) and p.value.id is None):
cached = self.get_from_any_cache(
cached = self.treated_records_lookup.get_any(p.value,
self.identifiableAdapter.get_identifiable(p.value, referencing_entities))
if cached is None:
continue
......@@ -477,64 +602,6 @@ class Crawler(object):
)
p.value = cached
def get_from_remote_missing_cache(self, identifiable: Identifiable):
"""
returns the identified record if an identifiable with the same values already exists locally
(Each identifiable that is not found on the remote server, is 'cached' locally to prevent
that the same identifiable exists twice)
"""
if identifiable is None:
raise ValueError("Identifiable has to be given as argument")
if identifiable in self.remote_missing_cache:
return self.remote_missing_cache[identifiable]
else:
return None
def get_from_any_cache(self, identifiable: Identifiable):
"""
returns the identifiable if an identifiable with the same values already exists locally
(Each identifiable that is not found on the remote server, is 'cached' locally to prevent
that the same identifiable exists twice)
"""
if identifiable is None:
raise ValueError("Identifiable has to be given as argument")
if identifiable in self.remote_existing_cache:
return self.remote_existing_cache[identifiable]
elif identifiable in self.remote_missing_cache:
return self.remote_missing_cache[identifiable]
else:
return None
def add_to_remote_missing_cache(self, record: db.Record, identifiable: Identifiable):
"""
stores the given Record in the remote_missing_cache.
If identifiable is None, the Record is NOT stored.
"""
self.add_to_cache(record=record, cache=self.remote_missing_cache,
identifiable=identifiable)
def add_to_remote_existing_cache(self, record: db.Record, identifiable: Identifiable):
"""
stores the given Record in the remote_existing_cache.
If identifiable is None, the Record is NOT stored.
"""
self.add_to_cache(record=record, cache=self.remote_existing_cache,
identifiable=identifiable)
def add_to_cache(self, record: db.Record, cache: IdentifiedCache,
identifiable: Identifiable) -> None:
"""
stores the given Record in the given cache.
If identifiable is None, the Record is NOT stored.
"""
if identifiable is not None:
cache.add(identifiable=identifiable, record=record)
@staticmethod
def bend_references_to_new_object(old, new, entities):
""" Bend references to the other object
......@@ -583,9 +650,8 @@ class Crawler(object):
return references
def split_into_inserts_and_updates(self, ent_list: list[db.Entity]):
to_be_inserted: list[db.Entity] = []
to_be_updated: list[db.Entity] = []
flat = Crawler.create_flat_list(ent_list)
all_records = list(flat)
# TODO: can the following be removed at some point
for ent in flat:
......@@ -598,7 +664,7 @@ class Crawler(object):
while resolved_references and len(flat) > 0:
resolved_references = False
referencing_entities = self.create_reference_mapping(
flat + to_be_updated + try_to_merge_later + to_be_inserted)
all_records)
# For each element we try to find out whether we can find it in the server or whether
# it does not yet exist. Since a Record may reference other unkown Records it might not
......@@ -615,14 +681,9 @@ class Crawler(object):
record,
referencing_entities=referencing_entities)
# TODO remove if the exception is never raised
if record in to_be_inserted:
raise RuntimeError("This should not be reached since treated elements"
"are removed from the list")
# 1. Can it be identified via an ID?
elif record.id is not None:
to_be_updated.append(record)
self.add_to_remote_existing_cache(record, identifiable)
if record.id is not None:
self.treated_records_lookup.add(record, identifiable)
del flat[i]
# 2. Can it be identified via a path?
elif record.path is not None:
......@@ -631,8 +692,7 @@ class Crawler(object):
except EmptyUniqueQueryError:
existing = None
if existing is None:
to_be_inserted.append(record)
self.add_to_remote_missing_cache(record, identifiable)
self.treated_records_lookup.add(record, identifiable)
del flat[i]
else:
record.id = existing.id
......@@ -640,12 +700,11 @@ class Crawler(object):
# Copy over checksum and size too if it is a file
record._size = existing._size
record._checksum = existing._checksum
to_be_updated.append(record)
self.add_to_remote_existing_cache(record, identifiable)
self.treated_records_lookup.add(record, identifiable)
del flat[i]
# 3. Is it in the cache of already checked Records?
elif self.get_from_any_cache(identifiable) is not None:
newrecord = self.get_from_any_cache(identifiable)
elif self.treated_records_lookup.get_any(record, identifiable) is not None:
newrecord = self.treated_records_lookup.get_any(record, identifiable)
# Since the identifiables are the same, newrecord and record actually describe
# the same obejct.
# We merge the two in order to prevent loss of information
......@@ -665,10 +724,9 @@ class Crawler(object):
raise
Crawler.bend_references_to_new_object(
old=record, new=newrecord,
entities=flat + to_be_updated + to_be_inserted + try_to_merge_later
entities=all_records
)
referencing_entities = self.create_reference_mapping(
flat + to_be_updated + try_to_merge_later + to_be_inserted)
referencing_entities = self.create_reference_mapping(all_records)
del flat[i]
resolved_references = True
......@@ -680,14 +738,11 @@ class Crawler(object):
identifiable))
if identified_record is None:
# identifiable does not exist remotely -> record needs to be inserted
to_be_inserted.append(record)
self.add_to_remote_missing_cache(record, identifiable)
del flat[i]
self.treated_records_lookup.add(record, identifiable)
else:
# side effect
record.id = identified_record.id
to_be_updated.append(record)
self.add_to_remote_existing_cache(record, identifiable)
self.treated_records_lookup.add(record, identifiable)
del flat[i]
resolved_references = True
......@@ -695,8 +750,7 @@ class Crawler(object):
# (Is it impossible to check this record because an identifiable references a
# missing record?)
elif self._has_missing_object_in_references(identifiable, referencing_entities):
to_be_inserted.append(record)
self.add_to_remote_missing_cache(record, identifiable)
self.treated_records_lookup.add(record, identifiable)
del flat[i]
resolved_references = True
......@@ -709,7 +763,7 @@ class Crawler(object):
identifiable = self.identifiableAdapter.get_identifiable(
record,
referencing_entities=referencing_entities)
newrecord = self.get_from_any_cache(identifiable)
newrecord = self.treated_records_lookup.get_any(record, identifiable)
merge_entities(newrecord, record, merge_id_with_resolved_entity=True)
if len(flat) > 0:
circle = self.detect_circular_dependency(flat)
......@@ -724,7 +778,16 @@ class Crawler(object):
f"Could not finish split_into_inserts_and_updates. Circular dependency: "
f"{circle is not None}")
return to_be_inserted, to_be_updated
# remove negative IDs
missing = self.treated_records_lookup.get_missing_list()
for el in missing:
if el.id is None:
raise RuntimeError("This should not happen") # TODO remove
if el.id >= 0:
raise RuntimeError("This should not happen") # TODO remove
el.id = None
return (missing, self.treated_records_lookup.get_existing_list())
def replace_entities_with_ids(self, rec: db.Record):
for el in rec.properties:
......@@ -1051,7 +1114,6 @@ class Crawler(object):
crawled_data = self.crawled_data
to_be_inserted, to_be_updated = self.split_into_inserts_and_updates(crawled_data)
referencing_entities = self.create_reference_mapping(to_be_updated + to_be_inserted)
for el in to_be_updated:
# all entity objects are replaced by their IDs except for the not yet inserted ones
......
#!/usr/bin/env python3
# encoding: utf-8
#
# ** header v3.0
# This file is a part of the CaosDB Project.
#
# Copyright (C) 2021 Indiscale GmbH <info@indiscale.com>
# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
# ** end header
#
"""
see class docstring
"""
from .identifiable import Identifiable
import caosdb as db
class IdentifiedCache(object):
"""
This class is like a dictionary where the keys are Identifiables. When you check whether an
Identifiable exists as key this class returns True not only if that exact Python object is
used as a key, but if an Identifiable is used as key that is **equal** to the one being
considered (see __eq__ function of Identifiable). Similarly, if you do `cache[identifiable]`
you get the Record where the key is an Identifiable that is equal to the one in the rectangular
brackets.
This class is used for Records where we checked the existence in a remote server using
identifiables. If the Record was found, this means that we identified the corresponding Record
in the remote server and the ID of the local object can be set.
To prevent querying the server again and again for the same objects, this cache allows storing
Records that were found on a remote server and those that were not (typically in separate
caches).
"""
def __init__(self):
self._cache = {}
self._identifiables = []
def __contains__(self, identifiable: Identifiable):
return identifiable in self._identifiables
def __getitem__(self, identifiable: db.Record):
index = self._identifiables.index(identifiable)
return self._cache[id(self._identifiables[index])]
def add(self, record: db.Record, identifiable: Identifiable):
self._cache[id(identifiable)] = record
self._identifiables.append(identifiable)
......@@ -38,8 +38,9 @@ import caosdb as db
import caosdb.common.models as dbmodels
import pytest
import yaml
from caoscrawler.crawl import (Crawler, SecurityMode, _treat_deprecated_prefix,
crawler_main, split_restricted_path)
from caoscrawler.crawl import (Crawler, SecurityMode, TreatedRecordLookUp,
_treat_deprecated_prefix, crawler_main,
split_restricted_path)
from caoscrawler.debug_tree import DebugTree
from caoscrawler.identifiable import Identifiable
from caoscrawler.identifiable_adapters import (CaosDBIdentifiableAdapter,
......@@ -247,8 +248,8 @@ def test_split_into_inserts_and_updates_single(crawler_mocked_identifiable_retri
entlist = [db.Record(name="A").add_parent(
"C"), db.Record(name="B").add_parent("C")]
assert crawler.get_from_any_cache(identlist[0]) is None
assert crawler.get_from_any_cache(identlist[1]) is None
assert crawler.treated_records_lookup.get_any(entlist[0], identlist[0]) is None
assert crawler.treated_records_lookup.get_any(entlist[0], identlist[0]) is None
assert not crawler._has_reference_value_without_id(identlist[0])
assert not crawler._has_reference_value_without_id(identlist[1])
assert crawler.identifiableAdapter.retrieve_identified_record_for_record(
......@@ -387,7 +388,7 @@ def test_has_missing_object_in_references():
assert not crawler._has_missing_object_in_references(
Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), [])
crawler.add_to_remote_missing_cache(a, Identifiable(name="C", record_type="RTC",
crawler.treated_records_lookup.add(a, Identifiable(name="C", record_type="RTC",
properties={'d': 12311}))
# one ref with id one with Entity without id but in cache -> check
assert crawler._has_missing_object_in_references(
......@@ -667,8 +668,8 @@ def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test)
crawler.split_into_inserts_and_updates([db.Record(name="B").add_parent("C")])
# identifiables were not yet checked
assert crawler.get_from_any_cache(identlist[0]) is None
assert crawler.get_from_any_cache(identlist[1]) is None
assert crawler.treated_records_lookup.get_any(entlist[1], identlist[0]) is None
assert crawler.treated_records_lookup.get_any(entlist[0], identlist[1]) is None
# one with reference, one without
assert not crawler._has_reference_value_without_id(identlist[0])
assert crawler._has_reference_value_without_id(identlist[1])
......@@ -964,3 +965,59 @@ def test_replace_name_with_referenced_entity():
assert isinstance(prop.value[2], int)
assert prop.value[2] == test_id
assert caoscrawler.crawl.cached_get_entity_by.call_count == 3
def test_treated_record_lookup():
trlu = TreatedRecordLookUp()
exist = db.Record(id=1)
trlu.add(exist)
assert len(trlu._existing) == 1
# was added to existing
assert trlu._existing[id(exist)] is exist
# is in ID lookup
assert trlu._id_look_up[exist.id] is exist
# can be accessed via get_existing
assert trlu.get_existing(db.Record(id=1)) is exist
# assert trlu.is_existing(db.Record(id=1)) # TODO remove?
# assert not trlu.is_missing(db.Record(id=1)) # TODO remove?
miss = db.Record()
# exception when identifiable is missing
with raises(RuntimeError):
trlu.add(miss)
ident = Identifiable(name='a')
trlu.add(miss, ident)
# was added to missing
assert trlu._missing[id(miss)] is miss
# is in ident lookup
assert trlu._identifiable_look_up[ident.get_representation()] is miss
# can be accessed via get_missing
assert trlu.get_missing(db.Record(), Identifiable(name='a')) is miss
# assert not trlu.is_existing(db.Record()) # TODO remove?
# assert trlu.is_missing(db.Record()) # TODO remove?
fi = db.File(path='a', id=2)
trlu.add(fi)
assert len(trlu._existing) == 2
# was added to existing
assert trlu._existing[id(fi)] is fi
# is in ID lookup
assert trlu._id_look_up[fi.id] is fi
# is in path lookup
assert trlu._path_look_up[fi.path] is fi
# can be accessed via get_existing
assert trlu.get_existing(fi) is fi
all_exi = trlu.get_existing_list()
assert fi in all_exi
assert exist in all_exi
all_mi = trlu.get_missing_list()
assert miss in all_mi
# If a Record was added using the ID, the ID must be used to identify it even though later an
# identifiable may be passed as well
assert trlu.get_any(exist, Identifiable(name='b')) is exist
fi2 = db.File(path='b')
trlu.add(fi2)
assert trlu.get_any(db.File(path='b'), Identifiable(name='c')) is fi2
......@@ -24,10 +24,9 @@
test identifiable module
"""
import pytest
import caosdb as db
import pytest
from caoscrawler.identifiable import Identifiable
from caoscrawler.identified_cache import IdentifiedCache
def test_create_hashable_string():
......
#!/usr/bin/env python3
# encoding: utf-8
#
# ** header v3.0
# This file is a part of the CaosDB Project.
#
# Copyright (C) 2021 Indiscale GmbH <info@indiscale.com>
# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
# ** end header
#
"""
test identified_cache module
"""
import caosdb as db
from caoscrawler.identifiable import Identifiable
from caoscrawler.identified_cache import IdentifiedCache
def test_IdentifiedCache():
ident = Identifiable(name="A", record_type="B")
record = db.Record("A").add_parent("B").add_property('b', 5)
cache = IdentifiedCache()
assert ident not in cache
cache.add(record=record, identifiable=ident)
assert ident in cache
assert cache[ident] is record
assert Identifiable(name="A", record_type="C") != Identifiable(name="A", record_type="B")
assert Identifiable(name="A", record_type="C") not in cache
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment