Skip to content
Snippets Groups Projects
Commit a9e84f9e authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

MAINT: refactoring

parent 83d48536
No related branches found
No related tags found
1 merge request!53Release 0.1
......@@ -73,6 +73,7 @@ from .identifiable_adapters import IdentifiableAdapter, LocalStorageIdentifiable
from collections import defaultdict
from typing import Union, Any, Optional
from caosdb.apiutils import compare_entities
from copy import deepcopy
class Crawler(object):
......@@ -194,7 +195,7 @@ class Crawler(object):
def can_be_checked_externally(self, record: db.Record):
"""
Returns True if there is at least one property in record which:
Returns False if there is at least one property in record which:
a) is a reference property AND
b) where the value is set to a db.Entity (instead of an ID) AND
c) where the ID of the value is not set (to an integer)
......@@ -204,7 +205,7 @@ class Crawler(object):
for p in record.properties:
# TODO: implement for lists?
if (is_reference(p) and isinstance(p.value, db.Entity)
and not isinstance(p.value.id, int)):
and p.value.id is None):
return False
return True
......@@ -221,7 +222,7 @@ class Crawler(object):
flat.append(p.value)
self.create_flat_list([p.value], flat)
def cannot_be_checked_remotely(self, record):
def all_references_are_existing_already(self, record):
"""
returns true if all references either have IDs or were checked remotely and not found (i.e.
they exist in the local cache)
......@@ -230,13 +231,14 @@ class Crawler(object):
if (is_reference(p)
# Entity instead of ID and not cached locally
# TODO: implement for lists?
and (isinstance(p.value, db.Entity)
and self.get_identifiable_from_local_cache(p.value) is None)):
and isinstance(p.value, db.Entity)
and p.value.id is None
and self.get_identified_record_from_local_cache(p.value) is None):
# might be checked when reference is resolved
return False
return True
def get_identifiable_from_local_cache(self, identifiable: db.Record):
def get_identified_record_from_local_cache(self, identifiable: db.Record):
"""
returns the identifiable if an identifiable with the same values already exists locally
(Each identifiable that is not found on the remote server, is 'cached' locally to prevent
......@@ -244,7 +246,7 @@ class Crawler(object):
"""
raise NotImplementedError()
def add_identifiable_to_local_cache(self, identifiable: db.Record):
def add_identified_record_to_local_cache(self, identifiable: db.Record):
"""
adds the given identifiable to the local cache
......@@ -265,6 +267,7 @@ class Crawler(object):
self.create_flat_list(ent_list, flat)
resolved_references = True
# flat contains Entities which could not yet be checked against the remote server
while resolved_references and len(flat) > 0:
resolved_references = False
......@@ -275,40 +278,38 @@ class Crawler(object):
if (record.id is not None or record in to_be_inserted):
raise Exception("This should not be reached since treated elements are removed"
" from the list")
# all references need to be IDs that exist on the remote server
elif self.can_be_checked_externally(record):
# Check the local cache first
if self.get_identifiable_from_local_cache(record) is not None:
print(record, "is in cache ")
# Check the local cache first for duplicate
elif self.get_identified_record_from_local_cache(record) is not None:
# This record is a duplicate that can be removed. Make sure we do not lose
# information
# Update an (local) identified record that will be inserted
self.copy_attributes(
fro=record, to=self.get_identifiable_from_local_cache(record))
fro=record, to=self.get_identified_record_from_local_cache(record))
del flat[i]
continue
# Check remotely
identified = self.identifiableAdapter.retrieve_identifiable(record)
print(record)
if identified is None:
print("not identified")
# all references need to be IDs that exist on the remote server
elif self.can_be_checked_externally(record):
# Check remotely
identified_record = self.identifiableAdapter.retrieve_identifiable(
deepcopy(record))
if identified_record is None:
# identifiable does not exist remote
to_be_inserted.append(record)
self.add_identifiable_to_local_cache(record)
self.add_identified_record_to_local_cache(record)
del flat[i]
else:
print("identified")
record.id = identified.id
# side effect
record.id = identified_record.id
to_be_updated.append(record)
del flat[i]
resolved_references = True
# e.g. references an identifiable that does not exist remotely
elif self.cannot_be_checked_remotely(record):
print("cannot be checked")
print(record)
elif self.all_references_are_existing_already(record):
to_be_inserted.append(record)
self.add_identifiable_to_local_cache(record)
self.add_identified_record_to_local_cache(record)
del flat[i]
resolved_references = True
if len(flat) > 0:
......@@ -316,47 +317,18 @@ class Crawler(object):
return to_be_inserted, to_be_updated
def _synchronize(self, updateList: list[db.Record]):
def remove_unnecessary_updates(self, updateList: list[db.Record]):
"""
This function applies several stages:
1) Retrieve identifiables for all records in updateList.
2) Compare updateList with existing records.
3) Insert and update records based on the set of identified differences.
This function makes use of an IdentifiableAdapter which is used to retrieve
register and retrieve identifiables.
Return the final insertList and updateList as tuple.
checks whether all relevant attributes (especially Property values) are equal
"""
if self.identifiableAdapter is None:
raise RuntimeError("Should not happen.")
to_be_inserted, to_be_updated = self.split_into_inserts_and_updates(updateList)
# TODO the following needs to be moved
for i in reversed(range(len(updateList))):
record = updateList[i]
# if there is no identifiable, move record from update list to insert list:
if identifiable is None:
insertList.append(record)
del updateList[i]
# also update all references to this entity to get the value -x
# where -x is the id of the new entity in insert list
# also add this entity directly to the list of known entites of the current
# identifiable adapter.
# any reference to this entity does not need to be compared anymore, as it
# definitely needs updating (the new value cannot have existed before)
continue
identifiable = self.get_identifiable(record)
comp = compare_entities(record, identifiable)
identical = True
for j in range(2):
# TODO: should be implemented elsewhere
# TODO: should be implemented elsewhere (?)
for label in ("parents", ):
if len(comp[j][label]) > 0:
identical = False
......@@ -384,9 +356,30 @@ class Crawler(object):
else:
pass
def _synchronize(self, updateList: list[db.Record]):
"""
This function applies several stages:
1) Retrieve identifiables for all records in updateList.
2) Compare updateList with existing records.
3) Insert and update records based on the set of identified differences.
This function makes use of an IdentifiableAdapter which is used to retrieve
register and retrieve identifiables.
Return the final insertList and updateList as tuple.
"""
if self.identifiableAdapter is None:
raise RuntimeError("Should not happen.")
to_be_inserted, to_be_updated = self.split_into_inserts_and_updates(updateList)
# remove unnecessary updates from list
self.remove_unnecessary_updates(to_be_updated)
# TODO
# self.exeute_inserts_in_list(insertList)
# self.exeute_updates_in_list(updateList)
self.execute_inserts_in_list(insertList)
self.execute_updates_in_list(updateList)
return (insertList, updateList)
......
......@@ -150,6 +150,7 @@ class IdentifiableAdapter(object):
pass
# TODO: the name is confusing. it returns the identified record
# TODO: remove side effect
def retrieve_identifiable(self, record: db.Record):
"""
This function combines all functionality of the IdentifierAdapter by
......
......@@ -376,8 +376,8 @@ def test_split_into_inserts_and_updates(crawler):
def trivial_cache_add(stuff):
cache.append(stuff.name)
crawler.get_identifiable_from_local_cache = Mock(side_effect=trivial_cache_loockup)
crawler.add_identifiable_to_local_cache = Mock(side_effect=trivial_cache_add)
crawler.get_identified_record_from_local_cache = Mock(side_effect=trivial_cache_loockup)
crawler.add_identified_record_to_local_cache = Mock(side_effect=trivial_cache_add)
crawler.copy_attributes = Mock()
# a record that is found remotely and should be added to the update list and one that is not
......@@ -386,11 +386,11 @@ def test_split_into_inserts_and_updates(crawler):
entlist = [db.Record(name="A"), db.Record(name="B")]
crawler.identifiableAdapter.retrieve_identifiable = Mock(side_effect=partial(
base_mocked_lookup, known=remote_known))
insert, update = crawler.split_into_inserts_and_updates(entlist)
insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist))
print(crawler.identifiableAdapter.retrieve_identifiable.call_args_list)
print(entlist)
crawler.identifiableAdapter.retrieve_identifiable.assert_any_call(entlist[0])
crawler.identifiableAdapter.retrieve_identifiable.assert_any_call(entlist[1])
# crawler.identifiableAdapter.retrieve_identifiable.assert_any_call(entlist[0])
# crawler.identifiableAdapter.retrieve_identifiable.assert_any_call(entlist[1])
assert len(insert) == 1
assert insert[0].name == "B"
assert len(update) == 1
......@@ -423,3 +423,18 @@ def test_split_into_inserts_and_updates(crawler):
entlist = [a, b]
with raises(RuntimeError):
crawler.split_into_inserts_and_updates(entlist)
# assume identifiable is only the name
a = db.Record(name="A")
a.add_property("foo", 1)
b = db.Record(name="A")
b.add_property("bar", 2)
# expected TODO
#assert result.has_property("foo").value == 1
#assert result.has_property("bar").value == 1
def test_all_references_are_existing_already(crawler):
pass
# crawler.all_references_are_existing_already(record)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment