Skip to content
Snippets Groups Projects
Commit d3e03841 authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

MAINT: move checks that are not related to identifiables

parent 02666380
No related branches found
No related tags found
3 merge requests!160STY: styling,!152MAINT: move checks that are not related to identifiables,!151New class to track treated Records during sync
...@@ -232,6 +232,18 @@ class TreatedRecordLookUp(): ...@@ -232,6 +232,18 @@ class TreatedRecordLookUp():
existing or missing list depending on whether the Record has a valid ID. existing or missing list depending on whether the Record has a valid ID.
Additionally, the Record is added to three look up dicts. The keys of those are paths, IDs and Additionally, the Record is added to three look up dicts. The keys of those are paths, IDs and
the representation of the identifiables. the representation of the identifiables.
The extreme case, that one could imagine, would be that the same Record occurs three times as
different Python objects: one that only has an ID, one with only a path and one without ID and
path but with identifying properties. During `split_into_inserts_and_updates` all three
must be identified with each other (and must be merged). Since we require, that treated
entities have a valid ID if they exist in the remote server, all three objects would be
identified with each other simply using the IDs.
In the case that the Record is not yet in the remote server, there cannot be a Python object
with an ID. Thus we might have one with a path and one with an identifiable. If that Record
does not yet exist, it is necessary that both Python objects have at least either the path or
the identifiable in common. Currently, this has to be assured by the user.
""" """
def __init__(self): def __init__(self):
...@@ -266,6 +278,8 @@ class TreatedRecordLookUp(): ...@@ -266,6 +278,8 @@ class TreatedRecordLookUp():
def add(self, record: db.Entity, identifiable: Optional[Identifiable] = None): def add(self, record: db.Entity, identifiable: Optional[Identifiable] = None):
""" """
Add a Record that was treated, such that it is contained in the internal look up dicts Add a Record that was treated, such that it is contained in the internal look up dicts
This Record MUST have an ID if it was found in the remote server.
""" """
if record.id is None: if record.id is None:
if record.path is None and identifiable is None: if record.path is None and identifiable is None:
...@@ -658,6 +672,32 @@ class Crawler(object): ...@@ -658,6 +672,32 @@ class Crawler(object):
if ent.role == "Record" and len(ent.parents) == 0: if ent.role == "Record" and len(ent.parents) == 0:
raise RuntimeError(f"Records must have a parent.\n{ent}") raise RuntimeError(f"Records must have a parent.\n{ent}")
# Check whether Records can be identified without identifiable
for i in reversed(range(len(flat))):
record = flat[i]
# 1. Can it be identified via an ID?
if record.id is not None:
self.treated_records_lookup.add(record, None)
del flat[i]
# 2. Can it be identified via a path?
elif record.path is not None:
try:
existing = cached_get_entity_by(path=record.path)
except EmptyUniqueQueryError:
existing = None
if existing is None:
# TODO add identifiable if possible
self.treated_records_lookup.add(record, None)
del flat[i]
else:
record.id = existing.id
# TODO check the following copying of _size and _checksum
# Copy over checksum and size too if it is a file
record._size = existing._size
record._checksum = existing._checksum
self.treated_records_lookup.add(record, None)
del flat[i]
resolved_references = True resolved_references = True
# flat contains Entities which could not yet be checked against the remote server # flat contains Entities which could not yet be checked against the remote server
try_to_merge_later = [] try_to_merge_later = []
...@@ -670,40 +710,17 @@ class Crawler(object): ...@@ -670,40 +710,17 @@ class Crawler(object):
# it does not yet exist. Since a Record may reference other unkown Records it might not # it does not yet exist. Since a Record may reference other unkown Records it might not
# be possible to answer this right away. # be possible to answer this right away.
# The following checks are done on each Record: # The following checks are done on each Record:
# 1. Can it be identified via an ID? # 1. Is it in the cache of already checked Records?
# 2. Can it be identified via a path? # 2. Can it be checked on the remote server?
# 3. Is it in the cache of already checked Records? # 3. Does it have to be new since a needed reference is missing?
# 4. Can it be checked on the remote server?
# 5. Does it have to be new since a needed reference is missing?
for i in reversed(range(len(flat))): for i in reversed(range(len(flat))):
record = flat[i] record = flat[i]
identifiable = self.identifiableAdapter.get_identifiable( identifiable = self.identifiableAdapter.get_identifiable(
record, record,
referencing_entities=referencing_entities) referencing_entities=referencing_entities)
# 1. Can it be identified via an ID? # 1. Is it in the cache of already checked Records?
if record.id is not None: if self.treated_records_lookup.get_any(record, identifiable) is not None:
self.treated_records_lookup.add(record, identifiable)
del flat[i]
# 2. Can it be identified via a path?
elif record.path is not None:
try:
existing = cached_get_entity_by(path=record.path)
except EmptyUniqueQueryError:
existing = None
if existing is None:
self.treated_records_lookup.add(record, identifiable)
del flat[i]
else:
record.id = existing.id
# TODO check the following copying of _size and _checksum
# Copy over checksum and size too if it is a file
record._size = existing._size
record._checksum = existing._checksum
self.treated_records_lookup.add(record, identifiable)
del flat[i]
# 3. Is it in the cache of already checked Records?
elif self.treated_records_lookup.get_any(record, identifiable) is not None:
newrecord = self.treated_records_lookup.get_any(record, identifiable) newrecord = self.treated_records_lookup.get_any(record, identifiable)
# Since the identifiables are the same, newrecord and record actually describe # Since the identifiables are the same, newrecord and record actually describe
# the same obejct. # the same obejct.
...@@ -731,7 +748,7 @@ class Crawler(object): ...@@ -731,7 +748,7 @@ class Crawler(object):
del flat[i] del flat[i]
resolved_references = True resolved_references = True
# 4. Can it be checked on the remote server? # 2. Can it be checked on the remote server?
elif not self._has_reference_value_without_id(identifiable): elif not self._has_reference_value_without_id(identifiable):
identified_record = ( identified_record = (
self.identifiableAdapter.retrieve_identified_record_for_identifiable( self.identifiableAdapter.retrieve_identified_record_for_identifiable(
...@@ -742,11 +759,12 @@ class Crawler(object): ...@@ -742,11 +759,12 @@ class Crawler(object):
else: else:
# side effect # side effect
record.id = identified_record.id record.id = identified_record.id
record.path = existing.path
self.treated_records_lookup.add(record, identifiable) self.treated_records_lookup.add(record, identifiable)
del flat[i] del flat[i]
resolved_references = True resolved_references = True
# 5. Does it have to be new since a needed reference is missing? # 3. Does it have to be new since a needed reference is missing?
# (Is it impossible to check this record because an identifiable references a # (Is it impossible to check this record because an identifiable references a
# missing record?) # missing record?)
elif self._has_missing_object_in_references(identifiable, referencing_entities): elif self._has_missing_object_in_references(identifiable, referencing_entities):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment