Skip to content
Snippets Groups Projects
Commit dc95a9ab authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

MAINT: refactor merge into new function

parent b0a0791b
No related branches found
No related tags found
4 merge requests!160STY: styling,!153MAINT: refactor merge into new function,!152MAINT: move checks that are not related to identifiables,!151New class to track treated Records during sync
Pipeline #47350 passed
...@@ -632,6 +632,32 @@ class Crawler(object): ...@@ -632,6 +632,32 @@ class Crawler(object):
if p.value is old: if p.value is old:
p.value = new p.value = new
def _merge_identified(self, newrecord, record, try_to_merge_later, all_records):
""" tries to merge record into newrecord
If it fails, record is added to the try_to_merge_later list.
In any case, references are bent to the newrecord object.
"""
try:
merge_entities(
newrecord, record, merge_references_with_empty_diffs=False, merge_id_with_resolved_entity=True)
except EntityMergeConflictError:
_treat_merge_error_of(newrecord, record)
# We cannot merge but it is none of the clear case where merge is
# impossible. Thus we try later
try_to_merge_later.append(record)
if newrecord.id is not None:
record.id = newrecord.id
except NotImplementedError:
print(newrecord)
print(record)
raise
Crawler.bend_references_to_new_object(
old=record, new=newrecord,
entities=all_records
)
@staticmethod @staticmethod
def create_reference_mapping(flat: list[db.Entity]): def create_reference_mapping(flat: list[db.Entity]):
""" """
...@@ -677,7 +703,12 @@ class Crawler(object): ...@@ -677,7 +703,12 @@ class Crawler(object):
record = flat[i] record = flat[i]
# 1. Can it be identified via an ID? # 1. Can it be identified via an ID?
if record.id is not None: if record.id is not None:
self.treated_records_lookup.add(record, None) treated_record = self.treated_records_lookup.get_existing(record)
if treated_record is not None:
self._merge_identified(treated_record, record, try_to_merge_later, all_records)
referencing_entities = self.create_reference_mapping(all_records)
else:
self.treated_records_lookup.add(record, None)
del flat[i] del flat[i]
# 2. Can it be identified via a path? # 2. Can it be identified via a path?
elif record.path is not None: elif record.path is not None:
...@@ -685,18 +716,20 @@ class Crawler(object): ...@@ -685,18 +716,20 @@ class Crawler(object):
existing = cached_get_entity_by(path=record.path) existing = cached_get_entity_by(path=record.path)
except EmptyUniqueQueryError: except EmptyUniqueQueryError:
existing = None existing = None
if existing is None: if existing is not None:
# TODO add identifiable if possible
self.treated_records_lookup.add(record, None)
del flat[i]
else:
record.id = existing.id record.id = existing.id
# TODO check the following copying of _size and _checksum # TODO check the following copying of _size and _checksum
# Copy over checksum and size too if it is a file # Copy over checksum and size too if it is a file
record._size = existing._size record._size = existing._size
record._checksum = existing._checksum record._checksum = existing._checksum
treated_record = self.treated_records_lookup.get_any(record)
if treated_record is not None:
self._merge_identified(treated_record, record, try_to_merge_later, all_records)
referencing_entities = self.create_reference_mapping(all_records)
else:
# TODO add identifiable if possible
self.treated_records_lookup.add(record, None) self.treated_records_lookup.add(record, None)
del flat[i] del flat[i]
resolved_references = True resolved_references = True
# flat contains Entities which could not yet be checked against the remote server # flat contains Entities which could not yet be checked against the remote server
...@@ -721,28 +754,11 @@ class Crawler(object): ...@@ -721,28 +754,11 @@ class Crawler(object):
# 1. Is it in the cache of already checked Records? # 1. Is it in the cache of already checked Records?
if self.treated_records_lookup.get_any(record, identifiable) is not None: if self.treated_records_lookup.get_any(record, identifiable) is not None:
newrecord = self.treated_records_lookup.get_any(record, identifiable) treated_record = self.treated_records_lookup.get_any(record, identifiable)
# Since the identifiables are the same, newrecord and record actually describe # Since the identifiables are the same, treated_record and record actually describe
# the same obejct. # the same obejct.
# We merge the two in order to prevent loss of information # We merge record into treated_record in order to prevent loss of information
try: self._merge_identified(treated_record, record, try_to_merge_later, all_records)
merge_entities(
newrecord, record, merge_references_with_empty_diffs=False, merge_id_with_resolved_entity=True)
except EntityMergeConflictError:
_treat_merge_error_of(newrecord, record)
# We cannot merge but it is none of the clear case where merge is
# impossible. Thus we try later
try_to_merge_later.append(record)
if newrecord.id is not None:
record.id = newrecord.id
except NotImplementedError:
print(newrecord)
print(record)
raise
Crawler.bend_references_to_new_object(
old=record, new=newrecord,
entities=all_records
)
referencing_entities = self.create_reference_mapping(all_records) referencing_entities = self.create_reference_mapping(all_records)
del flat[i] del flat[i]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment