Skip to content
Snippets Groups Projects
Commit 67fd2eda authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

MAINT: refactor and comments

parent e49ba6d8
Branches
Tags
2 merge requests!160STY: styling,!140New f fix merge
Pipeline #46352 passed
......@@ -537,6 +537,42 @@ class Crawler(object):
return references
@staticmethod
def generate_merge_error_info(newrecord, record):
# Deal with two obvious cases where no merge will ever be possible:
# 1. Two Entities with differing IDs
# 2. Two non-Entity values which differ
for this_p in newrecord.properties:
that_p = record.get_property(this_p.name)
if (isinstance(this_p.value, db.Entity)
and isinstance(that_p.value, db.Entity)):
if this_p.value.id is not None and that_p.value.id is not None:
if this_p.value.id != that_p.value.id:
logger.error("The Crawler is trying to merge two entities "
"because they should be the same object (same"
" identifiables), but they reference "
"different Entities with the same Property."
f"Problematic Property: {this_p.name}\n"
f"Referenced Entities: {this_p.value.id} and "
f"{that_p.value.id}\n"
f"{record}\n{newrecord}")
raise RuntimeError("Cannot merge Entities")
elif (not isinstance(this_p.value, db.Entity)
and not isinstance(that_p.value, db.Entity)):
if ((this_p.value != that_p.value)
# TODO can we also compare lists?
and not isinstance(this_p.value, list)
and not isinstance(that_p.value, list)):
logger.error("The Crawler is trying to merge two entities "
"because they should be the same object (same"
" identifiables), but they have "
"different values for the same Property."
f"Problematic Property: {this_p.name}\n"
f"Values: {this_p.value} and "
f"{that_p.value}\n"
f"{record}\n{newrecord}")
raise RuntimeError("Cannot merge Entities")
def split_into_inserts_and_updates(self, ent_list: list[db.Entity]):
to_be_inserted: list[db.Entity] = []
to_be_updated: list[db.Entity] = []
......@@ -549,11 +585,11 @@ class Crawler(object):
resolved_references = True
# flat contains Entities which could not yet be checked against the remote server
pending = []
try_to_merge_later = []
while resolved_references and len(flat) > 0:
resolved_references = False
referencing_entities = self.create_reference_mapping(
flat + to_be_updated + pending+to_be_inserted)
flat + to_be_updated + try_to_merge_later+to_be_inserted)
# For each element we try to find out whether we can find it in the server or whether
# it does not yet exist. Since a Record may reference other unkown Records it might not
......@@ -600,55 +636,24 @@ class Crawler(object):
del flat[i]
# 3. Is it in the cache of already checked Records?
elif self.get_from_any_cache(identifiable) is not None:
# We merge the two in order to prevent loss of information
newrecord = self.get_from_any_cache(identifiable)
# Since the identifiables are the same, newrecord and record actually describe
# the same obejct.
# We merge the two in order to prevent loss of information
try:
merge_entities(newrecord, record, merge_references_with_empty_diffs=False)
except EntityMergeConflictError:
# Deal with two obvious cases where no merge will ever be possible:
# 1. Two Entities with differing IDs
# 2. Two non-Entity values which differ
for this_p in newrecord.properties:
that_p = record.get_property(this_p.name)
if (isinstance(this_p.value, db.Entity)
and isinstance(that_p.value, db.Entity)):
if this_p.value.id is not None and that_p.value.id is not None:
if this_p.value.id != that_p.value.id:
logger.error("The Crawler is trying to merge two entities "
"because they should be the same object (same"
" identifiables), but they reference "
"different Entities with the same Property."
f"Problematic Property: {this_p.name}\n"
f"Referenced Entities: {this_p.value.id} and "
f"{that_p.value.id}\n"
f"{record}\n{newrecord}")
raise RuntimeError("Cannot merge Entities")
elif (not isinstance(this_p.value, db.Entity)
and not isinstance(that_p.value, db.Entity)):
if ((this_p.value != that_p.value)
# TODO can we also compare lists?
and not isinstance(this_p.value, list)
and not isinstance(that_p.value, list)):
logger.error("The Crawler is trying to merge two entities "
"because they should be the same object (same"
" identifiables), but they have "
"different values for the same Property."
f"Problematic Property: {this_p.name}\n"
f"Values: {this_p.value} and "
f"{that_p.value}\n"
f"{record}\n{newrecord}")
raise RuntimeError("Cannot merge Entities")
pending.append(record)
generate_merge_error_info(newrecord, record)
# We cannot merge but it is none of the clear case where merge is
# impossible. Thus we try later
try_to_merge_later.append(record)
if newrecord.id is not None:
record.id = newrecord.id
del flat[i]
# the continue prevents record from ever being resolved even if it would
# be possible
continue
Crawler.bend_references_to_new_object(
old=record, new=newrecord, entities=flat + to_be_updated + to_be_inserted+pending)
old=record, new=newrecord, entities=flat + to_be_updated +
to_be_inserted+try_to_merge_later)
referencing_entities = self.create_reference_mapping(
flat + to_be_updated + pending+to_be_inserted)
flat + to_be_updated + try_to_merge_later+to_be_inserted)
del flat[i]
resolved_references = True
......@@ -683,14 +688,14 @@ class Crawler(object):
for record in flat:
self.replace_references_with_cached(record, referencing_entities)
for record in pending:
# We postponed the merge for records where it failed previously and try it again now.
# This only might add properties of the postponed records to the already used ones.
for record in try_to_merge_later:
identifiable = self.identifiableAdapter.get_identifiable(
record,
referencing_entities=referencing_entities)
newrecord = self.get_from_any_cache(identifiable)
merge_entities(newrecord, record)
Crawler.bend_references_to_new_object(
old=record, new=newrecord, entities=flat + to_be_updated + to_be_inserted)
if len(flat) > 0:
circle = self.detect_circular_dependency(flat)
if circle is None:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment