diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 32ad2a34e4f63c1cf05ea1760eb434a06fffa7de..e069a911934e4cc86cddf2351e7fa0f52c80022a 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -537,6 +537,48 @@ class Crawler(object): return references + @staticmethod + def _treat_merge_error_of(newrecord, record): + """ + The parameters are two entities that cannot be merged with the merge_entities function. + + # This function checks for two obvious cases where no merge will ever be possible: + # 1. Two Entities with differing IDs + # 2. Two non-Entity values which differ + + It creates a more informative logger message and raises an Exception in those cases. + """ + for this_p in newrecord.properties: + that_p = record.get_property(this_p.name) + if (isinstance(this_p.value, db.Entity) + and isinstance(that_p.value, db.Entity)): + if this_p.value.id is not None and that_p.value.id is not None: + if this_p.value.id != that_p.value.id: + logger.error("The Crawler is trying to merge two entities " + "because they should be the same object (same" + " identifiables), but they reference " + "different Entities with the same Property." + f"Problematic Property: {this_p.name}\n" + f"Referenced Entities: {this_p.value.id} and " + f"{that_p.value.id}\n" + f"{record}\n{newrecord}") + raise RuntimeError("Cannot merge Entities") + elif (not isinstance(this_p.value, db.Entity) + and not isinstance(that_p.value, db.Entity)): + if ((this_p.value != that_p.value) + # TODO can we also compare lists? + and not isinstance(this_p.value, list) + and not isinstance(that_p.value, list)): + logger.error("The Crawler is trying to merge two entities " + "because they should be the same object (same" + " identifiables), but they have " + "different values for the same Property." + f"Problematic Property: {this_p.name}\n" + f"Values: {this_p.value} and " + f"{that_p.value}\n" + f"{record}\n{newrecord}") + raise RuntimeError("Cannot merge Entities") + def split_into_inserts_and_updates(self, ent_list: list[db.Entity]): to_be_inserted: list[db.Entity] = [] to_be_updated: list[db.Entity] = [] @@ -549,10 +591,11 @@ class Crawler(object): resolved_references = True # flat contains Entities which could not yet be checked against the remote server + try_to_merge_later = [] while resolved_references and len(flat) > 0: resolved_references = False referencing_entities = self.create_reference_mapping( - flat + to_be_updated + to_be_inserted) + flat + to_be_updated + try_to_merge_later+to_be_inserted) # For each element we try to find out whether we can find it in the server or whether # it does not yet exist. Since a Record may reference other unkown Records it might not @@ -599,14 +642,24 @@ class Crawler(object): del flat[i] # 3. Is it in the cache of already checked Records? elif self.get_from_any_cache(identifiable) is not None: - # We merge the two in order to prevent loss of information newrecord = self.get_from_any_cache(identifiable) + # Since the identifiables are the same, newrecord and record actually describe + # the same obejct. + # We merge the two in order to prevent loss of information try: - merge_entities(newrecord, record) + merge_entities(newrecord, record, merge_references_with_empty_diffs=False) except EntityMergeConflictError: - continue + _treat_merge_error_of(newrecord, record) + # We cannot merge but it is none of the clear case where merge is + # impossible. Thus we try later + try_to_merge_later.append(record) + if newrecord.id is not None: + record.id = newrecord.id Crawler.bend_references_to_new_object( - old=record, new=newrecord, entities=flat + to_be_updated + to_be_inserted) + old=record, new=newrecord, entities=flat + to_be_updated + + to_be_inserted+try_to_merge_later) + referencing_entities = self.create_reference_mapping( + flat + to_be_updated + try_to_merge_later+to_be_inserted) del flat[i] resolved_references = True @@ -641,6 +694,14 @@ class Crawler(object): for record in flat: self.replace_references_with_cached(record, referencing_entities) + # We postponed the merge for records where it failed previously and try it again now. + # This only might add properties of the postponed records to the already used ones. + for record in try_to_merge_later: + identifiable = self.identifiableAdapter.get_identifiable( + record, + referencing_entities=referencing_entities) + newrecord = self.get_from_any_cache(identifiable) + merge_entities(newrecord, record) if len(flat) > 0: circle = self.detect_circular_dependency(flat) if circle is None: