diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 735d53a100b4eac18b8a33dcfa18e041b15f9972..d08eb790b3acf5c1345f59d7784061603561a772 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -633,12 +633,21 @@ class Crawler(object): identifiable = record self.identified_cache.add(identifiable=identifiable, record=record) - def copy_attributes(self, fro: db.Entity, to: db.Entity): - """ - Copy all attributes from one entity to another entity. + @staticmethod + def bend_references_to_new_object(old, new, entities): + """ Bend references to the other object + Iterate over all entities in `entities` and check the values of all properties of + occurances of old Entity and replace them with new Entity """ - - merge_entities(to, fro) + for el in entities: + for p in el.properties: + if isinstance(p.value, list): + for index, val in enumerate(p.value): + if val is old: + p.value[index] = new + else: + if p.value is old: + p.value = new def split_into_inserts_and_updates(self, ent_list: List[db.Entity]): if self.identifiableAdapter is None: @@ -668,24 +677,12 @@ class Crawler(object): "are removed from the list") # Check the local cache first for duplicate elif self.get_identified_record_from_local_cache(record) is not None: - - # This record is a duplicate that can be removed. Make sure we do not lose - # information - # Update an (local) identified record that will be inserted - newrecord = self.get_identified_record_from_local_cache( - record) - self.copy_attributes(fro=record, to=newrecord) - # Bend references to the other object - # TODO refactor this - for el in flat + to_be_inserted + to_be_updated: - for p in el.properties: - if isinstance(p.value, list): - for index, val in enumerate(p.value): - if val is record: - p.value[index] = newrecord - else: - if p.value is record: - p.value = newrecord + # This record is a duplicate that can be removed. + # We merge the two in order to prevent loss of information + newrecord = self.get_identified_record_from_local_cache(record) + merge_entities(newrecord, record) + Crawler.bend_references_to_new_object( + old=record, new=newrecord, entities=flat+to_be_updated+to_be_inserted) del flat[i]