diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 860f71348b8c5ee09d134db8f64f41188ceae4b7..771e2a4183788e1757b4773db4cafb5e9d7baf6e 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -705,9 +705,11 @@ class Crawler(object): treated_record = self.treated_records_lookup.get_existing(record) if treated_record is not None: self._merge_identified(treated_record, record, try_to_merge_later, all_records) + all_records.remove(record) referencing_entities = self.create_reference_mapping(all_records) else: self.treated_records_lookup.add(record, None) + assert record.id del flat[i] # 2. Can it be identified via a path? elif record.path is not None: @@ -724,10 +726,12 @@ class Crawler(object): treated_record = self.treated_records_lookup.get_any(record) if treated_record is not None: self._merge_identified(treated_record, record, try_to_merge_later, all_records) + all_records.remove(record) referencing_entities = self.create_reference_mapping(all_records) else: # TODO add identifiable if possible self.treated_records_lookup.add(record, None) + assert record.id del flat[i] entity_was_treated = True @@ -762,6 +766,7 @@ class Crawler(object): # describe the same object. # We merge record into treated_record in order to prevent loss of information self._merge_identified(treated_record, record, try_to_merge_later, all_records) + all_records.remove(record) referencing_entities = self.create_reference_mapping(all_records) del flat[i] @@ -780,6 +785,7 @@ class Crawler(object): record.id = identified_record.id record.path = identified_record.path self.treated_records_lookup.add(record, identifiable) + assert record.id del flat[i] entity_was_treated = True @@ -788,6 +794,7 @@ class Crawler(object): # missing record?) elif self._has_missing_object_in_references(identifiable, referencing_entities): self.treated_records_lookup.add(record, identifiable) + assert record.id del flat[i] entity_was_treated = True