diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index cb70921612024391cdd277d90e66e559f73af90b..935db0a7e9c8320eb89011e935c1e2547b766c17 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -632,6 +632,32 @@ class Crawler(object): if p.value is old: p.value = new + def _merge_identified(self, newrecord, record, try_to_merge_later, all_records): + """ tries to merge record into newrecord + + If it fails, record is added to the try_to_merge_later list. + In any case, references are bent to the newrecord object. + + """ + try: + merge_entities( + newrecord, record, merge_references_with_empty_diffs=False, merge_id_with_resolved_entity=True) + except EntityMergeConflictError: + _treat_merge_error_of(newrecord, record) + # We cannot merge but it is none of the clear case where merge is + # impossible. Thus we try later + try_to_merge_later.append(record) + if newrecord.id is not None: + record.id = newrecord.id + except NotImplementedError: + print(newrecord) + print(record) + raise + Crawler.bend_references_to_new_object( + old=record, new=newrecord, + entities=all_records + ) + @staticmethod def create_reference_mapping(flat: list[db.Entity]): """ @@ -677,7 +703,12 @@ class Crawler(object): record = flat[i] # 1. Can it be identified via an ID? if record.id is not None: - self.treated_records_lookup.add(record, None) + treated_record = self.treated_records_lookup.get_existing(record) + if treated_record is not None: + self._merge_identified(treated_record, record, try_to_merge_later, all_records) + referencing_entities = self.create_reference_mapping(all_records) + else: + self.treated_records_lookup.add(record, None) del flat[i] # 2. Can it be identified via a path? elif record.path is not None: @@ -685,18 +716,20 @@ class Crawler(object): existing = cached_get_entity_by(path=record.path) except EmptyUniqueQueryError: existing = None - if existing is None: - # TODO add identifiable if possible - self.treated_records_lookup.add(record, None) - del flat[i] - else: + if existing is not None: record.id = existing.id # TODO check the following copying of _size and _checksum # Copy over checksum and size too if it is a file record._size = existing._size record._checksum = existing._checksum + treated_record = self.treated_records_lookup.get_any(record) + if treated_record is not None: + self._merge_identified(treated_record, record, try_to_merge_later, all_records) + referencing_entities = self.create_reference_mapping(all_records) + else: + # TODO add identifiable if possible self.treated_records_lookup.add(record, None) - del flat[i] + del flat[i] resolved_references = True # flat contains Entities which could not yet be checked against the remote server @@ -721,28 +754,11 @@ class Crawler(object): # 1. Is it in the cache of already checked Records? if self.treated_records_lookup.get_any(record, identifiable) is not None: - newrecord = self.treated_records_lookup.get_any(record, identifiable) - # Since the identifiables are the same, newrecord and record actually describe + treated_record = self.treated_records_lookup.get_any(record, identifiable) + # Since the identifiables are the same, treated_record and record actually describe # the same obejct. - # We merge the two in order to prevent loss of information - try: - merge_entities( - newrecord, record, merge_references_with_empty_diffs=False, merge_id_with_resolved_entity=True) - except EntityMergeConflictError: - _treat_merge_error_of(newrecord, record) - # We cannot merge but it is none of the clear case where merge is - # impossible. Thus we try later - try_to_merge_later.append(record) - if newrecord.id is not None: - record.id = newrecord.id - except NotImplementedError: - print(newrecord) - print(record) - raise - Crawler.bend_references_to_new_object( - old=record, new=newrecord, - entities=all_records - ) + # We merge record into treated_record in order to prevent loss of information + self._merge_identified(treated_record, record, try_to_merge_later, all_records) referencing_entities = self.create_reference_mapping(all_records) del flat[i]