diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 876c6cdc247b24d2bb6b40dea24c2a386ce446f3..58b642642e2d2c29cac25c125da30ecee6a96a52 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -232,6 +232,18 @@ class TreatedRecordLookUp(): existing or missing list depending on whether the Record has a valid ID. Additionally, the Record is added to three look up dicts. The keys of those are paths, IDs and the representation of the identifiables. + + The extreme case, that one could imagine, would be that the same Record occurs three times as + different Python objects: one that only has an ID, one with only a path and one without ID and + path but with identifying properties. During `split_into_inserts_and_updates` all three + must be identified with each other (and must be merged). Since we require, that treated + entities have a valid ID if they exist in the remote server, all three objects would be + identified with each other simply using the IDs. + + In the case that the Record is not yet in the remote server, there cannot be a Python object + with an ID. Thus we might have one with a path and one with an identifiable. If that Record + does not yet exist, it is necessary that both Python objects have at least either the path or + the identifiable in common. Currently, this has to be assured by the user. """ def __init__(self): @@ -266,6 +278,8 @@ class TreatedRecordLookUp(): def add(self, record: db.Entity, identifiable: Optional[Identifiable] = None): """ Add a Record that was treated, such that it is contained in the internal look up dicts + + This Record MUST have an ID if it was found in the remote server. """ if record.id is None: if record.path is None and identifiable is None: @@ -658,6 +672,32 @@ class Crawler(object): if ent.role == "Record" and len(ent.parents) == 0: raise RuntimeError(f"Records must have a parent.\n{ent}") + # Check whether Records can be identified without identifiable + for i in reversed(range(len(flat))): + record = flat[i] + # 1. Can it be identified via an ID? + if record.id is not None: + self.treated_records_lookup.add(record, None) + del flat[i] + # 2. Can it be identified via a path? + elif record.path is not None: + try: + existing = cached_get_entity_by(path=record.path) + except EmptyUniqueQueryError: + existing = None + if existing is None: + # TODO add identifiable if possible + self.treated_records_lookup.add(record, None) + del flat[i] + else: + record.id = existing.id + # TODO check the following copying of _size and _checksum + # Copy over checksum and size too if it is a file + record._size = existing._size + record._checksum = existing._checksum + self.treated_records_lookup.add(record, None) + del flat[i] + resolved_references = True # flat contains Entities which could not yet be checked against the remote server try_to_merge_later = [] @@ -670,40 +710,17 @@ class Crawler(object): # it does not yet exist. Since a Record may reference other unkown Records it might not # be possible to answer this right away. # The following checks are done on each Record: - # 1. Can it be identified via an ID? - # 2. Can it be identified via a path? - # 3. Is it in the cache of already checked Records? - # 4. Can it be checked on the remote server? - # 5. Does it have to be new since a needed reference is missing? + # 1. Is it in the cache of already checked Records? + # 2. Can it be checked on the remote server? + # 3. Does it have to be new since a needed reference is missing? for i in reversed(range(len(flat))): record = flat[i] identifiable = self.identifiableAdapter.get_identifiable( record, referencing_entities=referencing_entities) - # 1. Can it be identified via an ID? - if record.id is not None: - self.treated_records_lookup.add(record, identifiable) - del flat[i] - # 2. Can it be identified via a path? - elif record.path is not None: - try: - existing = cached_get_entity_by(path=record.path) - except EmptyUniqueQueryError: - existing = None - if existing is None: - self.treated_records_lookup.add(record, identifiable) - del flat[i] - else: - record.id = existing.id - # TODO check the following copying of _size and _checksum - # Copy over checksum and size too if it is a file - record._size = existing._size - record._checksum = existing._checksum - self.treated_records_lookup.add(record, identifiable) - del flat[i] - # 3. Is it in the cache of already checked Records? - elif self.treated_records_lookup.get_any(record, identifiable) is not None: + # 1. Is it in the cache of already checked Records? + if self.treated_records_lookup.get_any(record, identifiable) is not None: newrecord = self.treated_records_lookup.get_any(record, identifiable) # Since the identifiables are the same, newrecord and record actually describe # the same obejct. @@ -731,7 +748,7 @@ class Crawler(object): del flat[i] resolved_references = True - # 4. Can it be checked on the remote server? + # 2. Can it be checked on the remote server? elif not self._has_reference_value_without_id(identifiable): identified_record = ( self.identifiableAdapter.retrieve_identified_record_for_identifiable( @@ -742,11 +759,12 @@ class Crawler(object): else: # side effect record.id = identified_record.id + record.path = existing.path self.treated_records_lookup.add(record, identifiable) del flat[i] resolved_references = True - # 5. Does it have to be new since a needed reference is missing? + # 3. Does it have to be new since a needed reference is missing? # (Is it impossible to check this record because an identifiable references a # missing record?) elif self._has_missing_object_in_references(identifiable, referencing_entities):