diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 876c6cdc247b24d2bb6b40dea24c2a386ce446f3..0e652f00a693dab1a48ed6b8484b0c2b06bae60e 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -232,6 +232,18 @@ class TreatedRecordLookUp(): existing or missing list depending on whether the Record has a valid ID. Additionally, the Record is added to three look up dicts. The keys of those are paths, IDs and the representation of the identifiables. + + The extreme case, that one could imagine, would be that the same Record occurs three times as + different Python objects: one that only has an ID, one with only a path and one without ID and + path but with identifying properties. During `split_into_inserts_and_updates` all three + must be identified with each other (and must be merged). Since we require, that treated + entities have a valid ID if they exist in the remote server, all three objects would be + identified with each other simply using the IDs. + + In the case that the Record is not yet in the remote server, there cannot be a Python object + with an ID. Thus we might have one with a path and one with an identifiable. If that Record + does not yet exist, it is necessary that both Python objects have at least either the path or + the identifiable in common. Currently, this has to be assured by the user. """ def __init__(self): @@ -266,6 +278,8 @@ class TreatedRecordLookUp(): def add(self, record: db.Entity, identifiable: Optional[Identifiable] = None): """ Add a Record that was treated, such that it is contained in the internal look up dicts + + This Record MUST have an ID if it was found in the remote server. """ if record.id is None: if record.path is None and identifiable is None: @@ -742,6 +756,7 @@ class Crawler(object): else: # side effect record.id = identified_record.id + record.path = existing.path self.treated_records_lookup.add(record, identifiable) del flat[i] resolved_references = True