From 718787312545cb5ae40ca586c72a11ba830c2962 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Wed, 14 Feb 2024 09:36:54 +0100 Subject: [PATCH] DOC/MAINT: set path for identified recs and add doc strings --- src/caoscrawler/crawl.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 876c6cdc..0e652f00 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -232,6 +232,18 @@ class TreatedRecordLookUp(): existing or missing list depending on whether the Record has a valid ID. Additionally, the Record is added to three look up dicts. The keys of those are paths, IDs and the representation of the identifiables. + + The extreme case, that one could imagine, would be that the same Record occurs three times as + different Python objects: one that only has an ID, one with only a path and one without ID and + path but with identifying properties. During `split_into_inserts_and_updates` all three + must be identified with each other (and must be merged). Since we require, that treated + entities have a valid ID if they exist in the remote server, all three objects would be + identified with each other simply using the IDs. + + In the case that the Record is not yet in the remote server, there cannot be a Python object + with an ID. Thus we might have one with a path and one with an identifiable. If that Record + does not yet exist, it is necessary that both Python objects have at least either the path or + the identifiable in common. Currently, this has to be assured by the user. """ def __init__(self): @@ -266,6 +278,8 @@ class TreatedRecordLookUp(): def add(self, record: db.Entity, identifiable: Optional[Identifiable] = None): """ Add a Record that was treated, such that it is contained in the internal look up dicts + + This Record MUST have an ID if it was found in the remote server. """ if record.id is None: if record.path is None and identifiable is None: @@ -742,6 +756,7 @@ class Crawler(object): else: # side effect record.id = identified_record.id + record.path = existing.path self.treated_records_lookup.add(record, identifiable) del flat[i] resolved_references = True -- GitLab