MAINT: move checks that are not related to identifiables

d3e03841 · Henrik tom Wörden · 02666380 · d3e03841
Commit d3e03841 authored 1 year ago by Henrik tom Wörden
--- a/src/caoscrawler/crawl.py
+++ b/src/caoscrawler/crawl.py
@@ -232,6 +232,18 @@ class TreatedRecordLookUp():
    existing or missing list depending on whether the Record has a valid ID.
    Additionally, the Record is added to three look up dicts. The keys of those are paths, IDs and
    the representation of the identifiables.
+    The extreme case, that one could imagine, would be that the same Record occurs three times as
+    different Python objects: one that only has an ID, one with only a path and one without ID and
+    path but with identifying properties. During `split_into_inserts_and_updates` all three
+    must be identified with each other (and must be merged). Since we require, that treated
+    entities have a valid ID if they exist in the remote server, all three objects would be
+    identified with each other simply using the IDs.
+    In the case that the Record is not yet in the remote server, there cannot be a Python object
+    with an ID. Thus we might have one with a path and one with an identifiable. If that Record
+    does not yet exist, it is necessary that both Python objects have at least either the path or
+    the identifiable in common. Currently, this has to be assured by the user.
    """
    def __init__(self):
@@ -266,6 +278,8 @@ class TreatedRecordLookUp():
    def add(self, record: db.Entity, identifiable: Optional[Identifiable] = None):
        """
        Add a Record that was treated, such that it is contained in the internal look up dicts
+        This Record MUST have an ID if it was found in the remote server.
        """
        if record.id is None:
            if record.path is None and identifiable is None:
@@ -658,6 +672,32 @@ class Crawler(object):
            if ent.role == "Record" and len(ent.parents) == 0:
                raise RuntimeError(f"Records must have a parent.\n{ent}")
+        # Check whether Records can be identified without identifiable
+        for i in reversed(range(len(flat))):
+            record = flat[i]
+            # 1. Can it be identified via an ID?
+            if record.id is not None:
+                self.treated_records_lookup.add(record, None)
+                del flat[i]
+            # 2. Can it be identified via a path?
+            elif record.path is not None:
+                try:
+                    existing = cached_get_entity_by(path=record.path)
+                except EmptyUniqueQueryError:
+                    existing = None
+                if existing is None:
+                    # TODO add identifiable if possible
+                    self.treated_records_lookup.add(record, None)
+                    del flat[i]
+                else:
+                    record.id = existing.id
+                    # TODO check the following copying of _size and _checksum
+                    # Copy over checksum and size too if it is a file
+                    record._size = existing._size
+                    record._checksum = existing._checksum
+                    self.treated_records_lookup.add(record, None)
+                    del flat[i]
        resolved_references = True
        # flat contains Entities which could not yet be checked against the remote server
        try_to_merge_later = []
@@ -670,40 +710,17 @@ class Crawler(object):
            # it does not yet exist. Since a Record may reference other unkown Records it might not
            # be possible to answer this right away.
            # The following checks are done on each Record:
-            # 1. Can it be identified via an ID?
+            # 1. Is it in the cache of already checked Records?
-            # 2. Can it be identified via a path?
+            # 2. Can it be checked on the remote server?
-            # 3. Is it in the cache of already checked Records?
+            # 3. Does it have to be new since a needed reference is missing?
-            # 4. Can it be checked on the remote server?
-            # 5. Does it have to be new since a needed reference is missing?
            for i in reversed(range(len(flat))):
                record = flat[i]
                identifiable = self.identifiableAdapter.get_identifiable(
                    record,
                    referencing_entities=referencing_entities)
-                # 1. Can it be identified via an ID?
+                # 1. Is it in the cache of already checked Records?
-                if record.id is not None:
+                if self.treated_records_lookup.get_any(record, identifiable) is not None:
-                    self.treated_records_lookup.add(record, identifiable)
-                    del flat[i]
-                # 2. Can it be identified via a path?
-                elif record.path is not None:
-                    try:
-                        existing = cached_get_entity_by(path=record.path)
-                    except EmptyUniqueQueryError:
-                        existing = None
-                    if existing is None:
-                        self.treated_records_lookup.add(record, identifiable)
-                        del flat[i]
-                    else:
-                        record.id = existing.id
-                        # TODO check the following copying of _size and _checksum
-                        # Copy over checksum and size too if it is a file
-                        record._size = existing._size
-                        record._checksum = existing._checksum
-                        self.treated_records_lookup.add(record, identifiable)
-                        del flat[i]
-                # 3. Is it in the cache of already checked Records?
-                elif self.treated_records_lookup.get_any(record, identifiable) is not None:
                    newrecord = self.treated_records_lookup.get_any(record, identifiable)
                    # Since the identifiables are the same, newrecord and record actually describe
                    # the same obejct.
@@ -731,7 +748,7 @@ class Crawler(object):
                    del flat[i]
                    resolved_references = True
-                # 4. Can it be checked on the remote server?
+                # 2. Can it be checked on the remote server?
                elif not self._has_reference_value_without_id(identifiable):
                    identified_record = (
                        self.identifiableAdapter.retrieve_identified_record_for_identifiable(
@@ -742,11 +759,12 @@ class Crawler(object):
                    else:
                        # side effect
                        record.id = identified_record.id
+                        record.path = existing.path
                        self.treated_records_lookup.add(record, identifiable)
                    del flat[i]
                    resolved_references = True
-                # 5. Does it have to be new since a needed reference is missing?
+                # 3. Does it have to be new since a needed reference is missing?
                # (Is it impossible to check this record because an identifiable references a
                # missing record?)
                elif self._has_missing_object_in_references(identifiable, referencing_entities):