From e087df2711fe4994f9c852f726118adfbd343482 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com>
Date: Sun, 16 Oct 2022 20:41:40 +0200
Subject: [PATCH] working

---
 src/caoscrawler/crawl.py | 36 ++++++++++--------------------------
 unittests/test_tool.py   | 16 ++++++++++------
 2 files changed, 20 insertions(+), 32 deletions(-)

diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py
index 055cd130..13c6a7be 100644
--- a/src/caoscrawler/crawl.py
+++ b/src/caoscrawler/crawl.py
@@ -535,10 +535,10 @@ class Crawler(object):
             # Entity instead of ID and not cached locally
             if (isinstance(p.value, list)):
                 for el in p.value:
-                    if (isinstance(el, db.Entity) and el.id is None
+                    if (isinstance(el, db.Entity)
                             and self.get_from_remote_missing_cache(el) is not None):
                         return True
-            if (isinstance(p.value, db.Entity) and p.value.id is None
+            if (isinstance(p.value, db.Entity)
                     and self.get_from_remote_missing_cache(p.value) is not None):
                 # might be checked when reference is resolved
                 return True
@@ -706,13 +706,9 @@ class Crawler(object):
         # flat contains Entities which could not yet be checked against the remote server
         while resolved_references and len(flat) > 0:
             resolved_references = False
-            print("LSIT")
-            for ii, el in enumerate(flat):
-                print(ii, el.id, el.parents[0].name if len(el.parents) > 0 else "")
 
             for i in reversed(range(len(flat))):
                 record = flat[i]
-                print(i, record.id, record.parents[0].name if len(record.parents) > 0 else "")
 
                 # TODO remove if the exception is never raised
                 if (record.id is not None or record in to_be_inserted):
@@ -720,7 +716,6 @@ class Crawler(object):
                                        "are removed from the list")
                 # Check whether this record is a duplicate that can be removed
                 elif self.get_from_any_cache(record) is not None:
-                    print("duplicate")
                     # We merge the two in order to prevent loss of information
                     newrecord = self.get_from_any_cache(record)
                     merge_entities(newrecord, record)
@@ -728,23 +723,21 @@ class Crawler(object):
                         old=record, new=newrecord, entities=flat+to_be_updated+to_be_inserted)
 
                     del flat[i]
+                    resolved_references = True
 
-                # can we check whether the record(identifiable) exists on the remote server
+                # can we check whether the record(identifiable) exists on the remote server?
                 elif not self.references_entity_without_id(
                         self.identifiableAdapter.get_identifiable(record)):
-                    print("checked")
                     # TODO: remove deepcopy?
                     identified_record = (
                         self.identifiableAdapter.retrieve_identified_record_for_record(
                             deepcopy(record)))
                     if identified_record is None:
-                        print("not found")
                         # identifiable does not exist remotely -> record needs to be inserted
                         to_be_inserted.append(record)
                         self.add_to_remote_missing_cache(record)
                         del flat[i]
                     else:
-                        print("found")
                         # side effect
                         record.id = identified_record.id
                         # Copy over checksum and size too if it is a file
@@ -756,26 +749,17 @@ class Crawler(object):
                         self.add_to_remote_existing_cache(record)
                         del flat[i]
                     resolved_references = True
-                elif self.has_missing_object_in_references(record):
+
+                # is it impossible to check this record because an identifiable references a
+                # missing record?
+                elif self.has_missing_object_in_references(
+                        self.identifiableAdapter.get_identifiable(record)):
                     to_be_inserted.append(record)
                     self.add_to_remote_missing_cache(record)
                     del flat[i]
+                    resolved_references = True
 
-                else:
-                    print("nothing")
-                    print(record)
             for record in flat:
-                # TODO: (for review)
-                # This was the old version, but also for this case the
-                # check for identifiables has to be done.
-                # to_be_inserted.append(record)
-                # self.add_to_remote_existing_cache(record)
-                # del flat[i]
-
-                # TODO: (for review)
-                # If the following replacement is not done, the cache will
-                # be invalid as soon as references are resolved.
-                # replace references by versions from cache:
                 self.replace_references_with_cached(record)
 
         if len(flat) > 0:
diff --git a/unittests/test_tool.py b/unittests/test_tool.py
index dea1429c..b77d2a90 100755
--- a/unittests/test_tool.py
+++ b/unittests/test_tool.py
@@ -509,15 +509,18 @@ def test_no_uncached_entity_object_in_references(crawler):
                                                }))
 
     # one reference with id -> check
-    assert not crawler.has_missing_object_in_references(
-        db.Record(name="C").add_parent("C").add_property('d', 123))
+    assert not crawler.has_missing_object_in_references(db.Record(name="C")
+                                                        .add_parent("C").add_property('d', 123))
     # one ref with Entity with id -> check
     assert not crawler.has_missing_object_in_references(db.Record(name="C")
-                                                        .add_property('d', db.Record(id=123)))
+                                                        .add_parent("C")
+                                                        .add_property('d', db.Record(id=123)
+                                                                      .add_parent("C")))
     # one ref with id one with Entity with id (mixed) -> check
     assert not crawler.has_missing_object_in_references(db.Record(name="C").add_parent("D")
                                                         .add_property('d', 123)
-                                                        .add_property('b', db.Record(id=123)))
+                                                        .add_property('b', db.Record(id=123)
+                                                                      .add_parent("C")))
     # entity to be referenced in the following
     a = db.Record(name="C").add_parent("C").add_property("d", 12311)
     # one ref with id one with Entity without id (but not identifying) -> fail
@@ -539,8 +542,9 @@ def test_no_uncached_entity_object_in_references(crawler):
 
 
 def test_references_entities_without_ids(crawler, ident):
-    assert not crawler.references_entity_without_id(
-        db.Record().add_parent("Person").add_property('last_name', 123).add_property('first_name', 123))
+    assert not crawler.references_entity_without_id(db.Record().add_parent("Person")
+                                                    .add_property('last_name', 123)
+                                                    .add_property('first_name', 123))
     # id and rec with id
     assert not crawler.references_entity_without_id(db.Record().add_parent("Person")
                                                     .add_property('first_name', 123)
-- 
GitLab