From e087df2711fe4994f9c852f726118adfbd343482 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Sun, 16 Oct 2022 20:41:40 +0200 Subject: [PATCH] working --- src/caoscrawler/crawl.py | 36 ++++++++++-------------------------- unittests/test_tool.py | 16 ++++++++++------ 2 files changed, 20 insertions(+), 32 deletions(-) diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 055cd130..13c6a7be 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -535,10 +535,10 @@ class Crawler(object): # Entity instead of ID and not cached locally if (isinstance(p.value, list)): for el in p.value: - if (isinstance(el, db.Entity) and el.id is None + if (isinstance(el, db.Entity) and self.get_from_remote_missing_cache(el) is not None): return True - if (isinstance(p.value, db.Entity) and p.value.id is None + if (isinstance(p.value, db.Entity) and self.get_from_remote_missing_cache(p.value) is not None): # might be checked when reference is resolved return True @@ -706,13 +706,9 @@ class Crawler(object): # flat contains Entities which could not yet be checked against the remote server while resolved_references and len(flat) > 0: resolved_references = False - print("LSIT") - for ii, el in enumerate(flat): - print(ii, el.id, el.parents[0].name if len(el.parents) > 0 else "") for i in reversed(range(len(flat))): record = flat[i] - print(i, record.id, record.parents[0].name if len(record.parents) > 0 else "") # TODO remove if the exception is never raised if (record.id is not None or record in to_be_inserted): @@ -720,7 +716,6 @@ class Crawler(object): "are removed from the list") # Check whether this record is a duplicate that can be removed elif self.get_from_any_cache(record) is not None: - print("duplicate") # We merge the two in order to prevent loss of information newrecord = self.get_from_any_cache(record) merge_entities(newrecord, record) @@ -728,23 +723,21 @@ class Crawler(object): old=record, new=newrecord, entities=flat+to_be_updated+to_be_inserted) del flat[i] + resolved_references = True - # can we check whether the record(identifiable) exists on the remote server + # can we check whether the record(identifiable) exists on the remote server? elif not self.references_entity_without_id( self.identifiableAdapter.get_identifiable(record)): - print("checked") # TODO: remove deepcopy? identified_record = ( self.identifiableAdapter.retrieve_identified_record_for_record( deepcopy(record))) if identified_record is None: - print("not found") # identifiable does not exist remotely -> record needs to be inserted to_be_inserted.append(record) self.add_to_remote_missing_cache(record) del flat[i] else: - print("found") # side effect record.id = identified_record.id # Copy over checksum and size too if it is a file @@ -756,26 +749,17 @@ class Crawler(object): self.add_to_remote_existing_cache(record) del flat[i] resolved_references = True - elif self.has_missing_object_in_references(record): + + # is it impossible to check this record because an identifiable references a + # missing record? + elif self.has_missing_object_in_references( + self.identifiableAdapter.get_identifiable(record)): to_be_inserted.append(record) self.add_to_remote_missing_cache(record) del flat[i] + resolved_references = True - else: - print("nothing") - print(record) for record in flat: - # TODO: (for review) - # This was the old version, but also for this case the - # check for identifiables has to be done. - # to_be_inserted.append(record) - # self.add_to_remote_existing_cache(record) - # del flat[i] - - # TODO: (for review) - # If the following replacement is not done, the cache will - # be invalid as soon as references are resolved. - # replace references by versions from cache: self.replace_references_with_cached(record) if len(flat) > 0: diff --git a/unittests/test_tool.py b/unittests/test_tool.py index dea1429c..b77d2a90 100755 --- a/unittests/test_tool.py +++ b/unittests/test_tool.py @@ -509,15 +509,18 @@ def test_no_uncached_entity_object_in_references(crawler): })) # one reference with id -> check - assert not crawler.has_missing_object_in_references( - db.Record(name="C").add_parent("C").add_property('d', 123)) + assert not crawler.has_missing_object_in_references(db.Record(name="C") + .add_parent("C").add_property('d', 123)) # one ref with Entity with id -> check assert not crawler.has_missing_object_in_references(db.Record(name="C") - .add_property('d', db.Record(id=123))) + .add_parent("C") + .add_property('d', db.Record(id=123) + .add_parent("C"))) # one ref with id one with Entity with id (mixed) -> check assert not crawler.has_missing_object_in_references(db.Record(name="C").add_parent("D") .add_property('d', 123) - .add_property('b', db.Record(id=123))) + .add_property('b', db.Record(id=123) + .add_parent("C"))) # entity to be referenced in the following a = db.Record(name="C").add_parent("C").add_property("d", 12311) # one ref with id one with Entity without id (but not identifying) -> fail @@ -539,8 +542,9 @@ def test_no_uncached_entity_object_in_references(crawler): def test_references_entities_without_ids(crawler, ident): - assert not crawler.references_entity_without_id( - db.Record().add_parent("Person").add_property('last_name', 123).add_property('first_name', 123)) + assert not crawler.references_entity_without_id(db.Record().add_parent("Person") + .add_property('last_name', 123) + .add_property('first_name', 123)) # id and rec with id assert not crawler.references_entity_without_id(db.Record().add_parent("Person") .add_property('first_name', 123) -- GitLab