diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 7b9119caa1cd4dd4623a9141de4a70abb4da5946..6d1c9e951e5589d6695319dc07c19fb5c6cdc77c 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -90,6 +90,10 @@ yaml.SafeLoader.add_constructor("!defmacro", defmacro_constructor) yaml.SafeLoader.add_constructor("!macro", macro_constructor) +class ForbiddenTransaction(Exception): + pass + + def check_identical(record1: db.Entity, record2: db.Entity, ignore_id=False): """Check whether two entities are identical. @@ -815,10 +819,22 @@ class Crawler(object): update_cache = UpdateCache() update_cache.insert(to_be_updated, run_id) + @staticmethod + def check_whether_parent_exists(records: list[db.Entity], parents: list[str]): + """ returns a list of all records in `records` that have a parent that is in `parents`""" + problems = [] + for rec in records: + for parent in rec.parents: + if parent.name in parents: + problems.append(rec) + return problems + def synchronize(self, commit_changes: bool = True, unique_names: bool = True, crawled_data: Optional[list[db.Record]] = None, + no_insert_RTs=None, + no_update_RTs=None, ): """ This function applies several stages: @@ -832,6 +848,9 @@ class Crawler(object): if commit_changes is True, the changes are synchronized to the CaosDB server. For debugging in can be useful to set this to False. + no_insert_RTs + no_update_RTs + Return the final to_be_inserted and to_be_updated as tuple. """ if crawled_data is None: @@ -858,6 +877,15 @@ class Crawler(object): # to the existing ones to_be_updated = self.remove_unnecessary_updates(to_be_updated, identified_records) + ins_problems = self.check_whether_parent_exists(to_be_inserted, no_insert_RTs) + upd_problems = self.check_whether_parent_exists(to_be_updated, no_update_RTs) + if len(ins_problems) > 0 or len(upd_problems) > 0: + raise ForbiddenTransaction( + "One or more Records that have a parent which is excluded from insters or updates." + f"The following Records cannot be inserted due to a parent:\n{ins_problems}" + f"The following Records cannot be updated due to a parent:\n{upd_problems}" + ) + logger.info(f"Going to insert {len(to_be_inserted)} Entities and update " f"{len(to_be_updated)} Entities.") if commit_changes: @@ -1206,6 +1234,10 @@ def crawler_main(crawled_directory_path: str, crawler.run_id) _update_status_record(crawler.run_id, len(inserts), len(updates), status="OK") return 0 + except ForbiddenTransaction as err: + logger.error(err) + _update_status_record(crawler.run_id, 0, 0, status="FAILED") + return 1 except ConverterValidationError as err: logger.error(err) _update_status_record(crawler.run_id, 0, 0, status="FAILED") diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index f3ad73c5d75acea5fd3e92954e3899983ea73a2a..d8baa623184608b4ddc863e7c1250cfc0c5894ab 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -74,3 +74,25 @@ def test_deprecated_functions(): cr.crawled_data assert issubclass(w[-1].category, DeprecationWarning) assert "The use of self.crawled_data is depricated" in str(w[-1].message) + + +def test_check_whether_parent_exists(): + trivial_result = Crawler.check_whether_parent_exists([], []) + assert len(trivial_result) == 0 + assert isinstance(trivial_result, list) + + trivial_result2 = Crawler.check_whether_parent_exists([db.Record(), db.Record()], []) + assert len(trivial_result) == 0 + assert isinstance(trivial_result, list) + + # make sure records with parent is collected + a_recs = Crawler.check_whether_parent_exists( + [ + db.Record(id=1).add_parent("A"), + db.Record(id=2).add_parent("B"), + db.Record(id=3).add_parent("B"), + db.Record(id=4).add_parent("A"), + ], ["A"]) + a_recs_ids = [el.id for el in a_recs] + assert 1 in a_recs_ids + assert 4 in a_recs_ids