diff --git a/CHANGELOG.md b/CHANGELOG.md index 7df6439d5ee38b236a0731cf5ca09b82c7fcf002..6aa83aca68ca2c2daa97c88784aec4987a817605 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### - DateElementConverter: allows to interpret text as a date object - the restricted_path argument allows to crawl only a subtree +- logging that provides a summary of what is inserted and updated ### Changed ### diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index fe6c63ff7af86a76fd6649323c96b8977b9fcc2f..6911e2bf0ba90e85d6bfdbd70effdcda01e79426 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -1017,20 +1017,25 @@ class Crawler(object): referencing_entities) for record in to_be_updated] # Merge with existing data to prevent unwanted overwrites - to_be_updated = self._merge_properties_from_remote(to_be_updated, - identified_records) + to_be_updated = self._merge_properties_from_remote(to_be_updated, identified_records) # remove unnecessary updates from list by comparing the target records # to the existing ones - to_be_updated = self.remove_unnecessary_updates( - to_be_updated, identified_records) + to_be_updated = self.remove_unnecessary_updates(to_be_updated, identified_records) + logger.info(f"Going to insert {len(to_be_inserted)} Entities:\n" + + self.create_entity_summary(to_be_inserted)) + logger.info(f"Going to update {len(to_be_inserted)} Entities:\n" + + self.create_entity_summary(to_be_updated)) if commit_changes: self.execute_parent_updates_in_list(to_be_updated, securityMode=self.securityMode, run_id=self.run_id, unique_names=unique_names) + logger.info(f"Added parent RecordTypes where necessary.") self.execute_inserts_in_list( to_be_inserted, self.securityMode, self.run_id, unique_names=unique_names) + logger.info(f"Executed inserts.") self.execute_updates_in_list( to_be_updated, self.securityMode, self.run_id, unique_names=unique_names) + logger.info(f"Executed updates.") update_cache = UpdateCache() pending_inserts = update_cache.get_inserts(self.run_id) @@ -1045,6 +1050,25 @@ class Crawler(object): return (to_be_inserted, to_be_updated) + @staticmethod + def create_entity_summary(entities: list[db.Entity]): + """ Creates a summary string reprensentation of a list of entities.""" + parents = {} + for el in entities: + for pp in el.parents: + if pp.name not in parents: + parents[pp.name] = [] + else: + parents[pp.name].append(el.id) + output = "" + for key, value in parents.items(): + output += f"{key}:\n" + for el in value: + output += create_entity_link(el) + ", " + + output = output[:-2] + "\n" + return output + @staticmethod def inform_about_pending_changes(pending_changes, run_id, path, inserts=False): # Sending an Email with a link to a form to authorize updates is