diff --git a/CHANGELOG.md b/CHANGELOG.md index af63b9a1a82e0d12b9148ac979684f6e2ff546ff..0c68188c6cc140fa49c6cb4b8f1f58189b45f8c9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### - DateElementConverter: allows to interpret text as a date object - the restricted_path argument allows to crawl only a subtree +- logging that provides a summary of what is inserted and updated - You can now access the file system path of a structure element (if it has one) using the variable name ``<converter name>.path`` diff --git a/integrationtests/test_realworld_example.py b/integrationtests/test_realworld_example.py index 4158ed22278ef5c871a22d45885e58fbfa84ea3b..3e4bec3f5465d176b03792253b2e776a48acf1e7 100644 --- a/integrationtests/test_realworld_example.py +++ b/integrationtests/test_realworld_example.py @@ -25,6 +25,7 @@ an integration test module that runs a test against a (close to) real world example """ from caosdb.utils.register_tests import clear_database, set_test_key +import logging import json import os @@ -85,7 +86,8 @@ def create_identifiable_adapter(): return ident -def test_dataset(clear_database, usemodel): +def test_dataset(clear_database, usemodel, caplog): + caplog.set_level(logging.DEBUG, logger="caoscrawler") ident = create_identifiable_adapter() crawler = Crawler(identifiableAdapter=ident) crawler_definition = crawler.load_definition( @@ -119,6 +121,11 @@ def test_dataset(clear_database, usemodel): "start_datetime='2022-02-10T16:36:48+01:00'") == 1 assert db.execute_query(f"FIND Event WITH latitude=53", unique=True) + # test logging + assert "Executed inserts" in caplog.text + assert "Going to insert" in caplog.text + assert "Executed updates" in caplog.text + def test_event_update(clear_database, usemodel): diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index fe6c63ff7af86a76fd6649323c96b8977b9fcc2f..18d2fcb368935abd6cd51acdc9f05fbc27fd46e0 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -49,6 +49,7 @@ from typing import Any, Optional, Type, Union import caosdb as db +from caosadvancedtools.utils import create_entity_link from caosadvancedtools.cache import UpdateCache, Cache from caosadvancedtools.crawler import Crawler as OldCrawler from caosdb.apiutils import (compare_entities, EntityMergeConflictError, @@ -1017,20 +1018,25 @@ class Crawler(object): referencing_entities) for record in to_be_updated] # Merge with existing data to prevent unwanted overwrites - to_be_updated = self._merge_properties_from_remote(to_be_updated, - identified_records) + to_be_updated = self._merge_properties_from_remote(to_be_updated, identified_records) # remove unnecessary updates from list by comparing the target records # to the existing ones - to_be_updated = self.remove_unnecessary_updates( - to_be_updated, identified_records) + to_be_updated = self.remove_unnecessary_updates(to_be_updated, identified_records) + logger.info(f"Going to insert {len(to_be_inserted)} Entities and update " + f"{len(to_be_inserted)} Entities.") if commit_changes: self.execute_parent_updates_in_list(to_be_updated, securityMode=self.securityMode, run_id=self.run_id, unique_names=unique_names) + logger.info(f"Added parent RecordTypes where necessary.") self.execute_inserts_in_list( to_be_inserted, self.securityMode, self.run_id, unique_names=unique_names) + logger.info(f"Executed inserts:\n" + + self.create_entity_summary(to_be_inserted)) self.execute_updates_in_list( to_be_updated, self.securityMode, self.run_id, unique_names=unique_names) + logger.info(f"Executed updates:\n" + + self.create_entity_summary(to_be_updated)) update_cache = UpdateCache() pending_inserts = update_cache.get_inserts(self.run_id) @@ -1045,6 +1051,25 @@ class Crawler(object): return (to_be_inserted, to_be_updated) + @staticmethod + def create_entity_summary(entities: list[db.Entity]): + """ Creates a summary string reprensentation of a list of entities.""" + parents = {} + for el in entities: + for pp in el.parents: + if pp.name not in parents: + parents[pp.name] = [el] + else: + parents[pp.name].append(el) + output = "" + for key, value in parents.items(): + output += f"{key}:\n" + for el in value: + output += create_entity_link(el) + ", " + + output = output[:-2] + "\n" + return output + @staticmethod def inform_about_pending_changes(pending_changes, run_id, path, inserts=False): # Sending an Email with a link to a form to authorize updates is diff --git a/unittests/test_tool.py b/unittests/test_tool.py index 4ac2b4577fbeea6f4bdf291c48ddaf0fa418b2a5..ca9d074e4a65b83d02347c2f7773e8a9b963886c 100755 --- a/unittests/test_tool.py +++ b/unittests/test_tool.py @@ -967,3 +967,20 @@ def test_split_restricted_path(): assert ["el"] == split_restricted_path("/el") assert ["el"] == split_restricted_path("/el/") assert ["el", "el"] == split_restricted_path("/el/el") + + +def test_create_entity_summary(): + assert "" == Crawler.create_entity_summary([]).strip() + + entities = [ + db.Record(id=1).add_parent("A"), + db.Record(id=4, name='a').add_parent("B"), + db.Record(id=5).add_parent("A"), + db.Record(id=6, name='b').add_parent("B"), + ] + text = Crawler.create_entity_summary(entities).strip() + assert 'a' in text + assert 'b' in text + assert 'A:' in text + assert 'B:' in text + assert "<a href='/Entity/4'>a</a>, <a href='/Entity/6'>b</a>" in text