diff --git a/CHANGELOG.md b/CHANGELOG.md index 5abcafd4cfead88ffb82fa019af8b29a85b56ea6..95189b50054033f54054a21388a67ca47c8356ea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - DateElementConverter: allows to interpret text as a date object - the restricted_path argument allows to crawl only a subtree +- logging that provides a summary of what is inserted and updated - You can now access the file system path of a structure element (if it has one) using the variable name ``<converter name>.path`` - ``add_prefix`` and ``remove_prefix`` arguments for the command line interface diff --git a/integrationtests/test_realworld_example.py b/integrationtests/test_realworld_example.py index 9dccc2fff7c09a2c9db178a371790c61acdf1eeb..cf8f0416e4c6e93605475740abfd8863d2469801 100644 --- a/integrationtests/test_realworld_example.py +++ b/integrationtests/test_realworld_example.py @@ -25,6 +25,7 @@ an integration test module that runs a test against a (close to) real world example """ from caosdb.utils.register_tests import clear_database, set_test_key +import logging import json import os @@ -98,6 +99,7 @@ def create_identifiable_adapter(): def test_dataset(clear_database, usemodel, addfiles): + caplog.set_level(logging.DEBUG, logger="caoscrawler") identifiable_path = os.path.join(DATADIR, "identifiables.yml") crawler_definition_path = os.path.join(DATADIR, "dataset_cfoods.yml") crawler_main( @@ -129,6 +131,11 @@ def test_dataset(clear_database, usemodel, addfiles): "start_datetime='2022-02-10T16:36:48+01:00'") == 1 assert db.execute_query(f"FIND Event WITH latitude=53", unique=True) + # test logging + assert "Executed inserts" in caplog.text + assert "Going to insert" in caplog.text + assert "Executed updates" in caplog.text + def test_event_update(clear_database, usemodel, addfiles): diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 9a0e6ce560a4ccdc2caf80f4a5cc70736cde22f8..03a1c314a528af4b3802fda1269a36656d995624 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -49,6 +49,7 @@ from typing import Any, Optional, Type, Union import caosdb as db +from caosadvancedtools.utils import create_entity_link from caosadvancedtools.cache import UpdateCache, Cache from caosadvancedtools.crawler import Crawler as OldCrawler from caosdb.apiutils import (compare_entities, EntityMergeConflictError, @@ -1017,20 +1018,25 @@ class Crawler(object): referencing_entities) for record in to_be_updated] # Merge with existing data to prevent unwanted overwrites - to_be_updated = self._merge_properties_from_remote(to_be_updated, - identified_records) + to_be_updated = self._merge_properties_from_remote(to_be_updated, identified_records) # remove unnecessary updates from list by comparing the target records # to the existing ones - to_be_updated = self.remove_unnecessary_updates( - to_be_updated, identified_records) + to_be_updated = self.remove_unnecessary_updates(to_be_updated, identified_records) + logger.info(f"Going to insert {len(to_be_inserted)} Entities and update " + f"{len(to_be_inserted)} Entities.") if commit_changes: self.execute_parent_updates_in_list(to_be_updated, securityMode=self.securityMode, run_id=self.run_id, unique_names=unique_names) + logger.info(f"Added parent RecordTypes where necessary.") self.execute_inserts_in_list( to_be_inserted, self.securityMode, self.run_id, unique_names=unique_names) + logger.info(f"Executed inserts:\n" + + self.create_entity_summary(to_be_inserted)) self.execute_updates_in_list( to_be_updated, self.securityMode, self.run_id, unique_names=unique_names) + logger.info(f"Executed updates:\n" + + self.create_entity_summary(to_be_updated)) update_cache = UpdateCache() pending_inserts = update_cache.get_inserts(self.run_id) @@ -1045,6 +1051,25 @@ class Crawler(object): return (to_be_inserted, to_be_updated) + @staticmethod + def create_entity_summary(entities: list[db.Entity]): + """ Creates a summary string reprensentation of a list of entities.""" + parents = {} + for el in entities: + for pp in el.parents: + if pp.name not in parents: + parents[pp.name] = [el] + else: + parents[pp.name].append(el) + output = "" + for key, value in parents.items(): + output += f"{key}:\n" + for el in value: + output += create_entity_link(el) + ", " + + output = output[:-2] + "\n" + return output + @staticmethod def inform_about_pending_changes(pending_changes, run_id, path, inserts=False): # Sending an Email with a link to a form to authorize updates is diff --git a/unittests/test_tool.py b/unittests/test_tool.py index af84f8fdbeb7f792a2648b6baab3c0266f803474..e15d7cb777ced4b92566df2b25b375e90be39295 100755 --- a/unittests/test_tool.py +++ b/unittests/test_tool.py @@ -982,3 +982,20 @@ def test_deprecated_prefix_option(): remove_prefix="to/be/removed") assert "(deprecated) `prefix` and the `remove_prefix`" in str(ve.value) + + +def test_create_entity_summary(): + assert "" == Crawler.create_entity_summary([]).strip() + + entities = [ + db.Record(id=1).add_parent("A"), + db.Record(id=4, name='a').add_parent("B"), + db.Record(id=5).add_parent("A"), + db.Record(id=6, name='b').add_parent("B"), + ] + text = Crawler.create_entity_summary(entities).strip() + assert 'a' in text + assert 'b' in text + assert 'A:' in text + assert 'B:' in text + assert "<a href='/Entity/4'>a</a>, <a href='/Entity/6'>b</a>" in text