Skip to content
Snippets Groups Projects
Commit 707788bc authored by Florian Spreckelsen's avatar Florian Spreckelsen
Browse files

Merge branch 'f-check-parents' into 'dev'

ENH: Allow to prevent inserts and updates for Records that have certain parents.

See merge request !116
parents 6d9f75d6 295e49fc
Branches
Tags
2 merge requests!123REL: Release v0.6.0,!116ENH: Allow to prevent inserts and updates for Records that have certain parents.
Pipeline #37025 passed
......@@ -15,6 +15,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Creation of CrawlerRun Records that contain status information about data
integration of the crawler if the `pycaosdb.ini` contains a `[caoscrawler]`
with `create_crawler_status_records=True`.
- The Crawler `synchronize` function now takes list of RecordType names.
Records that have the given names as parents are excluded from inserts or
updates
### Changed ###
......@@ -23,8 +26,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Removed ###
### Fixed ###
- usage of ID when looking for identified records
- Query generation when there are only backrefs or backrefs and a name
- usage of ID when looking for identified records
### Security ###
......
......@@ -92,6 +92,10 @@ yaml.SafeLoader.add_constructor("!defmacro", defmacro_constructor)
yaml.SafeLoader.add_constructor("!macro", macro_constructor)
class ForbiddenTransaction(Exception):
pass
def check_identical(record1: db.Entity, record2: db.Entity, ignore_id=False):
"""Check whether two entities are identical.
......@@ -805,10 +809,22 @@ class Crawler(object):
update_cache = UpdateCache()
update_cache.insert(to_be_updated, run_id)
@staticmethod
def check_whether_parent_exists(records: list[db.Entity], parents: list[str]):
""" returns a list of all records in `records` that have a parent that is in `parents`"""
problems = []
for rec in records:
for parent in rec.parents:
if parent.name in parents:
problems.append(rec)
return problems
def synchronize(self,
commit_changes: bool = True,
unique_names: bool = True,
crawled_data: Optional[list[db.Record]] = None,
no_insert_RTs: Optional[list[str]] = None,
no_update_RTs: Optional[list[str]] = None,
):
"""
This function applies several stages:
......@@ -822,7 +838,19 @@ class Crawler(object):
if commit_changes is True, the changes are synchronized to the CaosDB server.
For debugging in can be useful to set this to False.
Return the final to_be_inserted and to_be_updated as tuple.
Parameters
----------
no_insert_RTs : list[str], optional
list of RecordType names. Records that have one of those RecordTypes
as parent will not be inserted
no_update_RTs : list[str], optional
list of RecordType names. Records that have one of those RecordTypes
as parent will not be updated
Returns
-------
inserts and updates
the final to_be_inserted and to_be_updated as tuple.
"""
if crawled_data is None:
warnings.warn(DeprecationWarning(
......@@ -837,16 +865,35 @@ class Crawler(object):
# all entity objects are replaced by their IDs except for the not yet inserted ones
self.replace_entities_with_ids(el)
identified_records = [
self.identifiableAdapter.retrieve_identified_record_for_record(record,
referencing_entities)
for record in to_be_updated]
identified_records = []
for record in to_be_updated:
if record.id is not None:
# TODO: use cache here?
identified_records.append(cached_get_entity_by(eid=record.id))
else:
raise Exception("Please report a bug: At this stage all records to be updated"
" should have an ID")
# Merge with existing data to prevent unwanted overwrites
to_be_updated = self._merge_properties_from_remote(to_be_updated, identified_records)
# remove unnecessary updates from list by comparing the target records
# to the existing ones
to_be_updated = self.remove_unnecessary_updates(to_be_updated, identified_records)
if no_insert_RTs:
ins_problems = self.check_whether_parent_exists(to_be_inserted, no_insert_RTs)
else:
ins_problems = []
if no_update_RTs:
upd_problems = self.check_whether_parent_exists(to_be_updated, no_update_RTs)
else:
upd_problems = []
if len(ins_problems) > 0 or len(upd_problems) > 0:
raise ForbiddenTransaction(
"One or more Records that have a parent which is excluded from inserts or updates."
f"\nRecords excluded from inserts have the following RecordTypes:\n{[el.parents[0].name for el in ins_problems]}"
f"\nRecords excluded from updates have the following RecordTypes:\n{[el.parents[0].name for el in upd_problems]}"
)
logger.info(f"Going to insert {len(to_be_inserted)} Entities and update "
f"{len(to_be_updated)} Entities.")
if commit_changes:
......@@ -1196,6 +1243,10 @@ def crawler_main(crawled_directory_path: str,
crawler.run_id)
_update_status_record(crawler.run_id, len(inserts), len(updates), status="OK")
return 0
except ForbiddenTransaction as err:
logger.error(err)
_update_status_record(crawler.run_id, 0, 0, status="FAILED")
return 1
except ConverterValidationError as err:
logger.error(err)
_update_status_record(crawler.run_id, 0, 0, status="FAILED")
......
......@@ -150,6 +150,28 @@ def test_deprecated_functions():
assert "The use of self.crawled_data is depricated" in str(w[-1].message)
def test_check_whether_parent_exists():
trivial_result = Crawler.check_whether_parent_exists([], [])
assert len(trivial_result) == 0
assert isinstance(trivial_result, list)
trivial_result2 = Crawler.check_whether_parent_exists([db.Record(), db.Record()], [])
assert len(trivial_result) == 0
assert isinstance(trivial_result, list)
# make sure records with parent is collected
a_recs = Crawler.check_whether_parent_exists(
[
db.Record(id=1).add_parent("A"),
db.Record(id=2).add_parent("B"),
db.Record(id=3).add_parent("B"),
db.Record(id=4).add_parent("A"),
], ["A"])
a_recs_ids = [el.id for el in a_recs]
assert 1 in a_recs_ids
assert 4 in a_recs_ids
def test_remove_unnecessary_updates():
# test trvial case
upl = [db.Record().add_parent("A")]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment