Skip to content
Snippets Groups Projects
Commit dec8858e authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

Merge branch 'f-fix-crawler-overwrite' into 'dev'

F fix crawler overwrite

See merge request !61
parents 8b1b03ae f109a398
Branches
Tags
2 merge requests!71REL: RElease v0.2.0,!61F fix crawler overwrite
Pipeline #30076 passed
......@@ -18,8 +18,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Removed ###
### Fixed ###
* FIX: https://gitlab.com/caosdb/caosdb-crawler/-/issues/31 Identified cache: Hash is the same for Records without IDs
* FIX: https://gitlab.com/caosdb/caosdb-crawler/-/issues/30
* [#31](https://gitlab.com/caosdb/caosdb-crawler/-/issues/31) Identified cache:
Hash is the same for Records without IDs
* [#30](https://gitlab.com/caosdb/caosdb-crawler/-/issues/30)
* [#23](https://gitlab.com/caosdb/caosdb-crawler/-/issues/23) Crawler may
overwrite and delete existing data in case of manually added properties
### Security ###
......
......@@ -108,7 +108,7 @@ def crawler_extended(ident):
cr = Crawler(debug=True, identifiableAdapter=ident)
crawl_standard_test_directory(cr, cfood="scifolder_extended.yml")
# correct paths for current working directory
file_list = [r for r in cr.target_data if r.role == "File"]
file_list = [r for r in cr.crawled_data if r.role == "File"]
for f in file_list:
f.file = rfp("..", "..", "unittests", "test_directories", f.file)
return cr
......@@ -160,7 +160,7 @@ def test_insertion(clear_database, usemodel, ident, crawler):
# Do a second run on the same data, there should a new insert:
cr = Crawler(debug=True, identifiableAdapter=ident)
crawl_standard_test_directory(cr, "example_insert")
assert len(cr.target_data) == 3
assert len(cr.crawled_data) == 3
ins, ups = cr.synchronize()
assert len(ins) == 1
assert len(ups) == 0
......@@ -168,7 +168,7 @@ def test_insertion(clear_database, usemodel, ident, crawler):
# Do it again to check whether nothing is changed:
cr = Crawler(debug=True, identifiableAdapter=ident)
crawl_standard_test_directory(cr, "example_insert")
assert len(cr.target_data) == 3
assert len(cr.crawled_data) == 3
ins, ups = cr.synchronize()
assert len(ins) == 0
assert len(ups) == 0
......@@ -180,7 +180,7 @@ def test_insert_auth(clear_database, usemodel, ident, crawler):
# Do a second run on the same data, there should a new insert:
cr = Crawler(debug=True, identifiableAdapter=ident, securityMode=SecurityMode.RETRIEVE)
crawl_standard_test_directory(cr, "example_insert")
assert len(cr.target_data) == 3
assert len(cr.crawled_data) == 3
ins, ups = cr.synchronize()
assert len(ins) == 1
assert not ins[0].is_valid()
......@@ -190,7 +190,7 @@ def test_insert_auth(clear_database, usemodel, ident, crawler):
# Do it again to check whether nothing is changed:
cr = Crawler(debug=True, identifiableAdapter=ident)
crawl_standard_test_directory(cr, "example_insert")
assert len(cr.target_data) == 3
assert len(cr.crawled_data) == 3
ins, ups = cr.synchronize()
assert len(ins) == 0
assert len(ups) == 0
......@@ -205,9 +205,9 @@ def test_insertion_and_update(clear_database, usemodel, ident, crawler):
cr = Crawler(debug=True, identifiableAdapter=ident)
crawl_standard_test_directory(cr, "example_overwrite_1")
# print(cr.target_data)
# print(cr.crawled_data)
# cr.save_debug_data(rfp("provenance.yml"))
assert len(cr.target_data) == 3
assert len(cr.crawled_data) == 3
ins, ups = cr.synchronize()
assert len(ins) == 0
assert len(ups) == 1
......@@ -222,7 +222,7 @@ def test_identifiable_update(clear_database, usemodel, ident, crawler):
crawl_standard_test_directory(cr)
# Test the addition of a single property:
l = cr.target_data
l = cr.crawled_data
for record in l:
if (record.parents[0].name == "Measurement" and
record.get_property("date").value == "2020-01-03"):
......@@ -238,7 +238,7 @@ def test_identifiable_update(clear_database, usemodel, ident, crawler):
# Test the change within one property:
cr = Crawler(debug=True, identifiableAdapter=ident)
crawl_standard_test_directory(cr)
l = cr.target_data
l = cr.crawled_data
for record in l:
if (record.parents[0].name == "Measurement" and
record.get_property("date").value == "2020-01-03"):
......@@ -252,7 +252,7 @@ def test_identifiable_update(clear_database, usemodel, ident, crawler):
# Changing the date should result in a new insertion:
cr = Crawler(debug=True, identifiableAdapter=ident)
crawl_standard_test_directory(cr)
l = cr.target_data
l = cr.crawled_data
for record in l:
if (record.parents[0].name == "Measurement" and
record.get_property("date").value == "2020-01-03"):
......@@ -269,7 +269,7 @@ def test_file_insertion_dry(clear_database, usemodel, ident):
crawler_extended = Crawler(debug=True, identifiableAdapter=ident)
crawl_standard_test_directory(
crawler_extended, cfood="scifolder_extended.yml")
file_list = [r for r in crawler_extended.target_data if r.role == "File"]
file_list = [r for r in crawler_extended.crawled_data if r.role == "File"]
assert len(file_list) == 11
for f in file_list:
......@@ -305,7 +305,7 @@ def test_file_update(clear_database, usemodel, ident, crawler_extended):
cr = Crawler(debug=True, identifiableAdapter=ident)
crawl_standard_test_directory(cr, cfood="scifolder_extended.yml")
file_list = [r for r in cr.target_data if r.role == "File"]
file_list = [r for r in cr.crawled_data if r.role == "File"]
for f in file_list:
f.file = rfp("..", "..", "unittests", "test_directories", f.file)
ins2, ups2 = cr.synchronize(commit_changes=True)
......@@ -320,7 +320,7 @@ def test_file_update(clear_database, usemodel, ident, crawler_extended):
cr2 = Crawler(debug=True, identifiableAdapter=ident)
crawl_standard_test_directory(cr2, cfood="scifolder_extended2.yml")
file_list = [r for r in cr2.target_data if r.role == "File"]
file_list = [r for r in cr2.crawled_data if r.role == "File"]
for f in file_list:
f.file = rfp("..", "..", "unittests", "test_directories", f.file)
ins3, ups3 = cr2.synchronize(commit_changes=True)
......
......@@ -30,11 +30,12 @@ def clear_database():
db.execute_query("FIND ENTITY").delete(raise_exception_on_error=False)
@mark.xfail(reason="See issue https://gitlab.com/caosdb/caosdb-crawler/-/issues/23")
def test_issue_23(clear_database):
"""Test that an update leaves existing properties, that were not found by
the crawler, unchanged.
See issue https://gitlab.com/caosdb/caosdb-crawler/-/issues/23
"""
# insert a simplistic model an arecord of type TestType with identifying
......
......@@ -20,7 +20,7 @@ packages = find:
python_requires = >=3.8
install_requires =
importlib-resources
caosdb
caosdb > 0.9.0
caosadvancedtools >= 0.6.0
yaml-header-tools >= 0.2.1
pyyaml
......
......@@ -113,7 +113,7 @@ def check_identical(record1: db.Entity, record2: db.Entity, ignore_id=False):
return False
for attribute in ("datatype", "importance", "unit"):
# only make an update for those attributes if there is a value difference and
# the value in the target_data is not None
# the value in the crawled_data is not None
if attribute in comp[0]["properties"][key]:
attr_val = comp[0]["properties"][key][attribute]
other_attr_val = (comp[1]["properties"][key][attribute]
......@@ -447,7 +447,7 @@ class Crawler(object):
Returns
-------
target_data : list
crawled_data : list
the final list with the target state of Records.
"""
......@@ -463,14 +463,14 @@ class Crawler(object):
local_converters = Crawler.initialize_converters(
crawler_definition, converter_registry)
# This recursive crawling procedure generates the update list:
self.target_data: List[db.Record] = []
self.crawled_data: List[db.Record] = []
self._crawl(items, local_converters, self.generalStore,
self.recordStore, [], [])
if self.debug:
self.debug_converters = local_converters
return self.target_data
return self.crawled_data
def synchronize(self, commit_changes: bool = True, unique_names=True):
"""
......@@ -480,7 +480,7 @@ class Crawler(object):
# After the crawling, the actual synchronization with the database, based on the
# update list is carried out:
return self._synchronize(self.target_data, commit_changes, unique_names=unique_names)
return self._synchronize(self.crawled_data, commit_changes, unique_names=unique_names)
def has_reference_value_without_id(self, record: db.Record):
"""
......@@ -780,28 +780,70 @@ class Crawler(object):
el.value[index] = val.id
@staticmethod
def remove_unnecessary_updates(target_data: List[db.Record],
identified_records: List[db.Record]):
def _merge_properties_from_remote(
crawled_data: List[db.Record],
identified_records: List[db.Record]
):
"""Merge entity representation that was created by crawling the data with remotely found
identified records s.th. new properties and property values are updated correctly but
additional properties are not overwritten.
Parameters
----------
crawled_data : list[db.Record]
List of the Entities created by the crawler
identified_records : list[db.Record]
List of identified remote Records
Returns
-------
to_be_updated : list[db.Record]
List of merged records
"""
checks whether all relevant attributes (especially Property values) are equal
to_be_updated = []
for target, identified in zip(crawled_data, identified_records):
# Special treatment for name and description in case they have been
# set in the server independently from the crawler
for attr in ["name", "description"]:
if getattr(target, attr) is None:
# The crawler didn't find any name or description, i.e., not
# an empty one. In this case (and only in this), keep any
# existing name or description.
setattr(target, attr, getattr(identified, attr))
# Create a temporary copy since the merge will be conducted in place
tmp = deepcopy(identified)
# A force merge will overwrite any properties that both the
# identified and the crawled record have with the values of the
# crawled record while keeping existing properties intact.
merge_entities(tmp, target, force=True)
to_be_updated.append(tmp)
return to_be_updated
Returns (in future)
@staticmethod
def remove_unnecessary_updates(
crawled_data: List[db.Record],
identified_records: List[db.Record]
):
"""Compare the Records to be updated with their remote
correspondant. Only update if there are actual differences.
Returns
-------
update list without unecessary updates
"""
if len(target_data) != len(identified_records):
if len(crawled_data) != len(identified_records):
raise RuntimeError("The lists of updates and of identified records need to be of the "
"same length!")
# TODO this can now easily be changed to a function without side effect
for i in reversed(range(len(target_data))):
identical = check_identical(target_data[i], identified_records[i])
actual_updates = []
for i in reversed(range(len(crawled_data))):
if identical:
del target_data[i]
continue
else:
pass
if not check_identical(crawled_data[i], identified_records[i]):
actual_updates.append(crawled_data[i])
return actual_updates
@staticmethod
def execute_parent_updates_in_list(to_be_updated, securityMode, run_id, unique_names):
......@@ -894,12 +936,12 @@ class Crawler(object):
update_cache = UpdateCache()
update_cache.insert(to_be_updated, run_id)
def _synchronize(self, target_data: List[db.Record], commit_changes: bool = True,
def _synchronize(self, crawled_data: List[db.Record], commit_changes: bool = True,
unique_names=True):
"""
This function applies several stages:
1) Retrieve identifiables for all records in target_data.
2) Compare target_data with existing records.
1) Retrieve identifiables for all records in crawled_data.
2) Compare crawled_data with existing records.
3) Insert and update records based on the set of identified differences.
This function makes use of an IdentifiableAdapter which is used to retrieve
......@@ -914,8 +956,7 @@ class Crawler(object):
if self.identifiableAdapter is None:
raise RuntimeError("Should not happen.")
to_be_inserted, to_be_updated = self.split_into_inserts_and_updates(
target_data)
to_be_inserted, to_be_updated = self.split_into_inserts_and_updates(crawled_data)
# TODO: refactoring of typo
for el in to_be_updated:
......@@ -926,8 +967,13 @@ class Crawler(object):
self.identifiableAdapter.retrieve_identified_record_for_record(
record)
for record in to_be_updated]
# remove unnecessary updates from list by comparing the target records to the existing ones
self.remove_unnecessary_updates(to_be_updated, identified_records)
# Merge with existing data to prevent unwanted overwrites
to_be_updated = self._merge_properties_from_remote(to_be_updated,
identified_records)
# remove unnecessary updates from list by comparing the target records
# to the existing ones
to_be_updated = self.remove_unnecessary_updates(
to_be_updated, identified_records)
if commit_changes:
self.execute_parent_updates_in_list(to_be_updated, securityMode=self.securityMode,
......@@ -1086,7 +1132,7 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3]))
# to the general update container.
scoped_records = recordStore.get_records_current_scope()
for record in scoped_records:
self.target_data.append(record)
self.crawled_data.append(record)
# TODO: the scoped variables should be cleaned up as soon if the variables
# are no longer in the current scope. This can be implemented as follows,
......@@ -1099,7 +1145,7 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3]))
# del recordStore[name]
# del generalStore[name]
return self.target_data
return self.crawled_data
def crawler_main(crawled_directory_path: str,
......@@ -1161,7 +1207,7 @@ def crawler_main(crawled_directory_path: str,
"update": updates}))
else:
rtsfinder = dict()
for elem in crawler.target_data:
for elem in crawler.crawled_data:
if isinstance(elem, db.File):
# correct the file path:
# elem.file = os.path.join(args.path, elem.file)
......
......@@ -184,7 +184,7 @@ def test_record_structure_generation(crawler):
def test_ambigious_records(crawler, ident):
ident.get_records().clear()
ident.get_records().extend(crawler.target_data)
ident.get_records().extend(crawler.crawled_data)
r = ident.get_records()
id_r0 = ident.get_identifiable(r[0])
with raises(RuntimeError, match=".*unambigiously.*"):
......@@ -206,7 +206,7 @@ def test_crawler_update_list(crawler, ident):
) == 2
# The crawler contains lots of duplicates, because identifiables have not been resolved yet:
assert len(ident.get_records()) != len(crawler.target_data)
assert len(ident.get_records()) != len(crawler.crawled_data)
# Check consistency:
# Check whether identifiables retrieved from current identifiable store return
......@@ -296,8 +296,8 @@ def test_remove_unnecessary_updates():
# test trvial case
upl = [db.Record().add_parent("A")]
irs = [db.Record().add_parent("A")]
Crawler.remove_unnecessary_updates(upl, irs)
assert len(upl) == 0
updates = Crawler.remove_unnecessary_updates(upl, irs)
assert len(updates) == 0
# test property difference case
# TODO this should work right?
......@@ -309,24 +309,24 @@ def test_remove_unnecessary_updates():
# test value difference case
upl = [db.Record().add_parent("A").add_property("a", 5)]
irs = [db.Record().add_parent("A").add_property("a")]
Crawler.remove_unnecessary_updates(upl, irs)
assert len(upl) == 1
updates = Crawler.remove_unnecessary_updates(upl, irs)
assert len(updates) == 1
upl = [db.Record().add_parent("A").add_property("a", 5)]
irs = [db.Record().add_parent("A").add_property("a", 5)]
Crawler.remove_unnecessary_updates(upl, irs)
assert len(upl) == 0
updates = Crawler.remove_unnecessary_updates(upl, irs)
assert len(updates) == 0
# test unit difference case
upl = [db.Record().add_parent("A").add_property("a", unit='cm')]
irs = [db.Record().add_parent("A").add_property("a")]
Crawler.remove_unnecessary_updates(upl, irs)
assert len(upl) == 1
updates = Crawler.remove_unnecessary_updates(upl, irs)
assert len(updates) == 1
# test None difference case
upl = [db.Record().add_parent("A").add_property("a")]
irs = [db.Record().add_parent("A").add_property("a", 5)]
Crawler.remove_unnecessary_updates(upl, irs)
assert len(upl) == 1
updates = Crawler.remove_unnecessary_updates(upl, irs)
assert len(updates) == 1
# Current status:
......@@ -339,7 +339,7 @@ def test_identifiable_adapter_no_identifiable(crawler, ident):
insl, updl = crawler.synchronize()
assert len(updl) == 0
pers = [r for r in crawler.target_data if r.parents[0].name == "Person"]
pers = [r for r in crawler.crawled_data if r.parents[0].name == "Person"]
# All persons are inserted, because they are not identifiable:
assert len(insl) == len(pers)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment