Skip to content
Snippets Groups Projects
Commit d88ac4ec authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

Merge branch 'f-fix-merge' into 'dev'

New f fix merge

See merge request !140
parents 9affdce8 61556a34
No related branches found
No related tags found
2 merge requests!160STY: styling,!140New f fix merge
Pipeline #46544 passed
......@@ -24,9 +24,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Removed ###
### Fixed ###
* Empty Records can now be created (https://gitlab.com/caosdb/caosdb-crawler/-/issues/27)
* [#58](https://gitlab.com/caosdb/caosdb-crawler/-/issues/58) Documentation builds API docs in pipeline now.
* [#117](https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/117) `replace_variable` does no longer unnecessarily change the type. Values stored in variables in a CFood can have now other types.
* [#117](https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/117)
`replace_variable` does no longer unnecessarily change the type. Values stored
in variables in a CFood can have now other types.
* [indiscale#113](https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/113)
Resolving referenced entities fails in some corner cases. The crawler now
handles cases correctly in which entities retrieved from the server have to be
merged with local entities that both reference another, already existing
entity
### Security ###
......
......@@ -20,6 +20,7 @@ from pytest import fixture, mark
import caosdb as db
from caosdb.cached import cache_clear
from caosadvancedtools.models.parser import parse_model_from_string
from caoscrawler.crawl import Crawler
from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter
......@@ -208,3 +209,60 @@ def test_issue_83(clear_database):
assert len(retrieved_referencing3.get_property(referenced_type.name).value) == 2
assert retrieved_target1.id in retrieved_referencing3.get_property(referenced_type.name).value
assert retrieved_target2.id in retrieved_referencing3.get_property(referenced_type.name).value
def test_indiscale_113(clear_database):
"""Somewhat mysterious failures to resolve references in
split_into_inserts_and_updates, see
https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/113
"""
# Create and insert minimal datamodel
datamodel_str = """
Event:
recommended_properties:
Basis:
Campaign:
Basis:
Campaign:
recommended_properties:
Basis:
"""
model = parse_model_from_string(datamodel_str)
model.sync_data_model(noquestion=True)
# Register identifiables, everything is identified by name
ident = CaosDBIdentifiableAdapter()
ident.register_identifiable("Event", db.RecordType().add_parent(
name="Event").add_property(name="name"))
ident.register_identifiable("Basis", db.RecordType().add_parent(
name="Basis").add_property(name="name"))
ident.register_identifiable("Campaign", db.RecordType().add_parent(
name="Campaign").add_property(name="name"))
crawler = Crawler(identifiableAdapter=ident)
# Add records: event references basis and campaign, campaign references
# basis.
basis = db.Record(name="Poseidon").add_parent(name="Basis")
campaign = db.Record(name="POS386").add_parent(
name="Campaign").add_property(name="Basis", value=basis)
event = db.Record(name="GeoB13952").add_parent(name="Event")
event.add_property(name="Basis", value=basis)
event.add_property(name="Campaign", value=campaign)
# basis and campaign already exist in the db
db.Container().extend([basis, campaign]).insert()
# redefine to trigger resolving
basis = db.Record(name="Poseidon").add_parent(name="Basis")
campaign = db.Record(name="POS386").add_parent(
name="Campaign").add_property(name="Basis", value=basis)
recs = [event, basis, campaign]
ins, ups = crawler.synchronize(crawled_data=recs, unique_names=False)
# There is only one event to be inserted
assert len(ins) == 1
# Nothing to do for the existing ents
assert len(ups) == 0
assert ins[0].name == event.name
......@@ -21,7 +21,7 @@ python_requires = >=3.7
install_requires =
importlib-resources
caosadvancedtools >= 0.7.0
linkahead >= 0.13.1
linkahead > 0.13.2
yaml-header-tools >= 0.2.1
pyyaml
odfpy #make optional
......
......@@ -170,6 +170,48 @@ def _resolve_datatype(prop: db.Property, remote_entity: db.Entity):
return prop
def _treat_merge_error_of(newrecord, record):
"""
The parameters are two entities that cannot be merged with the merge_entities function.
# This function checks for two obvious cases where no merge will ever be possible:
# 1. Two Entities with differing IDs
# 2. Two non-Entity values which differ
It creates a more informative logger message and raises an Exception in those cases.
"""
for this_p in newrecord.properties:
that_p = record.get_property(this_p.name)
if (isinstance(this_p.value, db.Entity)
and isinstance(that_p.value, db.Entity)):
if this_p.value.id is not None and that_p.value.id is not None:
if this_p.value.id != that_p.value.id:
logger.error("The Crawler is trying to merge two entities "
"because they should be the same object (same"
" identifiables), but they reference "
"different Entities with the same Property."
f"Problematic Property: {this_p.name}\n"
f"Referenced Entities: {this_p.value.id} and "
f"{that_p.value.id}\n"
f"{record}\n{newrecord}")
raise RuntimeError("Cannot merge Entities")
elif (not isinstance(this_p.value, db.Entity)
and not isinstance(that_p.value, db.Entity)):
if ((this_p.value != that_p.value)
# TODO can we also compare lists?
and not isinstance(this_p.value, list)
and not isinstance(that_p.value, list)):
logger.error("The Crawler is trying to merge two entities "
"because they should be the same object (same"
" identifiables), but they have "
"different values for the same Property."
f"Problematic Property: {this_p.name}\n"
f"Values: {this_p.value} and "
f"{that_p.value}\n"
f"{record}\n{newrecord}")
raise RuntimeError("Cannot merge Entities")
class SecurityMode(Enum):
RETRIEVE = 0
INSERT = 1
......@@ -549,10 +591,11 @@ class Crawler(object):
resolved_references = True
# flat contains Entities which could not yet be checked against the remote server
try_to_merge_later = []
while resolved_references and len(flat) > 0:
resolved_references = False
referencing_entities = self.create_reference_mapping(
flat + to_be_updated + to_be_inserted)
flat + to_be_updated + try_to_merge_later + to_be_inserted)
# For each element we try to find out whether we can find it in the server or whether
# it does not yet exist. Since a Record may reference other unkown Records it might not
......@@ -599,14 +642,26 @@ class Crawler(object):
del flat[i]
# 3. Is it in the cache of already checked Records?
elif self.get_from_any_cache(identifiable) is not None:
# We merge the two in order to prevent loss of information
newrecord = self.get_from_any_cache(identifiable)
# Since the identifiables are the same, newrecord and record actually describe
# the same obejct.
# We merge the two in order to prevent loss of information
try:
merge_entities(newrecord, record)
merge_entities(
newrecord, record, merge_references_with_empty_diffs=False, merge_id_with_resolved_entity=True)
except EntityMergeConflictError:
continue
_treat_merge_error_of(newrecord, record)
# We cannot merge but it is none of the clear case where merge is
# impossible. Thus we try later
try_to_merge_later.append(record)
if newrecord.id is not None:
record.id = newrecord.id
Crawler.bend_references_to_new_object(
old=record, new=newrecord, entities=flat + to_be_updated + to_be_inserted)
old=record, new=newrecord,
entities=flat+to_be_updated+to_be_inserted+try_to_merge_later
)
referencing_entities = self.create_reference_mapping(
flat + to_be_updated + try_to_merge_later + to_be_inserted)
del flat[i]
resolved_references = True
......@@ -641,6 +696,14 @@ class Crawler(object):
for record in flat:
self.replace_references_with_cached(record, referencing_entities)
# We postponed the merge for records where it failed previously and try it again now.
# This only might add properties of the postponed records to the already used ones.
for record in try_to_merge_later:
identifiable = self.identifiableAdapter.get_identifiable(
record,
referencing_entities=referencing_entities)
newrecord = self.get_from_any_cache(identifiable)
merge_entities(newrecord, record, merge_id_with_resolved_entity=True)
if len(flat) > 0:
circle = self.detect_circular_dependency(flat)
if circle is None:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment