diff --git a/CHANGELOG.md b/CHANGELOG.md index cf324dd1b18cb6a839df3a9010ddde7b080b0c7e..a3a80183c7b6a6980db3abfeb0abc9187fb9ea3e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,11 +8,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ## ### Added ### + - Identifiable class to represent the information used to identify Records. - Added some StructureElements: BooleanElement, FloatElement, IntegerElement, ListElement, DictElement ### Changed ### + - Some StructureElements changed (see "How to upgrade" in the docs): - Dict, DictElement and DictDictElement were merged into DictElement. - DictTextElement and TextElement were merged into TextElement. The "match" @@ -27,6 +29,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed ### +- [#39](https://gitlab.com/caosdb/caosdb-crawler/-/issues/39) Merge conflicts in + `split_into_inserts_and_updates` when cached entity references a record + without id + ### Security ### ### Documentation ### diff --git a/setup.cfg b/setup.cfg index 5d76c9244bb21e2fd28a33e637be02dd7ef9fb6a..77c546f110c67e3c3a8f44cb617a7f1b187813e1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -19,9 +19,9 @@ package_dir = packages = find: python_requires = >=3.8 install_requires = - importlib-resources - caosdb > 0.9.0 - caosadvancedtools >= 0.6.0 + importlib-resources + caosdb > 0.10.0 + caosadvancedtools >= 0.6.0 yaml-header-tools >= 0.2.1 pyyaml odfpy #make optional diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 00b69aacb6aeff690ec860d3cc610138dc8026cd..e6b5f65cde7bd9159ab104f763597c3ded1c3903 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -29,36 +29,41 @@ the acuired data with CaosDB. """ from __future__ import annotations + +import argparse import importlib -from caosadvancedtools.cache import UpdateCache, Cache -import uuid -import sys +import logging import os +import sys +import uuid +import warnings import yaml + +from argparse import RawTextHelpFormatter +from collections import defaultdict +from copy import deepcopy from enum import Enum -import logging from importlib_resources import files -import argparse -from argparse import RawTextHelpFormatter +from jsonschema import validate +from typing import Any, Optional, Type, Union + import caosdb as db + +from caosadvancedtools.cache import UpdateCache, Cache from caosadvancedtools.crawler import Crawler as OldCrawler -import warnings +from caosdb.apiutils import (compare_entities, EntityMergeConflictError, + merge_entities) from caosdb.common.datatype import is_reference -from .stores import GeneralStore, RecordStore -from .identified_cache import IdentifiedCache -from .identifiable import Identifiable -from .structure_elements import StructureElement, Directory + from .converters import Converter, DirectoryConverter +from .identifiable import Identifiable from .identifiable_adapters import (IdentifiableAdapter, LocalStorageIdentifiableAdapter, CaosDBIdentifiableAdapter) -from collections import defaultdict -from typing import Any, Optional, Type, Union -from caosdb.apiutils import compare_entities, merge_entities -from copy import deepcopy -from jsonschema import validate - +from .identified_cache import IdentifiedCache from .macros import defmacro_constructor, macro_constructor +from .stores import GeneralStore, RecordStore +from .structure_elements import StructureElement, Directory logger = logging.getLogger(__name__) @@ -759,7 +764,10 @@ class Crawler(object): elif self.get_from_any_cache(identifiable) is not None: # We merge the two in order to prevent loss of information newrecord = self.get_from_any_cache(identifiable) - merge_entities(newrecord, record) + try: + merge_entities(newrecord, record) + except EntityMergeConflictError: + continue Crawler.bend_references_to_new_object( old=record, new=newrecord, entities=flat + to_be_updated + to_be_inserted) @@ -997,7 +1005,8 @@ class Crawler(object): if self.identifiableAdapter is None: raise RuntimeError("Should not happen.") - to_be_inserted, to_be_updated = self.split_into_inserts_and_updates(crawled_data) + to_be_inserted, to_be_updated = self.split_into_inserts_and_updates( + crawled_data) # TODO: refactoring of typo for el in to_be_updated: diff --git a/unittests/test_issues.py b/unittests/test_issues.py index ad66aa1413303a16b96ed877f3279a061a0a4bc5..ed30f7f619a6c054f09d35cc57a1746f01c664a6 100644 --- a/unittests/test_issues.py +++ b/unittests/test_issues.py @@ -22,15 +22,14 @@ from pytest import mark +import caosdb as db + from caoscrawler.crawl import Crawler +from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter from caoscrawler.structure_elements import DictElement from test_tool import rfp -@mark.xfail( - reason="Wait until value conversion in dicts is fixed, see " - "https://gitlab.com/caosdb/caosdb-crawler/-/issues/10." -) def test_issue_10(): """Test integer-to-float conversion in dictionaries""" crawler_definition = { @@ -68,3 +67,46 @@ def test_issue_10(): assert records[0].parents[0].name == "TestRec" assert records[0].get_property("float_prop") is not None assert float(records[0].get_property("float_prop").value) == 4.0 + + +def test_issue_39(): + """Test for merge conflicts in + `crawl.Crawler.split_into_inserts_and_updates` (see + https://gitlab.com/caosdb/caosdb-crawler/-/issues/39). + + """ + + crawler = Crawler(debug=True) + + # For trying and failing to retrieve remotely identified records + def _fake_retrieve(*args, **kwargs): + return None + + ident = CaosDBIdentifiableAdapter() + # identifiable property is just name for both Record Types + ident.register_identifiable("RT_A", db.RecordType().add_parent( + name="RT_A").add_property(name="name")) + ident.register_identifiable("RT_B", db.RecordType().add_parent( + name="RT_B").add_property(name="name")) + # overwrite retrieve + ident.retrieve_identified_record_for_identifiable = _fake_retrieve + crawler.identifiableadapter = ident + + # a1 (has id) references b1 (has no id) + a1 = db.Record(name="A", id=101).add_parent(name="RT_A") + b1 = db.Record(name="B").add_parent(name="RT_B") + a1.add_property(name="RT_B", value=b1) + + # a2 (no id) references b2 (has id) + a2 = db.Record(name="A").add_parent(name="RT_A") + b2 = db.Record(name="B", id=102).add_parent(name="RT_B") + a2.add_property(name="RT_B", value=b2) + + flat_list = [b1, a1, a2, b2] + + # the two records with ids exist remotely + crawler.add_to_remote_existing_cache(a1) + crawler.add_to_remote_existing_cache(b2) + + # this would result in a merge conflict before + ins, ups = crawler.split_into_inserts_and_updates(flat_list)