diff --git a/CHANGELOG.md b/CHANGELOG.md index cf324dd1b18cb6a839df3a9010ddde7b080b0c7e..02dcb340a1055881a12c31d90e079c2a5859a041 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,11 +8,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ## ### Added ### + - Identifiable class to represent the information used to identify Records. - Added some StructureElements: BooleanElement, FloatElement, IntegerElement, ListElement, DictElement +- String representation for Identifiables ### Changed ### + - Some StructureElements changed (see "How to upgrade" in the docs): - Dict, DictElement and DictDictElement were merged into DictElement. - DictTextElement and TextElement were merged into TextElement. The "match" @@ -27,6 +30,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed ### +- [#39](https://gitlab.com/caosdb/caosdb-crawler/-/issues/39) Merge conflicts in + `split_into_inserts_and_updates` when cached entity references a record + without id + ### Security ### ### Documentation ### diff --git a/setup.cfg b/setup.cfg index 5d76c9244bb21e2fd28a33e637be02dd7ef9fb6a..77c546f110c67e3c3a8f44cb617a7f1b187813e1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -19,9 +19,9 @@ package_dir = packages = find: python_requires = >=3.8 install_requires = - importlib-resources - caosdb > 0.9.0 - caosadvancedtools >= 0.6.0 + importlib-resources + caosdb > 0.10.0 + caosadvancedtools >= 0.6.0 yaml-header-tools >= 0.2.1 pyyaml odfpy #make optional diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 0d8cfab31e9a3456b383a380443c399421acccd0..cccbbdb040c556da8f904f27dd9d03aafb6d4872 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -29,36 +29,41 @@ the acuired data with CaosDB. """ from __future__ import annotations + +import argparse import importlib -from caosadvancedtools.cache import UpdateCache, Cache -import uuid -import sys +import logging import os +import sys +import uuid +import warnings import yaml + +from argparse import RawTextHelpFormatter +from collections import defaultdict +from copy import deepcopy from enum import Enum -import logging from importlib_resources import files -import argparse -from argparse import RawTextHelpFormatter +from jsonschema import validate +from typing import Any, Optional, Type, Union + import caosdb as db + +from caosadvancedtools.cache import UpdateCache, Cache from caosadvancedtools.crawler import Crawler as OldCrawler -import warnings +from caosdb.apiutils import (compare_entities, EntityMergeConflictError, + merge_entities) from caosdb.common.datatype import is_reference -from .stores import GeneralStore, RecordStore -from .identified_cache import IdentifiedCache -from .identifiable import Identifiable -from .structure_elements import StructureElement, Directory + from .converters import Converter, DirectoryConverter +from .identifiable import Identifiable from .identifiable_adapters import (IdentifiableAdapter, LocalStorageIdentifiableAdapter, CaosDBIdentifiableAdapter) -from collections import defaultdict -from typing import Any, Optional, Type, Union -from caosdb.apiutils import compare_entities, merge_entities -from copy import deepcopy -from jsonschema import validate - +from .identified_cache import IdentifiedCache from .macros import defmacro_constructor, macro_constructor +from .stores import GeneralStore, RecordStore +from .structure_elements import StructureElement, Directory logger = logging.getLogger(__name__) @@ -531,7 +536,7 @@ class Crawler(object): """ if ident is None: raise ValueError("Identifiable has to be given as argument") - for pvalue in list(ident.properties.values())+ident.backrefs: + for pvalue in list(ident.properties.values()) + ident.backrefs: if isinstance(pvalue, list): for el in pvalue: if isinstance(el, db.Entity) and el.id is None: @@ -572,7 +577,7 @@ class Crawler(object): """ if ident is None: raise ValueError("Identifiable has to be given as argument") - for pvalue in list(ident.properties.values())+ident.backrefs: + for pvalue in list(ident.properties.values()) + ident.backrefs: # Entity instead of ID and not cached locally if (isinstance(pvalue, list)): for el in pvalue: @@ -791,7 +796,10 @@ class Crawler(object): elif self.get_from_any_cache(identifiable) is not None: # We merge the two in order to prevent loss of information newrecord = self.get_from_any_cache(identifiable) - merge_entities(newrecord, record) + try: + merge_entities(newrecord, record) + except EntityMergeConflictError: + continue Crawler.bend_references_to_new_object( old=record, new=newrecord, entities=flat + to_be_updated + to_be_inserted) @@ -1026,9 +1034,7 @@ class Crawler(object): Return the final to_be_inserted and to_be_updated as tuple. """ - to_be_inserted, to_be_updated = self.split_into_inserts_and_updates( - crawled_data - ) + to_be_inserted, to_be_updated = self.split_into_inserts_and_updates(crawled_data) referencing_entities = self.create_reference_mapping(to_be_updated + to_be_inserted) # TODO: refactoring of typo diff --git a/src/caoscrawler/identifiable.py b/src/caoscrawler/identifiable.py index 814c9507fb1851dec68f06de26467a8a90844209..9a80a77478e87dfdf52a322503b73612e613cfd6 100644 --- a/src/caoscrawler/identifiable.py +++ b/src/caoscrawler/identifiable.py @@ -22,6 +22,7 @@ from __future__ import annotations import caosdb as db from datetime import datetime +import json from hashlib import sha256 from typing import Union @@ -131,3 +132,10 @@ class Identifiable(): return True else: return False + + def __repr__(self): + pstring = json.dumps(self.properties) + return (f"{self.__class__.__name__} for RT {self.record_type}: id={self.record_id}; " + f"name={self.name}\n\tpath={self.path}\n" + f"\tproperties:\n{pstring}\n" + f"\tbackrefs:\n{self.backrefs}") diff --git a/unittests/test_identifiable.py b/unittests/test_identifiable.py index e360624fa33ee7bbb1cc1097d6290105874c2cc5..3f3c606b163df4dc238be9a669fd31eb630a582d 100644 --- a/unittests/test_identifiable.py +++ b/unittests/test_identifiable.py @@ -42,7 +42,8 @@ def test_create_hashable_string(): assert a == b assert ( Identifiable._create_hashable_string( - Identifiable(name="A", record_type="B", properties={'a': db.Record(id=12)}) + Identifiable(name="A", record_type="B", + properties={'a': db.Record(id=12)}) ) == "P<B>N<A>R<[]>a:12") a = Identifiable._create_hashable_string( Identifiable(name="A", record_type="B", properties={'a': [db.Record(id=12)]})) @@ -51,7 +52,8 @@ def test_create_hashable_string(): Identifiable(name="A", record_type="B", properties={'a': [12]})) == "P<B>N<A>R<[]>a:[12]") assert ( Identifiable._create_hashable_string( - Identifiable(name="A", record_type="B", properties={'a': [db.Record(id=12), 11]}) + Identifiable(name="A", record_type="B", properties={ + 'a': [db.Record(id=12), 11]}) ) == "P<B>N<A>R<[]>a:[12, 11]") assert ( Identifiable._create_hashable_string( @@ -68,6 +70,17 @@ def test_name(): Identifiable(properties={"Name": 'li'}) +def test_repr(): + # only test that something meaningful is returned + assert 'properties' in str(Identifiable(name="A", record_type="B")) + assert str(Identifiable(name="A", record_type="B", properties={'a': 0})).split( + "properties:\n")[1].split('\n')[0] == '{"a": 0}' + assert str(Identifiable(name="A", record_type="B", properties={'a': 0, 'b': "test"})).split( + "properties:\n")[1].split('\n')[0] == '{"a": 0, "b": "test"}' + + # TODO(henrik): Add a test using backrefs once that's implemented. + + def test_equality(): assert Identifiable( record_id=12, properties={"a": 0}) == Identifiable(record_id=12, properties={"a": 1}) @@ -81,5 +94,7 @@ def test_equality(): path="a", properties={"a": 0}) == Identifiable(path="a", properties={"a": 1}) assert Identifiable( path="a", properties={"a": 0}) == Identifiable(properties={"a": 0}) - assert Identifiable(properties={"a": 0}) == Identifiable(properties={"a": 0}) - assert Identifiable(properties={"a": 0}) != Identifiable(properties={"a": 1}) + assert Identifiable(properties={"a": 0}) == Identifiable( + properties={"a": 0}) + assert Identifiable(properties={"a": 0}) != Identifiable( + properties={"a": 1}) diff --git a/unittests/test_issues.py b/unittests/test_issues.py index ad66aa1413303a16b96ed877f3279a061a0a4bc5..0e024429400b00441ca113a5d020adb9b2b77d12 100644 --- a/unittests/test_issues.py +++ b/unittests/test_issues.py @@ -22,15 +22,14 @@ from pytest import mark +import caosdb as db + from caoscrawler.crawl import Crawler +from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter from caoscrawler.structure_elements import DictElement from test_tool import rfp -@mark.xfail( - reason="Wait until value conversion in dicts is fixed, see " - "https://gitlab.com/caosdb/caosdb-crawler/-/issues/10." -) def test_issue_10(): """Test integer-to-float conversion in dictionaries""" crawler_definition = { @@ -68,3 +67,88 @@ def test_issue_10(): assert records[0].parents[0].name == "TestRec" assert records[0].get_property("float_prop") is not None assert float(records[0].get_property("float_prop").value) == 4.0 + + +def test_issue_39(): + """Test for merge conflicts in + `crawl.Crawler.split_into_inserts_and_updates` (see + https://gitlab.com/caosdb/caosdb-crawler/-/issues/39). + + """ + + crawler = Crawler(debug=True) + + # For trying and failing to retrieve remotely identified records + def _fake_retrieve(*args, **kwargs): + return None + + ident = CaosDBIdentifiableAdapter() + # identifiable property is just name for both Record Types + ident.register_identifiable("RT_A", db.RecordType().add_parent( + name="RT_A").add_property(name="name")) + ident.register_identifiable("RT_B", db.RecordType().add_parent( + name="RT_B").add_property(name="name")) + # overwrite retrieve + ident.retrieve_identified_record_for_identifiable = _fake_retrieve + crawler.identifiableadapter = ident + + # a1 (has id) references b1 (has no id) + a1 = db.Record(name="A", id=101).add_parent(name="RT_A") + b1 = db.Record(name="B").add_parent(name="RT_B") + a1.add_property(name="RT_B", value=b1) + + # a2 (no id) references b2 (has id) + a2 = db.Record(name="A").add_parent(name="RT_A") + b2 = db.Record(name="B", id=102).add_parent(name="RT_B") + a2.add_property(name="RT_B", value=b2) + + flat_list = [b1, a1, a2, b2] + + # the two records with ids exist remotely + crawler.add_to_remote_existing_cache(a1) + crawler.add_to_remote_existing_cache(b2) + + # this would result in a merge conflict before + ins, ups = crawler.split_into_inserts_and_updates(flat_list) + + +@mark.xfail(reason="FIX: https://gitlab.com/caosdb/caosdb-crawler/-/issues/47") +def test_list_datatypes(): + crawler_definition = { + "DictTest": { + "type": "DictElement", + "match": "(.*)", + "records": { + "Dataset": {} + }, + "subtree": { + "int_element": { + "type": "IntegerElement", + "match_name": ".*", + "match_value": "(?P<int_value>.*)", + "records": { + "Dataset": { + "Subject": "+$int_value" + } + } + } + } + } + } + + crawler = Crawler(debug=True) + converter_registry = crawler.load_converters(crawler_definition) + + test_dict = { + "v1": 1233, + "v2": 1234 + } + + records = crawler.start_crawling( + DictElement("TestDict", test_dict), crawler_definition, converter_registry) + assert len(records) == 1 + assert records[0].parents[0].name == "Dataset" + assert records[0].get_property("Subject") is not None + assert isinstance(records[0].get_property("Subject").value, list) + assert records[0].get_property("Subject").datatype is not None + assert records[0].get_property("Subject").datatype.startswith("LIST")