diff --git a/CHANGELOG.md b/CHANGELOG.md index 109956118239e4ec417081f7061d60947ecf90a0..6192e353b2762ef739fdcee4e4aba533b5ea9f73 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -69,6 +69,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 identified by the same LinkAhead id * [#87](https://gitlab.com/linkahead/linkahead-crawler/-/issues/87) Handle long strings more gracefully. The crawler sometimes runs into [linkahead-server#101](https://gitlab.com/linkahead/linkahead-server/-/issues/101), this is now mitigated. +* [indiscale#128](https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/128) Yet another corner case of referencing resolution resolved. ### Security ### diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 698a154f49d84e15dbe41f91da67469184151316..e532f20e339de243ae4565a12724261452a0eda6 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -3,10 +3,11 @@ # # This file is a part of the CaosDB Project. # -# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> -# 2021-2023 Research Group Biomedical Physics, -# Max-Planck-Institute for Dynamics and Self-Organization Göttingen -# Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> +# Copyright (C) 2021-2024 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# Copyright (C) 2021-2023 Research Group Biomedical Physics, MPI-DS Göttingen +# Copyright (C) 2021-2023 Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> +# Copyright (C) 2021-2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as @@ -42,7 +43,7 @@ from argparse import RawTextHelpFormatter from copy import deepcopy from datetime import datetime from enum import Enum -from typing import Any, Optional, Union +from typing import Any, List, Optional, Union import linkahead as db import yaml @@ -560,12 +561,12 @@ class Crawler(object): if isinstance(p.value, db.File): if p.value.path != cached.path: raise RuntimeError( - "The cached and the refernced entity are not identical.\n" + "The cached and the referenced entity are not identical.\n" f"Cached:\n{cached}\nReferenced:\n{el}" ) else: raise RuntimeError( - "The cached and the refernced entity are not identical.\n" + "The cached and the referenced entity are not identical.\n" f"Cached:\n{cached}\nReferenced:\n{el}" ) lst.append(cached) @@ -582,12 +583,12 @@ class Crawler(object): if isinstance(p.value, db.File): if p.value.path != cached.path: raise RuntimeError( - "The cached and the refernced entity are not identical.\n" + "The cached and the referenced entity are not identical.\n" f"Cached:\n{cached}\nReferenced:\n{p.value}" ) else: raise RuntimeError( - "The cached and the refernced entity are not identical.\n" + "The cached and the referenced entity are not identical.\n" f"Cached:\n{cached}\nReferenced:\n{p.value}" ) p.value = cached @@ -646,7 +647,7 @@ class Crawler(object): return False refs = self.identifiableAdapter.get_identifying_referencing_entities(referencing_entities, registered_identifiable) - if any([el is None for el in refs]): + if any(el is None for el in refs): return True refs = self.identifiableAdapter.get_identifying_referenced_entities( @@ -706,9 +707,11 @@ class Crawler(object): treated_record = self.treated_records_lookup.get_existing(record) if treated_record is not None: self._merge_identified(treated_record, record, try_to_merge_later, all_records) + all_records.remove(record) referencing_entities = self.create_reference_mapping(all_records) else: self.treated_records_lookup.add(record, None) + assert record.id del flat[i] # 2. Can it be identified via a path? elif record.path is not None: @@ -725,19 +728,20 @@ class Crawler(object): treated_record = self.treated_records_lookup.get_any(record) if treated_record is not None: self._merge_identified(treated_record, record, try_to_merge_later, all_records) + all_records.remove(record) referencing_entities = self.create_reference_mapping(all_records) else: # TODO add identifiable if possible self.treated_records_lookup.add(record, None) + assert record.id del flat[i] - resolved_references = True + entity_was_treated = True # flat contains Entities which could not yet be checked against the remote server try_to_merge_later = [] - while resolved_references and len(flat) > 0: - resolved_references = False - referencing_entities = self.create_reference_mapping( - all_records) + while entity_was_treated and len(flat) > 0: + entity_was_treated = False + referencing_entities = self.create_reference_mapping(all_records) # For each element we try to find out whether we can find it in the server or whether # it does not yet exist. Since a Record may reference other unkown Records it might not @@ -764,10 +768,11 @@ class Crawler(object): # describe the same object. # We merge record into treated_record in order to prevent loss of information self._merge_identified(treated_record, record, try_to_merge_later, all_records) + all_records.remove(record) referencing_entities = self.create_reference_mapping(all_records) del flat[i] - resolved_references = True + entity_was_treated = True # 2. Can it be checked on the remote server? elif not self._has_reference_value_without_id(identifiable): @@ -782,16 +787,18 @@ class Crawler(object): record.id = identified_record.id record.path = identified_record.path self.treated_records_lookup.add(record, identifiable) + assert record.id del flat[i] - resolved_references = True + entity_was_treated = True # 3. Does it have to be new since a needed reference is missing? # (Is it impossible to check this record because an identifiable references a # missing record?) elif self._has_missing_object_in_references(identifiable, referencing_entities): self.treated_records_lookup.add(record, identifiable) + assert record.id del flat[i] - resolved_references = True + entity_was_treated = True for record in flat: self.replace_references_with_cached(record, referencing_entities) @@ -808,11 +815,14 @@ class Crawler(object): circle = self.detect_circular_dependency(flat) if circle is None: logger.error("Failed, but found NO circular dependency. The data is as follows:" - + str(self.compact_entity_list_representation(flat))) + + str(self.compact_entity_list_representation(flat, + referencing_entities))) else: logger.error("Found circular dependency (Note that this might include references " "that are not identifying properties): " - + self.compact_entity_list_representation(circle)) + + self.compact_entity_list_representation(circle, + referencing_entities)) + raise RuntimeError( f"Could not finish split_into_inserts_and_updates. Circular dependency: " f"{circle is not None}") @@ -840,18 +850,34 @@ class Crawler(object): el.value[index] = val.id @ staticmethod - def compact_entity_list_representation(circle): + def compact_entity_list_representation(entities, referencing_entities: List) -> str: """ a more readable representation than the standard xml representation TODO this can be removed once the yaml format representation is in pylib """ text = "\n--------\n" - for el in circle: - if el.name is not None: - text += f"{el.name}\n" - text += f"{[el.name for el in el.parents]}\n" - props = {p.name: p.value for p in el.properties} - text += f"{props}\n" + + grouped = {"": []} + for ent in entities: + if not ent.parents: + grouped[""].append(ent) + for parent in ent.parents: + if parent.name not in grouped: + grouped[parent.name] = [] + grouped[parent.name].append(ent) + if not grouped[""]: + del grouped[""] + for parent, group in grouped.items(): + text += f"\n> Parent: {parent}\n" + for ent in group: + if ent.name is not None: + text += f"\n>> Name: {ent.name}\n" + else: + text += "\n>> name: # No name" + text += f"{[ent.name for ent in ent.parents]}\n" + props = {p.name: p.value for p in ent.properties} + text += f"{props}\n" + text += f"is_referenced_by:\n{referencing_entities[id(ent)]}\n" return text + "--------\n" diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index 920728c418479cd594e894e990d5f15693be589d..e026899201ae0dfa937053d315854e5fbb8a351a 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -38,6 +38,7 @@ import linkahead as db import linkahead.common.models as dbmodels import pytest import yaml +from caosadvancedtools.models.parser import parse_model_from_string from caoscrawler.crawl import (Crawler, SecurityMode, TreatedRecordLookUp, _treat_deprecated_prefix, crawler_main, split_restricted_path) @@ -52,7 +53,6 @@ from caoscrawler.stores import GeneralStore, RecordStore from caoscrawler.structure_elements import (DictElement, DictListElement, DictTextElement, File) from linkahead.apiutils import compare_entities -from caosadvancedtools.models.parser import parse_model_from_string from linkahead.cached import cache_clear from linkahead.exceptions import EmptyUniqueQueryError from pytest import raises @@ -1014,7 +1014,7 @@ def test_detect_circular_dependency(crawler_mocked_identifiable_retrieve, caplog _, _ = crawler.split_into_inserts_and_updates(flat) caplog.set_level(logging.ERROR, logger="caoscrawler.converters") assert "Found circular dependency" in caplog.text - assert "-------\na\n['C" in caplog.text + assert "\n--------\n\n> Parent: C\n\n>> Name: a\n[\'C\']" in caplog.text caplog.clear() @@ -1144,3 +1144,20 @@ def test_treated_record_lookup(): fi2 = db.File(path='b') trlu.add(fi2) assert trlu.get_any(db.File(path='b'), Identifiable(name='c')) is fi2 + + +def test_merge_entity_with_identifying_reference(crawler_mocked_identifiable_retrieve): + # When one python object representing a record is merged into another python object + # representing the same record, the former object can be forgotten and references from it to + # other records must not play a role + crawler = crawler_mocked_identifiable_retrieve + crawler.identifiableAdapter.get_registered_identifiable = Mock( + side_effect=lambda x: db.Record().add_parent('C').add_property(name='name') if + x.parents[0].name == "C" else + db.Record().add_parent('D').add_property(name='is_referenced_by', value="*") + ) + a = db.Record(name='a').add_parent("D") + b = db.Record(name='b').add_parent("C") + c = db.Record(name='b').add_parent("C").add_property(name="C", value=a) + flat = [a, c, b] + _, _ = crawler.split_into_inserts_and_updates(flat)