diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 0345363bc31072ce4120bfed831a7ef88d494291..3fd67e4bf2915d2755133ef8d4b8387b073a26a2 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -32,55 +32,48 @@ the acuired data with CaosDB. from __future__ import annotations import argparse -from datetime import datetime import importlib import logging import os import sys -from caosdb.exceptions import EmptyUniqueQueryError import uuid -from caosdb.cached import cached_get_entity_by, cache_clear import warnings -import yaml - from argparse import RawTextHelpFormatter from collections import defaultdict from copy import deepcopy - +from datetime import datetime from enum import Enum -from importlib_resources import files -from jsonschema import validate from typing import Any, Optional, Type, Union import caosdb as db - -from caosadvancedtools.utils import create_entity_link -from caosadvancedtools.cache import UpdateCache, Cache +import yaml +from caosadvancedtools.cache import Cache, UpdateCache from caosadvancedtools.crawler import Crawler as OldCrawler from caosadvancedtools.serverside.helper import send_mail -from caosdb.apiutils import (compare_entities, EntityMergeConflictError, +from caosadvancedtools.utils import create_entity_link +from caosdb.apiutils import (EntityMergeConflictError, compare_entities, merge_entities) +from caosdb.cached import cache_clear, cached_get_entity_by from caosdb.common.datatype import is_reference +from caosdb.exceptions import EmptyUniqueQueryError +from importlib_resources import files +from jsonschema import validate -from .converters import Converter, DirectoryConverter, ConverterValidationError +from .config import get_config_setting +from .converters import Converter, ConverterValidationError, DirectoryConverter +from .debug_tree import DebugTree from .identifiable import Identifiable -from .identifiable_adapters import (IdentifiableAdapter, - LocalStorageIdentifiableAdapter, - CaosDBIdentifiableAdapter) +from .identifiable_adapters import (CaosDBIdentifiableAdapter, + IdentifiableAdapter, + LocalStorageIdentifiableAdapter) from .identified_cache import IdentifiedCache +from .logging import configure_server_side_logging from .macros import defmacro_constructor, macro_constructor +from .scanner import (create_converter_registry, initialize_converters, + load_definition, scan_directory, scan_structure_elements) from .stores import GeneralStore, RecordStore -from .structure_elements import StructureElement, Directory, NoneElement +from .structure_elements import Directory, NoneElement, StructureElement from .version import check_cfood_version -from .config import get_config_setting -from .logging import configure_server_side_logging - -from .scanner import (scan_directory, - load_definition, - create_converter_registry, - initialize_converters, - scan_structure_elements) -from .debug_tree import DebugTree logger = logging.getLogger(__name__) @@ -407,7 +400,8 @@ class Crawler(object): cached = self.get_from_any_cache( self.identifiableAdapter.get_identifiable(el, referencing_entities)) if cached is None: - raise RuntimeError("Not in cache.") + lst.append(el) + continue if not check_identical(cached, el, True): if isinstance(p.value, db.File): if p.value.path != cached.path: @@ -422,7 +416,7 @@ class Crawler(object): cached = self.get_from_any_cache( self.identifiableAdapter.get_identifiable(p.value, referencing_entities)) if cached is None: - raise RuntimeError("Not in cache.") + continue if not check_identical(cached, p.value, True): if isinstance(p.value, db.File): if p.value.path != cached.path: @@ -641,8 +635,17 @@ class Crawler(object): self.replace_references_with_cached(record, referencing_entities) if len(flat) > 0: + circle = self.detect_circular_dependency(flat) + if circle is None: + logger.error("Failed, but found NO circular dependency. The data is as follows:" + + str(self.compact_entity_list_representation(flat))) + else: + logger.error("Found circular dependency (Note that this might include references " + "that are not identifying properties): " + + self.compact_entity_list_representation(circle)) raise RuntimeError( - "Could not resolve all Entity references. Circular Dependency?") + f"Could not finish split_into_inserts_and_updates. Circular dependency: " + f"{circle is not None}") return to_be_inserted, to_be_updated @@ -657,6 +660,55 @@ class Crawler(object): if val.id is not None: el.value[index] = val.id + @staticmethod + def compact_entity_list_representation(circle): + """ a more readable representation than the standard xml representation + + TODO this can be removed once the yaml format representation is in pylib + """ + text = "\n--------\n" + for el in circle: + if el.name is not None: + text += f"{el.name}\n" + text += f"{[el.name for el in el.parents]}\n" + props = {p.name: p.value for p in el.properties} + text += f"{props}\n" + + return text + "--------\n" + + @staticmethod + def detect_circular_dependency(flat: list[db.Entity]): + """ + Detects whether there are circular references in the given entity list and returns a list + where the entities are ordered according to the chain of references (and only the entities + contained in the circle are included. Returns None if no circular dependency is found. + + TODO: for the sake of detecting problems for split_into_inserts_and_updates we should only + consider references that are identifying properties. + """ + circle = [flat[0]] + closed = False + while not closed: + current = circle[-1] + added_to_circle = False + for p in current.properties: + if isinstance(p.value, list): + for pval in p.value: + if pval in flat: + if pval in circle: + closed = True + circle.append(pval) + added_to_circle = True + else: + if p.value in flat: + if p.value in circle: + closed = True + circle.append(p.value) + added_to_circle = True + if not added_to_circle: + return None + return circle + @staticmethod def _merge_properties_from_remote( crawled_data: list[db.Record], diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index 7eb33c1a929c3c60f4f62a425a50200fdb836938..f28cd897fe9f1c31b1d13648f19356066a58ed8e 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -858,3 +858,26 @@ def test_create_entity_summary(): assert 'A:' in text assert 'B:' in text assert "<a href='/Entity/4'>a</a>, <a href='/Entity/6'>b</a>" in text + + +def test_detect_circular_dependency(crawler_mocked_identifiable_retrieve, caplog): + crawler = crawler_mocked_identifiable_retrieve + crawler.identifiableAdapter.get_registered_identifiable = Mock( + side_effect=lambda x: db.Record().add_parent('C').add_property(name='C')) + a = db.Record(name='a').add_parent("C") + b = db.Record(name='b').add_parent("C").add_property(name="C", value=a) + c = db.Record(name='c').add_parent("C").add_property(name='D', value='e' + ).add_property(name="C", value=b) + d = db.Record(name='c').add_parent("C") + a.add_property(name="C", value=c) + flat = [a, b, c] + circle = Crawler.detect_circular_dependency(flat) + assert [id(el) for el in circle] == [id(el) for el in [a, c, b, a]] + + assert Crawler.detect_circular_dependency([d]) is None + with raises(RuntimeError): + _, _ = crawler.split_into_inserts_and_updates(flat) + caplog.set_level(logging.ERROR, logger="caoscrawler.converters") + assert "Found circular dependency" in caplog.text + assert "-------\na\n['C" in caplog.text + caplog.clear()