diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 0345363bc31072ce4120bfed831a7ef88d494291..d57d54665ce30a6ebe2258b47e2b4aca0c7d8912 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -32,55 +32,48 @@ the acuired data with CaosDB. from __future__ import annotations import argparse -from datetime import datetime import importlib import logging import os import sys -from caosdb.exceptions import EmptyUniqueQueryError import uuid -from caosdb.cached import cached_get_entity_by, cache_clear import warnings -import yaml - from argparse import RawTextHelpFormatter from collections import defaultdict from copy import deepcopy - +from datetime import datetime from enum import Enum -from importlib_resources import files -from jsonschema import validate from typing import Any, Optional, Type, Union import caosdb as db - -from caosadvancedtools.utils import create_entity_link -from caosadvancedtools.cache import UpdateCache, Cache +import yaml +from caosadvancedtools.cache import Cache, UpdateCache from caosadvancedtools.crawler import Crawler as OldCrawler from caosadvancedtools.serverside.helper import send_mail -from caosdb.apiutils import (compare_entities, EntityMergeConflictError, +from caosadvancedtools.utils import create_entity_link +from caosdb.apiutils import (EntityMergeConflictError, compare_entities, merge_entities) +from caosdb.cached import cache_clear, cached_get_entity_by from caosdb.common.datatype import is_reference +from caosdb.exceptions import EmptyUniqueQueryError +from importlib_resources import files +from jsonschema import validate -from .converters import Converter, DirectoryConverter, ConverterValidationError +from .config import get_config_setting +from .converters import Converter, ConverterValidationError, DirectoryConverter +from .debug_tree import DebugTree from .identifiable import Identifiable -from .identifiable_adapters import (IdentifiableAdapter, - LocalStorageIdentifiableAdapter, - CaosDBIdentifiableAdapter) +from .identifiable_adapters import (CaosDBIdentifiableAdapter, + IdentifiableAdapter, + LocalStorageIdentifiableAdapter) from .identified_cache import IdentifiedCache +from .logging import configure_server_side_logging from .macros import defmacro_constructor, macro_constructor +from .scanner import (create_converter_registry, initialize_converters, + load_definition, scan_directory, scan_structure_elements) from .stores import GeneralStore, RecordStore -from .structure_elements import StructureElement, Directory, NoneElement +from .structure_elements import Directory, NoneElement, StructureElement from .version import check_cfood_version -from .config import get_config_setting -from .logging import configure_server_side_logging - -from .scanner import (scan_directory, - load_definition, - create_converter_registry, - initialize_converters, - scan_structure_elements) -from .debug_tree import DebugTree logger = logging.getLogger(__name__) @@ -407,7 +400,8 @@ class Crawler(object): cached = self.get_from_any_cache( self.identifiableAdapter.get_identifiable(el, referencing_entities)) if cached is None: - raise RuntimeError("Not in cache.") + lst.append(el) + continue if not check_identical(cached, el, True): if isinstance(p.value, db.File): if p.value.path != cached.path: @@ -422,7 +416,7 @@ class Crawler(object): cached = self.get_from_any_cache( self.identifiableAdapter.get_identifiable(p.value, referencing_entities)) if cached is None: - raise RuntimeError("Not in cache.") + continue if not check_identical(cached, p.value, True): if isinstance(p.value, db.File): if p.value.path != cached.path: @@ -641,6 +635,12 @@ class Crawler(object): self.replace_references_with_cached(record, referencing_entities) if len(flat) > 0: + circle = self.detect_circular_dependency(flat) + if circle is None: + logger.error("Failed, but found NO circular dependency. The data is as follows:" + + str(self.represent_circle(flat))) + else: + logger.error("Found circular dependency: "+self.represent_circle(circle)) raise RuntimeError( "Could not resolve all Entity references. Circular Dependency?") @@ -657,6 +657,43 @@ class Crawler(object): if val.id is not None: el.value[index] = val.id + @staticmethod + def represent_circle(circle): + text = "The Records with circular dependencies have the following parents:\n--------\n" + for el in circle: + if el.name is not None: + text += f"{el.name}\n" + text += f"{[el.name for el in el.parents]}\n" + props = {p.name: p.value for p in el.properties} + text += f"{props}\n" + + return text + "--------\n" + + @staticmethod + def detect_circular_dependency(flat: list[db.Entity]): + circle = [flat[0]] + closed = False + while not closed: + current = circle[-1] + added_to_circle = False + for p in current.properties: + if isinstance(p.value, list): + for pval in p.value: + if pval in flat: + if pval in circle: + closed = True + circle.append(pval) + added_to_circle = True + else: + if p.value in flat: + if p.value in circle: + closed = True + circle.append(p.value) + added_to_circle = True + if not added_to_circle: + return None + return circle + @staticmethod def _merge_properties_from_remote( crawled_data: list[db.Record], diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index 7eb33c1a929c3c60f4f62a425a50200fdb836938..b45a70e42d51479099f6fbb2c7f911ff01e2b25e 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -858,3 +858,26 @@ def test_create_entity_summary(): assert 'A:' in text assert 'B:' in text assert "<a href='/Entity/4'>a</a>, <a href='/Entity/6'>b</a>" in text + + +def test_detect_circular_dependency(crawler_mocked_identifiable_retrieve, caplog): + crawler = crawler_mocked_identifiable_retrieve + crawler.identifiableAdapter.get_registered_identifiable = Mock( + side_effect=lambda x: db.Record().add_parent('C').add_property(name='C')) + a = db.Record(name='a').add_parent("C") + b = db.Record(name='b').add_parent("C").add_property(name="C", value=a) + c = db.Record(name='c').add_parent("C").add_property(name='D', value='e' + ).add_property(name="C", value=b) + d = db.Record(name='c').add_parent("C") + a.add_property(name="C", value=c) + flat = [a, b, c] + circle = Crawler.detect_circular_dependency(flat) + assert [id(el) for el in circle] == [id(el) for el in [a, c, b, a]] + + with raises(Exception): + detect_circular_dependency([d]) + with raises(Exception): + _, _ = crawler.split_into_inserts_and_updates(flat) + caplog.set_level(logging.ERROR, logger="caoscrawler.converters") + assert "-------\na\n['C" in caplog.text + caplog.clear()