diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 0a8c156f603587e894a19b1f356cce2e1dcc5774..4e0490111fdaa4a6580cf17058a168b4a0301496 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -61,6 +61,7 @@ from linkahead.utils.escape import escape_squoted_text from .config import get_config_setting from .converters import Converter, ConverterValidationError from .debug_tree import DebugTree +from .exceptions import MissingIdentifyingProperty from .identifiable import Identifiable from .identifiable_adapters import (CaosDBIdentifiableAdapter, IdentifiableAdapter) @@ -232,6 +233,9 @@ class Crawler(object): warnings.warn(DeprecationWarning( "The generalStore argument of the Crawler class is deprecated and has no effect.")) + self.problem_logger = logging.getLogger(__name__ + "probs") + self.problem_logger.addHandler(logging.FileHandler("problems.txt", encoding='utf-8')) + def load_converters(self, definition: dict): warnings.warn(DeprecationWarning( "The function load_converters in the crawl module is deprecated. " @@ -367,12 +371,26 @@ class Crawler(object): for se in list(st.unchecked): if se not in st.unchecked: continue + if st.identity_relies_on_unchecked_entity(se): print(st.nodes.index(se), "relies on unchecked") continue print(se.identifiable) if se.identifiable is None: + try: + self.identifiableAdapter.check_identifying_props(se) + except MissingIdentifyingProperty as exc: + self.problem_logger.error(f"ERROR: an identifying Property is missing.\n" + f"{exc}\n{se}\n") + st.problems.add(("MissingProperty", se.parents[0].name, exc.prop)) + if st.raise_problems: + raise + else: + remove, rmrefs = st.remove_failed(se) + entity_was_treated = True + continue + st.set_identifiable_of_node(se, st.identifiableAdapter.get_identifiable( se, st.backward_id_referenced_by[se.uuid])) # entity was merged with another due to the new identifiable diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index 2405e454f7a17f8a74617349cd95e5b61fe66541..25069874cf7d9be3cb1f12f050ae88081aaf8658 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -37,6 +37,8 @@ import yaml from linkahead.cached import cached_get_entity_by, cached_query from linkahead.utils.escape import escape_squoted_text +from .exceptions import (MissingIdentifyingProperty, + MissingReferencingEntityError) from .identifiable import Identifiable from .sync_node import SyncNode from .utils import has_parent @@ -133,7 +135,12 @@ identifiabel, identifiable and identified record) for a Record. query_string += " WITH " if ident.name is not None: - query_string += "name='{}'".format(escape_squoted_text(ident.name)) + if len(ident.name) < 200: + query_string += "name='{}'".format(escape_squoted_text(ident.name)) + else: + query_string += (f"name like '{escape_squoted_text(ident.name[:40])}*' AND " + f"name like '*{escape_squoted_text(ident.name[-40:])}'") + if len(ident.properties) > 0: query_string += " AND " @@ -146,6 +153,24 @@ identifiabel, identifiable and identified record) for a Record. query_string = query_string[:-len(" AND ")] return query_string + def check_identifying_props(self, node): + if node.registered_identifiable is None: + raise RuntimeError("no registered_identifiable") + for prop in node.registered_identifiable.properties: + if prop.name.lower() == "is_referenced_by": + continue + if prop.name.lower() == "name": + if node.name is None: + i = MissingIdentifyingProperty(f"The node has no name.") + i.prop = "name" + raise i + else: + continue + + if (len([el for el in node.properties if el.name.lower() == prop.name.lower()]) == 0): + i = MissingIdentifyingProperty(f"The property {prop.name} is missing.") + i.prop = prop.name + @staticmethod def __create_pov_snippet(pname: str, pvalue, startswith: bool = False): """Return something like ``'name'='some value'`` or ``'name' LIKE 'some*'``. @@ -274,7 +299,7 @@ startswith: bool, optional for el in identifiable_backrefs: assert isinstance(el, SyncNode) if len(identifiable_backrefs) == 0: - raise RuntimeError( + raise MissingReferencingEntityError( f"Could not find referencing entities of type(s): {prop.value}\n" f"for registered identifiable:\n{registered_identifiable}\n" f"There were {len(identifiable_backrefs)} referencing entities to choose from.\n" diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py index 9d1f538732858ff2fbf949d45c359ebb16fe3480..f6fc4d8f81d68277d6ad405ab04f50999d85525d 100644 --- a/src/caoscrawler/scanner.py +++ b/src/caoscrawler/scanner.py @@ -362,16 +362,17 @@ def scanner(items: list[StructureElement], debug_tree.debug_metadata["usage"][str(element)].add( "/".join(converters_path + [converter.name])) mod_info = debug_tree.debug_metadata["provenance"] - for record_name, prop_name in keys_modified: - # TODO: check - internal_id = record_store_copy.get_internal_id( - record_name) - record_identifier = record_name + \ - "_" + str(internal_id) - converter.metadata["usage"].add(record_identifier) - mod_info[record_identifier][prop_name] = ( - structure_elements_path + [element.get_name()], - converters_path + [converter.name]) + if keys_modified is not None: + for record_name, prop_name in keys_modified: + # TODO: check + internal_id = record_store_copy.get_internal_id( + record_name) + record_identifier = record_name + \ + "_" + str(internal_id) + converter.metadata["usage"].add(record_identifier) + mod_info[record_identifier][prop_name] = ( + structure_elements_path + [element.get_name()], + converters_path + [converter.name]) scanner(children, converter.converters, general_store_copy, record_store_copy, diff --git a/src/caoscrawler/sync_graph.py b/src/caoscrawler/sync_graph.py index ff9b7c6fa05eb4fc8b026bb160e81bc2151bc99e..fd0f07f1aca65cbd8c615ea120657b03ec90709f 100644 --- a/src/caoscrawler/sync_graph.py +++ b/src/caoscrawler/sync_graph.py @@ -25,6 +25,7 @@ A data model class for the semantic data that shall be created by synchronizatio from __future__ import annotations +import logging from typing import Any, Dict, List, Optional, Union import linkahead as db @@ -33,9 +34,12 @@ from linkahead.apiutils import (EntityMergeConflictError, compare_entities, from linkahead.cached import cache_clear, cached_get_entity_by from linkahead.exceptions import EmptyUniqueQueryError +from .exceptions import MissingReferencingEntityError from .identifiable_adapters import IdentifiableAdapter from .sync_node import SyncNode +logger = logging.getLogger(__name__) + class SyncGraph(): """ combines nodes in the graph based on their identity in order to create a graph of objects @@ -62,7 +66,7 @@ class SyncGraph(): - set_missing: declares that a SyncNode is NOT existing on the remote server """ - def __init__(self, entities: List[db.Entity], identifiableAdapter): + def __init__(self, entities: List[db.Entity], identifiableAdapter, raise_problems=True): self.identifiableAdapter = identifiableAdapter self._id_look_up: Dict[int, SyncNode] = {} self._path_look_up: Dict[str, SyncNode] = {} @@ -71,6 +75,8 @@ class SyncGraph(): self._existing: Dict[int, SyncNode] = {} # entities that are missing get negative IDs to allow identifiable creation self._remote_missing_counter = -1 + self.problems = set() + self.raise_problems = raise_problems self.nodes: List[SyncNode] = [] self._initialize_nodes(entities) # list of all SemanticEntities @@ -88,6 +94,24 @@ class SyncGraph(): ) = self._create_reference_mapping(self.nodes) self._mark_entities_with_path_or_id() + # at this point (after _mark_entities_with_path_or_id) everything has to be removed from + # unchecked (path or ID) or has to have sufficient properties for an identifiable + + for node in list(self.unchecked): + try: + # simply check whether sufficient references exist + self.identity_relies_on_unchecked_entity(node) + except MissingReferencingEntityError as exc: + self.problem_logger.error(f"ERROR: a reference is missing.\nThe Record " + f"{simple_print(node)}\nshould be referenced by one of the " + f"following RecordTypes\n{exc.rts}\nNo such reference was found.") + self.problems.add(("MissingReference", node.parents[0].name, exc.rts)) + if self.raise_problems: + raise + else: + remove, rmrefs = self.remove_failed(node) + logger.error(exc) + logger.error(f"{len(remove)}, {len(rmrefs)}") for node in list(self.nodes): try: identifiable = self.identifiableAdapter.get_identifiable( @@ -273,52 +297,49 @@ class SyncGraph(): SyncGraph._create_flat_list([p.value], flat) return flat - @staticmethod - def _treat_merge_error_of(newrecord, record): - """ - The parameters are two entities that cannot be merged with the merge_entities function. - - # This function checks for two obvious cases where no merge will ever be possible: - # 1. Two Entities with differing IDs - # 2. Two non-Entity values which differ + def remove_failed(self, node): + self.nodes.remove(node) + if node in self.unchecked: + self.unchecked.remove(node) + refs2 = [] + # remove reference property or value from referencing nodes + for referencing in self.backward_references[node.uuid]: + for p in el.properties: + v = p.value + if not isinstance(p.value, list): + v = [v] + for vv in v: + if vv == record: + if not isinstance(p.value, list): + el.properties.remove(p) + else: + p.value.remove(record) + refs2.append(el) - It creates a more informative logger message and raises an Exception in those cases. - """ - for this_p in newrecord.properties: - that_p = record.get_property(this_p.name) - - if that_p is None: - logger.debug(f"Property {this_p.name} does not exist in the second entity. Note that " - "this should not be the reason for the merge conflict.") - continue - - if (isinstance(this_p.value, db.Entity) - and isinstance(that_p.value, db.Entity)): - if this_p.value.id is not None and that_p.value.id is not None: - if this_p.value.id != that_p.value.id: - logger.error("The Crawler is trying to merge two entities " - "because they should be the same object (same" - " identifiables), but they reference " - "different Entities with the same Property." - f"Problematic Property: {this_p.name}\n" - f"Referenced Entities: {this_p.value.id} and " - f"{that_p.value.id}\n" - f"{record}\n{newrecord}") - raise RuntimeError("Cannot merge Entities") - elif (not isinstance(this_p.value, db.Entity) - and not isinstance(that_p.value, db.Entity)): - if ((this_p.value != that_p.value) - # TODO can we also compare lists? - and not isinstance(this_p.value, list) - and not isinstance(that_p.value, list)): - logger.error( - "The Crawler is trying to merge two entities because they should be the same " - "object (same identifiables), but they have different values for the same " - "Property.\n" - f"Problematic Property: {this_p.name}\n" - f"Values: {this_p.value} and {that_p.value}\n" - f"{record}\n{newrecord}") - raise RuntimeError("Cannot merge Entities") + # update reference mappings + for other in self.forward_references.pop(node.uuid): + self.backward_references[other.uuid].remove(node) + for other in self.backward_references.pop(node.uuid): + self.forward_references[other.uuid].remove(node) + + for other in self.forward_id_references.pop(node.uuid): + self.backward_id_references[other.uuid].remove(node) + for other in self.backward_id_references.pop(node.uuid): + self.forward_id_references[other.uuid].remove(node) + + for other in self.forward_id_referenced_by.pop(node.uuid): + self.backward_id_referenced_by[other.uuid].remove(node) + for other in self.backward_id_referenced_by.pop(node.uuid): + self.forward_id_referenced_by[other.uuid].remove(node) + + remove = [] + remove.extend(self.forward_id_referenced_by[node.uuid]) + remove.extend(self.backward_id_references[node.uuid]) + for el in remove: + rm, rf = self.remove_failed(el) + remove.extend(rm) + refs2.extend(rf) + return remove, refs2 @staticmethod def _create_reference_mapping(flat: List[SyncNode]): diff --git a/src/caoscrawler/sync_node.py b/src/caoscrawler/sync_node.py index 4dfc61ceeb29d157ae36cccb23d3273b4868861c..7ca33c670bb5a30fda91e8f0f76afc9b7dd1b502 100644 --- a/src/caoscrawler/sync_node.py +++ b/src/caoscrawler/sync_node.py @@ -22,12 +22,18 @@ from __future__ import annotations +import logging from typing import Any, Dict, List, Optional, Union from uuid import uuid4 as uuid import linkahead as db +import yaml from linkahead.common.models import _ParentList, _Properties +from .exceptions import ImpossibleMergeError + +logger = logging.getLogger(__name__) + class SyncNode(): """ represents the information related to an Entity as it shall be created in LinkAhead @@ -102,12 +108,46 @@ class SyncNode(): pval = pval.id if entval != pval: - raise db.apiutils.EntityMergeConflictError(f"Differing values were set for Property {p.name}:\n" - f"{ent.get_property(p).value}\n{p.value}") + logger.error("The Crawler is trying to create an entity," + " but there are have conflicting property values." + f"Problematic Property: {p.name}\n" + f"First value:\n{ent.get_property(p).value}\n" + f"Second value:\n{p.value}\n" + ) + ime = ImpossibleMergeError("Cannot merge Entities") + ime.pname = p.name + ime.values = (ent.get_property(p).value, p.value) + raise ime else: ent.add_property(id=p.id, name=p.name, value=p.value) return ent + def __repr__(self): + res = f"\n=====================================================\n{self.role}\n" + if hasattr(self, "_metadata"): + res += f"user: {self._metadata['user']}\n" + res += f"json: {self._metadata['json']}\n" + res += "---------------------------------------------------\n" + res += yaml.dump({"id": self.id, "name": self.name, + "parents": [el.name for el in self.parents]}, allow_unicode=True) + res += "---------------------------------------------------\n" + res += "properties:\n" + d = {} + for p in self.properties: + v = p.value + d[p.name] = [] + if not isinstance(p.value, list): + v = [v] + for el in v: + if isinstance(el, SyncNode): + d[p.name].append({"id": el.id, "name": el.name, "parents": [e.name for e in + el.parents]}) + else: + d[p.name].append(el) + + return (res + yaml.dump(d, allow_unicode=True) + + "=====================================================\n") + def parent_in_list(parent, plist): missing = False diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index bc6b5d43e79aff5974c9745a489716b7d9b4763b..90726f7756333abd8c71ae90c91018cf25fc00f1 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -42,6 +42,9 @@ from caosadvancedtools.models.parser import parse_model_from_string from caoscrawler.crawl import (Crawler, SecurityMode, _treat_deprecated_prefix, crawler_main, split_restricted_path) from caoscrawler.debug_tree import DebugTree +from caoscrawler.exceptions import (ImpossibleMergeError, + MissingIdentifyingProperty, + MissingReferencingEntityError) from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import (CaosDBIdentifiableAdapter, IdentifiableAdapter, @@ -281,9 +284,9 @@ def test_split_into_inserts_and_updates_unidentified(crawler_mocked_identifiable crawler = crawler_mocked_identifiable_retrieve st = SyncGraph([db.Record().add_parent("someparent")], crawler.identifiableAdapter) - with raises(ValueError) as err: + with raises(MissingIdentifyingProperty) as err: crawler.split_into_inserts_and_updates(st) - assert str(err.value).startswith("There is no identifying information.") + assert str(err.value).startswith("The node has no name.") def basic_retrieve_by_name_mock_up(rec, referencing_entities=None, known=None): @@ -493,7 +496,7 @@ a: ([b1, b2]) # The Cs cannot be merged due to different identifying properties # The Bs cannot be merged due to differeng references to Cs - with raises(db.apiutils.EntityMergeConflictError) as rte: + with raises(ImpossibleMergeError) as rte: crawler.split_into_inserts_and_updates(st) # assert not isinstance(rte.value, NotImplementedError), \ # "Exception must not be NotImplementedError, but plain RuntimeError." @@ -704,7 +707,7 @@ def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test) st = SyncGraph([db.Record(name="B").add_parent("C")], crawler.identifiableAdapter) # Test without referencing object # currently a RuntimeError is raised if necessary properties are missing. - with raises(RuntimeError): + with raises(MissingReferencingEntityError): crawler.split_into_inserts_and_updates(st) # identifiables were not yet checked diff --git a/unittests/test_sync_graph.py b/unittests/test_sync_graph.py index 2b39d42e225d39d7bcbecbd8d04cba07bddef976..49ba396047ce8adf6bf34e4476df45e3517a12d4 100644 --- a/unittests/test_sync_graph.py +++ b/unittests/test_sync_graph.py @@ -543,6 +543,7 @@ def test_sync_node(): .add_property(name="a", value='a') .add_property(id=103, value='b')) sn = SyncNode(rec) + assert "Record" in str(sn) assert sn.id == rec.id assert sn.name == rec.name assert sn.parents == rec.parents