diff --git a/src/caoscrawler/exceptions.py b/src/caoscrawler/exceptions.py index 8066c7f4d2197ac53280fe47d3adb82def310b62..6d08cf76fc177407154e38f0eb6aaa47bc863866 100644 --- a/src/caoscrawler/exceptions.py +++ b/src/caoscrawler/exceptions.py @@ -21,16 +21,26 @@ # class ForbiddenTransaction(Exception): + """Thrown if an transactions is needed that is not allowed. + For example an update of an entity if the security level is INSERT + """ pass class MissingReferencingEntityError(Exception): + """Thrown if the identifiable requires that some entity references the given entity but there + is no such reference """ + def __init__(self, *args, rts=None, **kwargs): self.rts = rts super().__init__(self, *args, **kwargs) class ImpossibleMergeError(Exception): + """Thrown if due to identifying information, two SyncNodes or two Properties of SyncNodes + should be merged, but there is conflicting information that prevents this. + """ + def __init__(self, *args, pname, values, **kwargs): self.pname = pname self.values = values @@ -38,4 +48,7 @@ class ImpossibleMergeError(Exception): class MissingIdentifyingProperty(Exception): + """Thrown if a SyncNode does not have the properties required by the corresponding registered + identifiable + """ pass diff --git a/src/caoscrawler/identifiable.py b/src/caoscrawler/identifiable.py index aead37694e86fa8f837585ab37ba44d0c64e61cd..c7312e12addb89c74d406bdc0e63e1e21e07e12a 100644 --- a/src/caoscrawler/identifiable.py +++ b/src/caoscrawler/identifiable.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # -# This file is a part of the CaosDB Project. +# This file is a part of the LinkAhead Project. # # Copyright (C) 2022 Henrik tom Wörden # @@ -37,10 +37,10 @@ logger = logging.getLogger(__name__) class Identifiable(): """ - The fingerprint of a Record in CaosDB. + The fingerprint of a Record in LinkAhead. - This class contains the information that is used by the CaosDB Crawler to identify Records. - In order to check whether a Record exits in the CaosDB Server, a query can + This class contains the information that is used by the LinkAhead Crawler to identify Records. + In order to check whether a Record exits in the LinkAhead Server, a query can be created using the information contained in the Identifiable. Parameters @@ -84,9 +84,8 @@ class Identifiable(): def _value_representation(value) -> str: """returns the string representation of property values to be used in the hash function - The string is the CaosDB ID of other Entities - (Python Id only if there is no CaosDB ID) and the string representation of bool, float, int - and str. + The string is the LinkAhead ID in case of SyncNode objects (SyncNode objects must have an ID) + and the string representation of None, bool, float, int, datetime and str. """ if value is None: @@ -95,7 +94,7 @@ class Identifiable(): if value.id is not None: return str(value.id) else: - raise RuntimeError("Python Entity without id not allowed") + raise RuntimeError("Python Entity (SyncNode) without ID not allowed") elif isinstance(value, list): return "[" + ", ".join([Identifiable._value_representation(el) for el in value]) + "]" elif (isinstance(value, str) or isinstance(value, int) or isinstance(value, float) @@ -104,7 +103,7 @@ class Identifiable(): else: raise ValueError(f"Unknown datatype of the value: {value}") - @ staticmethod + @staticmethod def _create_hashable_string(identifiable: Identifiable) -> str: """ creates a string from the attributes of an identifiable that can be hashed @@ -121,13 +120,10 @@ class Identifiable(): return rec_string def __eq__(self, other) -> bool: - """ - Identifiables are equal if they belong to the same Record. - equal if attribute representations are equal - """ + """ Identifiables are equal if they share the same ID or if the representation is equal """ if not isinstance(other, Identifiable): raise ValueError("Identifiable can only be compared to other Identifiable objects.") - elif self.record_id is not None and other.record_id is not None: + if self.record_id is not None and other.record_id is not None: return self.record_id == other.record_id elif self.get_representation() == other.get_representation(): return True @@ -135,6 +131,7 @@ class Identifiable(): return False def __repr__(self): + """ deterministic text representation of the identifiable """ pstring = json.dumps({k: str(v) for k, v in self.properties.items()}) return (f"{self.__class__.__name__} for RT {self.record_type}: id={self.record_id}; " f"name={self.name}\n" diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index 9fa297ccc8dd7cb32071b01188d167c28f6a94fd..2b0cd294b5304552e0a80f9da29e1737ad8f9c58 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -2,7 +2,7 @@ # encoding: utf-8 # # ** header v3.0 -# This file is a part of the CaosDB Project. +# This file is a part of the LinkAhead Project. # # Copyright (C) 2021-2022 Henrik tom Wörden # 2021-2022 Alexander Schlemmer @@ -148,7 +148,7 @@ identifiabel, identifiable and identified record) for a Record. query_string = query_string[:-len(" AND ")] return query_string - def check_identifying_props(self, node, raise_exception=True): + def all_identifying_properties_exist(self, node: SyncNode, raise_exception: bool = True): """checks whether all identifying properties exist and raises an error if that's not the case. It furthermore raises an error if "name" is part of the identifiable, but the node does not have a name. @@ -180,6 +180,8 @@ identifiabel, identifiable and identified record) for a Record. else: continue + # multiple occurances are ok here. We deal with that when actually creating an + # identifiable (IDs of referenced Entities might need to get resolved first). if (len([el for el in node.properties if el.name.lower() == prop.name.lower()]) == 0): if raise_exception: i = MissingIdentifyingProperty(f"The property {prop.name} is missing.") @@ -298,7 +300,7 @@ startswith: bool, optional name = None if se.registered_identifiable is None: - raise ValueError("no register_identifiable") + raise ValueError("no registered_identifiable") # fill the values: for prop in se.registered_identifiable.properties: diff --git a/src/caoscrawler/sync_graph.py b/src/caoscrawler/sync_graph.py index 3fd0683353ac7a6decaa1b9dacd57eb673f48075..919131b858ee7530c98c13e207c954c112becd72 100644 --- a/src/caoscrawler/sync_graph.py +++ b/src/caoscrawler/sync_graph.py @@ -43,25 +43,21 @@ from .sync_node import SyncNode, TempID logger = logging.getLogger(__name__) -def _for_each_scalar_value(node: SyncNode, condition: Callable[[Any], bool], kind: str, - value: Callable[[Any], Any] = None): - """ helper function that performs an action on each value element of each property of a node +def _set_each_scalar_value(node: SyncNode, condition: Callable[[Any], bool], value: Any): + """ helper function that conditionally replaces each value element of each property of a node - The argument "kind" determines which action is performed on each property value: - The action (remove or set) is performed on each property value of each property: in case of - lists, it is performed on each list element. The action is only performed if the condition that + If the property value is a list, the replacement is done for each list entry. + The replacement is only performed if the condition that is provided is fulfilled, i.e. the callable ``condition`` returns True. The callable ``condition`` must take the property value (or list element) as the sole argument. - Thus, with "remove" you can conditionally remove values and with "set" you can conditionally - replace values. - Args: node (SyncNode): The node which provides the properties (and their values) to operate on. - condition (Callable): A function with one argument which is interpreted as a condition: Only if - it returns True for the property value, the action is executed. - kind (str): Either "remove" or "set" depending on whether you want to remove the property or replace the property. - value (Callable): A function returning a new value that is set as the property value. This function receives the old value as the single argument. + condition (Callable): A function with one argument which is interpreted as a condition: + Only if it returns True for the property value, the action is + executed. + value (Callable): A function returning a new value that is set as the property value. This + function receives the old value as the single argument. Last review by Alexander Schlemmer on 2024-05-24. """ @@ -69,25 +65,9 @@ def _for_each_scalar_value(node: SyncNode, condition: Callable[[Any], bool], kin if isinstance(p.value, list): for ii, el in enumerate(p.value): if condition(el): - if kind == "remove": - p.value.remove(el) - elif kind == "set": - p.value[ii] = value(el) + p.value[ii] = value(el) elif condition(p.value): - if kind == "remove": - node.properties.remove(p) - elif kind == "set": - p.value = value(p.value) - - -def _remove_each_scalar_value(node: SyncNode, condition: Callable[[Any], bool]): - """ "remove" version of _for_each_scalar_value """ - _for_each_scalar_value(node, condition, "remove") - - -def _set_each_scalar_value(node: SyncNode, condition: Callable[[Any], bool], value: Any): - """ "set" version of _for_each_scalar_value """ - _for_each_scalar_value(node, condition, "set", value=value) + p.value = value(p.value) class SyncGraph(): @@ -138,20 +118,36 @@ class SyncGraph(): Last review by Alexander Schlemmer on 2024-05-24. """ + # General implementation remark: + # There are three cases where an update of one SyncNode can affect other nodes: + # - mark existing (add identifiables) + # - mark missing (add identifiables and add (negative) IDs) + # - merge (add identifiables) + # + # We cannot get an infinite recursion where one update triggers another update and so on + # because updates are conditional: + # Setting an ID removes the node (immediately) from the unchecked list and it is only tried to + # set an ID in _mark_missing if a node is in the uncheck list. Thus, setting the ID once + # prevents future attempts to set the ID of the same node. + # Also, setting an identifiable is only done when needed, i.e. there is no identifiable. + # Note, that when ever one node is changed, we check all dependend nodes (see usage of + # `_get_nodes_whose_identity_relies_on`) whether something should be updated. Thus, we cannot + # miss a necessary update. def __init__(self, entities: list[db.Entity], identifiableAdapter: IdentifiableAdapter): self.identifiableAdapter = identifiableAdapter # A dictionary allowing for quick lookup of sync nodes using their (possibly negative) IDs. # This dictionary is initially set using _mark_entities_with_path_or_id and later updated - # using set_id_of_node. + # using set_id_of_node or during merges of nodes. self._id_look_up: dict[Union[int, TempID, str], SyncNode] = {} - # Similar as above for looking up nodes using paths: + # Similar as above for looking up nodes using paths self._path_look_up: dict[str, SyncNode] = {} - # Similar as above for looking up nodes using identifiables. This dictionary uses - # the get_representation method of Identifiable as keys. + # Similar as above for looking up nodes using identifiables. This dictionary uses the text + # representation generated by get_representation method of Identifiable as keys. self._identifiable_look_up: dict[str, SyncNode] = {} + # look up for the nodes that were marked as being missing (on the remote server) self._missing: dict[int, SyncNode] = {} + # same for existing self._existing: dict[int, SyncNode] = {} - self._nonidentifiable: dict[int, SyncNode] = {} # entities that are missing get negative IDs to allow identifiable creation self._remote_missing_counter = -1 @@ -182,7 +178,7 @@ class SyncGraph(): # Thus, it must be possible to create an # identifiable which is checked using the following function: for node in self.unchecked: - self.identifiableAdapter.check_identifying_props(node) + self.identifiableAdapter.all_identifying_properties_exist(node) def set_id_of_node(self, node: SyncNode, node_id: Optional[str] = None): """sets the ID attribute of the given SyncNode to node_id. @@ -290,6 +286,12 @@ class SyncGraph(): Return an equivalent SyncNode. Equivalent means that ID, path or identifiable are the same. + If a new information was added to the given SyncNode (e.g. the ID), it might be possible + then to identify an equivalent node (i.e. one with the same ID in this example). + There might be more than one equivalent nodes in the graph. However, simply the first that + is found is being returned. (When an equivalent node is found this, the given node is + typically merged, into the one that was found and after the merge the graph is again + checked for equivalent nodes) Returns None if no equivalent node is found. """ @@ -324,7 +326,7 @@ class SyncGraph(): if no identifiable is given, the identifiable is retrieved from the identifiable adapter """ if identifiable is None: - self.identifiableAdapter.check_identifying_props(node) + self.identifiableAdapter.all_identifying_properties_exist(node) identifiable = self.identifiableAdapter.get_identifiable( node, self.backward_id_referenced_by[id(node)]) node.identifiable = identifiable @@ -347,7 +349,11 @@ class SyncGraph(): """ for ent in entities: if ent.role == "Record" and len(ent.parents) == 0: - raise RuntimeError(f"Records must have a parent.\n{ent}") + raise ValueError(f"Records must have a parent.\n{ent}") + if isinstance(ent.id, int) and ent.id < 0: + raise ValueError(f"Records must not have negative integers as IDs.\n{ent}") + if isinstance(ent.id, str) and re.match(r"^-\d+$", ent.id): + raise ValueError(f"Records must not have negative integers as IDs.\n{ent}") def _get_nodes_whose_identity_relies_on(self, node: SyncNode): """returns a set of nodes that reference the given node as identifying property or are @@ -472,17 +478,6 @@ class SyncGraph(): self._path_look_up[node.path] = node self.set_id_of_node(node, remote_id) - def _remove_non_identifiables(self): - """ A path or an ID is sufficiently identifying. Thus, those entities can be marked as - checked - - Last review by Alexander Schlemmer on 2024-05-24. - """ - for node in list(self.nodes[::-1]): - if "nonidentifiable" in [p.name for p in node.registered_identifiable.properties]: - self.unchecked.remove(node) - self._nonidentifiable[id(node)] = node - def _merge_into(self, source: SyncNode, target: SyncNode): """ tries to merge source into target and performs the necessary updates: - update the membervariables of target using source (``target.update(source)``). @@ -574,10 +569,10 @@ class SyncGraph(): # - mark existing # - mark missing # - merge - # For each dependent node, we check whether this allows to create an identifiable - for other_node in self._get_nodes_whose_identity_relies_on(target): - if self._identifiable_is_needed(other_node): - self._set_identifiable_of_node(other_node) + self._add_identifiables_to_dependend_nodes(target) + + if self.get_equivalent(target) is not None: + self._merge_into(target, self.get_equivalent(target)) def _identifiable_is_needed(self, node: SyncNode): """ @@ -590,7 +585,8 @@ class SyncGraph(): Last review by Alexander Schlemmer on 2024-05-24. """ return (node.identifiable is None and not self._identity_relies_on_unchecked_entity(node) - and self.identifiableAdapter.check_identifying_props(node, raise_exception=False)) + and self.identifiableAdapter.all_identifying_properties_exist( + node, raise_exception=False)) def _initialize_nodes(self, entities: list[db.Entity]): """ create initial set of SyncNodes from provided Entity list""" @@ -612,6 +608,12 @@ class SyncGraph(): condition=lambda val: id(val) in se_lookup, value=lambda val: se_lookup[id(val)]) + def _add_identifiables_to_dependend_nodes(self, node): + """ For each dependent node, we check whether this allows to create an identifiable """ + for other_node in self._get_nodes_whose_identity_relies_on(node): + if self._identifiable_is_needed(other_node): + self._set_identifiable_of_node(other_node) + def _mark_missing(self, node: SyncNode): """Mark a sync node as missing and remove it from the dictionary of unchecked nodes. @@ -624,15 +626,11 @@ class SyncGraph(): # - mark existing # - mark missing # - merge - # For each dependent node, we check whether this allows to create an identifiable - for other_node in self._get_nodes_whose_identity_relies_on(node): - if self._identifiable_is_needed(other_node): - self._set_identifiable_of_node(other_node) + self._add_identifiables_to_dependend_nodes(node) # For each dependent node, we set the ID to None (missing) # (None is the default second argument of set_id_of_node.) for other_node in self._get_nodes_whose_identity_relies_on(node): if other_node in self.unchecked: - print(f"set\n{other_node}\n to missing due to missing\n{node}") self.set_id_of_node(other_node) def _mark_existing(self, node: SyncNode): @@ -648,10 +646,4 @@ class SyncGraph(): # - mark existing # - mark missing # - merge - # For each dependent node, we check whether this allows to create an identifiable - for other_node in self._get_nodes_whose_identity_relies_on(node): - if self._identifiable_is_needed(other_node): - self._set_identifiable_of_node(other_node) - - def __repr__(self): - return f"SyncNode with ID={self.id}" + self._add_identifiables_to_dependend_nodes(node) diff --git a/src/caoscrawler/sync_node.py b/src/caoscrawler/sync_node.py index 26dbb41123f01c0a1774c5aeb106e6d973381ff2..80a7e4a183a0fe3685c0c1873935ea43766125c7 100644 --- a/src/caoscrawler/sync_node.py +++ b/src/caoscrawler/sync_node.py @@ -167,7 +167,7 @@ class SyncNode: if unequal: logger.error( "The Crawler is trying to create an entity," - " but there are have conflicting property values." + " but there are conflicting property values." f"Problematic Property: {p.name}\n" f"First value:\n{entval}\n" f"Second value:\n{pval}\n" @@ -180,11 +180,8 @@ class SyncNode: return ent def __repr__(self) -> str: + """ somewhat concise text representation of the SyncNode """ res = f"\n=====================================================\n{self.role}\n" - if hasattr(self, "_metadata"): - res += f"user: {self._metadata['user']}\n" - res += f"json: {self._metadata['json']}\n" - res += "---------------------------------------------------\n" res += yaml.dump( { "id": self.id, @@ -219,13 +216,6 @@ class SyncNode: + "=====================================================\n" ) - def is_unidentifiable(self) -> bool: - """returns whether this is an unidentifiable Node""" - return ( - self.registered_identifiable is not None - and self.registered_identifiable.get_property("no-ident") is not None - ) - def parent_in_list(parent: Parent, plist: _ParentList) -> bool: """helper function that checks whether a parent with the same name or ID is in the plist""" diff --git a/unittests/test_sync_graph.py b/unittests/test_sync_graph.py index fba3dd531972461caf7968cc78a4c4c053a58b42..ab0a46bc1cb98cd9349037ce5cdbe7ed2ace7dab 100644 --- a/unittests/test_sync_graph.py +++ b/unittests/test_sync_graph.py @@ -37,8 +37,7 @@ from itertools import product @pytest.fixture def simple_adapter(): - # We use the reference as identifying reference in both directions. Thus the map is the same - # for all three categories: references, id_references and id_referenced_by + # different RTs with different registered identifiables to allow to test various behavior ident_adapter = CaosDBIdentifiableAdapter() ident_adapter.register_identifiable( "RT1", @@ -166,9 +165,12 @@ def test_SyncGraph_init(): for el in st.nodes: if el.identifiable is not None: assert st._identifiable_look_up[el.identifiable.get_representation()] is el - # node without ID but with identifiable was merged with other node with ID - assert len([el for el in st.nodes if len(el.properties) > 0 - and el.properties[0].value == "MERGEME"]) == 1 + # The node, which has no ID but has an identifiable, was merged with another node with ID (due + # to the shared identifiable) + new_one = [el for el in st.nodes if len(el.properties) > 0 + and el.properties[0].value == "MERGEME"] + assert len(new_one) == 1 + assert new_one[0].id == 104 # every node that does not rely on something unchecked has an identifiable or an ID for el in st.nodes: if not st._identity_relies_on_unchecked_entity(el): diff --git a/unittests/test_sync_node.py b/unittests/test_sync_node.py index 1fc2be6833df9774adf99b40987b1c944fbd5029..31f54a3b2f144c91351fdabe93589363af4486b1 100644 --- a/unittests/test_sync_node.py +++ b/unittests/test_sync_node.py @@ -129,10 +129,6 @@ def test_sync_node(): assert export.name == rec_a.name for p in rec_a.parents + rec_b.parents: assert parent_in_list(p, export.parents) - # if p.name is not None: - # assert p.name in [el.name for el in export.parents] - # if p.id is not None: - # assert p.id in [el.id for el in export.parents] for p in rec_a.properties + rec_b.properties: if p.name is not None: assert p.name in [el.name for el in export.properties] @@ -170,33 +166,32 @@ def test_sync_node(): # merge with conflicting information # ---------------------------------- + # ID mismatch sn_a = SyncNode(db.Record(id=102)) with pytest.raises(ValueError, match="Trying to update"): sn_a.update(SyncNode(db.Record(id=101))) + # name mismatch sn_a = SyncNode(db.Record(name='102')) with pytest.raises(ValueError, match="Trying to update"): sn_a.update(SyncNode(db.Record(name='101'))) + # type mismatch sn_a = SyncNode(db.Record(name='102')) with pytest.raises(ValueError, match="Trying to update"): sn_a.update(SyncNode(db.File(name='102'))) + # description mismatch sn_a = SyncNode(db.Record(description='102')) with pytest.raises(ValueError, match="Trying to update"): sn_a.update(SyncNode(db.Record(description='101'))) + # path mismatch sn_a = SyncNode(db.File(path='102')) with pytest.raises(ValueError, match="Trying to update"): sn_a.update(SyncNode(db.File(path='101'))) - sn_a = SyncNode(db.File(path='102')) - sn_a.identifiable = Identifiable(name='a') - # sn_b.identifiable = Identifiable(name='b') - sn_b = SyncNode(db.File(path='101')) - with pytest.raises(ValueError, match="Trying to update"): - sn_a.update(sn_b) - + # identifiable mismatch sn_a = SyncNode(db.File(path='102')) sn_a.identifiable = Identifiable(name='a') sn_b = SyncNode(db.File(path='101'))