From 8179f69c1c56e2ca393d2e0c0190ea33d9bd21cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com>
Date: Wed, 17 Apr 2024 12:14:49 +0200
Subject: [PATCH] wip

---
 src/caoscrawler/crawl.py                 |  71 ++----
 src/caoscrawler/identifiable.py          |   8 +-
 src/caoscrawler/identifiable_adapters.py |   8 +-
 src/caoscrawler/semantic_target.py       | 287 ++++++++++++++---------
 unittests/test_crawler.py                |  20 +-
 unittests/test_identifiable_adapters.py  |  43 +++-
 unittests/test_semantic_target.py        |  76 +++++-
 7 files changed, 332 insertions(+), 181 deletions(-)

diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py
index 17171e22..23707a6c 100644
--- a/src/caoscrawler/crawl.py
+++ b/src/caoscrawler/crawl.py
@@ -366,38 +366,32 @@ class Crawler(object):
             # 1. Is it in the cache of already checked Records?
             # 2. Does it have to be new since a needed reference is missing?
             # 3. Can it be checked on the remote server?
-            for se in st.unchecked:
+            for se in list(st.unchecked):
                 if st.identity_relies_on_unchecked_entity(se):
+                    print(st.se.index(se), "relies on unchecked")
                     continue
 
-                if se.identifiable is None:
-                    se.identifiable = self.identifiableAdapter.get_identifiable(
-                        se, st.backward_id_referenced_by[se.uuid])
-
-                equivalent_se = st.get_equivalent(se)
-
-                # 1. Is it in the cache of already checked Records?
-                if equivalent_se is not None:
-                    # We merge record into treated_record in order to prevent loss of information
-                    st.merge_into(se, equivalent_se)
+                st.make_identifiable(se)
+                print(st.se.index(se), "is now identifiable")
+                if st.merge_with_equivalent(se):
+                    print('see above', "was merged")
+                    entity_was_treated = True
+                    continue
 
                 # 2. Does it have to be new since a needed reference is missing?
                 # (Is it impossible to check this record because an identifiable references a
                 # missing record?)
-                elif st.identity_relies_on_missing_entity(se):
-                    st.add_to_missing(se)
+                if st.identity_relies_on_missing_entity(se):
+                    st.set_missing(se)
 
                 # 3. check on the remote server
                 else:
-                    identified_record = (
-                        self.identifiableAdapter.retrieve_identified_record_for_identifiable(
-                            se.identifiable))
-                    if identified_record is None:
-                        st.add_to_missing(se)
+                    st.check_remote_server(se)
+                    if se.id is None:
+                        st.set_missing(se)
                     else:
-                        se.identify_with(identified_record)
-                        st.add_to_existing(se)
-                entity_was_treated = True
+                        st.set_existing(se)
+                    entity_was_treated = True
 
             # TODO
             # for record in st.entities:
@@ -406,7 +400,7 @@ class Crawler(object):
         # We postponed the merge for records where it failed previously and try it again now.
         # This only might add properties of the postponed records to the already used ones.
         if len(st.unchecked) > 0:
-            circle = self.detect_circular_dependency(st.entities)
+            circle = st.detect_circular_dependency()
             if circle is None:
                 logger.error("Failed, but found NO circular dependency. The data is as follows:"
                              # + str(self.compact_entity_list_representation(st.entities,
@@ -468,39 +462,6 @@ class Crawler(object):
 
         return text + "--------\n"
 
-    @staticmethod
-    def detect_circular_dependency(flat: list[db.Entity]):
-        """
-        Detects whether there are circular references in the given entity list and returns a list
-        where the entities are ordered according to the chain of references (and only the entities
-        contained in the circle are included. Returns None if no circular dependency is found.
-
-        TODO: for the sake of detecting problems for split_into_inserts_and_updates we should only
-        consider references that are identifying properties.
-        """
-        circle = [flat[0]]
-        closed = False
-        while not closed:
-            current = circle[-1]
-            added_to_circle = False
-            for p in current.properties:
-                if isinstance(p.value, list):
-                    for pval in p.value:
-                        if pval in flat:
-                            if pval in circle:
-                                closed = True
-                            circle.append(pval)
-                            added_to_circle = True
-                else:
-                    if p.value in flat:
-                        if p.value in circle:
-                            closed = True
-                        circle.append(p.value)
-                        added_to_circle = True
-            if not added_to_circle:
-                return None
-        return circle
-
     @staticmethod
     def _merge_properties_from_remote(
             crawled_data: list[db.Record],
diff --git a/src/caoscrawler/identifiable.py b/src/caoscrawler/identifiable.py
index cefdf4a0..3df5bfa7 100644
--- a/src/caoscrawler/identifiable.py
+++ b/src/caoscrawler/identifiable.py
@@ -20,12 +20,14 @@
 #
 
 from __future__ import annotations
-import linkahead as db
-from datetime import datetime
+
 import json
+import logging
+from datetime import datetime
 from hashlib import sha256
 from typing import Union
-import logging
+
+import linkahead as db
 
 logger = logging.getLogger(__name__)
 
diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py
index d9e1080e..85f57449 100644
--- a/src/caoscrawler/identifiable_adapters.py
+++ b/src/caoscrawler/identifiable_adapters.py
@@ -264,8 +264,9 @@ startswith: bool, optional
                 if prop.name == "name":
                     name_options = [f.name for f in se.fragments if f.name is not None]
                     if len(name_options) == 0:
-                        raise RuntimeError("name")
-                    assert all([f == name_options[0] for f in name_options])
+                        raise RuntimeError("name is missing!")
+                    if not all([f == name_options[0] for f in name_options]):
+                        raise RuntimeError("differing names in fragments")
                     name = name_options[0]
                     continue
                 # problem: what happens with multi properties?
@@ -290,7 +291,8 @@ startswith: bool, optional
                         f"The following record is missing an identifying property:\n"
                         f"RECORD\n{se.fragments[0]}\nIdentifying PROPERTY\n{prop.name}"
                     )
-                assert all([f.value == options[0].value for f in options])
+                if not all([f.value == options[0].value for f in options]):
+                    raise RuntimeError("differing prop values in fragments")
                 record_prop = options[0]
 
                 identifiable_props[record_prop.name] = record_prop.value
diff --git a/src/caoscrawler/semantic_target.py b/src/caoscrawler/semantic_target.py
index 3154f81f..89e65c20 100644
--- a/src/caoscrawler/semantic_target.py
+++ b/src/caoscrawler/semantic_target.py
@@ -29,6 +29,8 @@ from uuid import uuid4 as uuid
 import linkahead as db
 from linkahead.apiutils import (EntityMergeConflictError, compare_entities,
                                 merge_entities)
+from linkahead.cached import cache_clear, cached_get_entity_by
+from linkahead.exceptions import EmptyUniqueQueryError
 
 from .identifiable_adapters import IdentifiableAdapter
 
@@ -49,6 +51,14 @@ class SemanticEntity():
         self.uuid = uuid()
 
     def identify_with(self, remote_entity):
+        """ the given remote_entity is considered to be the target entity.
+
+        ID and path are copied to this objects attributes and to its fragments
+        """
+
+        self.id = remote_entity.id
+        self.path = remote_entity.path
+
         for f in self.fragments:
             # side effect
             f.id = remote_entity.id
@@ -58,32 +68,59 @@ class SemanticEntity():
             f._size = remote_entity._size
             f._checksum = remote_entity._checksum
 
+    def include(self, source):
+        self.fragments.extend(source.fragments)
+
+        if source.id is not None:
+            if self.id is None:
+                self.id = source.id
+            else:
+                assert self.id == source.id
+
+        if source.path is not None:
+            if self.path is None:
+                self.path = source.path
+            else:
+                assert self.path == source.path
+
 
 class SemanticTarget():
     """ models the target structure of Entities as it shall be created by the Crawler
 
+    This model should only be manipulated via three functions:
+    - make_identifiable: adds an identifiable to a SemanticEntity what possibly allows to merge it
+                         with another SemanticEntity
+    - merge_with_equivalent: check whether there is an equivalent SemanticEntity and merge into
+                             that if one is found
+    - check_remote_server: uses the identifiable to check the remote server and add ID and path if
+                           an object was found
+    - set_existing: declares that a SemanticEntity is existing on the remote server
+    - set_missing: declares that a SemanticEntity is NOT existing on the remote server
     """
 
     def __init__(self, entities: list[db.Entity], identifiableAdapter):
         self.identifiableAdapter = identifiableAdapter
-        self.entities = self._create_flat_list(entities)
         self._id_look_up: dict[int, SemanticEntity] = {}
         self._path_look_up: dict[str, SemanticEntity] = {}
         self._identifiable_look_up: dict[str, SemanticEntity] = {}
         self._missing: dict[int, SemanticEntity] = {}
         self._existing: dict[int, SemanticEntity] = {}
+        self._remote_missing_counter = -1  # TODO: I guess we can now get rid of this...
 
         # create initial set of SemanticEntities from provided Entity list
         self.se: list[SemanticEntity] = []  # list of all SemanticEntities
-        self.se_lookup: dict[str, SemanticEntity] = {}  # get a SemanticEntity by its UUID
-        for el in self.entities:
+        # TODO do we only need this for creating the initial reference map? Remove it?
+        self.se_lookup: dict[str, SemanticEntity] = {}  # lookup: UUID -> SemanticEntity
+        entities = self._create_flat_list(entities)
+        self._sanity_check(entities)
+        for el in entities:
             self.se.append(SemanticEntity(
                 el,
                 self.identifiableAdapter.get_registered_identifiable(el)))
             self.se_lookup[id(el)] = self.se[-1]
         self.unchecked = list(self.se)  # list all SemanticEntities that have not yet been checked
-        self._remote_missing_counter = -1
-        self.sanity_check(self.entities)
+
+        # initialize reference mappings
         (
             self.forward_references,
             self.backward_references,
@@ -95,12 +132,61 @@ class SemanticTarget():
 
         self._mark_entities_with_path_or_id()
 
-    def get_equivalent(self, entity: SemanticEntity = None) -> Optional[SemanticEntity]:
+    def make_identifiable(self, se: SemanticEntity):
+        """ creates an identifiable for the given SemanticEntity and possibly merges it into an
+        equivalent SemanticEntity
+        """
+        if se.identifiable is not None:
+            raise RuntimeError("Already has identifiable")
+        se.identifiable = self.identifiableAdapter.get_identifiable(
+            se, self.backward_id_referenced_by[se.uuid])
+
+    def merge_with_equivalent(self, se: SemanticEntity):
+        equivalent_se = self.get_checked_equivalent(se)
+        if equivalent_se is None:
+            return False
+        else:
+            self._merge_into(se, equivalent_se)
+            return True
+
+    def check_remote_server(self, se: SemanticEntity):
+        identified_record = (
+            self.identifiableAdapter.retrieve_identified_record_for_identifiable(
+                se.identifiable))
+        if identified_record is not None:
+            se.identify_with(identified_record)
+
+    def set_missing(self, se: SemanticEntity):
+        """ add the given SemanticEntity to the list of missing entities
+
+        This removes the SemanticEntity from the unchecked list and implies that the entity does
+        NOT exist on the remote server.
+        """
+        assert se.id is None
+        if se.path is None and se.identifiable is None:
+            raise RuntimeError("no identifying information")
+        se.id = self._remote_missing_counter
+        self._remote_missing_counter -= 1
+        self._add_any(se, self._missing)
+        self.unchecked.remove(se)
+
+    def set_existing(self, se: SemanticEntity):
+        """ add the given SemanticEntity to the list of existing entities
+
+        This removes the SemanticEntity from the unchecked list and implies that the entity exists
+        on the remote server.
+        """
+        assert se.id is not None
+        self._add_any(se, self._existing)
+        self.unchecked.remove(se)
+
+    def get_checked_equivalent(self, entity: SemanticEntity) -> Optional[SemanticEntity]:
         """
         Return an equivalent SemanticEntity from the list of missing or existing entities.
 
         Equivalent means that ID, path or identifiable are the same.
         """
+        # TODO shall we also provide a variant that returns equivalent objects that are unchecked?
         if entity.id is not None and entity.id in self._id_look_up:
             return self._id_look_up[entity.id]
         if entity.path is not None and entity.path in self._path_look_up:
@@ -149,93 +235,6 @@ class SemanticTarget():
 
         return (missing, [el.fragments[0] for el in self._existing.values()])
 
-    def add_to_missing(self, se: SemanticEntity):
-        assert se.id is None
-        if se.path is None and se.identifiable is None:
-            raise RuntimeError("no identifying information")
-        se.id = self._remote_missing_counter
-        self._remote_missing_counter -= 1
-        self._add_any(se, self._missing)
-        self.unchecked.remove(se)
-
-    def add_to_existing(self, se: SemanticEntity):
-        """ add a SemanticEntity to the lookup of treated entities and remove id from the unchecked
-        list
-        Add a Record that was treated, such that it is contained in the internal look up dicts
-
-        This Record MUST have an ID if it was found in the remote server.
-"""
-        self._add_any(se, self._existing)
-        self.unchecked.remove(se)
-
-    def merge_into(self, source: SemanticEntity, target: SemanticEntity):
-        """ tries to merge record into newrecord
-
-        If it fails, record is added to the try_to_merge_later list.
-        In any case, references are bent to the newrecord object.
-
-        """
-        for frag in source.fragments:
-            try:
-                merge_entities(
-                    target.fragments[0], frag, merge_references_with_empty_diffs=False,
-                    merge_id_with_resolved_entity=True)
-            except EntityMergeConflictError:
-                self._treat_merge_error_of(target.fragments[0], frag)
-                # We cannot merge but it is none of the clear case where merge is
-                # impossible. Thus we try later
-                target.fragments.append(frag)
-                if target.fragments[0].id is not None:
-                    frag.id = target.fragments[0].id
-            except NotImplementedError:
-                print(target)
-                print(source)
-                raise
-        if source.id is not None:
-            if target.id is None:
-                target.id = source.id
-            else:
-                assert target.id == source.id
-
-        if source.path is not None:
-            if target.path is None:
-                target.path = source.path
-            else:
-                assert target.path == source.path
-
-        # update reference mappings
-        for se in self.forward_references.pop(source.uuid):
-            self.forward_references[target.uuid].add(se)
-            self.backward_references[se.uuid].remove(source)
-            self.backward_references[se.uuid].add(target)
-        for se in self.backward_references.pop(source.uuid):
-            self.backward_references[target.uuid].add(se)
-            self.forward_references[se.uuid].remove(source)
-            self.forward_references[se.uuid].add(target)
-
-        for se in self.forward_id_references.pop(source.uuid):
-            self.forward_id_references[target.uuid].add(se)
-            self.backward_id_references[se.uuid].remove(source)
-            self.backward_id_references[se.uuid].add(target)
-        for se in self.backward_id_references.pop(source.uuid):
-            self.backward_id_references[target.uuid].add(se)
-            self.forward_id_references[se.uuid].remove(source)
-            self.forward_id_references[se.uuid].add(target)
-
-        for se in self.forward_id_referenced_by.pop(source.uuid):
-            self.forward_id_referenced_by[target.uuid].add(se)
-            self.backward_id_referenced_by[se.uuid].remove(source)
-            self.backward_id_referenced_by[se.uuid].add(target)
-        for se in self.backward_id_referenced_by.pop(source.uuid):
-            self.backward_id_referenced_by[target.uuid].add(se)
-            self.forward_id_referenced_by[se.uuid].remove(source)
-            self.forward_id_referenced_by[se.uuid].add(target)
-
-        # remove empyt SemanticEntity
-        self.se.remove(source)
-        if source in self.unchecked:
-            self.unchecked.remove(source)
-
     def identity_relies_on_unchecked_entity(self, se: SemanticEntity):
         """
         If a record for which it could not yet be verified whether it exists in LA or not is part
@@ -257,8 +256,8 @@ class SemanticTarget():
         return any([id(ent) in self._missing for ent in self.forward_id_references[se.uuid]]
                    + [id(ent) in self._missing for ent in self.backward_id_referenced_by[se.uuid]])
 
-    @staticmethod
-    def sanity_check(entities: list[db.Entity]):
+    @ staticmethod
+    def _sanity_check(entities: list[db.Entity]):
         for ent in entities:
             if ent.role == "Record" and len(ent.parents) == 0:
                 raise RuntimeError(f"Records must have a parent.\n{ent}")
@@ -271,7 +270,7 @@ class SemanticTarget():
                 merge_entities(se.fragments[0], ent, merge_id_with_resolved_entity=True)
             se.fragments = [se.fragments[0]]
 
-    @staticmethod
+    @ staticmethod
     def _create_flat_list(ent_list: list[db.Entity], flat: Optional[list[db.Entity]] = None):
         """
         Recursively adds entities and all their properties contained in ent_list to
@@ -301,7 +300,7 @@ class SemanticTarget():
                         SemanticTarget._create_flat_list([p.value], flat)
         return flat
 
-    @staticmethod
+    @ staticmethod
     def _treat_merge_error_of(newrecord, record):
         """
         The parameters are two entities that cannot be merged with the merge_entities function.
@@ -348,7 +347,7 @@ class SemanticTarget():
                         f"{record}\n{newrecord}")
                     raise RuntimeError("Cannot merge Entities")
 
-    @staticmethod
+    @ staticmethod
     def _create_reference_mapping(flat: list[SemanticEntity], se_lookup):
         """
         TODO update docstring
@@ -407,7 +406,7 @@ class SemanticTarget():
     def _mark_entities_with_path_or_id(self):
         """ A path or an ID is sufficiently identifying. Thus, those entities can be marked as
         checked """
-        for semantic_entity in self.se:
+        for semantic_entity in list(self.se[::-1]):
             assert len(semantic_entity.fragments) == 1
             entity = semantic_entity.fragments[0]
             if entity.id is None and entity.path is None:
@@ -420,17 +419,41 @@ class SemanticTarget():
                 if existing is not None:
                     semantic_entity.identify_with(existing)
 
-            treated_before = self.get_equivalent(semantic_entity)
+            treated_before = self.get_checked_equivalent(semantic_entity)
             if treated_before is None:
                 if semantic_entity.id is None:
-                    self.add_to_missing(semantic_entity)
+                    self.set_missing(semantic_entity)
                 else:
-                    self.add_to_existing(semantic_entity)
+                    self.set_existing(semantic_entity)
             else:
-                self.merge_into(semantic_entity, self.se_lookup[id(treated_before)])
+                self._merge_into(semantic_entity, treated_before)
+
+    def detect_circular_dependency(self):
+        """
+        Detects whether there are circular references in the given entity list and returns a list
+        where the entities are ordered according to the chain of references (and only the entities
+        contained in the circle are included. Returns None if no circular dependency is found.
 
-    @staticmethod
-    def bend_references_to_new_object(old, new, entities):
+        TODO: for the sake of detecting problems for split_into_inserts_and_updates we should only
+        consider references that are identifying properties.
+        """
+        circle = [self.unchecked[0]]
+        closed = False
+        while not closed:
+            added_to_circle = False
+            for referenced in self.forward_references[circle[-1].uuid]:
+                if referenced in self.unchecked:
+                    if referenced in circle:
+                        closed = True
+                    circle.append(pval)
+                    added_to_circle = True
+            if not added_to_circle:
+                return None
+        return circle
+
+    @ staticmethod
+    def _bend_references_to_new_object(old, new, entities):
+        # TODO still needed???
         """ Bend references to the other object
         Iterate over all entities in `entities` and check the values of all properties of
         occurances of old Entity and replace them with new Entity
@@ -453,3 +476,57 @@ class SemanticTarget():
         if entity.identifiable is not None:
             self._identifiable_look_up[entity.identifiable.get_representation()] = entity
         lookup[id(entity)] = entity
+
+    def _merge_into(self, source: SemanticEntity, target: SemanticEntity):
+        """ tries to merge record into newrecord
+
+        If it fails, record is added to the try_to_merge_later list.
+        In any case, references are bent to the newrecord object.
+
+        """
+        for f in source.fragments:
+            self.se_lookup[id(f)] = target
+        target.include(source)
+        if target.identifiable is None and not self.identity_relies_on_unchecked_entity(target):
+            try:
+                self.make_identifiable(target)
+                if target not in self.unchecked:
+                    self._identifiable_look_up[target.identifiable.get_representation()] = target
+            except Exception as es:
+                print(es)
+                pass
+
+        # update reference mappings
+        for se in self.forward_references.pop(source.uuid):
+            self.forward_references[target.uuid].add(se)
+            self.backward_references[se.uuid].remove(source)
+            self.backward_references[se.uuid].add(target)
+        for se in self.backward_references.pop(source.uuid):
+            self.backward_references[target.uuid].add(se)
+            self.forward_references[se.uuid].remove(source)
+            self.forward_references[se.uuid].add(target)
+
+        for se in self.forward_id_references.pop(source.uuid):
+            self.forward_id_references[target.uuid].add(se)
+            self.backward_id_references[se.uuid].remove(source)
+            self.backward_id_references[se.uuid].add(target)
+        for se in self.backward_id_references.pop(source.uuid):
+            self.backward_id_references[target.uuid].add(se)
+            self.forward_id_references[se.uuid].remove(source)
+            self.forward_id_references[se.uuid].add(target)
+
+        for se in self.forward_id_referenced_by.pop(source.uuid):
+            self.forward_id_referenced_by[target.uuid].add(se)
+            self.backward_id_referenced_by[se.uuid].remove(source)
+            self.backward_id_referenced_by[se.uuid].add(target)
+        for se in self.backward_id_referenced_by.pop(source.uuid):
+            self.backward_id_referenced_by[target.uuid].add(se)
+            self.forward_id_referenced_by[se.uuid].remove(source)
+            self.forward_id_referenced_by[se.uuid].add(target)
+
+        # remove unneeded SemanticEntity
+        self.se.remove(source)
+        if source in self.unchecked:
+            self.unchecked.remove(source)
+        assert id(source) not in self._missing
+        assert id(source) not in self._existing
diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py
index 2399c2fe..b18f6062 100644
--- a/unittests/test_crawler.py
+++ b/unittests/test_crawler.py
@@ -301,8 +301,8 @@ def test_split_into_inserts_and_updates_single(crawler_mocked_identifiable_retri
                db.Record(name="B").add_parent("C")]
 
     st = SemanticTarget(entlist, crawler.identifiableAdapter)
-    assert st.get_equivalent(st.se[0]) is None
-    assert st.get_equivalent(st.se[0]) is None
+    assert st.get_checked_equivalent(st.se[0]) is None
+    assert st.get_checked_equivalent(st.se[0]) is None
     assert not st.identity_relies_on_unchecked_entity(st.se[0])
     assert not st.identity_relies_on_unchecked_entity(st.se[1])
     assert crawler.identifiableAdapter.retrieve_identified_record_for_record(
@@ -487,7 +487,7 @@ a: ([b1, b2])
     assert st.identity_relies_on_unchecked_entity(st.se[3])
     assert st.identity_relies_on_unchecked_entity(st.se[4])
     st.se[0].identifiable = Identifiable(path='a')  # dummy identifiable
-    st.add_to_missing(st.se[0])
+    st.set_missing(st.se[0])
     assert st.identity_relies_on_unchecked_entity(st.se[1]) is False
 
     with raises(db.apiutils.EntityMergeConflictError) as rte:
@@ -708,8 +708,8 @@ def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test)
 
     # identifiables were not yet checked
     st = SemanticTarget(entlist, crawler.identifiableAdapter)
-    assert st.get_equivalent(st.se[1]) is None
-    assert st.get_equivalent(st.se[0]) is None
+    assert st.get_checked_equivalent(st.se[1]) is None
+    assert st.get_checked_equivalent(st.se[0]) is None
     # one can be found remotely, one not
     assert crawler.identifiableAdapter.retrieve_identified_record_for_record(
         identlist[0]).id == 1111
@@ -1020,7 +1020,7 @@ def test_treated_record_lookup():
     miss = trlu.se[1]
     fi = trlu.se[2]
     exist.id = 1
-    trlu.add_to_existing(exist)
+    trlu.set_existing(exist)
     assert len(trlu._existing) == 1
     # was added to existing
     assert trlu._existing[id(exist)] is exist
@@ -1029,9 +1029,9 @@ def test_treated_record_lookup():
 
     # exception when identifiable is missing
     with raises(RuntimeError):
-        trlu.add_to_missing(miss)
+        trlu.set_missing(miss)
     miss.identifiable = Identifiable(name='a')
-    trlu.add_to_missing(miss)
+    trlu.set_missing(miss)
     # was added to missing
     assert trlu._missing[id(miss)] is miss
     # is in ident lookup
@@ -1039,7 +1039,7 @@ def test_treated_record_lookup():
 
     fi.path = 'a'
     fi.id = 2
-    trlu.add_to_existing(fi)
+    trlu.set_existing(fi)
     assert len(trlu._existing) == 2
     # was added to existing
     assert trlu._existing[id(fi)] is fi
@@ -1056,7 +1056,7 @@ def test_treated_record_lookup():
     # If a Record was added using the ID, the ID must be used to identify it even though later an
     # identifiable may be passed as well
     exist.identifiable = Identifiable(name='b')
-    assert trlu.get_equivalent(exist) is exist
+    assert trlu.get_checked_equivalent(exist) is exist
 
 
 def test_merge_entity_with_identifying_reference(crawler_mocked_identifiable_retrieve):
diff --git a/unittests/test_identifiable_adapters.py b/unittests/test_identifiable_adapters.py
index ee5374a9..dd6f40af 100644
--- a/unittests/test_identifiable_adapters.py
+++ b/unittests/test_identifiable_adapters.py
@@ -156,9 +156,6 @@ def test_convert_value():
 
 
 def test_get_identifiable():
-    # TODO modify this such that it becomes a test that acutally tests (sufficiently) the
-    # get_identifable function
-
     ident = CaosDBIdentifiableAdapter()
     ident.load_from_yaml_definition(UNITTESTDIR / "example_identifiables.yml")
     rec = (db.Record(id=5)
@@ -175,6 +172,46 @@ def test_get_identifiable():
     assert len(r_cur.properties) == 2
     assert len(id_r0.properties) == 1
 
+    ident = CaosDBIdentifiableAdapter()
+    ident_a = db.RecordType(name="A").add_parent("A").add_property("name").add_property("a")
+    ident.register_identifiable("A", ident_a)
+    rec = (db.Record(id=5)
+           .add_parent(name="A", id=3)
+           .add_property(name="a", value="2022-02-01")
+           .add_property(name="result", value="FAIL"))
+    se = SemanticEntity(rec, ident.get_registered_identifiable(rec))
+    se.fragments.extend([
+        db.Record()
+        .add_parent(name="A", id=3)
+        .add_property(name="a", value="2022-02-01")
+        .add_property(name="result", value="FAIL"),
+        db.Record(name='a')
+        .add_parent(name="A", id=3)
+        .add_property(name="a", value="2022-02-01")
+        .add_property(name="result", value="FAIL"),
+    ])
+
+    id_r0 = ident.get_identifiable(se, [])
+    r_cur = se.fragments[0]
+    assert r_cur.parents[0].name == id_r0.record_type
+    assert r_cur.get_property("a").value == id_r0.properties["a"]
+    assert 'a' == id_r0.name
+    assert len(id_r0.properties) == 1
+
+    rec = (db.Record(name='a')
+           .add_parent(name="A")
+           .add_property(name="a", value="2")
+           )
+    se = SemanticEntity(rec, ident.get_registered_identifiable(rec))
+    se.fragments.extend([
+        db.Record(name='a')
+        .add_parent(name="A")
+        .add_property(name="a", value="3")
+    ])
+
+    with pytest.raises(RuntimeError):
+        id_r0 = ident.get_identifiable(se, [])
+
 
 @ pytest.mark.xfail
 def test_retrieve_identified_record_for_identifiable():
diff --git a/unittests/test_semantic_target.py b/unittests/test_semantic_target.py
index 7c5b8a81..dc6bf0d7 100644
--- a/unittests/test_semantic_target.py
+++ b/unittests/test_semantic_target.py
@@ -18,10 +18,15 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program. If not, see <https://www.gnu.org/licenses/>.
 #
+from functools import partial
+from unittest.mock import MagicMock, Mock, patch
+
 import linkahead as db
 from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter
 from caoscrawler.semantic_target import SemanticEntity, SemanticTarget
 
+from test_crawler import basic_retrieve_by_name_mock_up, mock_get_entity_by
+
 
 def test_create_flat_list():
     a = db.Record()
@@ -147,7 +152,7 @@ def test_merge_into():
     assert len(st.backward_id_referenced_by[se_c.uuid]) == 1
     se_a in st.backward_id_referenced_by[se_c.uuid]
 
-    st.merge_into(se_a, se_b)
+    st._merge_into(se_a, se_b)
 
     # CHECK REFERENCE MAP (after merge):
     # c is now referenced by b
@@ -223,7 +228,7 @@ def test_merge_into():
     se_a in st.backward_id_referenced_by[se_c.uuid]
     se_b in st.backward_id_referenced_by[se_c.uuid]
 
-    st.merge_into(se_a, se_b)
+    st._merge_into(se_a, se_b)
 
     # CHECK REFERENCE MAP (after merge):
     # c is now referenced by b
@@ -270,3 +275,70 @@ def test_backward_id_referenced_by():
 
     st = SemanticTarget(entlist, ident_adapter)
     assert st.se[1] in st.backward_id_referenced_by[st.se[0].uuid]
+
+
+@patch("caoscrawler.semantic_target.cached_get_entity_by",
+       new=Mock(side_effect=mock_get_entity_by))
+def test_merging():
+    # identifying information can be given at various locations in the hierachical tree
+    # test whether an object is correctly combined for all cases
+    ident_adapter = CaosDBIdentifiableAdapter()
+    ident_a = db.RecordType().add_parent("A").add_property("name").add_property("a")
+    ident_adapter.register_identifiable("A", ident_a)
+    ident_adapter.retrieve_identified_record_for_identifiable = Mock(
+        side_effect=partial(
+            basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A")}))
+
+    # merging based on id
+    entlist = [
+        db.Record(id=101).add_parent("A"),
+        db.Record(id=101).add_parent("A")]
+    st = SemanticTarget(entlist, ident_adapter)
+    assert len(st.se) == 1
+    assert len(st.unchecked) == 0
+    assert entlist[0] in st.se[0].fragments
+    assert entlist[1] in st.se[0].fragments
+
+    # merging based on path
+    entlist = [
+        db.File(path='101').add_parent("A"),
+        db.File(path='101').add_parent("A")]
+    st = SemanticTarget(entlist, ident_adapter)
+    assert len(st.se) == 1
+    assert len(st.unchecked) == 0
+    assert entlist[0] in st.se[0].fragments
+    assert entlist[1] in st.se[0].fragments
+
+    # merging based on identifiable
+    entlist = [
+        db.File(name='101').add_parent("A").add_property('a', value=1),
+        db.File(name='101').add_parent("A").add_property('a', value=1)]
+    st = SemanticTarget(entlist, ident_adapter)
+    st.make_identifiable(st.se[0])
+    st.check_remote_server(st.se[0])
+    st.set_missing(st.se[0])
+    assert len(st.unchecked) == 1
+    st.make_identifiable(st.se[1])
+    assert st.merge_with_equivalent(st.se[1])
+    assert len(st.se) == 1
+    assert len(st.unchecked) == 0
+    assert entlist[0] in st.se[0].fragments
+    assert entlist[1] in st.se[0].fragments
+
+    # Merging a mix. One Record needs the identifiable to be merged. But the identifying
+    # information is scattered in the other case.
+    entlist = [
+        db.Record(id=101).add_parent("A"),
+        db.Record(id=101, name='a').add_parent("A"),
+        db.Record(id=101).add_parent("A").add_property('a', value=1),
+        db.Record(name='a').add_parent("A").add_property('a', value=1)]
+
+    st = SemanticTarget(entlist, ident_adapter)
+    assert len(st.se) == 2
+    assert len(st.unchecked) == 1
+    st.make_identifiable(st.se[1])
+    assert st.merge_with_equivalent(st.se[1])
+    assert len(st.se) == 1
+    assert len(st.unchecked) == 0
+    for ii in range(4):
+        assert entlist[ii] in st.se[0].fragments
-- 
GitLab