From 8179f69c1c56e2ca393d2e0c0190ea33d9bd21cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Wed, 17 Apr 2024 12:14:49 +0200 Subject: [PATCH] wip --- src/caoscrawler/crawl.py | 71 ++---- src/caoscrawler/identifiable.py | 8 +- src/caoscrawler/identifiable_adapters.py | 8 +- src/caoscrawler/semantic_target.py | 287 ++++++++++++++--------- unittests/test_crawler.py | 20 +- unittests/test_identifiable_adapters.py | 43 +++- unittests/test_semantic_target.py | 76 +++++- 7 files changed, 332 insertions(+), 181 deletions(-) diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 17171e22..23707a6c 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -366,38 +366,32 @@ class Crawler(object): # 1. Is it in the cache of already checked Records? # 2. Does it have to be new since a needed reference is missing? # 3. Can it be checked on the remote server? - for se in st.unchecked: + for se in list(st.unchecked): if st.identity_relies_on_unchecked_entity(se): + print(st.se.index(se), "relies on unchecked") continue - if se.identifiable is None: - se.identifiable = self.identifiableAdapter.get_identifiable( - se, st.backward_id_referenced_by[se.uuid]) - - equivalent_se = st.get_equivalent(se) - - # 1. Is it in the cache of already checked Records? - if equivalent_se is not None: - # We merge record into treated_record in order to prevent loss of information - st.merge_into(se, equivalent_se) + st.make_identifiable(se) + print(st.se.index(se), "is now identifiable") + if st.merge_with_equivalent(se): + print('see above', "was merged") + entity_was_treated = True + continue # 2. Does it have to be new since a needed reference is missing? # (Is it impossible to check this record because an identifiable references a # missing record?) - elif st.identity_relies_on_missing_entity(se): - st.add_to_missing(se) + if st.identity_relies_on_missing_entity(se): + st.set_missing(se) # 3. check on the remote server else: - identified_record = ( - self.identifiableAdapter.retrieve_identified_record_for_identifiable( - se.identifiable)) - if identified_record is None: - st.add_to_missing(se) + st.check_remote_server(se) + if se.id is None: + st.set_missing(se) else: - se.identify_with(identified_record) - st.add_to_existing(se) - entity_was_treated = True + st.set_existing(se) + entity_was_treated = True # TODO # for record in st.entities: @@ -406,7 +400,7 @@ class Crawler(object): # We postponed the merge for records where it failed previously and try it again now. # This only might add properties of the postponed records to the already used ones. if len(st.unchecked) > 0: - circle = self.detect_circular_dependency(st.entities) + circle = st.detect_circular_dependency() if circle is None: logger.error("Failed, but found NO circular dependency. The data is as follows:" # + str(self.compact_entity_list_representation(st.entities, @@ -468,39 +462,6 @@ class Crawler(object): return text + "--------\n" - @staticmethod - def detect_circular_dependency(flat: list[db.Entity]): - """ - Detects whether there are circular references in the given entity list and returns a list - where the entities are ordered according to the chain of references (and only the entities - contained in the circle are included. Returns None if no circular dependency is found. - - TODO: for the sake of detecting problems for split_into_inserts_and_updates we should only - consider references that are identifying properties. - """ - circle = [flat[0]] - closed = False - while not closed: - current = circle[-1] - added_to_circle = False - for p in current.properties: - if isinstance(p.value, list): - for pval in p.value: - if pval in flat: - if pval in circle: - closed = True - circle.append(pval) - added_to_circle = True - else: - if p.value in flat: - if p.value in circle: - closed = True - circle.append(p.value) - added_to_circle = True - if not added_to_circle: - return None - return circle - @staticmethod def _merge_properties_from_remote( crawled_data: list[db.Record], diff --git a/src/caoscrawler/identifiable.py b/src/caoscrawler/identifiable.py index cefdf4a0..3df5bfa7 100644 --- a/src/caoscrawler/identifiable.py +++ b/src/caoscrawler/identifiable.py @@ -20,12 +20,14 @@ # from __future__ import annotations -import linkahead as db -from datetime import datetime + import json +import logging +from datetime import datetime from hashlib import sha256 from typing import Union -import logging + +import linkahead as db logger = logging.getLogger(__name__) diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index d9e1080e..85f57449 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -264,8 +264,9 @@ startswith: bool, optional if prop.name == "name": name_options = [f.name for f in se.fragments if f.name is not None] if len(name_options) == 0: - raise RuntimeError("name") - assert all([f == name_options[0] for f in name_options]) + raise RuntimeError("name is missing!") + if not all([f == name_options[0] for f in name_options]): + raise RuntimeError("differing names in fragments") name = name_options[0] continue # problem: what happens with multi properties? @@ -290,7 +291,8 @@ startswith: bool, optional f"The following record is missing an identifying property:\n" f"RECORD\n{se.fragments[0]}\nIdentifying PROPERTY\n{prop.name}" ) - assert all([f.value == options[0].value for f in options]) + if not all([f.value == options[0].value for f in options]): + raise RuntimeError("differing prop values in fragments") record_prop = options[0] identifiable_props[record_prop.name] = record_prop.value diff --git a/src/caoscrawler/semantic_target.py b/src/caoscrawler/semantic_target.py index 3154f81f..89e65c20 100644 --- a/src/caoscrawler/semantic_target.py +++ b/src/caoscrawler/semantic_target.py @@ -29,6 +29,8 @@ from uuid import uuid4 as uuid import linkahead as db from linkahead.apiutils import (EntityMergeConflictError, compare_entities, merge_entities) +from linkahead.cached import cache_clear, cached_get_entity_by +from linkahead.exceptions import EmptyUniqueQueryError from .identifiable_adapters import IdentifiableAdapter @@ -49,6 +51,14 @@ class SemanticEntity(): self.uuid = uuid() def identify_with(self, remote_entity): + """ the given remote_entity is considered to be the target entity. + + ID and path are copied to this objects attributes and to its fragments + """ + + self.id = remote_entity.id + self.path = remote_entity.path + for f in self.fragments: # side effect f.id = remote_entity.id @@ -58,32 +68,59 @@ class SemanticEntity(): f._size = remote_entity._size f._checksum = remote_entity._checksum + def include(self, source): + self.fragments.extend(source.fragments) + + if source.id is not None: + if self.id is None: + self.id = source.id + else: + assert self.id == source.id + + if source.path is not None: + if self.path is None: + self.path = source.path + else: + assert self.path == source.path + class SemanticTarget(): """ models the target structure of Entities as it shall be created by the Crawler + This model should only be manipulated via three functions: + - make_identifiable: adds an identifiable to a SemanticEntity what possibly allows to merge it + with another SemanticEntity + - merge_with_equivalent: check whether there is an equivalent SemanticEntity and merge into + that if one is found + - check_remote_server: uses the identifiable to check the remote server and add ID and path if + an object was found + - set_existing: declares that a SemanticEntity is existing on the remote server + - set_missing: declares that a SemanticEntity is NOT existing on the remote server """ def __init__(self, entities: list[db.Entity], identifiableAdapter): self.identifiableAdapter = identifiableAdapter - self.entities = self._create_flat_list(entities) self._id_look_up: dict[int, SemanticEntity] = {} self._path_look_up: dict[str, SemanticEntity] = {} self._identifiable_look_up: dict[str, SemanticEntity] = {} self._missing: dict[int, SemanticEntity] = {} self._existing: dict[int, SemanticEntity] = {} + self._remote_missing_counter = -1 # TODO: I guess we can now get rid of this... # create initial set of SemanticEntities from provided Entity list self.se: list[SemanticEntity] = [] # list of all SemanticEntities - self.se_lookup: dict[str, SemanticEntity] = {} # get a SemanticEntity by its UUID - for el in self.entities: + # TODO do we only need this for creating the initial reference map? Remove it? + self.se_lookup: dict[str, SemanticEntity] = {} # lookup: UUID -> SemanticEntity + entities = self._create_flat_list(entities) + self._sanity_check(entities) + for el in entities: self.se.append(SemanticEntity( el, self.identifiableAdapter.get_registered_identifiable(el))) self.se_lookup[id(el)] = self.se[-1] self.unchecked = list(self.se) # list all SemanticEntities that have not yet been checked - self._remote_missing_counter = -1 - self.sanity_check(self.entities) + + # initialize reference mappings ( self.forward_references, self.backward_references, @@ -95,12 +132,61 @@ class SemanticTarget(): self._mark_entities_with_path_or_id() - def get_equivalent(self, entity: SemanticEntity = None) -> Optional[SemanticEntity]: + def make_identifiable(self, se: SemanticEntity): + """ creates an identifiable for the given SemanticEntity and possibly merges it into an + equivalent SemanticEntity + """ + if se.identifiable is not None: + raise RuntimeError("Already has identifiable") + se.identifiable = self.identifiableAdapter.get_identifiable( + se, self.backward_id_referenced_by[se.uuid]) + + def merge_with_equivalent(self, se: SemanticEntity): + equivalent_se = self.get_checked_equivalent(se) + if equivalent_se is None: + return False + else: + self._merge_into(se, equivalent_se) + return True + + def check_remote_server(self, se: SemanticEntity): + identified_record = ( + self.identifiableAdapter.retrieve_identified_record_for_identifiable( + se.identifiable)) + if identified_record is not None: + se.identify_with(identified_record) + + def set_missing(self, se: SemanticEntity): + """ add the given SemanticEntity to the list of missing entities + + This removes the SemanticEntity from the unchecked list and implies that the entity does + NOT exist on the remote server. + """ + assert se.id is None + if se.path is None and se.identifiable is None: + raise RuntimeError("no identifying information") + se.id = self._remote_missing_counter + self._remote_missing_counter -= 1 + self._add_any(se, self._missing) + self.unchecked.remove(se) + + def set_existing(self, se: SemanticEntity): + """ add the given SemanticEntity to the list of existing entities + + This removes the SemanticEntity from the unchecked list and implies that the entity exists + on the remote server. + """ + assert se.id is not None + self._add_any(se, self._existing) + self.unchecked.remove(se) + + def get_checked_equivalent(self, entity: SemanticEntity) -> Optional[SemanticEntity]: """ Return an equivalent SemanticEntity from the list of missing or existing entities. Equivalent means that ID, path or identifiable are the same. """ + # TODO shall we also provide a variant that returns equivalent objects that are unchecked? if entity.id is not None and entity.id in self._id_look_up: return self._id_look_up[entity.id] if entity.path is not None and entity.path in self._path_look_up: @@ -149,93 +235,6 @@ class SemanticTarget(): return (missing, [el.fragments[0] for el in self._existing.values()]) - def add_to_missing(self, se: SemanticEntity): - assert se.id is None - if se.path is None and se.identifiable is None: - raise RuntimeError("no identifying information") - se.id = self._remote_missing_counter - self._remote_missing_counter -= 1 - self._add_any(se, self._missing) - self.unchecked.remove(se) - - def add_to_existing(self, se: SemanticEntity): - """ add a SemanticEntity to the lookup of treated entities and remove id from the unchecked - list - Add a Record that was treated, such that it is contained in the internal look up dicts - - This Record MUST have an ID if it was found in the remote server. -""" - self._add_any(se, self._existing) - self.unchecked.remove(se) - - def merge_into(self, source: SemanticEntity, target: SemanticEntity): - """ tries to merge record into newrecord - - If it fails, record is added to the try_to_merge_later list. - In any case, references are bent to the newrecord object. - - """ - for frag in source.fragments: - try: - merge_entities( - target.fragments[0], frag, merge_references_with_empty_diffs=False, - merge_id_with_resolved_entity=True) - except EntityMergeConflictError: - self._treat_merge_error_of(target.fragments[0], frag) - # We cannot merge but it is none of the clear case where merge is - # impossible. Thus we try later - target.fragments.append(frag) - if target.fragments[0].id is not None: - frag.id = target.fragments[0].id - except NotImplementedError: - print(target) - print(source) - raise - if source.id is not None: - if target.id is None: - target.id = source.id - else: - assert target.id == source.id - - if source.path is not None: - if target.path is None: - target.path = source.path - else: - assert target.path == source.path - - # update reference mappings - for se in self.forward_references.pop(source.uuid): - self.forward_references[target.uuid].add(se) - self.backward_references[se.uuid].remove(source) - self.backward_references[se.uuid].add(target) - for se in self.backward_references.pop(source.uuid): - self.backward_references[target.uuid].add(se) - self.forward_references[se.uuid].remove(source) - self.forward_references[se.uuid].add(target) - - for se in self.forward_id_references.pop(source.uuid): - self.forward_id_references[target.uuid].add(se) - self.backward_id_references[se.uuid].remove(source) - self.backward_id_references[se.uuid].add(target) - for se in self.backward_id_references.pop(source.uuid): - self.backward_id_references[target.uuid].add(se) - self.forward_id_references[se.uuid].remove(source) - self.forward_id_references[se.uuid].add(target) - - for se in self.forward_id_referenced_by.pop(source.uuid): - self.forward_id_referenced_by[target.uuid].add(se) - self.backward_id_referenced_by[se.uuid].remove(source) - self.backward_id_referenced_by[se.uuid].add(target) - for se in self.backward_id_referenced_by.pop(source.uuid): - self.backward_id_referenced_by[target.uuid].add(se) - self.forward_id_referenced_by[se.uuid].remove(source) - self.forward_id_referenced_by[se.uuid].add(target) - - # remove empyt SemanticEntity - self.se.remove(source) - if source in self.unchecked: - self.unchecked.remove(source) - def identity_relies_on_unchecked_entity(self, se: SemanticEntity): """ If a record for which it could not yet be verified whether it exists in LA or not is part @@ -257,8 +256,8 @@ class SemanticTarget(): return any([id(ent) in self._missing for ent in self.forward_id_references[se.uuid]] + [id(ent) in self._missing for ent in self.backward_id_referenced_by[se.uuid]]) - @staticmethod - def sanity_check(entities: list[db.Entity]): + @ staticmethod + def _sanity_check(entities: list[db.Entity]): for ent in entities: if ent.role == "Record" and len(ent.parents) == 0: raise RuntimeError(f"Records must have a parent.\n{ent}") @@ -271,7 +270,7 @@ class SemanticTarget(): merge_entities(se.fragments[0], ent, merge_id_with_resolved_entity=True) se.fragments = [se.fragments[0]] - @staticmethod + @ staticmethod def _create_flat_list(ent_list: list[db.Entity], flat: Optional[list[db.Entity]] = None): """ Recursively adds entities and all their properties contained in ent_list to @@ -301,7 +300,7 @@ class SemanticTarget(): SemanticTarget._create_flat_list([p.value], flat) return flat - @staticmethod + @ staticmethod def _treat_merge_error_of(newrecord, record): """ The parameters are two entities that cannot be merged with the merge_entities function. @@ -348,7 +347,7 @@ class SemanticTarget(): f"{record}\n{newrecord}") raise RuntimeError("Cannot merge Entities") - @staticmethod + @ staticmethod def _create_reference_mapping(flat: list[SemanticEntity], se_lookup): """ TODO update docstring @@ -407,7 +406,7 @@ class SemanticTarget(): def _mark_entities_with_path_or_id(self): """ A path or an ID is sufficiently identifying. Thus, those entities can be marked as checked """ - for semantic_entity in self.se: + for semantic_entity in list(self.se[::-1]): assert len(semantic_entity.fragments) == 1 entity = semantic_entity.fragments[0] if entity.id is None and entity.path is None: @@ -420,17 +419,41 @@ class SemanticTarget(): if existing is not None: semantic_entity.identify_with(existing) - treated_before = self.get_equivalent(semantic_entity) + treated_before = self.get_checked_equivalent(semantic_entity) if treated_before is None: if semantic_entity.id is None: - self.add_to_missing(semantic_entity) + self.set_missing(semantic_entity) else: - self.add_to_existing(semantic_entity) + self.set_existing(semantic_entity) else: - self.merge_into(semantic_entity, self.se_lookup[id(treated_before)]) + self._merge_into(semantic_entity, treated_before) + + def detect_circular_dependency(self): + """ + Detects whether there are circular references in the given entity list and returns a list + where the entities are ordered according to the chain of references (and only the entities + contained in the circle are included. Returns None if no circular dependency is found. - @staticmethod - def bend_references_to_new_object(old, new, entities): + TODO: for the sake of detecting problems for split_into_inserts_and_updates we should only + consider references that are identifying properties. + """ + circle = [self.unchecked[0]] + closed = False + while not closed: + added_to_circle = False + for referenced in self.forward_references[circle[-1].uuid]: + if referenced in self.unchecked: + if referenced in circle: + closed = True + circle.append(pval) + added_to_circle = True + if not added_to_circle: + return None + return circle + + @ staticmethod + def _bend_references_to_new_object(old, new, entities): + # TODO still needed??? """ Bend references to the other object Iterate over all entities in `entities` and check the values of all properties of occurances of old Entity and replace them with new Entity @@ -453,3 +476,57 @@ class SemanticTarget(): if entity.identifiable is not None: self._identifiable_look_up[entity.identifiable.get_representation()] = entity lookup[id(entity)] = entity + + def _merge_into(self, source: SemanticEntity, target: SemanticEntity): + """ tries to merge record into newrecord + + If it fails, record is added to the try_to_merge_later list. + In any case, references are bent to the newrecord object. + + """ + for f in source.fragments: + self.se_lookup[id(f)] = target + target.include(source) + if target.identifiable is None and not self.identity_relies_on_unchecked_entity(target): + try: + self.make_identifiable(target) + if target not in self.unchecked: + self._identifiable_look_up[target.identifiable.get_representation()] = target + except Exception as es: + print(es) + pass + + # update reference mappings + for se in self.forward_references.pop(source.uuid): + self.forward_references[target.uuid].add(se) + self.backward_references[se.uuid].remove(source) + self.backward_references[se.uuid].add(target) + for se in self.backward_references.pop(source.uuid): + self.backward_references[target.uuid].add(se) + self.forward_references[se.uuid].remove(source) + self.forward_references[se.uuid].add(target) + + for se in self.forward_id_references.pop(source.uuid): + self.forward_id_references[target.uuid].add(se) + self.backward_id_references[se.uuid].remove(source) + self.backward_id_references[se.uuid].add(target) + for se in self.backward_id_references.pop(source.uuid): + self.backward_id_references[target.uuid].add(se) + self.forward_id_references[se.uuid].remove(source) + self.forward_id_references[se.uuid].add(target) + + for se in self.forward_id_referenced_by.pop(source.uuid): + self.forward_id_referenced_by[target.uuid].add(se) + self.backward_id_referenced_by[se.uuid].remove(source) + self.backward_id_referenced_by[se.uuid].add(target) + for se in self.backward_id_referenced_by.pop(source.uuid): + self.backward_id_referenced_by[target.uuid].add(se) + self.forward_id_referenced_by[se.uuid].remove(source) + self.forward_id_referenced_by[se.uuid].add(target) + + # remove unneeded SemanticEntity + self.se.remove(source) + if source in self.unchecked: + self.unchecked.remove(source) + assert id(source) not in self._missing + assert id(source) not in self._existing diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index 2399c2fe..b18f6062 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -301,8 +301,8 @@ def test_split_into_inserts_and_updates_single(crawler_mocked_identifiable_retri db.Record(name="B").add_parent("C")] st = SemanticTarget(entlist, crawler.identifiableAdapter) - assert st.get_equivalent(st.se[0]) is None - assert st.get_equivalent(st.se[0]) is None + assert st.get_checked_equivalent(st.se[0]) is None + assert st.get_checked_equivalent(st.se[0]) is None assert not st.identity_relies_on_unchecked_entity(st.se[0]) assert not st.identity_relies_on_unchecked_entity(st.se[1]) assert crawler.identifiableAdapter.retrieve_identified_record_for_record( @@ -487,7 +487,7 @@ a: ([b1, b2]) assert st.identity_relies_on_unchecked_entity(st.se[3]) assert st.identity_relies_on_unchecked_entity(st.se[4]) st.se[0].identifiable = Identifiable(path='a') # dummy identifiable - st.add_to_missing(st.se[0]) + st.set_missing(st.se[0]) assert st.identity_relies_on_unchecked_entity(st.se[1]) is False with raises(db.apiutils.EntityMergeConflictError) as rte: @@ -708,8 +708,8 @@ def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test) # identifiables were not yet checked st = SemanticTarget(entlist, crawler.identifiableAdapter) - assert st.get_equivalent(st.se[1]) is None - assert st.get_equivalent(st.se[0]) is None + assert st.get_checked_equivalent(st.se[1]) is None + assert st.get_checked_equivalent(st.se[0]) is None # one can be found remotely, one not assert crawler.identifiableAdapter.retrieve_identified_record_for_record( identlist[0]).id == 1111 @@ -1020,7 +1020,7 @@ def test_treated_record_lookup(): miss = trlu.se[1] fi = trlu.se[2] exist.id = 1 - trlu.add_to_existing(exist) + trlu.set_existing(exist) assert len(trlu._existing) == 1 # was added to existing assert trlu._existing[id(exist)] is exist @@ -1029,9 +1029,9 @@ def test_treated_record_lookup(): # exception when identifiable is missing with raises(RuntimeError): - trlu.add_to_missing(miss) + trlu.set_missing(miss) miss.identifiable = Identifiable(name='a') - trlu.add_to_missing(miss) + trlu.set_missing(miss) # was added to missing assert trlu._missing[id(miss)] is miss # is in ident lookup @@ -1039,7 +1039,7 @@ def test_treated_record_lookup(): fi.path = 'a' fi.id = 2 - trlu.add_to_existing(fi) + trlu.set_existing(fi) assert len(trlu._existing) == 2 # was added to existing assert trlu._existing[id(fi)] is fi @@ -1056,7 +1056,7 @@ def test_treated_record_lookup(): # If a Record was added using the ID, the ID must be used to identify it even though later an # identifiable may be passed as well exist.identifiable = Identifiable(name='b') - assert trlu.get_equivalent(exist) is exist + assert trlu.get_checked_equivalent(exist) is exist def test_merge_entity_with_identifying_reference(crawler_mocked_identifiable_retrieve): diff --git a/unittests/test_identifiable_adapters.py b/unittests/test_identifiable_adapters.py index ee5374a9..dd6f40af 100644 --- a/unittests/test_identifiable_adapters.py +++ b/unittests/test_identifiable_adapters.py @@ -156,9 +156,6 @@ def test_convert_value(): def test_get_identifiable(): - # TODO modify this such that it becomes a test that acutally tests (sufficiently) the - # get_identifable function - ident = CaosDBIdentifiableAdapter() ident.load_from_yaml_definition(UNITTESTDIR / "example_identifiables.yml") rec = (db.Record(id=5) @@ -175,6 +172,46 @@ def test_get_identifiable(): assert len(r_cur.properties) == 2 assert len(id_r0.properties) == 1 + ident = CaosDBIdentifiableAdapter() + ident_a = db.RecordType(name="A").add_parent("A").add_property("name").add_property("a") + ident.register_identifiable("A", ident_a) + rec = (db.Record(id=5) + .add_parent(name="A", id=3) + .add_property(name="a", value="2022-02-01") + .add_property(name="result", value="FAIL")) + se = SemanticEntity(rec, ident.get_registered_identifiable(rec)) + se.fragments.extend([ + db.Record() + .add_parent(name="A", id=3) + .add_property(name="a", value="2022-02-01") + .add_property(name="result", value="FAIL"), + db.Record(name='a') + .add_parent(name="A", id=3) + .add_property(name="a", value="2022-02-01") + .add_property(name="result", value="FAIL"), + ]) + + id_r0 = ident.get_identifiable(se, []) + r_cur = se.fragments[0] + assert r_cur.parents[0].name == id_r0.record_type + assert r_cur.get_property("a").value == id_r0.properties["a"] + assert 'a' == id_r0.name + assert len(id_r0.properties) == 1 + + rec = (db.Record(name='a') + .add_parent(name="A") + .add_property(name="a", value="2") + ) + se = SemanticEntity(rec, ident.get_registered_identifiable(rec)) + se.fragments.extend([ + db.Record(name='a') + .add_parent(name="A") + .add_property(name="a", value="3") + ]) + + with pytest.raises(RuntimeError): + id_r0 = ident.get_identifiable(se, []) + @ pytest.mark.xfail def test_retrieve_identified_record_for_identifiable(): diff --git a/unittests/test_semantic_target.py b/unittests/test_semantic_target.py index 7c5b8a81..dc6bf0d7 100644 --- a/unittests/test_semantic_target.py +++ b/unittests/test_semantic_target.py @@ -18,10 +18,15 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. # +from functools import partial +from unittest.mock import MagicMock, Mock, patch + import linkahead as db from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter from caoscrawler.semantic_target import SemanticEntity, SemanticTarget +from test_crawler import basic_retrieve_by_name_mock_up, mock_get_entity_by + def test_create_flat_list(): a = db.Record() @@ -147,7 +152,7 @@ def test_merge_into(): assert len(st.backward_id_referenced_by[se_c.uuid]) == 1 se_a in st.backward_id_referenced_by[se_c.uuid] - st.merge_into(se_a, se_b) + st._merge_into(se_a, se_b) # CHECK REFERENCE MAP (after merge): # c is now referenced by b @@ -223,7 +228,7 @@ def test_merge_into(): se_a in st.backward_id_referenced_by[se_c.uuid] se_b in st.backward_id_referenced_by[se_c.uuid] - st.merge_into(se_a, se_b) + st._merge_into(se_a, se_b) # CHECK REFERENCE MAP (after merge): # c is now referenced by b @@ -270,3 +275,70 @@ def test_backward_id_referenced_by(): st = SemanticTarget(entlist, ident_adapter) assert st.se[1] in st.backward_id_referenced_by[st.se[0].uuid] + + +@patch("caoscrawler.semantic_target.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +def test_merging(): + # identifying information can be given at various locations in the hierachical tree + # test whether an object is correctly combined for all cases + ident_adapter = CaosDBIdentifiableAdapter() + ident_a = db.RecordType().add_parent("A").add_property("name").add_property("a") + ident_adapter.register_identifiable("A", ident_a) + ident_adapter.retrieve_identified_record_for_identifiable = Mock( + side_effect=partial( + basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A")})) + + # merging based on id + entlist = [ + db.Record(id=101).add_parent("A"), + db.Record(id=101).add_parent("A")] + st = SemanticTarget(entlist, ident_adapter) + assert len(st.se) == 1 + assert len(st.unchecked) == 0 + assert entlist[0] in st.se[0].fragments + assert entlist[1] in st.se[0].fragments + + # merging based on path + entlist = [ + db.File(path='101').add_parent("A"), + db.File(path='101').add_parent("A")] + st = SemanticTarget(entlist, ident_adapter) + assert len(st.se) == 1 + assert len(st.unchecked) == 0 + assert entlist[0] in st.se[0].fragments + assert entlist[1] in st.se[0].fragments + + # merging based on identifiable + entlist = [ + db.File(name='101').add_parent("A").add_property('a', value=1), + db.File(name='101').add_parent("A").add_property('a', value=1)] + st = SemanticTarget(entlist, ident_adapter) + st.make_identifiable(st.se[0]) + st.check_remote_server(st.se[0]) + st.set_missing(st.se[0]) + assert len(st.unchecked) == 1 + st.make_identifiable(st.se[1]) + assert st.merge_with_equivalent(st.se[1]) + assert len(st.se) == 1 + assert len(st.unchecked) == 0 + assert entlist[0] in st.se[0].fragments + assert entlist[1] in st.se[0].fragments + + # Merging a mix. One Record needs the identifiable to be merged. But the identifying + # information is scattered in the other case. + entlist = [ + db.Record(id=101).add_parent("A"), + db.Record(id=101, name='a').add_parent("A"), + db.Record(id=101).add_parent("A").add_property('a', value=1), + db.Record(name='a').add_parent("A").add_property('a', value=1)] + + st = SemanticTarget(entlist, ident_adapter) + assert len(st.se) == 2 + assert len(st.unchecked) == 1 + st.make_identifiable(st.se[1]) + assert st.merge_with_equivalent(st.se[1]) + assert len(st.se) == 1 + assert len(st.unchecked) == 0 + for ii in range(4): + assert entlist[ii] in st.se[0].fragments -- GitLab