diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 546f5c2e2ed09d507738903ba7b2b7805f6cb12a..ed8493fd2429ed42a4277c0bfca7644e93b08da6 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -69,9 +69,9 @@ from .logging import configure_server_side_logging from .macros import defmacro_constructor, macro_constructor from .scanner import (create_converter_registry, initialize_converters, load_definition, scan_directory, scan_structure_elements) -from .sync_graph import SyncGraph from .stores import GeneralStore from .structure_elements import StructureElement +from .sync_graph import SyncGraph logger = logging.getLogger(__name__) @@ -368,33 +368,37 @@ class Crawler(object): # 3. Can it be checked on the remote server? for se in list(st.unchecked): if st.identity_relies_on_unchecked_entity(se): - print(st.se.index(se), "relies on unchecked") - continue - - st.make_identifiable(se) - print(st.se.index(se), "is now identifiable") - if st.merge_with_equivalent(se): - print('see above', "was merged") - entity_was_treated = True + print(st.nodes.index(se), "relies on unchecked") continue - # 2. Does it have to be new since a needed reference is missing? - # (Is it impossible to check this record because an identifiable references a - # missing record?) - if st.identity_relies_on_missing_entity(se): - st.set_missing(se) - - # 3. check on the remote server - else: - st.check_remote_server(se) - print("checked", se.id) - if se.id is None: - st.set_missing(se) - print("missing") - else: - st.set_existing(se) - print("exisitng") - entity_was_treated = True + if se.identifiable is None: + st.set_identifiable_of_node(se, st.identifiableAdapter.get_identifiable( + se, st.backward_id_referenced_by[se.uuid])) + # entity was merged with another due to the new identifiable + if se not in st.unchecked: + continue + + # if (equivalent_se.identifiable is None and not + # self.identity_relies_on_unchecked_entity(equivalent_se)): + # try: + # equivalent_se.identifiable = self.identifiableAdapter.get_identifiable( + # equivalent_se, self.backward_id_referenced_by[equivalent_se.uuid]) + # if equivalent_se not in self.unchecked: + # self._identifiable_look_up[ + # equivalent_se.identifiable.get_representation() + # ] = equivalent_se + # except Exception as es: + # print(es) + # pass + + identified_record = ( + st.identifiableAdapter.retrieve_identified_record_for_identifiable( + se.identifiable)) + remote_id = None + if identified_record is not None: + remote_id = identified_record.id + st.set_id_of_node(se, remote_id) + entity_was_treated = True # TODO # for record in st.entities: @@ -403,7 +407,7 @@ class Crawler(object): # We postponed the merge for records where it failed previously and try it again now. # This only might add properties of the postponed records to the already used ones. if len(st.unchecked) > 0: - circle = st.detect_circular_dependency() + circle = st.unchecked_contains_circular_dependency() if circle is None: logger.error("Failed, but found NO circular dependency. The data is as follows:" # + str(self.compact_entity_list_representation(st.entities, diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index 903327424e8cbf0c93ec9c6fc1bbba2fb6612f48..cfe27c3ffe82173af0283dcde7d4139dbb4571f5 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -233,7 +233,7 @@ startswith: bool, optional refs.append(val) return refs - def get_identifiable(self, se: SemanticEntity, identifiable_backrefs): + def get_identifiable(self, se: SyncNode, identifiable_backrefs): """ Retrieve the registered identifiable and fill the property values to create an identifiable. diff --git a/src/caoscrawler/sync_graph.py b/src/caoscrawler/sync_graph.py index 5993ed09505e7d434a66164e5895b50f44519ace..3bf704c37455bb8a711c1bd45b7a82f67c65fb35 100644 --- a/src/caoscrawler/sync_graph.py +++ b/src/caoscrawler/sync_graph.py @@ -23,6 +23,8 @@ A data model class for the semantic data that shall be created by synchronization of the crawler. """ +from __future__ import annotations + from typing import Any, Dict, List, Optional, Union from uuid import uuid4 as uuid @@ -35,53 +37,75 @@ from linkahead.exceptions import EmptyUniqueQueryError from .identifiable_adapters import IdentifiableAdapter -class SemanticEntity(): +class SyncNode(): """ represents the information related to an Entity as it shall be created in LinkAhead - Parents and Properties are given by fragments: db.Record objects that may have been created by - the scanner. + The following information is taken from db.Entity object during initialization or when the + object is updated using `update(entity)`: + - id + - role + - parents + - path + - name + - description + - properties + + Typically, this class is used in the following way: + 1. A SyncNode is initialized with a db.Entity object + 2. The SyncNode object is possibly updated one or more times with further db.Entity objects + 3. A db.Entity object is created (`export_entity`) that contains the combined information of + the previous db.Entity objects. """ - def __init__(self, entity: db.Entity, registered_identifiable): - self.fragments = [entity] + def __init__(self, entity: db.Entity, registered_identifiable: Optional[db.RecordType] = + None) -> None: self.id = entity.id + self.role = entity.role + self.parents = entity.parents self.path = entity.path + self.name = entity.name + self.description = entity.description + self.properties = list(entity.properties) + self.uuid = uuid() self.identifiable = None self.registered_identifiable = registered_identifiable - self.uuid = uuid() - - def identify_with(self, remote_entity): - """ the given remote_entity is considered to be the target entity. - - ID and path are copied to this objects attributes and to its fragments - """ - - self.id = remote_entity.id - self.path = remote_entity.path - - for f in self.fragments: - # side effect - f.id = remote_entity.id - f.path = remote_entity.path - # TODO check the following copying of _size and _checksum - # Copy over checksum and size too if it is a file - f._size = remote_entity._size - f._checksum = remote_entity._checksum - def include(self, source): - self.fragments.extend(source.fragments) - - if source.id is not None: - if self.id is None: - self.id = source.id - else: - assert self.id == source.id - - if source.path is not None: - if self.path is None: - self.path = source.path + def update(self, other: SyncNode) -> None: + if other.identifiable is not None and self.identifiable is not None: + assert (other.identifiable.get_representation() == + self.identifiable.get_representation()) + for attr in ["id", "path", "role", "path", "name", "description"]: + if other.__getattribute__(attr) is not None: + if self.__getattribute__(attr) is None: + self.__setattr__(attr, other.__getattribute__(attr)) + else: + assert self.__getattribute__(attr) == other.__getattribute__(attr) + for p in other.parents: + if p not in self.parents: + self.parents.append(p) + for p in other.properties: + if p not in self.properties: + self.properties.append(p) + + def export_entity(self) -> db.Entity: + ent = None + if self.role == "Record": + ent = db.Record() + elif self.role == "File": + ent = db.File() + else: + raise RuntimeError("Invalid role") + for attr in ["id", "path", "role", "path", "name", "description"]: + ent.__setattr__(attr, self.__getattribute__(attr)) + for p in self.parents: + ent.add_parent(p) + for p in self.properties: + if ent.get_property(p) is not None: + if ent.get_property(p).value != p.value: + raise Exception() else: - assert self.path == source.path + ent.add_property(p) + return ent class SyncGraph(): @@ -93,41 +117,28 @@ class SyncGraph(): other db.Entity object. These references are scanned initially and used to create multiple reference maps that track how SemanticEntities reference each other. These maps are kept up to date when SemanticEntities are merged because they are identified with each other. When - creating the final list of db.Entity objects (``create_record_lists``) the Python references + creating the final list of db.Entity objects (``export_record_lists``) the Python references are updated according to the reference map. This model should only be manipulated via three functions: - - make_identifiable: adds an identifiable to a SemanticEntity - - merge_with_equivalent: check whether there is an equivalent SemanticEntity and merge into - that if one is found - - check_remote_server: uses the identifiable to check the remote server and add ID and path if - an object was found - - set_existing: declares that a SemanticEntity is existing on the remote server - - set_missing: declares that a SemanticEntity is NOT existing on the remote server + - set_existing: declares that a SyncNode is existing on the remote server + - set_missing: declares that a SyncNode is NOT existing on the remote server """ def __init__(self, entities: List[db.Entity], identifiableAdapter): self.identifiableAdapter = identifiableAdapter - self._id_look_up: Dict[int, SemanticEntity] = {} - self._path_look_up: Dict[str, SemanticEntity] = {} - self._identifiable_look_up: Dict[str, SemanticEntity] = {} - self._missing: Dict[int, SemanticEntity] = {} - self._existing: Dict[int, SemanticEntity] = {} + self._id_look_up: Dict[int, SyncNode] = {} + self._path_look_up: Dict[str, SyncNode] = {} + self._identifiable_look_up: Dict[str, SyncNode] = {} + self._missing: Dict[int, SyncNode] = {} + self._existing: Dict[int, SyncNode] = {} # entities that are missing get negative IDs to allow identifiable creation self._remote_missing_counter = -1 - # create initial set of SemanticEntities from provided Entity list - self.se: List[SemanticEntity] = [] # list of all SemanticEntities - # TODO do we only need this for creating the initial reference map? Remove it? - self.se_lookup: Dict[str, SemanticEntity] = {} # lookup: UUID -> SemanticEntity - entities = self._create_flat_list(entities) - self._sanity_check(entities) - for el in entities: - self.se.append(SemanticEntity( - el, - self.identifiableAdapter.get_registered_identifiable(el))) - self.se_lookup[id(el)] = self.se[-1] - self.unchecked = list(self.se) # list all SemanticEntities that have not yet been checked + self.nodes: List[SyncNode] = [] + self._initialize_nodes(entities) # list of all SemanticEntities + # list all SemanticEntities that have not yet been checked + self.unchecked = list(self.nodes) # initialize reference mappings ( @@ -137,67 +148,95 @@ class SyncGraph(): self.backward_id_references, self.forward_id_referenced_by, self.backward_id_referenced_by, - ) = self._create_reference_mapping(self.se, self.se_lookup) - - self._mark_entities_with_path_or_id() + ) = self._create_reference_mapping(self.nodes) + + # self._mark_entities_with_path_or_id() + + def set_id_of_node(self, se: SyncNode, node_id: Optional[str]): + """sets the ID attribute of the given SyncNode. If node_id is None, a negative Id will be + given indicating that the entity does not exist on the remote server""" + if se.id is not None: + raise RuntimeError('cannot update id') + if node_id is None: + if se.path is None and se.identifiable is None: + raise RuntimeError("no identifying information") + se.id = self._remote_missing_counter + self._remote_missing_counter -= 1 + self._add_any(se, self._missing) + self.unchecked.remove(se) + + for other_missing in (self.backward_id_references[se.uuid] + + self.forward_id_referenced_by[se.uuid]): + self.set_id_of_node(other_missing) - def make_identifiable(self, se: SemanticEntity): - """Create an identifiable for the given SemanticEntity and store it there. - """ - if se.identifiable is not None: - raise RuntimeError("Already has identifiable") - se.identifiable = self.identifiableAdapter.get_identifiable( - se, self.backward_id_referenced_by[se.uuid]) - - def merge_with_equivalent(self, se: SemanticEntity) -> bool: - equivalent_se = self.get_checked_equivalent(se) - if equivalent_se is None: - return False else: + assert node_id > 0 + se.id = node_id + self._add_any(se, self._existing) + self.unchecked.remove(se) + + def set_identifiable_of_node(self, se: SyncNode, identifiable: Identifiable): + se.identifiable = identifiable + equivalent_se = self.get_equivalent(se) + if equivalent_se is not None: self._merge_into(se, equivalent_se) - return True + assert equivalent_se.identifiable is not None - def check_remote_server(self, se: SemanticEntity) -> None: - identified_record = ( - self.identifiableAdapter.retrieve_identified_record_for_identifiable( - se.identifiable)) - if identified_record is not None: - se.identify_with(identified_record) + def export_record_lists(self): + self._update_reference_values() - def set_missing(self, se: SemanticEntity) -> None: - """ add the given SemanticEntity to the list of missing entities + # TODO + missing = [el.fragments[0] for el in self._missing.values()] + # remove negative IDs + for el in missing: + if el.id is None: + raise RuntimeError("This should not happen") # TODO remove + if el.id >= 0: + raise RuntimeError("This should not happen") # TODO remove + el.id = None - This removes the SemanticEntity from the unchecked list and implies that the entity does - NOT exist on the remote server. + return (missing, [el.fragments[0] for el in self._existing.values()]) + + def identity_relies_on_unchecked_entity(self, se: SyncNode): """ - assert se.id is None - if se.path is None and se.identifiable is None: - raise RuntimeError("no identifying information") - se.id = self._remote_missing_counter - for f in se.fragments: - f.id = self._remote_missing_counter - self._remote_missing_counter -= 1 - self._add_any(se, self._missing) - self.unchecked.remove(se) - - def set_existing(self, se: SemanticEntity): - """ add the given SemanticEntity to the list of existing entities - - This removes the SemanticEntity from the unchecked list and implies that the entity exists - on the remote server. + If a record for which it could not yet be verified whether it exists in LA or not is part + of the identifying properties, this returns True, otherwise False """ - assert se.id is not None - assert se.id > 0 - self._add_any(se, self._existing) - self.unchecked.remove(se) - def get_checked_equivalent(self, entity: SemanticEntity) -> Optional[SemanticEntity]: + return any([id(ent) not in self._missing and id(ent) not in self._existing + for ent in self.forward_id_references[se.uuid]] + + [id(ent) not in self._missing and id(ent) not in self._existing + for ent in self.backward_id_referenced_by[se.uuid]]) + + def unchecked_contains_circular_dependency(self): """ - Return an equivalent SemanticEntity from the list of missing or existing entities. + Detects whether there are circular references in the given entity list and returns a list + where the entities are ordered according to the chain of references (and only the entities + contained in the circle are included. Returns None if no circular dependency is found. + + TODO: for the sake of detecting problems for split_into_inserts_and_updates we should only + consider references that are identifying properties. + """ + circle = [self.unchecked[0]] + closed = False + while not closed: + added_to_circle = False + for referenced in self.forward_references[circle[-1].uuid]: + if referenced in self.unchecked: + if referenced in circle: + closed = True + circle.append(pval) # FIXME + added_to_circle = True + if not added_to_circle: + return None + return circle + + def get_equivalent(self, entity: SyncNode) -> Optional[SyncNode]: + """ + Return an equivalent SyncNode. Equivalent means that ID, path or identifiable are the same. """ - # TODO shall we also provide a variant that returns equivalent objects that are unchecked? if entity.id is not None and entity.id in self._id_look_up: return self._id_look_up[entity.id] if entity.path is not None and entity.path in self._path_look_up: @@ -228,73 +267,13 @@ class SyncGraph(): # else: # return None - def combine_fragments(self): - for se in self.se: - if len(se.fragments) < 2: - continue - for ent in se.fragments[1:]: - merge_entities(se.fragments[0], ent, merge_id_with_resolved_entity=True) - - def create_record_lists(self): - for se in self.se: - for f in se.fragments: - f.id = se.id - f.path = se.path - self._update_reference_values() - self.combine_fragments() - # TODO assure that there is only one fragment each - - missing = [el.fragments[0] for el in self._missing.values()] - # remove negative IDs - for el in missing: - if el.id is None: - raise RuntimeError("This should not happen") # TODO remove - if el.id >= 0: - raise RuntimeError("This should not happen") # TODO remove - el.id = None - - return (missing, [el.fragments[0] for el in self._existing.values()]) - - def _update_reference_values(self): - for se in self.se: - for f in se.fragments: - for p in f.properties: - if isinstance(p.value, list): - for index, val in enumerate(p.value): - if id(val) in self.se_lookup: - p.value[index] = self.se_lookup[id(val)].fragments[0] - else: - if id(p.value) in self.se_lookup: - p.value = self.se_lookup[id(p.value)].fragments[0] - - def identity_relies_on_unchecked_entity(self, se: SemanticEntity): - """ - If a record for which it could not yet be verified whether it exists in LA or not is part - of the identifying properties, this returns True, otherwise False - """ - - return any([id(ent) not in self._missing and id(ent) not in self._existing - for ent in self.forward_id_references[se.uuid]] - + [id(ent) not in self._missing and id(ent) not in self._existing - for ent in self.backward_id_referenced_by[se.uuid]]) - - def identity_relies_on_missing_entity(self, se: SemanticEntity): - """ - returns False if any value in the properties attribute is a db.Entity object that - is contained in the `remote_missing_cache`. If ident has such an object in - properties, it means that it references another Entity, where we checked - whether it exists remotely and it was not found. - """ - return any([id(ent) in self._missing for ent in self.forward_id_references[se.uuid]] - + [id(ent) in self._missing for ent in self.backward_id_referenced_by[se.uuid]]) - - @ staticmethod + @staticmethod def _sanity_check(entities: List[db.Entity]): for ent in entities: if ent.role == "Record" and len(ent.parents) == 0: raise RuntimeError(f"Records must have a parent.\n{ent}") - @ staticmethod + @staticmethod def _create_flat_list(ent_list: List[db.Entity], flat: Optional[List[db.Entity]] = None): """ Recursively adds entities and all their properties contained in ent_list to @@ -324,7 +303,7 @@ class SyncGraph(): SyncGraph._create_flat_list([p.value], flat) return flat - @ staticmethod + @staticmethod def _treat_merge_error_of(newrecord, record): """ The parameters are two entities that cannot be merged with the merge_entities function. @@ -371,8 +350,8 @@ class SyncGraph(): f"{record}\n{newrecord}") raise RuntimeError("Cannot merge Entities") - @ staticmethod - def _create_reference_mapping(flat: List[SemanticEntity], se_lookup): + @staticmethod + def _create_reference_mapping(flat: List[SyncNode]): """ TODO update docstring Create a dictionary of dictionaries of the form: @@ -387,42 +366,39 @@ class SyncGraph(): to them. """ # TODO we need to treat children of RecordTypes somehow. - forward_references: Dict[str, set[SemanticEntity]] = {} - backward_references: Dict[str, set[SemanticEntity]] = {} - forward_id_references: Dict[str, set[SemanticEntity]] = {} - backward_id_references: Dict[str, set[SemanticEntity]] = {} - forward_id_referenced_by: Dict[str, set[SemanticEntity]] = {} - backward_id_referenced_by: Dict[str, set[SemanticEntity]] = {} + forward_references: Dict[str, set[SyncNode]] = {} + backward_references: Dict[str, set[SyncNode]] = {} + forward_id_references: Dict[str, set[SyncNode]] = {} + backward_id_references: Dict[str, set[SyncNode]] = {} + forward_id_referenced_by: Dict[str, set[SyncNode]] = {} + backward_id_referenced_by: Dict[str, set[SyncNode]] = {} # initialize with empty lists/dict for se in flat: - for ent in se.fragments: - forward_references[se.uuid] = set() - backward_references[se.uuid] = set() - forward_id_references[se.uuid] = set() - backward_id_references[se.uuid] = set() - forward_id_referenced_by[se.uuid] = set() - backward_id_referenced_by[se.uuid] = set() + forward_references[se.uuid] = set() + backward_references[se.uuid] = set() + forward_id_references[se.uuid] = set() + backward_id_references[se.uuid] = set() + forward_id_referenced_by[se.uuid] = set() + backward_id_referenced_by[se.uuid] = set() for se in flat: - for ent in se.fragments: - for p in ent.properties: - val = p.value - if not isinstance(val, list): - val = [val] - for v in val: - if isinstance(v, db.Entity): - vse = se_lookup[id(v)] - forward_references[se.uuid].add(vse) - backward_references[vse.uuid].add(se) - if len([el.name for el in se.registered_identifiable.properties if - el.name == p.name]) > 0: - forward_id_references[se.uuid].add(vse) - backward_id_references[vse.uuid].add(se) - if (vse.registered_identifiable is not None and - IdentifiableAdapter.referencing_entity_has_appropriate_type( - ent.parents, vse.registered_identifiable)): - forward_id_referenced_by[se.uuid].add(vse) - backward_id_referenced_by[vse.uuid].add(se) + for p in se.properties: + val = p.value + if not isinstance(val, list): + val = [val] + for v in val: + if isinstance(v, SyncNode): + forward_references[se.uuid].add(v) + backward_references[v.uuid].add(se) + if len([el.name for el in se.registered_identifiable.properties if + el.name == p.name]) > 0: + forward_id_references[se.uuid].add(v) + backward_id_references[v.uuid].add(se) + if (v.registered_identifiable is not None and + IdentifiableAdapter.referencing_entity_has_appropriate_type( + se.parents, v.registered_identifiable)): + forward_id_referenced_by[se.uuid].add(v) + backward_id_referenced_by[v.uuid].add(se) return (forward_references, backward_references, forward_id_references, backward_id_references, forward_id_referenced_by, backward_id_referenced_by, @@ -431,7 +407,7 @@ class SyncGraph(): def _mark_entities_with_path_or_id(self): """ A path or an ID is sufficiently identifying. Thus, those entities can be marked as checked """ - for semantic_entity in list(self.se[::-1]): + for semantic_entity in list(self.nodes[::-1]): assert len(semantic_entity.fragments) == 1 entity = semantic_entity.fragments[0] if entity.id is None and entity.path is None: @@ -445,7 +421,7 @@ class SyncGraph(): semantic_entity.identify_with(existing) # at this point, semantic_entity has an ID if it is existing - treated_before = self.get_checked_equivalent(semantic_entity) + treated_before = self.get_equivalent(semantic_entity) if treated_before is None: if semantic_entity.id is None or semantic_entity.id < 0: self.set_missing(semantic_entity) @@ -457,39 +433,16 @@ class SyncGraph(): def _remove_non_identifiables(self): """ A path or an ID is sufficiently identifying. Thus, those entities can be marked as checked """ - for semantic_entity in list(self.se[::-1]): + for semantic_entity in list(self.nodes[::-1]): if "nonidentifiable" in [p.name for p in semantic_entity.registered_identifiable.properties]: self.unchecked.remove(semantic_entity) - def detect_circular_dependency(self): - """ - Detects whether there are circular references in the given entity list and returns a list - where the entities are ordered according to the chain of references (and only the entities - contained in the circle are included. Returns None if no circular dependency is found. - - TODO: for the sake of detecting problems for split_into_inserts_and_updates we should only - consider references that are identifying properties. - """ - circle = [self.unchecked[0]] - closed = False - while not closed: - added_to_circle = False - for referenced in self.forward_references[circle[-1].uuid]: - if referenced in self.unchecked: - if referenced in circle: - closed = True - circle.append(pval) # FIXME - added_to_circle = True - if not added_to_circle: - return None - return circle - - def _add_any(self, entity: SemanticEntity, lookup): + def _add_any(self, entity: SyncNode, lookup): """Add ``entity`` to this SemanticTarget and store in ``lookup`` cache. -The entity is stored in the SemanticEntity's ``id``, ``path`` and ``identifiable`` lookup tables, if +The entity is stored in the SyncNode's ``id``, ``path`` and ``identifiable`` lookup tables, if the respective attributes exist. """ @@ -501,26 +454,14 @@ the respective attributes exist. self._identifiable_look_up[entity.identifiable.get_representation()] = entity lookup[id(entity)] = entity - def _merge_into(self, source: SemanticEntity, target: SemanticEntity): + def _merge_into(self, source: SyncNode, target: SyncNode): """ FIXME tries to merge record into newrecord If it fails, record is added to the try_to_merge_later list. In any case, references are bent to the newrecord object. """ - for f in source.fragments: - self.se_lookup[id(f)] = target - f.id = target.id - f.path = target.path - target.include(source) - if target.identifiable is None and not self.identity_relies_on_unchecked_entity(target): - try: - self.make_identifiable(target) - if target not in self.unchecked: - self._identifiable_look_up[target.identifiable.get_representation()] = target - except Exception as es: - print(es) - pass + target.update(source) # update reference mappings for se in self.forward_references.pop(source.uuid): @@ -550,9 +491,29 @@ the respective attributes exist. self.forward_id_referenced_by[se.uuid].remove(source) self.forward_id_referenced_by[se.uuid].add(target) - # remove unneeded SemanticEntity - self.se.remove(source) + # remove unneeded SyncNode + self.nodes.remove(source) if source in self.unchecked: self.unchecked.remove(source) assert id(source) not in self._missing assert id(source) not in self._existing + + def _initialize_nodes(self, entities): + """ create initial set of SemanticEntities from provided Entity list""" + entities = self._create_flat_list(entities) + self._sanity_check(entities) + se_lookup: Dict[str, SyncNode] = {} # lookup: UUID -> SyncNode + for el in entities: + self.nodes.append(SyncNode( + el, + self.identifiableAdapter.get_registered_identifiable(el))) + se_lookup[id(el)] = self.nodes[-1] + for se in self.nodes: + for p in se.properties: + if isinstance(p.value, list): + for index, val in enumerate(p.value): + if id(val) in se_lookup: + p.value[index] = se_lookup[id(val)] + else: + if id(p.value) in se_lookup: + p.value = se_lookup[id(p.value)] diff --git a/unittests/test_file_identifiables.py b/unittests/test_file_identifiables.py index 29ad1c62f3e1eafc8ade737af7e8d38fff45109f..93dc8cb5d274b0875b9414d191f0379761af8dd8 100644 --- a/unittests/test_file_identifiables.py +++ b/unittests/test_file_identifiables.py @@ -8,7 +8,7 @@ import caosdb as db import pytest from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import LocalStorageIdentifiableAdapter -from caoscrawler.sync_graph import SemanticEntity +from caoscrawler.sync_graph import SyncNode from caosdb.cached import cache_clear from caosdb.exceptions import EmptyUniqueQueryError from pytest import raises @@ -30,11 +30,11 @@ def test_file_identifiable(): # Without a path there is no identifying information with raises(ValueError): - ident.get_identifiable(SemanticEntity(db.File(), None), []) + ident.get_identifiable(SyncNode(db.File(), None), []) fp = "/test/bla/bla.txt" file_obj = db.File(path=fp) - identifiable = ident.get_identifiable(SemanticEntity(file_obj, None), []) + identifiable = ident.get_identifiable(SyncNode(file_obj, None), []) # the path is copied to the identifiable assert fp == identifiable.path diff --git a/unittests/test_identifiable_adapters.py b/unittests/test_identifiable_adapters.py index 6f10189f3e86e50dedace3d75b15deed20d40719..167c84e1cf4e11e1b19d5cee9fa633dc9e934dee 100644 --- a/unittests/test_identifiable_adapters.py +++ b/unittests/test_identifiable_adapters.py @@ -37,7 +37,7 @@ from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import (CaosDBIdentifiableAdapter, IdentifiableAdapter, convert_value) -from caoscrawler.sync_graph import SemanticEntity +from caoscrawler.sync_graph import SyncNode UNITTESTDIR = Path(__file__).parent @@ -123,7 +123,7 @@ def test_load_from_yaml_file(): def test_non_default_name(): ident = CaosDBIdentifiableAdapter() - identifiable = ident.get_identifiable(SemanticEntity(db.Record(name="don't touch it") + identifiable = ident.get_identifiable(SyncNode(db.Record(name="don't touch it") .add_parent("Person") .add_property(name="last_name", value='Tom'), db.RecordType() .add_parent(name="Person") @@ -135,9 +135,9 @@ def test_wildcard_ref(): ident = CaosDBIdentifiableAdapter() rec = (db.Record(name="don't touch it").add_parent("Person") .add_property(name="last_name", value='Tom')) - dummy = SemanticEntity(db.Record(), None) + dummy = SyncNode(db.Record(), None) dummy.id = 1 - identifiable = ident.get_identifiable(SemanticEntity(rec, db.RecordType() + identifiable = ident.get_identifiable(SyncNode(rec, db.RecordType() .add_parent(name="Person") .add_property(name="is_referenced_by", value=["*"])), @@ -162,7 +162,7 @@ def test_get_identifiable(): .add_parent(name="Experiment", id=3) .add_property(name="date", value="2022-02-01") .add_property(name="result", value="FAIL")) - se = SemanticEntity(rec, + se = SyncNode(rec, ident.get_registered_identifiable(rec)) id_r0 = ident.get_identifiable(se, []) r_cur = se.fragments[0] @@ -179,7 +179,7 @@ def test_get_identifiable(): .add_parent(name="A", id=3) .add_property(name="a", value="2022-02-01") .add_property(name="result", value="FAIL")) - se = SemanticEntity(rec, ident.get_registered_identifiable(rec)) + se = SyncNode(rec, ident.get_registered_identifiable(rec)) se.fragments.extend([ db.Record() .add_parent(name="A", id=3) @@ -202,7 +202,7 @@ def test_get_identifiable(): .add_parent(name="A") .add_property(name="a", value="2") ) - se = SemanticEntity(rec, ident.get_registered_identifiable(rec)) + se = SyncNode(rec, ident.get_registered_identifiable(rec)) se.fragments.extend([ db.Record(name='a') .add_parent(name="A") diff --git a/unittests/test_semantic_target.py b/unittests/test_sync_graph.py similarity index 71% rename from unittests/test_semantic_target.py rename to unittests/test_sync_graph.py index e2d1509a157d0ba10a4ae1da2998b38ca71bed69..e2b8fc0622c6f0bec0e4d4f1b151696d431a18e1 100644 --- a/unittests/test_semantic_target.py +++ b/unittests/test_sync_graph.py @@ -22,8 +22,10 @@ from functools import partial from unittest.mock import MagicMock, Mock, patch import linkahead as db +import pytest +from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter -from caoscrawler.sync_graph import SemanticEntity, SyncGraph +from caoscrawler.sync_graph import SyncGraph, SyncNode from test_crawler import basic_retrieve_by_name_mock_up, mock_get_entity_by @@ -51,8 +53,8 @@ def test_create_flat_list(): def test_create_reference_mapping(): a = db.Record().add_parent("A") b = db.Record(id=132).add_parent("B").add_property('a', a) - ses = [SemanticEntity(a, db.RecordType().add_property("is_referenced_by", ["B"])), - SemanticEntity(b, db.RecordType().add_property("a"))] + ses = [SyncNode(a, db.RecordType().add_property("is_referenced_by", ["B"])), + SyncNode(b, db.RecordType().add_property("a"))] (forward_references, backward_references, forward_id_references, backward_id_references, forward_id_referenced_by, @@ -119,9 +121,6 @@ def test_merge_into(): ident_adapter.register_identifiable("RT2", ident_b) st = SyncGraph([a, b], ident_adapter) - se_a = st.se_lookup[id(a)] - se_b = st.se_lookup[id(b)] - se_c = st.se_lookup[id(c)] # CHECK REFERENCE MAP: # c is referenced by a @@ -274,7 +273,7 @@ def test_backward_id_referenced_by(): entlist = [referenced, db.Record(name="A").add_parent("BR").add_property("ref", referenced), ] st = SyncGraph(entlist, ident_adapter) - assert st.se[1] in st.backward_id_referenced_by[st.se[0].uuid] + assert st.nodes[1] in st.backward_id_referenced_by[st.nodes[0].uuid] @patch("caoscrawler.sync_graph.cached_get_entity_by", @@ -294,20 +293,20 @@ def test_merging(): db.Record(id=101).add_parent("A"), db.Record(id=101).add_parent("A")] st = SyncGraph(entlist, ident_adapter) - assert len(st.se) == 1 + assert len(st.nodes) == 1 assert len(st.unchecked) == 0 - assert entlist[0] in st.se[0].fragments - assert entlist[1] in st.se[0].fragments + assert 101 == st.nodes[0].id + assert "A" == st.nodes[0].parents[0].name # merging based on path entlist = [ db.File(path='101').add_parent("A"), db.File(path='101').add_parent("A")] st = SyncGraph(entlist, ident_adapter) - assert len(st.se) == 1 + assert len(st.nodes) == 1 assert len(st.unchecked) == 0 - assert entlist[0] in st.se[0].fragments - assert entlist[1] in st.se[0].fragments + assert '101' == st.nodes[0].path + assert "A" == st.nodes[0].parents[0].name # merging based on identifiable entlist = [ @@ -315,17 +314,18 @@ def test_merging(): db.File(name='101').add_parent("A").add_property('a', value=1)] st = SyncGraph(entlist, ident_adapter) assert len(st.unchecked) == 2 - st.make_identifiable(st.se[0]) - st.check_remote_server(st.se[0]) - st.set_missing(st.se[0]) + st.set_identifiable_of_node(st.nodes[0], + Identifiable(recordtype="A", name='101', properties={'a': 1})) + assert len(st.unchecked) == 2 + st.set_identifiable_of_node(st.nodes[1], + Identifiable(recordtype="A", name='101', properties={'a': 1})) assert len(st.unchecked) == 1 - st.make_identifiable(st.se[1]) - assert st.se[1].id is None - assert st.merge_with_equivalent(st.se[1]) - assert len(st.se) == 1 - assert len(st.unchecked) == 0 - assert entlist[0] in st.se[0].fragments - assert entlist[1] in st.se[0].fragments + assert len(st.nodes) == 1 + assert st.nodes[1].id is None + assert '101' == st.nodes[0].name + assert "A" == st.nodes[0].parents[0].name + assert 1 == st.nodes[0].properties[0].value + assert "a" == st.nodes[0].properties[0].name # Merging a mix. One Record needs the identifiable to be merged. But the identifying # information is scattered in the other case. @@ -336,11 +336,133 @@ def test_merging(): db.Record(name='a').add_parent("A").add_property('a', value=1)] st = SyncGraph(entlist, ident_adapter) - assert len(st.se) == 2 + assert len(st.nodes) == 2 assert len(st.unchecked) == 1 - st.make_identifiable(st.se[1]) - assert st.merge_with_equivalent(st.se[1]) - assert len(st.se) == 1 + st.make_identifiable(st.nodes[1]) + assert st.merge_with_equivalent(st.nodes[1]) + assert len(st.nodes) == 1 assert len(st.unchecked) == 0 for ii in range(4): - assert entlist[ii] in st.se[0].fragments + assert entlist[ii] in st.nodes[0].fragments + + +def test_sync_node(): + # initialization + rec = (db.Record(id=101, name='101') + .add_parent("A") + .add_parent(id=102) + .add_property(name="a", value='a') + .add_property(id=103, value='b')) + sn = SyncNode(rec) + assert sn.id == rec.id + assert sn.name == rec.name + assert sn.parents == rec.parents + assert sn.properties == rec.properties + assert sn.description == rec.description + assert sn.role == rec.role + fi = db.File(id=101, name='101', path='/a/') + sn = SyncNode(fi) + assert sn.role == fi.role + assert sn.name == fi.name + assert sn.id == fi.id + assert sn.path == fi.path + + export = sn.export_entity() + export == rec + + # merge no common information + rec_a = (db.Record(name='101') + .add_parent("A") + .add_parent(id=102) + .add_property(name="a", value='a') + .add_property(id=103, value='b')) + + rec_b = (db.Record(id=101) + .add_parent("B") + .add_parent(id=103) + .add_property(name="a", value='a') + .add_property(id=103, value='b')) + rec_b.description = "tja" + + sn_a = SyncNode(rec_a) + sn_b = SyncNode(rec_b) + sn_a.update(sn_b) + assert sn_a.id == rec_b.id + assert sn_a.name == rec_a.name + for p in rec_a.parents+rec_b.parents: + assert p in sn_a.parents + for p in rec_a.properties+rec_b.properties: + assert p in sn_a.properties + assert sn_a.description == rec_b.description + assert sn_a.role == rec_a.role + + export = sn_a.export_entity() + assert export.id == rec_b.id + assert export.name == rec_a.name + for p in rec_a.parents+rec_b.parents: + assert p in export.parents + # if p.name is not None: + # assert p.name in [el.name for el in export.parents] + # if p.id is not None: + # assert p.id in [el.id for el in export.parents] + for p in rec_a.properties+rec_b.properties: + if p.name is not None: + assert p.name in [el.name for el in export.properties] + if p.id is not None: + assert p.id in [el.id for el in export.properties] + assert len(export.properties) == 2 + assert export.get_property('a').value == 'a' + assert export.get_property(103).value == 'b' + assert export.description == rec_b.description + assert export.role == rec_a.role + + # merge with common information + rec_a = (db.Record(id=101, name='101') + .add_parent("A") + .add_parent(id=102) + .add_property(name="a", value='a')) + + rec_b = (db.Record(id=101, name='101') + .add_parent("A") + .add_parent(id=102) + .add_property(name="a", value='a')) + + sn_a = SyncNode(rec_a) + sn_b = SyncNode(rec_b) + sn_a.update(sn_b) + assert sn_a.id == rec_b.id + assert sn_a.name == rec_a.name + for p in rec_a.parents+rec_b.parents: + assert p in sn_a.parents + for p in rec_a.properties+rec_b.properties: + assert p in sn_a.properties + assert sn_a.description == rec_b.description + assert sn_a.role == rec_a.role + + # merge with conflicting information + sn_a = SyncNode(db.Record(id=102)) + with pytest.raises(AssertionError): + sn_a.update(SyncNode(db.Record(id=101))) + + sn_a = SyncNode(db.Record(name='102')) + with pytest.raises(AssertionError): + sn_a.update(SyncNode(db.Record(name='101'))) + + sn_a = SyncNode(db.Record(name='102')) + with pytest.raises(AssertionError): + sn_a.update(SyncNode(db.File(name='102'))) + + sn_a = SyncNode(db.Record(description='102')) + with pytest.raises(AssertionError): + sn_a.update(SyncNode(db.Record(description='101'))) + + sn_a = SyncNode(db.File(path='102')) + with pytest.raises(AssertionError): + sn_a.update(SyncNode(db.File(path='101'))) + + sn_a = SyncNode(db.File(path='102')) + sn_a.identifiable = Identifiable(name='a') + sn_b.identifiable = Identifiable(name='b') + sn_b = SyncNode(db.File(path='101')) + with pytest.raises(AssertionError): + sn_a.update(sn_b)