diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 6322310b62798c3ca92ae406cb83d627a4bee101..610bf7ca0651eed4f2a423d48d48c1d4d49c18ab 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -298,59 +298,59 @@ class Crawler(object): self.crawled_data = data return data - def replace_references_with_cached(self, record: db.Record, referencing_entities: dict): - """ - Replace all references with the versions stored in the cache. - - If the cache version is not identical, raise an error. - """ - for p in record.properties: - if (isinstance(p.value, list)): - lst = [] - for el in p.value: - if (isinstance(el, db.Entity) and el.id is None): - cached = self.treated_records_lookup.get_any( - el, - self.identifiableAdapter.get_identifiable( - el, referencing_entities[id(el)])) - if cached is None: - lst.append(el) - continue - if not check_identical(cached, el, True): - if isinstance(p.value, db.File): - if p.value.path != cached.path: - raise RuntimeError( - "The cached and the referenced entity are not identical.\n" - f"Cached:\n{cached}\nReferenced:\n{el}" - ) - else: - raise RuntimeError( - "The cached and the referenced entity are not identical.\n" - f"Cached:\n{cached}\nReferenced:\n{el}" - ) - lst.append(cached) - else: - lst.append(el) - p.value = lst - if (isinstance(p.value, db.Entity) and p.value.id is None): - cached = self.treated_records_lookup.get_any( - p.value, self.identifiableAdapter.get_identifiable( - p.value, referencing_entities[id(p.value)])) - if cached is None: - continue - if not check_identical(cached, p.value, True): - if isinstance(p.value, db.File): - if p.value.path != cached.path: - raise RuntimeError( - "The cached and the referenced entity are not identical.\n" - f"Cached:\n{cached}\nReferenced:\n{p.value}" - ) - else: - raise RuntimeError( - "The cached and the referenced entity are not identical.\n" - f"Cached:\n{cached}\nReferenced:\n{p.value}" - ) - p.value = cached +# def replace_references_with_cached(self, record: db.Record, referencing_entities: dict): +# """ +# Replace all references with the versions stored in the cache. +# +# If the cache version is not identical, raise an error. +# """ +# for p in record.properties: +# if (isinstance(p.value, list)): +# lst = [] +# for el in p.value: +# if (isinstance(el, db.Entity) and el.id is None): +# cached = self.treated_records_lookup.get_any( +# el, +# self.identifiableAdapter.get_identifiable( +# el, referencing_entities[id(el)])) +# if cached is None: +# lst.append(el) +# continue +# if not check_identical(cached, el, True): +# if isinstance(p.value, db.File): +# if p.value.path != cached.path: +# raise RuntimeError( +# "The cached and the referenced entity are not identical.\n" +# f"Cached:\n{cached}\nReferenced:\n{el}" +# ) +# else: +# raise RuntimeError( +# "The cached and the referenced entity are not identical.\n" +# f"Cached:\n{cached}\nReferenced:\n{el}" +# ) +# lst.append(cached) +# else: +# lst.append(el) +# p.value = lst +# if (isinstance(p.value, db.Entity) and p.value.id is None): +# cached = self.treated_records_lookup.get_any( +# p.value, self.identifiableAdapter.get_identifiable( +# p.value, referencing_entities[id(p.value)])) +# if cached is None: +# continue +# if not check_identical(cached, p.value, True): +# if isinstance(p.value, db.File): +# if p.value.path != cached.path: +# raise RuntimeError( +# "The cached and the referenced entity are not identical.\n" +# f"Cached:\n{cached}\nReferenced:\n{p.value}" +# ) +# else: +# raise RuntimeError( +# "The cached and the referenced entity are not identical.\n" +# f"Cached:\n{cached}\nReferenced:\n{p.value}" +# ) +# p.value = cached def split_into_inserts_and_updates(self, st: SemanticTarget): @@ -374,7 +374,7 @@ class Crawler(object): se.identifiable = self.identifiableAdapter.new_get_identifiable( se, st.backward_id_referenced_by[se.uuid]) - equivalent_se = st.get_equivalent_se(se) + equivalent_se = st.get_equivalent(se) # 1. Is it in the cache of already checked Records? if equivalent_se is not None: @@ -385,7 +385,7 @@ class Crawler(object): # (Is it impossible to check this record because an identifiable references a # missing record?) elif st.identity_relies_on_missing_entity(se): - st.set_new(se) + st.add_to_missing(se) # 3. check on the remote server else: @@ -393,9 +393,10 @@ class Crawler(object): self.identifiableAdapter.retrieve_identified_record_for_identifiable( se.identifiable)) if identified_record is None: - st.set_new(se) + st.add_to_missing(se) else: - st.set_existing(se, identified_record) + se.identify_with(identified_record) + st.add_to_existing(se) entity_was_treated = True # TODO @@ -404,8 +405,7 @@ class Crawler(object): # We postponed the merge for records where it failed previously and try it again now. # This only might add properties of the postponed records to the already used ones. - st.combine_fragments() - if len(st.entities) > 0: + if len(st.unchecked) > 0: circle = self.detect_circular_dependency(st.entities) if circle is None: logger.error("Failed, but found NO circular dependency. The data is as follows:" @@ -423,16 +423,7 @@ class Crawler(object): f"Could not finish split_into_inserts_and_updates. Circular dependency: " f"{circle is not None}") - # remove negative IDs - missing = st.treated_records_lookup.get_missing_list() - for el in missing: - if el.id is None: - raise RuntimeError("This should not happen") # TODO remove - if el.id >= 0: - raise RuntimeError("This should not happen") # TODO remove - el.id = None - - return (missing, st.treated_records_lookup.get_existing_list()) + return st.create_record_lists() def replace_entities_with_ids(self, rec: db.Record): for el in rec.properties: @@ -445,7 +436,7 @@ class Crawler(object): if val.id is not None: el.value[index] = val.id - @ staticmethod + @staticmethod def compact_entity_list_representation(entities, referencing_entities: List) -> str: """ a more readable representation than the standard xml representation @@ -477,7 +468,7 @@ class Crawler(object): return text + "--------\n" - @ staticmethod + @staticmethod def detect_circular_dependency(flat: list[db.Entity]): """ Detects whether there are circular references in the given entity list and returns a list @@ -510,7 +501,7 @@ class Crawler(object): return None return circle - @ staticmethod + @staticmethod def _merge_properties_from_remote( crawled_data: list[db.Record], identified_records: list[db.Record] @@ -552,7 +543,7 @@ class Crawler(object): return to_be_updated - @ staticmethod + @staticmethod def remove_unnecessary_updates( crawled_data: list[db.Record], identified_records: list[db.Record] @@ -578,7 +569,7 @@ class Crawler(object): return actual_updates - @ staticmethod + @staticmethod def execute_parent_updates_in_list(to_be_updated, securityMode, run_id, unique_names): """ Execute the updates of changed parents. @@ -621,13 +612,13 @@ class Crawler(object): "mode. This might lead to a failure of inserts that follow.") logger.info(parent_updates) - @ staticmethod + @staticmethod def _get_property_id_for_datatype(rtname: str, name: str): return cached_get_entity_by( query=f"FIND Entity '{escape_squoted_text(rtname)}' " f"with name='{escape_squoted_text(name)}'").id - @ staticmethod + @staticmethod def replace_name_with_referenced_entity_id(prop: db.Property): """changes the given property in place if it is a reference property that has a name as value @@ -672,7 +663,7 @@ class Crawler(object): propval.append(el) prop.value = propval - @ staticmethod + @staticmethod def execute_inserts_in_list(to_be_inserted, securityMode, run_id: Optional[uuid.UUID] = None, unique_names=True): @@ -692,7 +683,7 @@ class Crawler(object): update_cache = UpdateCache() update_cache.insert(to_be_inserted, run_id, insert=True) - @ staticmethod + @staticmethod def set_ids_and_datatype_of_parents_and_properties(rec_list): for record in rec_list: for parent in record.parents: @@ -704,7 +695,7 @@ class Crawler(object): prop.id = entity.id _resolve_datatype(prop, entity) - @ staticmethod + @staticmethod def execute_updates_in_list(to_be_updated, securityMode, run_id: Optional[uuid.UUID] = None, unique_names=True): @@ -718,7 +709,7 @@ class Crawler(object): update_cache = UpdateCache() update_cache.insert(to_be_updated, run_id) - @ staticmethod + @staticmethod def check_whether_parent_exists(records: list[db.Entity], parents: list[str]): """ returns a list of all records in `records` that have a parent that is in `parents`""" problems = [] @@ -839,7 +830,7 @@ class Crawler(object): return (to_be_inserted, to_be_updated) - @ staticmethod + @staticmethod def create_entity_summary(entities: list[db.Entity]): """ Creates a summary string reprensentation of a list of entities.""" parents = {} @@ -858,7 +849,7 @@ class Crawler(object): output = output[:-2] + "\n" return output - @ staticmethod + @staticmethod def inform_about_pending_changes(pending_changes, run_id, path, inserts=False): # Sending an Email with a link to a form to authorize updates is if get_config_setting("send_crawler_notifications"): @@ -879,7 +870,7 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) + " by invoking the crawler" " with the run id: {rid}\n".format(rid=run_id)) - @ staticmethod + @staticmethod def debug_build_usage_tree(converter: Converter): res: dict[str, dict[str, Any]] = { converter.name: { diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index 473597bfaa566cf211818d9061a4495224e33dfd..da80f77366c6e48ed32fe2978e7e8395513885c4 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -282,7 +282,7 @@ startswith: bool, optional if len(options) == 0: raise NotImplementedError( f"The following record is missing an identifying property:\n" - f"RECORD\n{record}\nIdentifying PROPERTY\n{prop.name}" + f"RECORD\n{se}\nIdentifying PROPERTY\n{prop.name}" ) assert all([f.value == options[0].value for f in options]) record_prop = options[0] @@ -303,8 +303,8 @@ startswith: bool, optional # We do not use parents of Record because it might have multiple try: return Identifiable( - record_id=se.remote_id, - path=se.remote_path, + record_id=se.id, + path=se.path, record_type=(registered_identifiable.parents[0].name if registered_identifiable else None), name=name, diff --git a/src/caoscrawler/semantic_target.py b/src/caoscrawler/semantic_target.py index 443024eee4ff855e1c8d3d2160f6a9617f72e638..cfba71fc6059762b356c341730f3c3255d0eb719 100644 --- a/src/caoscrawler/semantic_target.py +++ b/src/caoscrawler/semantic_target.py @@ -31,14 +31,19 @@ from linkahead.apiutils import (EntityMergeConflictError, compare_entities, merge_entities) from .identifiable_adapters import IdentifiableAdapter -from .treated_record_lookup import TreatedRecordLookUp class SemanticEntity(): + """ represents the information related to an Entity as it shall be created in LinkAhead + + Parents and Properties are given by fragments: db.Record objects that may have been created by + the scanner. + """ + def __init__(self, entity: db.Entity, registered_identifiable): self.fragments = [entity] - self.remote_id = entity.id - self.remote_path = entity.path + self.id = entity.id + self.path = entity.path self.identifiable = None self.registered_identifiable = registered_identifiable self.uuid = uuid() @@ -49,21 +54,41 @@ class SemanticEntity(): def is_missing(): return len([el.id for el in self.fragments if el.id < 0]) > 0 + def identify_with(self, remote_entity): + for f in self.fragments: + # side effect + f.id = remote_entity.id + f.path = remote_entity.path + # TODO check the following copying of _size and _checksum + # Copy over checksum and size too if it is a file + f._size = remote_entity._size + f._checksum = remote_entity._checksum + class SemanticTarget(): + """ models the target structure of Entities as it shall be created by the Crawler + + """ + def __init__(self, entities: list[db.Entity], identifiableAdapter): self.identifiableAdapter = identifiableAdapter - self.entities = self.create_flat_list(entities) - self.treated_records_lookup = TreatedRecordLookUp() - self.se = [] - self.se_lookup = {} + self.entities = self._create_flat_list(entities) + self._id_look_up: dict[int, SemanticEntity] = {} + self._path_look_up: dict[str, SemanticEntity] = {} + self._identifiable_look_up: dict[str, SemanticEntity] = {} + self._missing: dict[int, SemanticEntity] = {} + self._existing: dict[int, SemanticEntity] = {} + + # create initial set of SemanticEntities from provided Entity list + self.se: list[SemanticEntity] = [] # list of all SemanticEntities + self.se_lookup: dict[str, SemanticEntity] = {} # get a SemanticEntity by its UUID for el in self.entities: self.se.append(SemanticEntity( el, self.identifiableAdapter.get_registered_identifiable(el))) self.se_lookup[id(el)] = self.se[-1] - self.unchecked = list(self.se) - + self.unchecked = list(self.se) # list all SemanticEntities that have not yet been checked + self._remote_missing_counter = -1 self.sanity_check(self.entities) ( self.forward_references, @@ -72,80 +97,80 @@ class SemanticTarget(): self.backward_id_references, self.forward_id_referenced_by, self.backward_id_referenced_by, - ) = self.new_create_reference_mapping(self.se, self.se_lookup) - self.treat_trivial_id() + ) = self._create_reference_mapping(self.se, self.se_lookup) - def treat_trivial_id(self): - for semantic_entity in self.se: - assert len(semantic_entity.fragments) == 1 - entity = semantic_entity.fragments[0] - # 1. can it be checked via ID - if entity.id is not None: - treated_record = self.treated_records_lookup.get_existing(entity) - if treated_record is None: - self.treated_records_lookup.add(entity, None) - if semantic_entity.remote_id is None: - semantic_entity.remote_id = entity.id - else: - assert semantic_entity.remote_id == entity.id - else: - self.merge_into(semantic_entity, self.se_lookup[id(treated_record)]) - self.unchecked.remove(semantic_entity) - # 2. Can it be identified via a path? - elif entity.path is not None: - try: - existing = cached_get_entity_by(path=entity.path) - except EmptyUniqueQueryError: - existing = None - if existing is not None: - entity.id = existing.id - # TODO check the following copying of _size and _checksum - # Copy over checksum and size too if it is a file - entity._size = existing._size - entity._checksum = existing._checksum - treated_record = self.treated_records_lookup.get_any(entity) - if treated_record is None: - self.treated_records_lookup.add(entity, None) - if semantic_entity.remote_path is None: - semantic_entity.remote_path = entity.path - else: - assert semantic_entity.remote_path == entity.path - else: - self.merge_into(semantic_entity, self.se_lookup[id(treated_record)]) - self.unchecked.remove(semantic_entity) + self._mark_entities_with_path_or_id() - def get_equivalent_se(self, se: SemanticEntity) -> Optional[SemanticEntity]: - return st.se_lookup[id(self.treated_records_lookup.get_any(None, se.identifiable))] - - def set_new(self, se: SemanticEntity) -> Optional[SemanticEntity]: - self.treated_records_lookup.add(se.fragments[0], se.identifiable) - self.unchecked.remove(se) - - def set_existing(self, se, identified_record): - for f in se.fragments: - # side effect - f.id = identified_record.id - f.path = identified_record.path - st.treated_records_lookup.add(se.fragments[0], se.identifiable) + def get_equivalent(self, entity: SemanticEntity = None) -> Optional[SemanticEntity]: + """ + Return an equivalent SemanticEntity from the list of missing or existing entities. - def identity_relies_on_unchecked_entity(self, se: SemanticEntity): + Equivalent means that ID, path or identifiable are the same. """ - If a record for which it could not yet be verified whether it exists in LA or not is part - of the identifying properties, this returns True, otherwise False + if entity.id is not None and entity.id in self._id_look_up: + return self._id_look_up[entity.id] + if entity.path is not None and entity.path in self._path_look_up: + return self._path_look_up[entity.path] + if (entity.identifiable is not None and entity.identifiable.get_representation() in + self._identifiable_look_up): + return self._identifiable_look_up[entity.identifiable.get_representation()] + + def get_equivalent_existing(self, se): + """ Check whether this Record exists on the remote server + + Returns: The stored Record """ + treated = self.get_any(se) + if id(treated) in self._existing: + return rec + else: + return None - return any([ent.is_unchecked() for ent in self.forward_id_references[se.uuid]] - + [ent.is_unchecked() for ent in self.backward_id_referenced_by[se.uuid]]) + def get_equivalent_missing(self, se): + """ Check whether this Record is missing on the remote server - def identity_relies_on_missing_entity(self, se: SemanticEntity): - """ - returns False if any value in the properties attribute is a db.Entity object that - is contained in the `remote_missing_cache`. If ident has such an object in - properties, it means that it references another Entity, where we checked - whether it exists remotely and it was not found. + Returns: The stored Record """ - return any([ent.is_missing() for ent in self.forward_id_references[se.uuid]] - + [ent.is_missing() for ent in self.backward_id_referenced_by[se.uuid]]) + treated = self.get_any(record, identifiable) + if id(treated) in self._missing: + return rec + else: + return None + + def create_record_lists(self): + for se in self.se: + for f in se.fragments: + f.id = se.id + f.path = se.path + self.combine_fragments() + + missing = [el.fragments[0] for el in self._missing.values()] + # remove negative IDs + for el in missing: + if el.id is None: + raise RuntimeError("This should not happen") # TODO remove + if el.id >= 0: + raise RuntimeError("This should not happen") # TODO remove + el.id = None + + return (missing, [el.fragments[0] for el in self._existing.values()]) + + def add_to_missing(self, se: SemanticEntity): + assert se.id is None + se.id = self._remote_missing_counter + self._remote_missing_counter -= 1 + self._add_any(se, self._missing) + self.unchecked.remove(se) + + def add_to_existing(self, se: SemanticEntity): + """ add a SemanticEntity to the lookup of treated entities and remove id from the unchecked + list + Add a Record that was treated, such that it is contained in the internal look up dicts + + This Record MUST have an ID if it was found in the remote server. +""" + self._add_any(se, self._existing) + self.unchecked.remove(se) def merge_into(self, source: SemanticEntity, target: SemanticEntity): """ tries to merge record into newrecord @@ -170,17 +195,17 @@ class SemanticTarget(): print(target) print(source) raise - if source.remote_id is not None: - if target.remote_id is None: - target.remote_id = source.remote_id + if source.id is not None: + if target.id is None: + target.id = source.id else: - assert target.remote_id == source.remote_id + assert target.id == source.id - if source.remote_path is not None: - if target.remote_path is None: - target.remote_path = source.remote_path + if source.path is not None: + if target.path is None: + target.path = source.path else: - assert target.remote_path == source.remote_path + assert target.path == source.path # update reference mappings for se in self.forward_references.pop(source.uuid): @@ -215,140 +240,41 @@ class SemanticTarget(): if source in self.unchecked: self.unchecked.remove(source) - @ staticmethod - def new_create_reference_mapping(flat: list[SemanticEntity], se_lookup): - """ - TODO update docstring - Create a dictionary of dictionaries of the form: - dict[int, dict[str, list[Union[int,None]]]] - - - The integer index is the Python id of the value object. - - The string is the name of the first parent of the referencing object. - - Each value objects is taken from the values of all properties from the list flat. - - So the returned mapping maps ids of entities to the ids of objects which are referring - to them. + def identity_relies_on_unchecked_entity(self, se: SemanticEntity): """ - # TODO we need to treat children of RecordTypes somehow. - forward_references: dict[str, set[SemanticEntity]] = {} - backward_references: dict[str, set[SemanticEntity]] = {} - forward_id_references: dict[str, set[SemanticEntity]] = {} - backward_id_references: dict[str, set[SemanticEntity]] = {} - forward_id_referenced_by: dict[str, set[SemanticEntity]] = {} - backward_id_referenced_by: dict[str, set[SemanticEntity]] = {} - - # initialize with empty lists/dict - for se in flat: - for ent in se.fragments: - forward_references[se.uuid] = set() - backward_references[se.uuid] = set() - forward_id_references[se.uuid] = set() - backward_id_references[se.uuid] = set() - forward_id_referenced_by[se.uuid] = set() - backward_id_referenced_by[se.uuid] = set() - for se in flat: - for ent in se.fragments: - for p in ent.properties: - val = p.value - if not isinstance(val, list): - val = [val] - for v in val: - vse = se_lookup[id(v)] - if isinstance(v, db.Entity): - forward_references[se.uuid].add(vse) - backward_references[vse.uuid].add(se) - if len([el.name for el in se.registered_identifiable.properties if - el.name == p.name]) > 0: - forward_id_references[se.uuid].add(vse) - backward_id_references[vse.uuid].add(se) - if IdentifiableAdapter.referencing_entity_has_appropriate_type( - ent.parents, vse.registered_identifiable): - forward_id_referenced_by[se.uuid].add(vse) - backward_id_referenced_by[vse.uuid].add(se) - - return (forward_references, backward_references, forward_id_references, - backward_id_references, forward_id_referenced_by, backward_id_referenced_by, - ) - - @ staticmethod - def create_reference_mapping(flat: list[SemanticEntity]): + If a record for which it could not yet be verified whether it exists in LA or not is part + of the identifying properties, this returns True, otherwise False """ - TODO update docstring - Create a dictionary of dictionaries of the form: - dict[int, dict[str, list[Union[int,None]]]] - - - The integer index is the Python id of the value object. - - The string is the name of the first parent of the referencing object. - Each value objects is taken from the values of all properties from the list flat. + return any([ent.is_unchecked() for ent in self.forward_id_references[se.uuid]] + + [ent.is_unchecked() for ent in self.backward_id_referenced_by[se.uuid]]) - So the returned mapping maps ids of entities to the ids of objects which are referring - to them. + def identity_relies_on_missing_entity(self, se: SemanticEntity): """ - # TODO we need to treat children of RecordTypes somehow. - forward_references: dict[int, list[SemanticEntity]] = {} - backward_references: dict[int, list[SemanticEntity]] = {} - forward_id_references: dict[int, list[SemanticEntity]] = {} - backward_id_references: dict[int, list[SemanticEntity]] = {} - # initialize with empty lists/dict - for se in flat: - for ent in se.fragments: - forward_references[id(ent)] = [] - backward_references[id(ent)] = [] - forward_id_references[id(ent)] = [] - backward_id_references[id(ent)] = [] - forward_id_referenced_by[id(ent)] = [] - backward_id_referenced_by[id(ent)] = [] - for se in flat: - for ent in se.fragments: - for p in ent.properties: - val = p.value - if not isinstance(val, list): - val = [val] - for v in val: - if isinstance(v, db.Entity): - forward_references[id(ent)].append(v) - backward_references[id(v)].append(ent) - if (hasattr(ent, "registered") - and p.name in ent.registered.properties): - forward_id_references[id(ent)].append(v) - backward_id_references[id(v)].append(ent) - if (hasattr(v, "registered") and - v.registered.get_property("is_referenced_by") is not None and - (ent.parents[0].name in v.registered.get_property( - "is_referenced_by").value or - "*" in v.registered.get_property("is_referenced_by").value)): - forward_id_referenced_by[id(ent)].append(v) - backward_id_referenced_by[id(v)].append(ent) - - return (forward_references, backward_references, forward_id_references, - backward_id_references, forward_id_referenced_by, backward_id_referenced_by) - - @ staticmethod - def bend_references_to_new_object(old, new, entities): - """ Bend references to the other object - Iterate over all entities in `entities` and check the values of all properties of - occurances of old Entity and replace them with new Entity + returns False if any value in the properties attribute is a db.Entity object that + is contained in the `remote_missing_cache`. If ident has such an object in + properties, it means that it references another Entity, where we checked + whether it exists remotely and it was not found. """ - for el in entities: - for p in el.properties: - if isinstance(p.value, list): - for index, val in enumerate(p.value): - if val is old: - p.value[index] = new - else: - if p.value is old: - p.value = new + return any([ent.is_missing() for ent in self.forward_id_references[se.uuid]] + + [ent.is_missing() for ent in self.backward_id_referenced_by[se.uuid]]) - @ staticmethod + @staticmethod def sanity_check(entities: list[db.Entity]): for ent in entities: if ent.role == "Record" and len(ent.parents) == 0: raise RuntimeError(f"Records must have a parent.\n{ent}") - @ staticmethod - def create_flat_list(ent_list: list[db.Entity], flat: Optional[list[db.Entity]] = None): + def combine_fragments(self): + for se in self.se: + if len(se.fragments) < 2: + continue + for ent in se.fragments[1:]: + merge_entities(se.fragments[0], ent, merge_id_with_resolved_entity=True) + se.fragments = [se.fragments[0]] + + @staticmethod + def _create_flat_list(ent_list: list[db.Entity], flat: Optional[list[db.Entity]] = None): """ Recursively adds entities and all their properties contained in ent_list to the output list flat. @@ -370,14 +296,14 @@ class SemanticTarget(): if isinstance(el, db.Entity): if el not in flat: flat.append(el) - SemanticTarget.create_flat_list([el], flat) + SemanticTarget._create_flat_list([el], flat) elif isinstance(p.value, db.Entity): if p.value not in flat: flat.append(p.value) - SemanticTarget.create_flat_list([p.value], flat) + SemanticTarget._create_flat_list([p.value], flat) return flat - @ staticmethod + @staticmethod def _treat_merge_error_of(newrecord, record): """ The parameters are two entities that cannot be merged with the merge_entities function. @@ -424,9 +350,108 @@ class SemanticTarget(): f"{record}\n{newrecord}") raise RuntimeError("Cannot merge Entities") - def combine_fragments(self): - for se in self.se: + @staticmethod + def _create_reference_mapping(flat: list[SemanticEntity], se_lookup): + """ + TODO update docstring + Create a dictionary of dictionaries of the form: + dict[int, dict[str, list[Union[int,None]]]] + + - The integer index is the Python id of the value object. + - The string is the name of the first parent of the referencing object. + + Each value objects is taken from the values of all properties from the list flat. + + So the returned mapping maps ids of entities to the ids of objects which are referring + to them. + """ + # TODO we need to treat children of RecordTypes somehow. + forward_references: dict[str, set[SemanticEntity]] = {} + backward_references: dict[str, set[SemanticEntity]] = {} + forward_id_references: dict[str, set[SemanticEntity]] = {} + backward_id_references: dict[str, set[SemanticEntity]] = {} + forward_id_referenced_by: dict[str, set[SemanticEntity]] = {} + backward_id_referenced_by: dict[str, set[SemanticEntity]] = {} + + # initialize with empty lists/dict + for se in flat: for ent in se.fragments: - # TODO - newrecord = self.treated_records_lookup.get_any(record, identifiable) - merge_entities(newrecord, record, merge_id_with_resolved_entity=True) + forward_references[se.uuid] = set() + backward_references[se.uuid] = set() + forward_id_references[se.uuid] = set() + backward_id_references[se.uuid] = set() + forward_id_referenced_by[se.uuid] = set() + backward_id_referenced_by[se.uuid] = set() + for se in flat: + for ent in se.fragments: + for p in ent.properties: + val = p.value + if not isinstance(val, list): + val = [val] + for v in val: + if isinstance(v, db.Entity): + vse = se_lookup[id(v)] + forward_references[se.uuid].add(vse) + backward_references[vse.uuid].add(se) + if len([el.name for el in se.registered_identifiable.properties if + el.name == p.name]) > 0: + forward_id_references[se.uuid].add(vse) + backward_id_references[vse.uuid].add(se) + if IdentifiableAdapter.referencing_entity_has_appropriate_type( + ent.parents, vse.registered_identifiable): + forward_id_referenced_by[se.uuid].add(vse) + backward_id_referenced_by[vse.uuid].add(se) + + return (forward_references, backward_references, forward_id_references, + backward_id_references, forward_id_referenced_by, backward_id_referenced_by, + ) + + def _mark_entities_with_path_or_id(self): + """ A path or an ID is sufficiently identifying. Thus, those entities can be marked as + checked """ + for semantic_entity in self.se: + assert len(semantic_entity.fragments) == 1 + entity = semantic_entity.fragments[0] + if entity.id is None and entity.path is None: + continue + if entity.path is not None: + try: + existing = cached_get_entity_by(path=entity.path) + except EmptyUniqueQueryError: + existing = None + if existing is not None: + semantic_entity.identify_with(existing) + + treated_before = self.get_equivalent(semantic_entity) + if treated_before is None: + if semantic_entity.id is None: + self.add_to_missing(semantic_entity) + else: + self.add_to_existing(semantic_entity) + else: + self.merge_into(semantic_entity, self.se_lookup[id(treated_before)]) + + @staticmethod + def bend_references_to_new_object(old, new, entities): + """ Bend references to the other object + Iterate over all entities in `entities` and check the values of all properties of + occurances of old Entity and replace them with new Entity + """ + for el in entities: + for p in el.properties: + if isinstance(p.value, list): + for index, val in enumerate(p.value): + if val is old: + p.value[index] = new + else: + if p.value is old: + p.value = new + + def _add_any(self, entity: SemanticEntity, lookup): + if entity.id is not None: + self._id_look_up[entity.id] = entity + if entity.path is not None: + self._path_look_up[entity.path] = entity + if entity.identifiable is not None: + self._identifiable_look_up[entity.identifiable.get_representation()] = entity + lookup[id(entity)] = entity diff --git a/src/caoscrawler/treated_record_lookup.py b/src/caoscrawler/treated_record_lookup.py deleted file mode 100644 index 59bbd2df97919dffd0087f9efb7ce8ac9654a3b9..0000000000000000000000000000000000000000 --- a/src/caoscrawler/treated_record_lookup.py +++ /dev/null @@ -1,130 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -# -# This file is a part of the LinkAhead Project. -# -# Copyright (C) 2024 Henrik tom Wörden -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see <https://www.gnu.org/licenses/>. -# - -from typing import Any, List, Optional, Union - -import linkahead as db - -from .identifiable import Identifiable - - -class TreatedRecordLookUp(): - """tracks Records and Identifiables for which it was checked whether they exist in the remote - server - - For a given Record it can be checked, whether it exists in the remote sever if - - it has a (valid) ID - - it has a (valid) path (FILEs only) - - an identifiable can be created for the Record. - - Records are added by calling the `add` function and they are then added to the internal - existing or missing list depending on whether the Record has a valid ID. - Additionally, the Record is added to three look up dicts. The keys of those are paths, IDs and - the representation of the identifiables. - - The extreme case, that one could imagine, would be that the same Record occurs three times as - different Python objects: one that only has an ID, one with only a path and one without ID and - path but with identifying properties. During `split_into_inserts_and_updates` all three - must be identified with each other (and must be merged). Since we require, that treated - entities have a valid ID if they exist in the remote server, all three objects would be - identified with each other simply using the IDs. - - In the case that the Record is not yet in the remote server, there cannot be a Python object - with an ID. Thus we might have one with a path and one with an identifiable. If that Record - does not yet exist, it is necessary that both Python objects have at least either the path or - the identifiable in common. - """ - - def __init__(self): - self._id_look_up: dict[int, db.Entity] = {} - self._path_look_up: dict[str, db.Entity] = {} - self._identifiable_look_up: dict[str, db.Entity] = {} - self.remote_missing_counter = -1 - self._missing: dict[int, db.Entity] = {} - self._existing: dict[int, db.Entity] = {} - - def add(self, record: db.Entity, identifiable: Optional[Identifiable] = None): - """ - Add a Record that was treated, such that it is contained in the internal look up dicts - - This Record MUST have an ID if it was found in the remote server. - """ - if record.id is None: - if record.path is None and identifiable is None: - raise RuntimeError("Record must have ID or path or an identifiable must be given." - f"Record is\n{record}") - record.id = self.remote_missing_counter - self.remote_missing_counter -= 1 - self._add_any(record, self._missing, identifiable) - else: - self._add_any(record, self._existing, identifiable) - - def get_any(self, record: db.Entity = None, identifiable: Optional[Identifiable] = None): - """ - Check whether this Record was already added. Identity is based on ID, path or Identifiable - represenation - """ - if record is not None and record.id is not None and record.id in self._id_look_up: - return self._id_look_up[record.id] - if record is not None and record.path is not None and record.path in self._path_look_up: - return self._path_look_up[record.path] - if (identifiable is not None and identifiable.get_representation() in - self._identifiable_look_up): - return self._identifiable_look_up[identifiable.get_representation()] - - def get_existing(self, record: db.Entity, identifiable: Optional[Identifiable] = None): - """ Check whether this Record exists on the remote server - - Returns: The stored Record - """ - rec = self.get_any(record, identifiable) - if id(rec) in self._existing: - return rec - else: - return None - - def get_missing(self, record: db.Entity, identifiable: Optional[Identifiable] = None): - """ Check whether this Record is missing on the remote server - - Returns: The stored Record - """ - rec = self.get_any(record, identifiable) - if id(rec) in self._missing: - return rec - else: - return None - - def get_missing_list(self): - """ Return all Records that are missing in the remote server """ - return list(self._missing.values()) - - def get_existing_list(self): - """ Return all Records that exist in the remote server """ - return list(self._existing.values()) - - def _add_any(self, record: db.Entity, lookup, identifiable: Optional[Identifiable] = None): - if record.id is not None: - self._id_look_up[record.id] = record - if record.path is not None: - self._path_look_up[record.path] = record - if identifiable is not None: - self._identifiable_look_up[identifiable.get_representation()] = record - lookup[id(record)] = record diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index caaa4cce2e5704d796166773a3e04310187c6e7b..2afc1c76bde0d1d61959aaed12b6deabe5ace86a 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -52,7 +52,6 @@ from caoscrawler.semantic_target import SemanticTarget from caoscrawler.stores import GeneralStore, RecordStore from caoscrawler.structure_elements import (DictElement, DictListElement, DictTextElement, File) -from caoscrawler.treated_record_lookup import TreatedRecordLookUp from linkahead.apiutils import compare_entities from linkahead.cached import cache_clear from linkahead.exceptions import EmptyUniqueQueryError @@ -256,13 +255,16 @@ def test_remove_unnecessary_updates(): def test_split_into_inserts_and_updates_trivial(): crawler = Crawler() - crawler.split_into_inserts_and_updates([]) + st = SemanticTarget([], crawler.identifiableAdapter) + crawler.split_into_inserts_and_updates(st) def test_split_into_inserts_and_updates_unidentified(): crawler = Crawler() + st = SemanticTarget([db.Record(name="recname").add_parent("someparent")], + crawler.identifiableAdapter) with raises(ValueError) as err: - crawler.split_into_inserts_and_updates([db.Record(name="recname").add_parent("someparent")]) + crawler.split_into_inserts_and_updates(st) assert str(err.value).startswith("There is no identifying information.") @@ -299,8 +301,8 @@ def test_split_into_inserts_and_updates_single(crawler_mocked_identifiable_retri db.Record(name="B").add_parent("C")] st = SemanticTarget(entlist, crawler.identifiableAdapter) - assert st.treated_records_lookup.get_any(entlist[0], identlist[0]) is None - assert st.treated_records_lookup.get_any(entlist[0], identlist[0]) is None + assert st.get_equivalent(st.se[0]) is None + assert st.get_equivalent(st.se[0]) is None assert not st.identity_relies_on_unchecked_entity(st.se[0]) assert not st.identity_relies_on_unchecked_entity(st.se[1]) assert crawler.identifiableAdapter.retrieve_identified_record_for_record( @@ -326,7 +328,8 @@ def test_split_into_inserts_and_updates_with_duplicate(crawler_mocked_identifiab # This is identical to a and should be removed c = db.Record(name="A").add_parent("C") entlist = [a, b, c] - insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) + st = SemanticTarget(entlist, crawler.identifiableAdapter) + insert, update = crawler.split_into_inserts_and_updates(st) assert len(insert) == 1 assert insert[0].name == "B" assert len(update) == 1 @@ -343,7 +346,8 @@ def test_split_into_inserts_and_updates_with_ref(crawler_mocked_identifiable_ret b = db.Record(name="B").add_parent("C") b.add_property("A", a) entlist = [a, b] - insert, update = crawler.split_into_inserts_and_updates(entlist) + st = SemanticTarget(entlist, crawler.identifiableAdapter) + insert, update = crawler.split_into_inserts_and_updates(st) assert len(insert) == 1 assert insert[0].name == "B" assert len(update) == 1 @@ -378,7 +382,8 @@ def test_split_into_inserts_and_updates_with_complex(crawler_mocked_identifiable b.add_property("A", f) b.add_property("A", a) entlist = [a, b, g] - insert, update = crawler.split_into_inserts_and_updates(entlist) + st = SemanticTarget(entlist, crawler.identifiableAdapter) + insert, update = crawler.split_into_inserts_and_updates(st) assert len(insert) == 3 assert "B" in [el.name for el in insert] assert len(update) == 1 @@ -398,7 +403,8 @@ def test_split_into_inserts_and_updates_with_copy_attr(crawler_mocked_identifiab b = db.Record(name="A").add_parent("C") b.add_property("bar", 2) entlist = [a, b] - insert, update = crawler.split_into_inserts_and_updates(entlist) + st = SemanticTarget(entlist, crawler.identifiableAdapter) + insert, update = crawler.split_into_inserts_and_updates(st) assert update[0].get_property("bar").value == 2 assert update[0].get_property("foo").value == 1 @@ -1064,7 +1070,7 @@ def test_replace_name_with_referenced_entity(): def test_treated_record_lookup(): - trlu = TreatedRecordLookUp() + trlu = SemanticTarget() exist = db.Record(id=1) trlu.add(exist) assert len(trlu._existing) == 1