diff --git a/CHANGELOG.md b/CHANGELOG.md index 65547e6b8017e53d711955f9f9fbee00dc739b86..33cd6e06a51a1dcec534e3817a99b4f1a509ce2f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,26 +8,47 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ## ### Added ### -* 'transform' sections can be added to a CFood to apply functions to values stored in variables. + +* `transform` sections can be added to a CFood to apply functions to values stored in variables. * default transform functions: submatch, split and replace. * `*` can now be used as a wildcard in the identifiables parameter file to denote that any Record may reference the identified one. +* `crawl.TreatedRecordLookUp` class replacing the old (and slow) + `identified_cache` module. The new class now handles all records identified by + id, path, or identifiable simultaneously. See API docs for more info on how to + add to and get from the new lookup class. +* `identifiable_adapters.IdentifiableAdapter.get_identifying_referencing_entities` + and + `identifiable_adapters.IdentifiableAdapter.get_identifying_referenced_entities` + static methods to return the referencing or referenced entities belonging to a + registered identifiable, respectively. ### Changed ### -- If the `parents` key is used in a cfood at a lower level for a Record that + +* If the `parents` key is used in a cfood at a lower level for a Record that already has a Parent (because it was explicitly given or the default Parent), the old Parent(s) are now overwritten with the value belonging to the `parents` key. -- If a registered identifiable states, that a reference by a Record with parent +* If a registered identifiable states, that a reference by a Record with parent RT1 is needed, then now also references from Records that have a child of RT1 as parent are accepted. -- More aggressive caching. +* More aggressive caching. +* The `identifiable_adapters.IdentifiableAdapter` now creates (possibly empty) + reference lists for all records in `create_reference_mapping`. This allows + functions like `get_identifiable` to be called only with the subset of the + referenceing entities belonging to a specific Record. +* The `identifiable_adapters.IdentifiableAdapter` uses entity ids (negative for + entities that don't exist remotely) instead of entity objects for keeping + track of references. ### Deprecated ### -- `IdentifiableAdapter.get_file` + +* `IdentifiableAdapter.get_file` ### Removed ### +* `identified_cache` module which was replaced by the `crawl.TreatedRecordLookUp` class. + ### Fixed ### * Empty Records can now be created (https://gitlab.com/caosdb/caosdb-crawler/-/issues/27) @@ -40,6 +61,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 handles cases correctly in which entities retrieved from the server have to be merged with local entities that both reference another, already existing entity +* A corner case in `split_into_inserts_and_updates` whereby two records created + in different places in the cfood definition would not be merged if both were + identified by the same LinkAhead id ### Security ### diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 1c6968d76257109cc24458a048d6d41ea64d2e5f..d30472c9aa9745fb985358f86015f01286a41680 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -64,7 +64,6 @@ from .identifiable import Identifiable from .identifiable_adapters import (CaosDBIdentifiableAdapter, IdentifiableAdapter, LocalStorageIdentifiableAdapter) -from .identified_cache import IdentifiedCache from .logging import configure_server_side_logging from .macros import defmacro_constructor, macro_constructor from .scanner import (create_converter_registry, initialize_converters, @@ -220,6 +219,110 @@ class SecurityMode(Enum): UPDATE = 2 +class TreatedRecordLookUp(): + """tracks Records and Identifiables for which it was checked whether they exist in the remote + server + + For a given Record it can be checked, whether it exists in the remote sever if + - it has a (valid) ID + - it has a (valid) path (FILEs only) + - an identifiable can be created for the Record. + + Records are added by calling the `add` function and they are then added to the internal + existing or missing list depending on whether the Record has a valid ID. + Additionally, the Record is added to three look up dicts. The keys of those are paths, IDs and + the representation of the identifiables. + + The extreme case, that one could imagine, would be that the same Record occurs three times as + different Python objects: one that only has an ID, one with only a path and one without ID and + path but with identifying properties. During `split_into_inserts_and_updates` all three + must be identified with each other (and must be merged). Since we require, that treated + entities have a valid ID if they exist in the remote server, all three objects would be + identified with each other simply using the IDs. + + In the case that the Record is not yet in the remote server, there cannot be a Python object + with an ID. Thus we might have one with a path and one with an identifiable. If that Record + does not yet exist, it is necessary that both Python objects have at least either the path or + the identifiable in common. + """ + + def __init__(self): + self._id_look_up: dict[int, db.Entity] = {} + self._path_look_up: dict[str, db.Entity] = {} + self._identifiable_look_up: dict[str, db.Entity] = {} + self.remote_missing_counter = -1 + self._missing: dict[int, db.Entity] = {} + self._existing: dict[int, db.Entity] = {} + + def add(self, record: db.Entity, identifiable: Optional[Identifiable] = None): + """ + Add a Record that was treated, such that it is contained in the internal look up dicts + + This Record MUST have an ID if it was found in the remote server. + """ + if record.id is None: + if record.path is None and identifiable is None: + raise RuntimeError("Record must have ID or path or an identifiable must be given." + f"Record is\n{record}") + record.id = self.remote_missing_counter + self.remote_missing_counter -= 1 + self._add_any(record, self._missing, identifiable) + else: + self._add_any(record, self._existing, identifiable) + + def get_any(self, record: db.Entity, identifiable: Optional[Identifiable] = None): + """ + Check whether this Record was already added. Identity is based on ID, path or Identifiable + represenation + """ + if record.id is not None and record.id in self._id_look_up: + return self._id_look_up[record.id] + if record.path is not None and record.path in self._path_look_up: + return self._path_look_up[record.path] + if (identifiable is not None and identifiable.get_representation() in + self._identifiable_look_up): + return self._identifiable_look_up[identifiable.get_representation()] + + def get_existing(self, record: db.Entity, identifiable: Optional[Identifiable] = None): + """ Check whether this Record exists on the remote server + + Returns: The stored Record + """ + rec = self.get_any(record, identifiable) + if id(rec) in self._existing: + return rec + else: + return None + + def get_missing(self, record: db.Entity, identifiable: Optional[Identifiable] = None): + """ Check whether this Record is missing on the remote server + + Returns: The stored Record + """ + rec = self.get_any(record, identifiable) + if id(rec) in self._missing: + return rec + else: + return None + + def get_missing_list(self): + """ Return all Records that are missing in the remote server """ + return list(self._missing.values()) + + def get_existing_list(self): + """ Return all Records that exist in the remote server """ + return list(self._existing.values()) + + def _add_any(self, record: db.Entity, lookup, identifiable: Optional[Identifiable] = None): + if record.id is not None: + self._id_look_up[record.id] = record + if record.path is not None: + self._path_look_up[record.path] = record + if identifiable is not None: + self._identifiable_look_up[identifiable.get_representation()] = record + lookup[id(record)] = record + + class Crawler(object): """ Crawler class that encapsulates crawling functions. @@ -256,8 +359,8 @@ class Crawler(object): # The following caches store records, where we checked whether they exist on the remote # server. Since, it is important to know whether they exist or not, we store them into two # different caches. - self.remote_existing_cache = IdentifiedCache() - self.remote_missing_cache = IdentifiedCache() + self.treated_records_lookup = TreatedRecordLookUp() + # TODO does it make sense to have this as member variable? self.securityMode = securityMode # TODO does it make sense to have this as member variable(run_id)? @@ -404,7 +507,7 @@ class Crawler(object): Crawler.create_flat_list([p.value], flat) return flat - def _has_missing_object_in_references(self, ident: Identifiable, referencing_entities: list): + def _has_missing_object_in_references(self, ident: Identifiable, referencing_entities: dict): """ returns False if any value in the properties attribute is a db.Entity object that is contained in the `remote_missing_cache`. If ident has such an object in @@ -417,16 +520,21 @@ class Crawler(object): # Entity instead of ID and not cached locally if (isinstance(pvalue, list)): for el in pvalue: - if (isinstance(el, db.Entity) and self.get_from_remote_missing_cache( - self.identifiableAdapter.get_identifiable(el, referencing_entities)) is not None): + elident = self.identifiableAdapter.get_identifiable( + el, referencing_entities[id(el)]) + if (isinstance(el, db.Entity) + and self.treated_records_lookup.get_missing(el, elident) is not None): return True - if (isinstance(pvalue, db.Entity) and self.get_from_remote_missing_cache( - self.identifiableAdapter.get_identifiable(pvalue, referencing_entities)) is not None): + if (isinstance(pvalue, db.Entity) and self.treated_records_lookup.get_missing( + pvalue, + self.identifiableAdapter.get_identifiable(pvalue, + referencing_entities[id(pvalue)]) + ) is not None): # might be checked when reference is resolved return True return False - def replace_references_with_cached(self, record: db.Record, referencing_entities: list): + def replace_references_with_cached(self, record: db.Record, referencing_entities: dict): """ Replace all references with the versions stored in the cache. @@ -437,8 +545,10 @@ class Crawler(object): lst = [] for el in p.value: if (isinstance(el, db.Entity) and el.id is None): - cached = self.get_from_any_cache( - self.identifiableAdapter.get_identifiable(el, referencing_entities)) + cached = self.treated_records_lookup.get_any( + el, + self.identifiableAdapter.get_identifiable( + el, referencing_entities[id(el)])) if cached is None: lst.append(el) continue @@ -459,8 +569,9 @@ class Crawler(object): lst.append(el) p.value = lst if (isinstance(p.value, db.Entity) and p.value.id is None): - cached = self.get_from_any_cache( - self.identifiableAdapter.get_identifiable(p.value, referencing_entities)) + cached = self.treated_records_lookup.get_any( + p.value, self.identifiableAdapter.get_identifiable( + p.value, referencing_entities[id(p.value)])) if cached is None: continue if not check_identical(cached, p.value, True): @@ -477,64 +588,6 @@ class Crawler(object): ) p.value = cached - def get_from_remote_missing_cache(self, identifiable: Identifiable): - """ - returns the identified record if an identifiable with the same values already exists locally - (Each identifiable that is not found on the remote server, is 'cached' locally to prevent - that the same identifiable exists twice) - """ - if identifiable is None: - raise ValueError("Identifiable has to be given as argument") - - if identifiable in self.remote_missing_cache: - return self.remote_missing_cache[identifiable] - else: - return None - - def get_from_any_cache(self, identifiable: Identifiable): - """ - returns the identifiable if an identifiable with the same values already exists locally - (Each identifiable that is not found on the remote server, is 'cached' locally to prevent - that the same identifiable exists twice) - """ - if identifiable is None: - raise ValueError("Identifiable has to be given as argument") - - if identifiable in self.remote_existing_cache: - return self.remote_existing_cache[identifiable] - elif identifiable in self.remote_missing_cache: - return self.remote_missing_cache[identifiable] - else: - return None - - def add_to_remote_missing_cache(self, record: db.Record, identifiable: Identifiable): - """ - stores the given Record in the remote_missing_cache. - - If identifiable is None, the Record is NOT stored. - """ - self.add_to_cache(record=record, cache=self.remote_missing_cache, - identifiable=identifiable) - - def add_to_remote_existing_cache(self, record: db.Record, identifiable: Identifiable): - """ - stores the given Record in the remote_existing_cache. - - If identifiable is None, the Record is NOT stored. - """ - self.add_to_cache(record=record, cache=self.remote_existing_cache, - identifiable=identifiable) - - def add_to_cache(self, record: db.Record, cache: IdentifiedCache, - identifiable: Identifiable) -> None: - """ - stores the given Record in the given cache. - - If identifiable is None, the Record is NOT stored. - """ - if identifiable is not None: - cache.add(identifiable=identifiable, record=record) - @staticmethod def bend_references_to_new_object(old, new, entities): """ Bend references to the other object @@ -551,23 +604,70 @@ class Crawler(object): if p.value is old: p.value = new + def _merge_identified(self, newrecord, record, try_to_merge_later, all_records): + """ tries to merge record into newrecord + + If it fails, record is added to the try_to_merge_later list. + In any case, references are bent to the newrecord object. + + """ + try: + merge_entities( + newrecord, record, merge_references_with_empty_diffs=False, merge_id_with_resolved_entity=True) + except EntityMergeConflictError: + _treat_merge_error_of(newrecord, record) + # We cannot merge but it is none of the clear case where merge is + # impossible. Thus we try later + try_to_merge_later.append(record) + if newrecord.id is not None: + record.id = newrecord.id + except NotImplementedError: + print(newrecord) + print(record) + raise + Crawler.bend_references_to_new_object( + old=record, new=newrecord, + entities=all_records + ) + + def _identity_relies_on_unchecked_entities(self, record: db.Record, referencing_entities): + """ + If a record for which it could not yet be verified whether it exists in LA or not is part + of the identifying properties, this returns True, otherwise False + """ + + registered_identifiable = self.identifiableAdapter.get_registered_identifiable(record) + refs = self.identifiableAdapter.get_identifying_referencing_entities(referencing_entities, + registered_identifiable) + if any([el is None for el in refs]): + return True + + refs = self.identifiableAdapter.get_identifying_referenced_entities( + record, registered_identifiable) + if any([self.treated_records_lookup.get_any(el) is None for el in refs]): + return True + + return False + @staticmethod def create_reference_mapping(flat: list[db.Entity]): """ Create a dictionary of dictionaries of the form: - dict[int, dict[str, list[db.Entity]]] + dict[int, dict[str, list[Union[int,None]]]] - The integer index is the Python id of the value object. - The string is the name of the first parent of the referencing object. Each value objects is taken from the values of all properties from the list flat. - So the returned mapping maps ids of entities to the objects which are referring + So the returned mapping maps ids of entities to the ids of objects which are referring to them. """ # TODO we need to treat children of RecordTypes somehow. - references: dict[int, dict[str, list[db.Entity]]] = {} + references: dict[int, dict[str, list[Union[int, None]]]] = {} for ent in flat: + if id(ent) not in references: + references[id(ent)] = {} for p in ent.properties: val = p.value if not isinstance(val, list): @@ -578,125 +678,111 @@ class Crawler(object): references[id(v)] = {} if ent.parents[0].name not in references[id(v)]: references[id(v)][ent.parents[0].name] = [] - references[id(v)][ent.parents[0].name].append(ent) + references[id(v)][ent.parents[0].name].append(ent.id) return references def split_into_inserts_and_updates(self, ent_list: list[db.Entity]): - to_be_inserted: list[db.Entity] = [] - to_be_updated: list[db.Entity] = [] flat = Crawler.create_flat_list(ent_list) + all_records = list(flat) # TODO: can the following be removed at some point for ent in flat: if ent.role == "Record" and len(ent.parents) == 0: raise RuntimeError(f"Records must have a parent.\n{ent}") + # Check whether Records can be identified without identifiable + for i in reversed(range(len(flat))): + record = flat[i] + # 1. Can it be identified via an ID? + if record.id is not None: + treated_record = self.treated_records_lookup.get_existing(record) + if treated_record is not None: + self._merge_identified(treated_record, record, try_to_merge_later, all_records) + referencing_entities = self.create_reference_mapping(all_records) + else: + self.treated_records_lookup.add(record, None) + del flat[i] + # 2. Can it be identified via a path? + elif record.path is not None: + try: + existing = cached_get_entity_by(path=record.path) + except EmptyUniqueQueryError: + existing = None + if existing is not None: + record.id = existing.id + # TODO check the following copying of _size and _checksum + # Copy over checksum and size too if it is a file + record._size = existing._size + record._checksum = existing._checksum + treated_record = self.treated_records_lookup.get_any(record) + if treated_record is not None: + self._merge_identified(treated_record, record, try_to_merge_later, all_records) + referencing_entities = self.create_reference_mapping(all_records) + else: + # TODO add identifiable if possible + self.treated_records_lookup.add(record, None) + del flat[i] + resolved_references = True # flat contains Entities which could not yet be checked against the remote server try_to_merge_later = [] while resolved_references and len(flat) > 0: resolved_references = False referencing_entities = self.create_reference_mapping( - flat + to_be_updated + try_to_merge_later + to_be_inserted) + all_records) # For each element we try to find out whether we can find it in the server or whether # it does not yet exist. Since a Record may reference other unkown Records it might not # be possible to answer this right away. # The following checks are done on each Record: - # 1. Can it be identified via an ID? - # 2. Can it be identified via a path? - # 3. Is it in the cache of already checked Records? - # 4. Can it be checked on the remote server? - # 5. Does it have to be new since a needed reference is missing? + # 1. Is it in the cache of already checked Records? + # 2. Can it be checked on the remote server? + # 3. Does it have to be new since a needed reference is missing? for i in reversed(range(len(flat))): record = flat[i] + + if self._identity_relies_on_unchecked_entities(record, + referencing_entities[id(record)]): + continue + identifiable = self.identifiableAdapter.get_identifiable( record, - referencing_entities=referencing_entities) - - # TODO remove if the exception is never raised - if record in to_be_inserted: - raise RuntimeError("This should not be reached since treated elements" - "are removed from the list") - # 1. Can it be identified via an ID? - elif record.id is not None: - to_be_updated.append(record) - self.add_to_remote_existing_cache(record, identifiable) - del flat[i] - # 2. Can it be identified via a path? - elif record.path is not None: - try: - existing = cached_get_entity_by(path=record.path) - except EmptyUniqueQueryError: - existing = None - if existing is None: - to_be_inserted.append(record) - self.add_to_remote_missing_cache(record, identifiable) - del flat[i] - else: - record.id = existing.id - # TODO check the following copying of _size and _checksum - # Copy over checksum and size too if it is a file - record._size = existing._size - record._checksum = existing._checksum - to_be_updated.append(record) - self.add_to_remote_existing_cache(record, identifiable) - del flat[i] - # 3. Is it in the cache of already checked Records? - elif self.get_from_any_cache(identifiable) is not None: - newrecord = self.get_from_any_cache(identifiable) - # Since the identifiables are the same, newrecord and record actually describe + referencing_entities=referencing_entities[id(record)]) + + # 1. Is it in the cache of already checked Records? + if self.treated_records_lookup.get_any(record, identifiable) is not None: + treated_record = self.treated_records_lookup.get_any(record, identifiable) + # Since the identifiables are the same, treated_record and record actually describe # the same obejct. - # We merge the two in order to prevent loss of information - try: - merge_entities( - newrecord, record, merge_references_with_empty_diffs=False, merge_id_with_resolved_entity=True) - except EntityMergeConflictError: - _treat_merge_error_of(newrecord, record) - # We cannot merge but it is none of the clear case where merge is - # impossible. Thus we try later - try_to_merge_later.append(record) - if newrecord.id is not None: - record.id = newrecord.id - except NotImplementedError: - print(newrecord) - print(record) - raise - Crawler.bend_references_to_new_object( - old=record, new=newrecord, - entities=flat + to_be_updated + to_be_inserted + try_to_merge_later - ) - referencing_entities = self.create_reference_mapping( - flat + to_be_updated + try_to_merge_later + to_be_inserted) + # We merge record into treated_record in order to prevent loss of information + self._merge_identified(treated_record, record, try_to_merge_later, all_records) + referencing_entities = self.create_reference_mapping(all_records) del flat[i] resolved_references = True - # 4. Can it be checked on the remote server? + # 2. Can it be checked on the remote server? elif not self._has_reference_value_without_id(identifiable): identified_record = ( self.identifiableAdapter.retrieve_identified_record_for_identifiable( identifiable)) if identified_record is None: # identifiable does not exist remotely -> record needs to be inserted - to_be_inserted.append(record) - self.add_to_remote_missing_cache(record, identifiable) - del flat[i] + self.treated_records_lookup.add(record, identifiable) else: # side effect record.id = identified_record.id - to_be_updated.append(record) - self.add_to_remote_existing_cache(record, identifiable) - del flat[i] + record.path = identified_record.path + self.treated_records_lookup.add(record, identifiable) + del flat[i] resolved_references = True - # 5. Does it have to be new since a needed reference is missing? + # 3. Does it have to be new since a needed reference is missing? # (Is it impossible to check this record because an identifiable references a # missing record?) elif self._has_missing_object_in_references(identifiable, referencing_entities): - to_be_inserted.append(record) - self.add_to_remote_missing_cache(record, identifiable) + self.treated_records_lookup.add(record, identifiable) del flat[i] resolved_references = True @@ -708,8 +794,8 @@ class Crawler(object): for record in try_to_merge_later: identifiable = self.identifiableAdapter.get_identifiable( record, - referencing_entities=referencing_entities) - newrecord = self.get_from_any_cache(identifiable) + referencing_entities=referencing_entities[id(record)]) + newrecord = self.treated_records_lookup.get_any(record, identifiable) merge_entities(newrecord, record, merge_id_with_resolved_entity=True) if len(flat) > 0: circle = self.detect_circular_dependency(flat) @@ -724,7 +810,16 @@ class Crawler(object): f"Could not finish split_into_inserts_and_updates. Circular dependency: " f"{circle is not None}") - return to_be_inserted, to_be_updated + # remove negative IDs + missing = self.treated_records_lookup.get_missing_list() + for el in missing: + if el.id is None: + raise RuntimeError("This should not happen") # TODO remove + if el.id >= 0: + raise RuntimeError("This should not happen") # TODO remove + el.id = None + + return (missing, self.treated_records_lookup.get_existing_list()) def replace_entities_with_ids(self, rec: db.Record): for el in rec.properties: @@ -737,7 +832,7 @@ class Crawler(object): if val.id is not None: el.value[index] = val.id - @staticmethod + @ staticmethod def compact_entity_list_representation(circle): """ a more readable representation than the standard xml representation @@ -753,7 +848,7 @@ class Crawler(object): return text + "--------\n" - @staticmethod + @ staticmethod def detect_circular_dependency(flat: list[db.Entity]): """ Detects whether there are circular references in the given entity list and returns a list @@ -786,7 +881,7 @@ class Crawler(object): return None return circle - @staticmethod + @ staticmethod def _merge_properties_from_remote( crawled_data: list[db.Record], identified_records: list[db.Record] @@ -828,7 +923,7 @@ class Crawler(object): return to_be_updated - @staticmethod + @ staticmethod def remove_unnecessary_updates( crawled_data: list[db.Record], identified_records: list[db.Record] @@ -854,7 +949,7 @@ class Crawler(object): return actual_updates - @staticmethod + @ staticmethod def execute_parent_updates_in_list(to_be_updated, securityMode, run_id, unique_names): """ Execute the updates of changed parents. @@ -897,13 +992,13 @@ class Crawler(object): "mode. This might lead to a failure of inserts that follow.") logger.info(parent_updates) - @staticmethod + @ staticmethod def _get_property_id_for_datatype(rtname: str, name: str): return cached_get_entity_by( query=f"FIND Entity '{escape_squoted_text(rtname)}' " f"with name='{escape_squoted_text(name)}'").id - @staticmethod + @ staticmethod def replace_name_with_referenced_entity_id(prop: db.Property): """changes the given property in place if it is a reference property that has a name as value @@ -948,7 +1043,7 @@ class Crawler(object): propval.append(el) prop.value = propval - @staticmethod + @ staticmethod def execute_inserts_in_list(to_be_inserted, securityMode, run_id: Optional[uuid.UUID] = None, unique_names=True): @@ -968,7 +1063,7 @@ class Crawler(object): update_cache = UpdateCache() update_cache.insert(to_be_inserted, run_id, insert=True) - @staticmethod + @ staticmethod def set_ids_and_datatype_of_parents_and_properties(rec_list): for record in rec_list: for parent in record.parents: @@ -980,7 +1075,7 @@ class Crawler(object): prop.id = entity.id _resolve_datatype(prop, entity) - @staticmethod + @ staticmethod def execute_updates_in_list(to_be_updated, securityMode, run_id: Optional[uuid.UUID] = None, unique_names=True): @@ -994,7 +1089,7 @@ class Crawler(object): update_cache = UpdateCache() update_cache.insert(to_be_updated, run_id) - @staticmethod + @ staticmethod def check_whether_parent_exists(records: list[db.Entity], parents: list[str]): """ returns a list of all records in `records` that have a parent that is in `parents`""" problems = [] @@ -1051,7 +1146,6 @@ class Crawler(object): crawled_data = self.crawled_data to_be_inserted, to_be_updated = self.split_into_inserts_and_updates(crawled_data) - referencing_entities = self.create_reference_mapping(to_be_updated + to_be_inserted) for el in to_be_updated: # all entity objects are replaced by their IDs except for the not yet inserted ones @@ -1115,7 +1209,7 @@ class Crawler(object): return (to_be_inserted, to_be_updated) - @staticmethod + @ staticmethod def create_entity_summary(entities: list[db.Entity]): """ Creates a summary string reprensentation of a list of entities.""" parents = {} @@ -1134,7 +1228,7 @@ class Crawler(object): output = output[:-2] + "\n" return output - @staticmethod + @ staticmethod def inform_about_pending_changes(pending_changes, run_id, path, inserts=False): # Sending an Email with a link to a form to authorize updates is if get_config_setting("send_crawler_notifications"): @@ -1155,7 +1249,7 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) + " by invoking the crawler" " with the run id: {rid}\n".format(rid=run_id)) - @staticmethod + @ staticmethod def debug_build_usage_tree(converter: Converter): res: dict[str, dict[str, Any]] = { converter.name: { diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index dd8c032041a74fa05b16d93abb06186e7e6fa569..069afc668561ad6ba56bd48d00c6eb2a3c0e70d6 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -186,6 +186,49 @@ identifiabel, identifiable and identified record) for a Record. """ pass + @staticmethod + def get_identifying_referencing_entities(referencing_entities, registered_identifiable): + refs = [] + for prop in registered_identifiable.properties: + if prop.name.lower() != "is_referenced_by": + continue + for givenrt in prop.value: + found = False + if givenrt == "*": + for val in referencing_entities.values(): + if len(val) > 0: + found = True + refs.extend(val) + else: + rt_and_children = get_children_of_rt(givenrt) + for rtname in rt_and_children: + if (rtname in referencing_entities): + refs.extend(referencing_entities[rtname]) + found = True + if not found: + raise NotImplementedError( + f"An identifying property:\n" + f"\nIdentifying PROPERTY\n{prop.name}" + ) + return refs + + @staticmethod + def get_identifying_referenced_entities(record, registered_identifiable): + refs = [] + for prop in registered_identifiable.properties: + pname = prop.name.lower() + if pname == "name" or pname == "is_referenced_by": + continue + if record.get_property(prop.name) is None: + raise RuntimeError("Missing identifying Property") + pval = record.get_property(prop.name).value + if not isinstance(prop.value, list): + pval = [prop.value] + for val in pval: + if isinstance(val, db.Entity): + refs.append(val) + return refs + def get_identifiable(self, record: db.Record, referencing_entities=None): """ retrieve the registered identifiable and fill the property values to create an @@ -193,7 +236,7 @@ identifiabel, identifiable and identified record) for a Record. Args: record: the record for which the Identifiable shall be created. - referencing_entities: a dictionary (Type: dict[int, dict[str, list[db.Entity]]]), that + referencing_entities: a dictionary (Type: dict[str, list[db.Entity]]), that allows to look up entities with a certain RecordType, that reference ``record`` Returns: @@ -212,6 +255,8 @@ identifiabel, identifiable and identified record) for a Record. name_is_identifying_property = False if registered_identifiable is not None: + identifiable_backrefs = self.get_identifying_referencing_entities( + referencing_entities, registered_identifiable) # fill the values: for prop in registered_identifiable.properties: if prop.name == "name": @@ -222,31 +267,8 @@ identifiabel, identifiable and identified record) for a Record. # case A: in the registered identifiable # case B: in the identifiable - # TODO: similar to the Identifiable class, Registered Identifiable should be a - # separate class too + # treated above if prop.name.lower() == "is_referenced_by": - for givenrt in prop.value: - found = False - if givenrt == "*": - if id(record) not in referencing_entities: - continue - for rt, rec in referencing_entities[id(record)].items(): - identifiable_backrefs.extend(rec) - found = True - else: - rt_and_children = get_children_of_rt(givenrt) - for rtname in rt_and_children: - if (id(record) in referencing_entities - and (rtname in referencing_entities[id(record)])): - identifiable_backrefs.extend( - referencing_entities[id(record)][rtname]) - found = True - if not found: - # TODO: is this the appropriate error? - raise NotImplementedError( - f"The following record is missing an identifying property:\n" - f"RECORD\n{record}\nIdentifying PROPERTY\n{prop.name}" - ) continue record_prop = record.get_property(prop.name) diff --git a/src/caoscrawler/identified_cache.py b/src/caoscrawler/identified_cache.py deleted file mode 100644 index aa2d82f8e66c738e737c62f3cc68eaf60127e28b..0000000000000000000000000000000000000000 --- a/src/caoscrawler/identified_cache.py +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -# -# ** header v3.0 -# This file is a part of the CaosDB Project. -# -# Copyright (C) 2021 Indiscale GmbH <info@indiscale.com> -# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see <https://www.gnu.org/licenses/>. -# -# ** end header -# - - -""" -see class docstring -""" - -from .identifiable import Identifiable -import caosdb as db - - -class IdentifiedCache(object): - """ - This class is like a dictionary where the keys are Identifiables. When you check whether an - Identifiable exists as key this class returns True not only if that exact Python object is - used as a key, but if an Identifiable is used as key that is **equal** to the one being - considered (see __eq__ function of Identifiable). Similarly, if you do `cache[identifiable]` - you get the Record where the key is an Identifiable that is equal to the one in the rectangular - brackets. - - This class is used for Records where we checked the existence in a remote server using - identifiables. If the Record was found, this means that we identified the corresponding Record - in the remote server and the ID of the local object can be set. - To prevent querying the server again and again for the same objects, this cache allows storing - Records that were found on a remote server and those that were not (typically in separate - caches). - """ - - def __init__(self): - self._cache = {} - self._identifiables = [] - - def __contains__(self, identifiable: Identifiable): - return identifiable in self._identifiables - - def __getitem__(self, identifiable: db.Record): - index = self._identifiables.index(identifiable) - return self._cache[id(self._identifiables[index])] - - def add(self, record: db.Record, identifiable: Identifiable): - self._cache[id(identifiable)] = record - self._identifiables.append(identifiable) diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index fbf98346e59b0cbec88f17398eff41f26c423dee..5eecb10630bb478ff9070ad97eb5acca9a963fab 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -38,8 +38,9 @@ import caosdb as db import caosdb.common.models as dbmodels import pytest import yaml -from caoscrawler.crawl import (Crawler, SecurityMode, _treat_deprecated_prefix, - crawler_main, split_restricted_path) +from caoscrawler.crawl import (Crawler, SecurityMode, TreatedRecordLookUp, + _treat_deprecated_prefix, crawler_main, + split_restricted_path) from caoscrawler.debug_tree import DebugTree from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import (CaosDBIdentifiableAdapter, @@ -247,8 +248,8 @@ def test_split_into_inserts_and_updates_single(crawler_mocked_identifiable_retri entlist = [db.Record(name="A").add_parent( "C"), db.Record(name="B").add_parent("C")] - assert crawler.get_from_any_cache(identlist[0]) is None - assert crawler.get_from_any_cache(identlist[1]) is None + assert crawler.treated_records_lookup.get_any(entlist[0], identlist[0]) is None + assert crawler.treated_records_lookup.get_any(entlist[0], identlist[0]) is None assert not crawler._has_reference_value_without_id(identlist[0]) assert not crawler._has_reference_value_without_id(identlist[1]) assert crawler.identifiableAdapter.retrieve_identified_record_for_record( @@ -368,36 +369,40 @@ def test_has_missing_object_in_references(): # one reference with id -> check assert not crawler._has_missing_object_in_references( - Identifiable(name="C", record_type="RTC", properties={'d': 123}), []) + Identifiable(name="C", record_type="RTC", properties={'d': 123}), {}) # one ref with Entity with id -> check + rec = db.Record(id=123).add_parent("C") assert not crawler._has_missing_object_in_references( - Identifiable(name="C", record_type="RTC", properties={'d': db.Record(id=123) - .add_parent("C")}), []) + Identifiable(name="C", record_type="RTC", properties={'d': rec}), {id(rec): {'C': [None]}}) # one ref with id one with Entity with id (mixed) -> check + rec = db.Record(id=123).add_parent("RTC") assert not crawler._has_missing_object_in_references( Identifiable(name="C", record_type="RTD", - properties={'d': 123, 'b': db.Record(id=123).add_parent("RTC")}), []) + properties={'d': 123, 'b': rec}), {id(rec): {'C': [None]}}) # entity to be referenced in the following a = db.Record(name="C").add_parent("C").add_property("d", 12311) # one ref with id one with Entity without id (but not identifying) -> fail assert not crawler._has_missing_object_in_references( - Identifiable(name="C", record_type="RTC", properties={'d': 123, 'e': a}), []) + Identifiable(name="C", record_type="RTC", properties={'d': 123, 'e': a}), + {id(a): {'C': [None]}}) # one ref with id one with Entity without id (mixed) -> fail assert not crawler._has_missing_object_in_references( - Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), []) + Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), + {id(a): {'C': [None]}}) - crawler.add_to_remote_missing_cache(a, Identifiable(name="C", record_type="RTC", - properties={'d': 12311})) + crawler.treated_records_lookup.add(a, Identifiable(name="C", record_type="RTC", + properties={'d': 12311})) # one ref with id one with Entity without id but in cache -> check assert crawler._has_missing_object_in_references( - Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), []) + Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), + {id(a): {'C': [None]}}) # if this ever fails, the mock up may be removed crawler.identifiableAdapter.get_registered_identifiable.assert_called() -@pytest.mark.xfail() +@ pytest.mark.xfail() def test_references_entities_without_ids(): crawler = Crawler() assert not crawler._has_reference_value_without_id(db.Record().add_parent("Person") @@ -447,15 +452,15 @@ def mock_retrieve_record(identifiable: Identifiable): return None -@patch("caoscrawler.crawl.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by)) -@patch("caoscrawler.identifiable_adapters.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by)) -@patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter." - "retrieve_identified_record_for_identifiable", - new=Mock(side_effect=mock_retrieve_record)) -@patch("caoscrawler.crawl.db.Container.insert") -@patch("caoscrawler.crawl.db.Container.update") +@ patch("caoscrawler.crawl.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@ patch("caoscrawler.identifiable_adapters.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@ patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter." + "retrieve_identified_record_for_identifiable", + new=Mock(side_effect=mock_retrieve_record)) +@ patch("caoscrawler.crawl.db.Container.insert") +@ patch("caoscrawler.crawl.db.Container.update") def test_synchronization_no_commit(upmock, insmock): crawled_data = [r.copy() for r in EXAMPLE_SERVER_STATE if r.role == "Record"] # change one; add one @@ -472,16 +477,16 @@ def test_synchronization_no_commit(upmock, insmock): assert len(ups) == 1 -@patch("caoscrawler.crawl.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by)) -@patch("caoscrawler.identifiable_adapters.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by)) -@patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter." - "retrieve_identified_record_for_identifiable", - new=Mock(side_effect=mock_retrieve_record)) -@patch("caoscrawler.crawl.db.Container.insert") -@patch("caoscrawler.crawl.db.Container.update") -@patch("caoscrawler.crawl.UpdateCache.insert") +@ patch("caoscrawler.crawl.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@ patch("caoscrawler.identifiable_adapters.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@ patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter." + "retrieve_identified_record_for_identifiable", + new=Mock(side_effect=mock_retrieve_record)) +@ patch("caoscrawler.crawl.db.Container.insert") +@ patch("caoscrawler.crawl.db.Container.update") +@ patch("caoscrawler.crawl.UpdateCache.insert") def test_security_mode(updateCacheMock, upmock, insmock): # trivial case: nothing to do crawled_data = [r.copy() for r in EXAMPLE_SERVER_STATE if r.role == "Record"] @@ -580,12 +585,13 @@ def test_security_mode(updateCacheMock, upmock, insmock): def test_create_reference_mapping(): a = db.Record().add_parent("A") - b = db.Record().add_parent("B").add_property('a', a) + b = db.Record(id=132).add_parent("B").add_property('a', a) ref = Crawler.create_reference_mapping([a, b]) assert id(a) in ref - assert id(b) not in ref + assert id(b) in ref assert "B" in ref[id(a)] - assert ref[id(a)]["B"] == [b] + assert {} == ref[id(b)] + assert ref[id(a)]["B"] == [132] def test_create_flat_list(): @@ -608,7 +614,7 @@ def test_create_flat_list(): assert c in flat -@pytest.fixture +@ pytest.fixture def crawler_mocked_for_backref_test(): crawler = Crawler() # mock retrieval of registered identifiabls: return Record with just a parent @@ -652,8 +658,8 @@ def test_validation_error_print(caplog): caplog.clear() -@patch("caoscrawler.identifiable_adapters.get_children_of_rt", - new=Mock(side_effect=lambda x: [x])) +@ patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test): crawler = crawler_mocked_for_backref_test identlist = [Identifiable(name="A", record_type="BR"), @@ -667,8 +673,8 @@ def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test) crawler.split_into_inserts_and_updates([db.Record(name="B").add_parent("C")]) # identifiables were not yet checked - assert crawler.get_from_any_cache(identlist[0]) is None - assert crawler.get_from_any_cache(identlist[1]) is None + assert crawler.treated_records_lookup.get_any(entlist[1], identlist[0]) is None + assert crawler.treated_records_lookup.get_any(entlist[0], identlist[1]) is None # one with reference, one without assert not crawler._has_reference_value_without_id(identlist[0]) assert crawler._has_reference_value_without_id(identlist[1]) @@ -688,8 +694,8 @@ def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test) assert insert[0].name == "B" -@patch("caoscrawler.identifiable_adapters.get_children_of_rt", - new=Mock(side_effect=lambda x: [x])) +@ patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_test): # test whether multiple references of the same record type are correctly used crawler = crawler_mocked_for_backref_test @@ -701,7 +707,9 @@ def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_ # test whether both entities are listed in the backref attribute of the identifiable referencing_entities = crawler.create_reference_mapping(entlist) - identifiable = crawler.identifiableAdapter.get_identifiable(referenced, referencing_entities) + identifiable = crawler.identifiableAdapter.get_identifiable( + referenced, + referencing_entities[id(referenced)]) assert len(identifiable.backrefs) == 2 # check the split... @@ -710,8 +718,8 @@ def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_ assert len(insert) == 2 -@patch("caoscrawler.identifiable_adapters.get_children_of_rt", - new=Mock(side_effect=lambda x: [x])) +@ patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_test): # test whether multiple references of the different record types are correctly used crawler = crawler_mocked_for_backref_test @@ -723,7 +731,10 @@ def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_ # test whether both entities are listed in the backref attribute of the identifiable referencing_entities = crawler.create_reference_mapping(entlist) - identifiable = crawler.identifiableAdapter.get_identifiable(referenced, referencing_entities) + identifiable = crawler.identifiableAdapter.get_identifiable( + referenced, + referencing_entities[id(referenced)]) + assert len(identifiable.backrefs) == 2 # check the split... @@ -736,7 +747,7 @@ def mock_create_values(values, element): pass -@patch("caoscrawler.converters.IntegerElementConverter.create_values") +@ patch("caoscrawler.converters.IntegerElementConverter.create_values") def test_restricted_path(create_mock): """ The restricted_path argument allows to ignroe part of the crawled data structure. Here, we make @@ -829,7 +840,7 @@ def test_split_restricted_path(): # Filter the warning because we want to have it here and this way it does not hinder running # tests with -Werror. -@pytest.mark.filterwarnings("ignore:The prefix:DeprecationWarning") +@ pytest.mark.filterwarnings("ignore:The prefix:DeprecationWarning") def test_deprecated_prefix_option(): """Test that calling the crawler's main function with the deprecated `prefix` option raises the correct errors and warnings. @@ -895,8 +906,8 @@ def mock_get_entity_by_query(query=None): return db.Record(id=1111, name='rec_name').add_parent('RT') -@patch("caoscrawler.crawl.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by_query)) +@ patch("caoscrawler.crawl.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by_query)) def test_replace_name_with_referenced_entity(): test_text = 'lkajsdf' test_int = 134343 @@ -964,3 +975,55 @@ def test_replace_name_with_referenced_entity(): assert isinstance(prop.value[2], int) assert prop.value[2] == test_id assert caoscrawler.crawl.cached_get_entity_by.call_count == 3 + + +def test_treated_record_lookup(): + trlu = TreatedRecordLookUp() + exist = db.Record(id=1) + trlu.add(exist) + assert len(trlu._existing) == 1 + # was added to existing + assert trlu._existing[id(exist)] is exist + # is in ID lookup + assert trlu._id_look_up[exist.id] is exist + # can be accessed via get_existing + assert trlu.get_existing(db.Record(id=1)) is exist + + miss = db.Record() + # exception when identifiable is missing + with raises(RuntimeError): + trlu.add(miss) + ident = Identifiable(name='a') + trlu.add(miss, ident) + # was added to missing + assert trlu._missing[id(miss)] is miss + # is in ident lookup + assert trlu._identifiable_look_up[ident.get_representation()] is miss + # can be accessed via get_missing + assert trlu.get_missing(db.Record(), Identifiable(name='a')) is miss + + fi = db.File(path='a', id=2) + trlu.add(fi) + assert len(trlu._existing) == 2 + # was added to existing + assert trlu._existing[id(fi)] is fi + # is in ID lookup + assert trlu._id_look_up[fi.id] is fi + # is in path lookup + assert trlu._path_look_up[fi.path] is fi + # can be accessed via get_existing + assert trlu.get_existing(fi) is fi + + all_exi = trlu.get_existing_list() + assert fi in all_exi + assert exist in all_exi + all_mi = trlu.get_missing_list() + assert miss in all_mi + + # If a Record was added using the ID, the ID must be used to identify it even though later an + # identifiable may be passed as well + assert trlu.get_any(exist, Identifiable(name='b')) is exist + + fi2 = db.File(path='b') + trlu.add(fi2) + assert trlu.get_any(db.File(path='b'), Identifiable(name='c')) is fi2 diff --git a/unittests/test_identifiable.py b/unittests/test_identifiable.py index 3f3c606b163df4dc238be9a669fd31eb630a582d..28bdb7a2ad75d5b9389b47ca3f0ec2b2e2a1404b 100644 --- a/unittests/test_identifiable.py +++ b/unittests/test_identifiable.py @@ -24,10 +24,9 @@ test identifiable module """ -import pytest import caosdb as db +import pytest from caoscrawler.identifiable import Identifiable -from caoscrawler.identified_cache import IdentifiedCache def test_create_hashable_string(): diff --git a/unittests/test_identifiable_adapters.py b/unittests/test_identifiable_adapters.py index 7759fa55ce37e1c30ff9092eac3260ca80348bca..ee0e0d6cd7c791f78e7cd2307dc6f34698326b4a 100644 --- a/unittests/test_identifiable_adapters.py +++ b/unittests/test_identifiable_adapters.py @@ -143,10 +143,9 @@ def test_wildcard_ref(): .add_property(name="last_name", value='Tom')) identifiable = ident.get_identifiable(rec, referencing_entities={ - id(rec): - {'A': [db.Record(id=1).add_parent("A")]}} + 'A': [1]} ) - assert identifiable.backrefs[0].id == 1 + assert identifiable.backrefs[0] == 1 def test_convert_value(): diff --git a/unittests/test_identified_cache.py b/unittests/test_identified_cache.py deleted file mode 100644 index 4ed7c55c7326415308917e20e9f391b17b07ad87..0000000000000000000000000000000000000000 --- a/unittests/test_identified_cache.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -# -# ** header v3.0 -# This file is a part of the CaosDB Project. -# -# Copyright (C) 2021 Indiscale GmbH <info@indiscale.com> -# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see <https://www.gnu.org/licenses/>. -# -# ** end header -# - -""" -test identified_cache module -""" - -import caosdb as db -from caoscrawler.identifiable import Identifiable -from caoscrawler.identified_cache import IdentifiedCache - - -def test_IdentifiedCache(): - ident = Identifiable(name="A", record_type="B") - record = db.Record("A").add_parent("B").add_property('b', 5) - cache = IdentifiedCache() - assert ident not in cache - cache.add(record=record, identifiable=ident) - assert ident in cache - assert cache[ident] is record - assert Identifiable(name="A", record_type="C") != Identifiable(name="A", record_type="B") - assert Identifiable(name="A", record_type="C") not in cache