diff --git a/CHANGELOG.md b/CHANGELOG.md index 69d0c4e2b125803ccec7225c604c07f17fc68645..badcf0044bc9e182a69f8a200794eea181958efd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -75,6 +75,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * The `identifiable_adapters.IdentifiableAdapter` uses entity ids (negative for entities that don't exist remotely) instead of entity objects for keeping track of references. +* Log output is either written to $SHARED_DIR/ (when this variable is set) or just to the terminal. ### Deprecated ### diff --git a/integrationtests/test_issues.py b/integrationtests/test_issues.py index 814e82ad75512ec8fe217294e1a9e86c6aa01ab3..76392f3a4ce20d7ed6b6ccc30c79f1ce400001f7 100644 --- a/integrationtests/test_issues.py +++ b/integrationtests/test_issues.py @@ -16,20 +16,18 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. # -from pytest import fixture, mark, raises - import linkahead as db -from linkahead.cached import cache_clear from caosadvancedtools.models.parser import parse_model_from_string - from caoscrawler.crawl import Crawler from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter +from caoscrawler.scanner import (create_converter_registry, + scan_structure_elements) from caoscrawler.structure_elements import DictElement - -from caoscrawler.scanner import create_converter_registry, scan_structure_elements - +from linkahead.cached import cache_clear from linkahead.utils.register_tests import clear_database, set_test_key +from pytest import fixture, mark, raises + set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") @@ -171,8 +169,9 @@ def test_issue_83(clear_database): name=referencing_type.name).add_property(name=referenced_type.name, value=[ref_target1]) referencing2 = db.Record(name="Referencing2").add_parent( name=referencing_type.name).add_property(name=referenced_type.name, value=[ref_target2]) - referencing3 = db.Record(name="Referencing3").add_parent(name=referencing_type.name).add_property( - name=referenced_type.name, value=[ref_target1, ref_target2]) + referencing3 = db.Record(name="Referencing3").add_parent( + name=referencing_type.name).add_property(name=referenced_type.name, value=[ref_target1, + ref_target2]) records = db.Container().extend( [ref_target1, ref_target2, referencing1, referencing2, referencing3]) diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index 63879ba0928f7b58ded36a95b612fdf0419e1960..8945732776d51da5c924f19e938a68007c668704 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -470,6 +470,7 @@ class Converter(object, metaclass=ABCMeta): def create_records(self, values: GeneralStore, records: RecordStore, element: StructureElement): # TODO why is element passed but not used??? + # ANSWER: because it might be used by overriding child classes. if "records" not in self.definition: return [] @@ -716,7 +717,9 @@ class MarkdownFileConverter(SimpleFileConverter): def convert_basic_element(element: Union[list, dict, bool, int, float, str, None], name=None, msg_prefix=""): """Convert basic Python objects to the corresponding StructureElements""" - if isinstance(element, list): + if isinstance(element, StructureElement): + return element + elif isinstance(element, list): return ListElement(name, element) elif isinstance(element, dict): return DictElement(name, element) @@ -817,7 +820,7 @@ class DictElementConverter(Converter): class DictConverter(DictElementConverter): def __init__(self, *args, **kwargs): warnings.warn(DeprecationWarning( - "This class is deprecated. Please use DictElementConverter.")) + "This class is deprecated. Please use DictElemnentConverter.")) super().__init__(*args, **kwargs) @@ -1079,7 +1082,7 @@ class ListElementConverter(Converter): # TODO: See comment on types and inheritance if not isinstance(element, ListElement): raise RuntimeError( - "This converter can only process DictListElements.") + "This converter can only process ListElements.") children: list[StructureElement] = [] for index, list_element in enumerate(element.value): children.append( diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 6fc90e300c422efe637c0c4049443a94c34145d7..fc9832552d3fcac8459084ee2765b8731c169f1a 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -51,26 +51,24 @@ from caosadvancedtools.cache import UpdateCache from caosadvancedtools.crawler import Crawler as OldCrawler from caosadvancedtools.serverside.helper import send_mail from caosadvancedtools.utils import create_entity_link -from linkahead.apiutils import (EntityMergeConflictError, compare_entities, +from linkahead.apiutils import (compare_entities, merge_entities) from linkahead.cached import cache_clear, cached_get_entity_by from linkahead.common.datatype import get_list_datatype, is_reference -from linkahead.exceptions import EmptyUniqueQueryError from linkahead.utils.escape import escape_squoted_text from .config import get_config_setting from .converters import Converter, ConverterValidationError from .debug_tree import DebugTree -from .identifiable import Identifiable from .identifiable_adapters import (CaosDBIdentifiableAdapter, - IdentifiableAdapter, - LocalStorageIdentifiableAdapter) + IdentifiableAdapter) from .logging import configure_server_side_logging from .macros import defmacro_constructor, macro_constructor from .scanner import (create_converter_registry, initialize_converters, load_definition, scan_directory, scan_structure_elements) from .stores import GeneralStore from .structure_elements import StructureElement +from .sync_graph import SyncGraph logger = logging.getLogger(__name__) @@ -172,163 +170,12 @@ def _resolve_datatype(prop: db.Property, remote_entity: db.Entity): return prop -def _treat_merge_error_of(newrecord, record): - """ - The parameters are two entities that cannot be merged with the merge_entities function. - - # This function checks for two obvious cases where no merge will ever be possible: - # 1. Two Entities with differing IDs - # 2. Two non-Entity values which differ - - It creates a more informative logger message and raises an Exception in those cases. - """ - for this_p in newrecord.properties: - that_p = record.get_property(this_p.name) - - if that_p is None: - logger.debug(f"Property {this_p.name} does not exist in the second entity. Note that " - "this should not be the reason for the merge conflict.") - continue - - if (isinstance(this_p.value, db.Entity) - and isinstance(that_p.value, db.Entity)): - if this_p.value.id is not None and that_p.value.id is not None: - if this_p.value.id != that_p.value.id: - logger.error("The Crawler is trying to merge two entities " - "because they should be the same object (same" - " identifiables), but they reference " - "different Entities with the same Property." - f"Problematic Property: {this_p.name}\n" - f"Referenced Entities: {this_p.value.id} and " - f"{that_p.value.id}\n" - f"{record}\n{newrecord}") - raise RuntimeError("Cannot merge Entities") - elif (not isinstance(this_p.value, db.Entity) - and not isinstance(that_p.value, db.Entity)): - if ((this_p.value != that_p.value) - # TODO can we also compare lists? - and not isinstance(this_p.value, list) - and not isinstance(that_p.value, list)): - logger.error( - "The Crawler is trying to merge two entities because they should be the same " - "object (same identifiables), but they have different values for the same " - "Property.\n" - f"Problematic Property: {this_p.name}\n" - f"Values: {this_p.value} and {that_p.value}\n" - f"{record}\n{newrecord}") - raise RuntimeError("Cannot merge Entities") - - class SecurityMode(Enum): RETRIEVE = 0 INSERT = 1 UPDATE = 2 -class TreatedRecordLookUp(): - """tracks Records and Identifiables for which it was checked whether they exist in the remote - server - - For a given Record it can be checked, whether it exists in the remote sever if - - it has a (valid) ID - - it has a (valid) path (FILEs only) - - an identifiable can be created for the Record. - - Records are added by calling the `add` function and they are then added to the internal - existing or missing list depending on whether the Record has a valid ID. - Additionally, the Record is added to three look up dicts. The keys of those are paths, IDs and - the representation of the identifiables. - - The extreme case, that one could imagine, would be that the same Record occurs three times as - different Python objects: one that only has an ID, one with only a path and one without ID and - path but with identifying properties. During `split_into_inserts_and_updates` all three - must be identified with each other (and must be merged). Since we require, that treated - entities have a valid ID if they exist in the remote server, all three objects would be - identified with each other simply using the IDs. - - In the case that the Record is not yet in the remote server, there cannot be a Python object - with an ID. Thus we might have one with a path and one with an identifiable. If that Record - does not yet exist, it is necessary that both Python objects have at least either the path or - the identifiable in common. - """ - - def __init__(self): - self._id_look_up: dict[int, db.Entity] = {} - self._path_look_up: dict[str, db.Entity] = {} - self._identifiable_look_up: dict[str, db.Entity] = {} - self.remote_missing_counter = -1 - self._missing: dict[int, db.Entity] = {} - self._existing: dict[int, db.Entity] = {} - - def add(self, record: db.Entity, identifiable: Optional[Identifiable] = None): - """ - Add a Record that was treated, such that it is contained in the internal look up dicts - - This Record MUST have an ID if it was found in the remote server. - """ - if record.id is None: - if record.path is None and identifiable is None: - raise RuntimeError("Record must have ID or path or an identifiable must be given." - f"Record is\n{record}") - record.id = self.remote_missing_counter - self.remote_missing_counter -= 1 - self._add_any(record, self._missing, identifiable) - else: - self._add_any(record, self._existing, identifiable) - - def get_any(self, record: db.Entity, identifiable: Optional[Identifiable] = None): - """ - Check whether this Record was already added. Identity is based on ID, path or Identifiable - represenation - """ - if record.id is not None and record.id in self._id_look_up: - return self._id_look_up[record.id] - if record.path is not None and record.path in self._path_look_up: - return self._path_look_up[record.path] - if (identifiable is not None and identifiable.get_representation() in - self._identifiable_look_up): - return self._identifiable_look_up[identifiable.get_representation()] - - def get_existing(self, record: db.Entity, identifiable: Optional[Identifiable] = None): - """ Check whether this Record exists on the remote server - - Returns: The stored Record - """ - rec = self.get_any(record, identifiable) - if id(rec) in self._existing: - return rec - else: - return None - - def get_missing(self, record: db.Entity, identifiable: Optional[Identifiable] = None): - """ Check whether this Record is missing on the remote server - - Returns: The stored Record - """ - rec = self.get_any(record, identifiable) - if id(rec) in self._missing: - return rec - else: - return None - - def get_missing_list(self): - """ Return all Records that are missing in the remote server """ - return list(self._missing.values()) - - def get_existing_list(self): - """ Return all Records that exist in the remote server """ - return list(self._existing.values()) - - def _add_any(self, record: db.Entity, lookup, identifiable: Optional[Identifiable] = None): - if record.id is not None: - self._id_look_up[record.id] = record - if record.path is not None: - self._path_look_up[record.path] = record - if identifiable is not None: - self._identifiable_look_up[identifiable.get_representation()] = record - lookup[id(record)] = record - - class Crawler(object): """ Crawler class that encapsulates crawling functions. @@ -365,14 +212,13 @@ class Crawler(object): # The following caches store records, where we checked whether they exist on the remote # server. Since, it is important to know whether they exist or not, we store them into two # different caches. - self.treated_records_lookup = TreatedRecordLookUp() # TODO does it make sense to have this as member variable? self.securityMode = securityMode # TODO does it make sense to have this as member variable(run_id)? self.generate_run_id() - self.identifiableAdapter: IdentifiableAdapter = LocalStorageIdentifiableAdapter() + self.identifiableAdapter: IdentifiableAdapter = CaosDBIdentifiableAdapter() if identifiableAdapter is not None: self.identifiableAdapter = identifiableAdapter @@ -449,401 +295,53 @@ class Crawler(object): self.crawled_data = data return data - def _has_reference_value_without_id(self, ident: Identifiable) -> bool: - """ - Returns True if there is at least one value in the properties and backrefs attributes of - ``ident`` which: - - a) is a reference property AND - b) where the value is set to a - :external+caosdb-pylib:py:class:`db.Entity <caosdb.common.models.Entity>` - (instead of an ID) AND - c) where the ID of the value (the - :external+caosdb-pylib:py:class:`db.Entity <caosdb.common.models.Entity>` object in b)) - is not set (to an integer) - - Returns - ------- - bool - True if there is a value without id (see above) - - Raises - ------ - ValueError - If no Identifiable is given. - """ - if ident is None: - raise ValueError("Identifiable has to be given as argument") - for pvalue in list(ident.properties.values()) + ident.backrefs: - if isinstance(pvalue, list): - for el in pvalue: - if isinstance(el, db.Entity) and el.id is None: - return True - elif isinstance(pvalue, db.Entity) and pvalue.id is None: - return True - return False - - @staticmethod - def create_flat_list(ent_list: list[db.Entity], flat: Optional[list[db.Entity]] = None): - """ - Recursively adds entities and all their properties contained in ent_list to - the output list flat. - - TODO: This function will be moved to pylib as it is also needed by the - high level API. - """ - # Note: A set would be useful here, but we do not want a random order. - if flat is None: - flat = list() - for el in ent_list: - if el not in flat: - flat.append(el) - for ent in ent_list: - for p in ent.properties: - # For lists append each element that is of type Entity to flat: - if isinstance(p.value, list): - for el in p.value: - if isinstance(el, db.Entity): - if el not in flat: - flat.append(el) - Crawler.create_flat_list([el], flat) - elif isinstance(p.value, db.Entity): - if p.value not in flat: - flat.append(p.value) - Crawler.create_flat_list([p.value], flat) - return flat - - def _has_missing_object_in_references(self, ident: Identifiable, referencing_entities: dict): - """ - returns False if any value in the properties attribute is a db.Entity object that - is contained in the `remote_missing_cache`. If ident has such an object in - properties, it means that it references another Entity, where we checked - whether it exists remotely and it was not found. - """ - if ident is None: - raise ValueError("Identifiable has to be given as argument") - for pvalue in list(ident.properties.values()) + ident.backrefs: - # Entity instead of ID and not cached locally - if (isinstance(pvalue, list)): - for el in pvalue: - elident = self.identifiableAdapter.get_identifiable( - el, referencing_entities[id(el)]) - if (isinstance(el, db.Entity) - and self.treated_records_lookup.get_missing(el, elident) is not None): - return True - if (isinstance(pvalue, db.Entity) and self.treated_records_lookup.get_missing( - pvalue, - self.identifiableAdapter.get_identifiable(pvalue, - referencing_entities[id(pvalue)]) - ) is not None): - # might be checked when reference is resolved - return True - return False - - def replace_references_with_cached(self, record: db.Record, referencing_entities: dict): - """ - Replace all references with the versions stored in the cache. - - If the cache version is not identical, raise an error. - """ - for p in record.properties: - if (isinstance(p.value, list)): - lst = [] - for el in p.value: - if (isinstance(el, db.Entity) and el.id is None): - cached = self.treated_records_lookup.get_any( - el, - self.identifiableAdapter.get_identifiable( - el, referencing_entities[id(el)])) - if cached is None: - lst.append(el) - continue - if not check_identical(cached, el, True): - if isinstance(p.value, db.File): - if p.value.path != cached.path: - raise RuntimeError( - "The cached and the referenced entity are not identical.\n" - f"Cached:\n{cached}\nReferenced:\n{el}" - ) - else: - raise RuntimeError( - "The cached and the referenced entity are not identical.\n" - f"Cached:\n{cached}\nReferenced:\n{el}" - ) - lst.append(cached) - else: - lst.append(el) - p.value = lst - if (isinstance(p.value, db.Entity) and p.value.id is None): - cached = self.treated_records_lookup.get_any( - p.value, self.identifiableAdapter.get_identifiable( - p.value, referencing_entities[id(p.value)])) - if cached is None: - continue - if not check_identical(cached, p.value, True): - if isinstance(p.value, db.File): - if p.value.path != cached.path: - raise RuntimeError( - "The cached and the referenced entity are not identical.\n" - f"Cached:\n{cached}\nReferenced:\n{p.value}" - ) - else: - raise RuntimeError( - "The cached and the referenced entity are not identical.\n" - f"Cached:\n{cached}\nReferenced:\n{p.value}" - ) - p.value = cached - - @staticmethod - def bend_references_to_new_object(old, new, entities): - """ Bend references to the other object - Iterate over all entities in `entities` and check the values of all properties of - occurances of old Entity and replace them with new Entity - """ - for el in entities: - for p in el.properties: - if isinstance(p.value, list): - for index, val in enumerate(p.value): - if val is old: - p.value[index] = new - else: - if p.value is old: - p.value = new - - def _merge_identified(self, newrecord, record, try_to_merge_later, all_records): - """ tries to merge record into newrecord - - If it fails, record is added to the try_to_merge_later list. - In any case, references are bent to the newrecord object. + def split_into_inserts_and_updates(self, st: SyncGraph): + """Classify nodes in the SyncGraph ``st`` with respect to their state on the server. +This method iteratively checks whether those nodes exist on the remote server and creates two lists, +one with the entities that need to be updated and the other with entities to be inserted. """ - try: - merge_entities( - newrecord, record, merge_references_with_empty_diffs=False, - merge_id_with_resolved_entity=True) - except EntityMergeConflictError: - _treat_merge_error_of(newrecord, record) - # We cannot merge but it is none of the clear case where merge is - # impossible. Thus we try later - try_to_merge_later.append(record) - if newrecord.id is not None: - record.id = newrecord.id - except NotImplementedError: - print(newrecord) - print(record) - raise - Crawler.bend_references_to_new_object( - old=record, new=newrecord, - entities=all_records - ) - - def _identity_relies_on_unchecked_entities(self, record: db.Record, referencing_entities): - """ - If a record for which it could not yet be verified whether it exists in LA or not is part - of the identifying properties, this returns True, otherwise False - """ - - registered_identifiable = self.identifiableAdapter.get_registered_identifiable(record) - if registered_identifiable is None: - return False - refs = self.identifiableAdapter.get_identifying_referencing_entities( - referencing_entities, registered_identifiable) - if any(el is None for el in refs): - return True - - refs = self.identifiableAdapter.get_identifying_referenced_entities( - record, registered_identifiable) - if any([self.treated_records_lookup.get_any(el) is None for el in refs]): - return True - - return False - - @staticmethod - def create_reference_mapping(flat: list[db.Entity]): - """ - Create a dictionary of dictionaries of the form: - dict[int, dict[str, list[Union[int,None]]]] - - - The integer index is the Python id of the value object. - - The string is the name of the first parent of the referencing object. - - Each value objects is taken from the values of all properties from the list flat. - - So the returned mapping maps ids of entities to the ids of objects which are referring - to them. - - .. todo:: - - This method takes about 2/3 of the time of synchronize(), it might be a good - optimization candidate. - """ - # TODO we need to treat children of RecordTypes somehow. - references: dict[int, dict[str, list[Union[int, None]]]] = {} - for ent in flat: - if id(ent) not in references: - references[id(ent)] = {} - for p in ent.properties: - val = p.value - if not isinstance(val, list): - val = [val] - for v in val: - if isinstance(v, db.Entity): - if id(v) not in references: - references[id(v)] = {} - if ent.parents[0].name not in references[id(v)]: - references[id(v)][ent.parents[0].name] = [] - references[id(v)][ent.parents[0].name].append(ent.id) - - return references - - def split_into_inserts_and_updates(self, ent_list: list[db.Entity]): - flat = Crawler.create_flat_list(ent_list) - all_records = list(flat) - - # TODO: can the following be removed at some point - for ent in flat: - if ent.role == "Record" and len(ent.parents) == 0: - raise RuntimeError(f"Records must have a parent.\n{ent}") - - try_to_merge_later = [] - - # Check whether Records can be identified without identifiable - for i in reversed(range(len(flat))): - record = flat[i] - # 1. Can it be identified via an ID? - if record.id is not None: - treated_record = self.treated_records_lookup.get_existing(record) - if treated_record is not None: - self._merge_identified(treated_record, record, try_to_merge_later, all_records) - all_records.remove(record) - referencing_entities = self.create_reference_mapping(all_records) - else: - self.treated_records_lookup.add(record, None) - assert record.id - del flat[i] - # 2. Can it be identified via a path? - elif record.path is not None: - try: - existing = cached_get_entity_by(path=record.path) - except EmptyUniqueQueryError: - existing = None - if existing is not None: - record.id = existing.id - # TODO check the following copying of _size and _checksum - # Copy over checksum and size too if it is a file - record._size = existing._size - record._checksum = existing._checksum - treated_record = self.treated_records_lookup.get_any(record) - if treated_record is not None: - self._merge_identified(treated_record, record, try_to_merge_later, all_records) - all_records.remove(record) - referencing_entities = self.create_reference_mapping(all_records) - else: - # TODO add identifiable if possible - self.treated_records_lookup.add(record, None) - assert record.id - del flat[i] - entity_was_treated = True - # flat contains Entities which could not yet be checked against the remote server - while entity_was_treated and len(flat) > 0: + # st.unchecked contains Entities which could not yet be checked against the remote server + while entity_was_treated and len(st.unchecked) > 0: entity_was_treated = False - referencing_entities = self.create_reference_mapping(all_records) - - # For each element we try to find out whether we can find it in the server or whether - # it does not yet exist. Since a Record may reference other unkown Records it might not - # be possible to answer this right away. - # The following checks are done on each Record: - # 1. Is it in the cache of already checked Records? - # 2. Can it be checked on the remote server? - # 3. Does it have to be new since a needed reference is missing? - for i in reversed(range(len(flat))): - record = flat[i] - - if self._identity_relies_on_unchecked_entities(record, - referencing_entities[id(record)]): + + for se in st.unchecked: + if se.identifiable is None: # we cannot yet identify this node continue - identifiable = self.identifiableAdapter.get_identifiable( - record, - referencing_entities=referencing_entities[id(record)]) - - # 1. Is it in the cache of already checked Records? - if self.treated_records_lookup.get_any(record, identifiable) is not None: - treated_record = self.treated_records_lookup.get_any(record, identifiable) - # Since the identifiables are the same, treated_record and record actually - # describe the same object. - # We merge record into treated_record in order to prevent loss of information - self._merge_identified(treated_record, record, try_to_merge_later, all_records) - all_records.remove(record) - referencing_entities = self.create_reference_mapping(all_records) - - del flat[i] - entity_was_treated = True - - # 2. Can it be checked on the remote server? - elif not self._has_reference_value_without_id(identifiable): - identified_record = ( - self.identifiableAdapter.retrieve_identified_record_for_identifiable( - identifiable)) - if identified_record is None: - # identifiable does not exist remotely -> record needs to be inserted - self.treated_records_lookup.add(record, identifiable) - else: - # side effect - record.id = identified_record.id - record.path = identified_record.path - self.treated_records_lookup.add(record, identifiable) - assert record.id - del flat[i] - entity_was_treated = True - - # 3. Does it have to be new since a needed reference is missing? - # (Is it impossible to check this record because an identifiable references a - # missing record?) - elif self._has_missing_object_in_references(identifiable, referencing_entities): - self.treated_records_lookup.add(record, identifiable) - assert record.id - del flat[i] - entity_was_treated = True - - for record in flat: - self.replace_references_with_cached(record, referencing_entities) - - # We postponed the merge for records where it failed previously and try it again now. + # check remote server + identified_record = ( + st.identifiableAdapter.retrieve_identified_record_for_identifiable( + se.identifiable)) + remote_id = None + if identified_record is not None: + remote_id = identified_record.id + # set id of node. if node is missing, remote_id is None and the SyncGraph marks it + # as missing + st.set_id_of_node(se, remote_id) + entity_was_treated = True + break # one or more nodes were just removed from st.unchecked -> back to start + # This only might add properties of the postponed records to the already used ones. - for record in try_to_merge_later: - identifiable = self.identifiableAdapter.get_identifiable( - record, - referencing_entities=referencing_entities[id(record)]) - newrecord = self.treated_records_lookup.get_any(record, identifiable) - merge_entities(newrecord, record, merge_id_with_resolved_entity=True) - if len(flat) > 0: - circle = self.detect_circular_dependency(flat) - if circle is None: - logger.error("Failed, but found NO circular dependency. The data is as follows:" - + str(self.compact_entity_list_representation(flat, - referencing_entities))) - else: - logger.error("Found circular dependency (Note that this might include references " - "that are not identifying properties): " - + self.compact_entity_list_representation(circle, - referencing_entities)) + if len(st.unchecked) > 0: + # circle = st.unchecked_contains_circular_dependency() + # if circle is None: + # logger.error("Failed, but found NO circular dependency. The data is as follows:" + # + "\n".join([str(el) for el in st.unchecked]) + + # ) + # else: + # logger.error("Found circular dependency (Note that this might include references " + # "that are not identifying properties): " + # + "\n".join([str(el) for el in st.unchecked]) + # ) raise RuntimeError( - f"Could not finish split_into_inserts_and_updates. Circular dependency: " - f"{circle is not None}") - - # remove negative IDs - missing = self.treated_records_lookup.get_missing_list() - for el in missing: - if el.id is None: - raise RuntimeError("This should not happen") # TODO remove - if el.id >= 0: - raise RuntimeError("This should not happen") # TODO remove - el.id = None + "Could not finish split_into_inserts_and_updates. " + "It might be due to a circular dependency") - return (missing, self.treated_records_lookup.get_existing_list()) + return st.export_record_lists() def replace_entities_with_ids(self, rec: db.Record): for el in rec.properties: @@ -856,7 +354,7 @@ class Crawler(object): if val.id is not None: el.value[index] = val.id - @ staticmethod + @staticmethod def compact_entity_list_representation(entities, referencing_entities: List) -> str: """ a more readable representation than the standard xml representation @@ -888,40 +386,7 @@ class Crawler(object): return text + "--------\n" - @ staticmethod - def detect_circular_dependency(flat: list[db.Entity]): - """ - Detects whether there are circular references in the given entity list and returns a list - where the entities are ordered according to the chain of references (and only the entities - contained in the circle are included. Returns None if no circular dependency is found. - - TODO: for the sake of detecting problems for split_into_inserts_and_updates we should only - consider references that are identifying properties. - """ - circle = [flat[0]] - closed = False - while not closed: - current = circle[-1] - added_to_circle = False - for p in current.properties: - if isinstance(p.value, list): - for pval in p.value: - if pval in flat: - if pval in circle: - closed = True - circle.append(pval) - added_to_circle = True - else: - if p.value in flat: - if p.value in circle: - closed = True - circle.append(p.value) - added_to_circle = True - if not added_to_circle: - return None - return circle - - @ staticmethod + @staticmethod def _merge_properties_from_remote( crawled_data: list[db.Record], identified_records: list[db.Record] @@ -963,7 +428,7 @@ class Crawler(object): return to_be_updated - @ staticmethod + @staticmethod def remove_unnecessary_updates( crawled_data: list[db.Record], identified_records: list[db.Record] @@ -989,7 +454,7 @@ class Crawler(object): return actual_updates - @ staticmethod + @staticmethod def execute_parent_updates_in_list(to_be_updated, securityMode, run_id, unique_names): """ Execute the updates of changed parents. @@ -1032,13 +497,13 @@ class Crawler(object): "mode. This might lead to a failure of inserts that follow.") logger.info(parent_updates) - @ staticmethod + @staticmethod def _get_property_id_for_datatype(rtname: str, name: str): return cached_get_entity_by( query=f"FIND Entity '{escape_squoted_text(rtname)}' " - f"with name='{escape_squoted_text(name)}'").id + f"with name='{escape_squoted_text(name)}'").id - @ staticmethod + @staticmethod def replace_name_with_referenced_entity_id(prop: db.Property): """changes the given property in place if it is a reference property that has a name as value @@ -1083,7 +548,7 @@ class Crawler(object): propval.append(el) prop.value = propval - @ staticmethod + @staticmethod def execute_inserts_in_list(to_be_inserted, securityMode, run_id: Optional[uuid.UUID] = None, unique_names=True): @@ -1103,7 +568,7 @@ class Crawler(object): update_cache = UpdateCache() update_cache.insert(to_be_inserted, run_id, insert=True) - @ staticmethod + @staticmethod def set_ids_and_datatype_of_parents_and_properties(rec_list): for record in rec_list: for parent in record.parents: @@ -1115,7 +580,7 @@ class Crawler(object): prop.id = entity.id _resolve_datatype(prop, entity) - @ staticmethod + @staticmethod def execute_updates_in_list(to_be_updated, securityMode, run_id: Optional[uuid.UUID] = None, unique_names=True): @@ -1129,7 +594,7 @@ class Crawler(object): update_cache = UpdateCache() update_cache.insert(to_be_updated, run_id) - @ staticmethod + @staticmethod def check_whether_parent_exists(records: list[db.Entity], parents: list[str]): """ returns a list of all records in `records` that have a parent that is in `parents`""" problems = [] @@ -1185,7 +650,8 @@ class Crawler(object): "use for example the Scanner to create this data.")) crawled_data = self.crawled_data - to_be_inserted, to_be_updated = self.split_into_inserts_and_updates(crawled_data) + to_be_inserted, to_be_updated = self.split_into_inserts_and_updates( + SyncGraph(crawled_data, self.identifiableAdapter)) for el in to_be_updated: # all entity objects are replaced by their IDs except for the not yet inserted ones @@ -1216,8 +682,10 @@ class Crawler(object): if len(ins_problems) > 0 or len(upd_problems) > 0: raise ForbiddenTransaction( "One or more Records that have a parent which is excluded from inserts or updates." - f"\nRecords excluded from inserts have the following RecordTypes:\n{[el.parents[0].name for el in ins_problems]}" - f"\nRecords excluded from updates have the following RecordTypes:\n{[el.parents[0].name for el in upd_problems]}" + f"\nRecords excluded from inserts have the following RecordTypes:\n" + f"{[el.parents[0].name for el in ins_problems]}" + f"\nRecords excluded from updates have the following RecordTypes:\n" + f"{[el.parents[0].name for el in upd_problems]}" ) logger.info(f"Going to insert {len(to_be_inserted)} Entities and update " @@ -1226,14 +694,14 @@ class Crawler(object): cache_clear() self.execute_parent_updates_in_list(to_be_updated, securityMode=self.securityMode, run_id=self.run_id, unique_names=unique_names) - logger.info(f"Added parent RecordTypes where necessary.") + logger.info("Added parent RecordTypes where necessary.") self.execute_inserts_in_list( to_be_inserted, self.securityMode, self.run_id, unique_names=unique_names) - logger.info(f"Executed inserts:\n" + logger.info("Executed inserts:\n" + self.create_entity_summary(to_be_inserted)) self.execute_updates_in_list( to_be_updated, self.securityMode, self.run_id, unique_names=unique_names) - logger.info(f"Executed updates:\n" + logger.info("Executed updates:\n" + self.create_entity_summary(to_be_updated)) update_cache = UpdateCache() @@ -1249,7 +717,7 @@ class Crawler(object): return (to_be_inserted, to_be_updated) - @ staticmethod + @staticmethod def create_entity_summary(entities: list[db.Entity]): """ Creates a summary string reprensentation of a list of entities.""" parents = {} @@ -1268,7 +736,7 @@ class Crawler(object): output = output[:-2] + "\n" return output - @ staticmethod + @staticmethod def inform_about_pending_changes(pending_changes, run_id, path, inserts=False): # Sending an Email with a link to a form to authorize updates is if get_config_setting("send_crawler_notifications"): @@ -1289,7 +757,7 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) + " by invoking the crawler" " with the run id: {rid}\n".format(rid=run_id)) - @ staticmethod + @staticmethod def debug_build_usage_tree(converter: Converter): res: dict[str, dict[str, Any]] = { converter.name: { @@ -1366,13 +834,13 @@ def _update_status_record(run_id, n_inserts, n_updates, status): cr_rec.get_property('status').value = status (cr_rec .add_property(db.execute_query( - f"FIND Property with name='number_of_inserted_entities'", unique=True).id, + "FIND Property with name='number_of_inserted_entities'", unique=True).id, n_inserts) .add_property( - db.execute_query(f"FIND Property with name='number_of_updated_entities'", + db.execute_query("FIND Property with name='number_of_updated_entities'", unique=True).id, n_updates) .add_property( - db.execute_query(f"FIND Property with name='finished'", + db.execute_query("FIND Property with name='finished'", unique=True).id, datetime.now().isoformat())) cr_rec.update() @@ -1555,11 +1023,19 @@ def crawler_main(crawled_directory_path: str, try: crawler = Crawler(securityMode=securityMode) - # setup logging and reporting if serverside execution - if "SHARED_DIR" in os.environ: + if "SHARED_DIR" in os.environ: # setup logging and reporting if serverside execution userlog_public, htmluserlog_public, debuglog_public = configure_server_side_logging() + # TODO make this optional _create_status_record( - get_config_setting("public_host_url") + "/Shared/" + htmluserlog_public, crawler.run_id) + get_config_setting("public_host_url") + "/Shared/" + htmluserlog_public, + crawler.run_id) + else: # setup stdout logging for other cases + root_logger = logging.getLogger() + root_logger.setLevel(level=(logging.DEBUG if debug else logging.INFO)) + handler = logging.StreamHandler(stream=sys.stdout) + handler.setLevel(logging.DEBUG if debug else logging.INFO) + root_logger.addHandler(handler) + logger.handlers.clear() debug_tree = DebugTree() crawled_data = scan_directory( diff --git a/src/caoscrawler/exceptions.py b/src/caoscrawler/exceptions.py new file mode 100644 index 0000000000000000000000000000000000000000..6d08cf76fc177407154e38f0eb6aaa47bc863866 --- /dev/null +++ b/src/caoscrawler/exceptions.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +class ForbiddenTransaction(Exception): + """Thrown if an transactions is needed that is not allowed. + For example an update of an entity if the security level is INSERT + """ + pass + + +class MissingReferencingEntityError(Exception): + """Thrown if the identifiable requires that some entity references the given entity but there + is no such reference """ + + def __init__(self, *args, rts=None, **kwargs): + self.rts = rts + super().__init__(self, *args, **kwargs) + + +class ImpossibleMergeError(Exception): + """Thrown if due to identifying information, two SyncNodes or two Properties of SyncNodes + should be merged, but there is conflicting information that prevents this. + """ + + def __init__(self, *args, pname, values, **kwargs): + self.pname = pname + self.values = values + super().__init__(self, *args, **kwargs) + + +class MissingIdentifyingProperty(Exception): + """Thrown if a SyncNode does not have the properties required by the corresponding registered + identifiable + """ + pass diff --git a/src/caoscrawler/identifiable.py b/src/caoscrawler/identifiable.py index e69e1092950f5d24e5be31f2f74cf3a6302512c5..d7bccac7a483c2e2e6feca4c2bf9a9bfc8685ff9 100644 --- a/src/caoscrawler/identifiable.py +++ b/src/caoscrawler/identifiable.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # -# This file is a part of the CaosDB Project. +# This file is a part of the LinkAhead Project. # # Copyright (C) 2022 Henrik tom Wörden # @@ -20,23 +20,27 @@ # from __future__ import annotations -import linkahead as db -from datetime import datetime + import json +import logging +from datetime import datetime from hashlib import sha256 from typing import Optional, Union -import logging + +import linkahead as db + +from .exceptions import MissingIdentifyingProperty +from .sync_node import SyncNode logger = logging.getLogger(__name__) class Identifiable(): """ - The fingerprint of a Record in CaosDB. + The fingerprint of a Record in LinkAhead. - This class contains the information that is used by the CaosDB Crawler to identify Records. - On one hand, this can be the ID or a Record or the path of a File. - On the other hand, in order to check whether a Record exits in the CaosDB Server, a query can + This class contains the information that is used by the LinkAhead Crawler to identify Records. + In order to check whether a Record exits in the LinkAhead Server, a query can be created using the information contained in the Identifiable. Parameters @@ -46,56 +50,60 @@ class Identifiable(): properties: dict, keys are names of Properties; values are Property values Note, that lists are not checked for equality but are interpreted as multiple conditions for a single Property. - path: str, In case of files: The path where the file is stored. backrefs: list, TODO future """ + +<< << << < HEAD def __init__(self, record_id: Optional[int] = None, path: Optional[str] = None, record_type: Optional[str] = None, name: Optional[str] = None, properties: Optional[dict] = None, backrefs: Optional[list[Union[int, str]]] = None): if (record_id is None and path is None and name is None +== == === + def __init__(self, record_id: int=None, record_type: str=None, + name: str=None, properties: dict=None, + backrefs: list[Union[int, str]]=None): + if (record_id is None and name is None +>> >>>> > dev and (backrefs is None or len(backrefs) == 0) and (properties is None or len(properties) == 0)): - raise ValueError("There is no identifying information. You need to add a path or " - "properties or other identifying attributes.") + raise ValueError( + "There is no identifying information. You need to add " + "properties or other identifying attributes.") if properties is not None and 'name' in [k.lower() for k in properties.keys()]: raise ValueError("Please use the separete 'name' keyword instead of the properties " "dict for name") - self.record_id = record_id - self.path = path - self.record_type = record_type - self.name = name + self.record_id=record_id + self.record_type=record_type + self.name=name if name == "": - self.name = None - self.properties: dict = {} + self.name=None + self.properties: dict={} if properties is not None: - self.properties = properties - self.backrefs: list[Union[int, db.Entity]] = [] + self.properties=properties + self.backrefs: list[Union[int, db.Entity]]=[] if backrefs is not None: - self.backrefs = backrefs + self.backrefs=backrefs def get_representation(self) -> str: return sha256(Identifiable._create_hashable_string(self).encode('utf-8')).hexdigest() - @staticmethod + @ staticmethod def _value_representation(value) -> str: """returns the string representation of property values to be used in the hash function - The string is the path of a File Entity, the CaosDB ID or Python ID of other Entities - (Python Id only if there is no CaosDB ID) and the string representation of bool, float, int - and str. + The string is the LinkAhead ID in case of SyncNode objects (SyncNode objects must have an ID) + and the string representation of None, bool, float, int, datetime and str. """ if value is None: return "None" - elif isinstance(value, db.File): - return str(value.path) - elif isinstance(value, db.Entity): + elif isinstance(value, SyncNode): if value.id is not None: return str(value.id) else: - return "PyID=" + str(id(value)) + raise RuntimeError("Python Entity (SyncNode) without ID not allowed") elif isinstance(value, list): return "[" + ", ".join([Identifiable._value_representation(el) for el in value]) + "]" elif (isinstance(value, str) or isinstance(value, int) or isinstance(value, float) @@ -104,13 +112,13 @@ class Identifiable(): else: raise ValueError(f"Unknown datatype of the value: {value}") - @staticmethod + @ staticmethod def _create_hashable_string(identifiable: Identifiable) -> str: """ creates a string from the attributes of an identifiable that can be hashed String has the form "P<parent>N<name>R<reference-ids>a:5b:10" """ - rec_string = "P<{}>N<{}>R<{}>".format( + rec_string="P<{}>N<{}>R<{}>".format( identifiable.record_type, identifiable.name, [Identifiable._value_representation(el) for el in identifiable.backrefs]) @@ -121,27 +129,20 @@ class Identifiable(): return rec_string def __eq__(self, other) -> bool: - """ - Identifiables are equal if they belong to the same Record. Since ID and path are on their - own enough to identify the Record it is sufficient if those attributes are equal. - 1. both IDs are set (not None) -> equal if IDs are equal - 2. both paths are set (not None) -> equal if paths are equal - 3. equal if attribute representations are equal - """ + """ Identifiables are equal if they share the same ID or if the representation is equal """ if not isinstance(other, Identifiable): raise ValueError("Identifiable can only be compared to other Identifiable objects.") - elif self.record_id is not None and other.record_id is not None: + if self.record_id is not None and other.record_id is not None: return self.record_id == other.record_id - elif self.path is not None and other.path is not None: - return self.path == other.path elif self.get_representation() == other.get_representation(): return True else: return False def __repr__(self): - pstring = json.dumps(self.properties) + """ deterministic text representation of the identifiable """ + pstring=json.dumps({k: str(v) for k, v in self.properties.items()}) return (f"{self.__class__.__name__} for RT {self.record_type}: id={self.record_id}; " - f"name={self.name}\n\tpath={self.path}\n" + f"name={self.name}\n" f"\tproperties:\n{pstring}\n" f"\tbackrefs:\n{self.backrefs}") diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index ab0416edd36a7a307495f76541668517eb5ac460..e903190b00c83d32af28e878422d27dc0e8e7583 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -2,7 +2,7 @@ # encoding: utf-8 # # ** header v3.0 -# This file is a part of the CaosDB Project. +# This file is a part of the LinkAhead Project. # # Copyright (C) 2021-2022 Henrik tom Wörden # 2021-2022 Alexander Schlemmer @@ -29,7 +29,6 @@ import logging import warnings from abc import ABCMeta, abstractmethod from datetime import datetime -from functools import lru_cache from typing import Any import linkahead as db @@ -37,7 +36,9 @@ import yaml from linkahead.cached import cached_get_entity_by, cached_query from linkahead.utils.escape import escape_squoted_text +from .exceptions import MissingIdentifyingProperty, MissingReferencingEntityError from .identifiable import Identifiable +from .sync_node import SyncNode from .utils import has_parent logger = logging.getLogger(__name__) @@ -51,7 +52,7 @@ def get_children_of_rt(rtname): def convert_value(value: Any) -> str: - """ Return a string representation of the value suitable for the search query. + """Return a string representation of the value suitable for the search query. This is for search queries looking for the identified record. @@ -82,27 +83,27 @@ def convert_value(value: Any) -> str: class IdentifiableAdapter(metaclass=ABCMeta): """Base class for identifiable adapters. -Some terms: + Some terms: -- A *registered identifiable* defines an identifiable template, for example by specifying: - - Parent record types - - Properties - - ``is_referenced_by`` statements -- An *identifiable* belongs to a concrete record. It consists of identifying attributes which "fill - in" the *registered identifiable*. In code, it can be represented as a Record based on the - *registered identifiable* with all the values filled in. -- An *identified record* is the result of retrieving a record from the database, based on the - *identifiable* (and its values). + - A *registered identifiable* defines an identifiable template, for example by specifying: + - Parent record types + - Properties + - ``is_referenced_by`` statements + - An *identifiable* belongs to a concrete record. It consists of identifying attributes which "fill + in" the *registered identifiable*. In code, it can be represented as a Record based on the + *registered identifiable* with all the values filled in. + - An *identified record* is the result of retrieving a record from the database, based on the + *identifiable* (and its values). -General question to clarify: + General question to clarify: -- Do we want to support multiple identifiables per RecordType? -- Current implementation supports only one identifiable per RecordType. + - Do we want to support multiple identifiables per RecordType? + - Current implementation supports only one identifiable per RecordType. -The list of referenced by statements is currently not implemented. + The list of referenced by statements is currently not implemented. -The IdentifiableAdapter can be used to retrieve the three above mentioned objects (registered -identifiabel, identifiable and identified record) for a Record. + The IdentifiableAdapter can be used to retrieve the three above mentioned objects (registered + identifiabel, identifiable and identified record) for a Record. """ @@ -127,7 +128,7 @@ identifiabel, identifiable and identified record) for a Record. eid = ref if isinstance(ref, db.Entity): eid = ref.id - query_string += (" WHICH IS REFERENCED BY " + str(eid) + " AND") + query_string += " WHICH IS REFERENCED BY " + str(eid) + " AND" query_string += " WITH " @@ -136,22 +137,81 @@ identifiabel, identifiable and identified record) for a Record. if len(ident.properties) > 0: query_string += " AND " - query_string += IdentifiableAdapter.create_property_query(ident, startswith=startswith) + query_string += IdentifiableAdapter.create_property_query( + ident, startswith=startswith + ) # TODO Can these cases happen at all with the current code? if query_string.endswith(" AND WITH "): - query_string = query_string[:-len(" AND WITH ")] + query_string = query_string[: -len(" AND WITH ")] if query_string.endswith(" AND "): - query_string = query_string[:-len(" AND ")] + query_string = query_string[: -len(" AND ")] return query_string + def all_identifying_properties_exist( + self, node: SyncNode, raise_exception: bool = True + ): + """checks whether all identifying properties exist and raises an error if + that's not the case. It furthermore raises an error if "name" is part of + the identifiable, but the node does not have a name. + + If raise_exception is False, the function returns False instead of raising an error. + + Backreferences are not checked. + + Returns True if all identifying properties exist. + + Last review by Alexander Schlemmer on 2024-05-24. + """ + if node.registered_identifiable is None: + if raise_exception: + raise RuntimeError("no registered_identifiable") + else: + return False + for prop in node.registered_identifiable.properties: + if prop.name.lower() == "is_referenced_by": + continue + if prop.name.lower() == "name": + if node.name is None: + if raise_exception: + i = MissingIdentifyingProperty("The node has no name.") + i.prop = "name" + raise i + else: + return False + else: + continue + + # multiple occurances are ok here. We deal with that when actually creating an + # identifiable (IDs of referenced Entities might need to get resolved first). + if ( + len( + [ + el + for el in node.properties + if el.name.lower() == prop.name.lower() + ] + ) + == 0 + ): + if raise_exception: + i = MissingIdentifyingProperty( + f"The property {prop.name} is missing." + ) + i.prop = prop.name + raise i + else: + return False + + return True + @staticmethod def __create_pov_snippet(pname: str, pvalue, startswith: bool = False): """Return something like ``'name'='some value'`` or ``'name' LIKE 'some*'``. -If ``startswith`` is True, the value of strings will be cut off at 200 characters and a ``LIKE`` -operator will be used to find entities matching at the beginning. -""" + If ``startswith`` is True, the value of strings will be cut off at 200 characters and a ``LIKE`` + operator will be used to find entities matching at the beginning. + """ if startswith and isinstance(pvalue, str) and len(pvalue) > 200: operator_value_str = f" LIKE '{escape_squoted_text(pvalue[:200])}*'" else: @@ -163,14 +223,14 @@ operator will be used to find entities matching at the beginning. def create_property_query(entity: Identifiable, startswith: bool = False): """Create a POV query part with the entity's properties. -Parameters ----------- + Parameters + ---------- -entity: Identifiable - The Identifiable whose properties shall be used. + entity: Identifiable + The Identifiable whose properties shall be used. -startswith: bool, optional - If True, check string typed properties against the first 200 characters only. Default is False. + startswith: bool, optional + If True, check string typed properties against the first 200 characters only. Default is False. """ query_string = "" pov = IdentifiableAdapter.__create_pov_snippet # Shortcut @@ -197,7 +257,7 @@ startswith: bool, optional return query_string[:-4] @abstractmethod - def get_registered_identifiable(self, record: db.Record): + def get_registered_identifiable(self, record: db.Entity): """ Check whether an identifiable is registered for this record and return its definition. If there is no identifiable registered, return None. @@ -210,42 +270,21 @@ startswith: bool, optional @abstractmethod def get_file(self, identifiable: db.File): - warnings.warn(DeprecationWarning("This function is deprecated. Please do not use it.")) + warnings.warn( + DeprecationWarning("This function is deprecated. Please do not use it.") + ) """ Retrieve the file object for a (File) identifiable. """ pass - @staticmethod - def get_identifying_referencing_entities(referencing_entities, registered_identifiable): - refs = [] - for prop in registered_identifiable.properties: - if prop.name.lower() != "is_referenced_by": - continue - for looking_for_rt in prop.value: - found = False - if looking_for_rt == "*": - for val in referencing_entities.values(): - if len(val) > 0: - found = True - refs.extend(val) - else: - rt_and_children = get_children_of_rt(looking_for_rt) - for rtname in rt_and_children: - if (rtname in referencing_entities): - refs.extend(referencing_entities[rtname]) - found = True - if not found: - raise RuntimeError( - f"Could not find referencing entities of type(s): {prop.value}\n" - f"for registered identifiable:\n{registered_identifiable}\n" - f"There were {len(referencing_entities)} referencing entities to choose from.\n" - f"This error can also occur in case of merge conflicts in the referencing entities." - ) - return refs - @staticmethod def get_identifying_referenced_entities(record, registered_identifiable): + """Create a list of all entities that are referenced by record + and that are used as identying properties of the identifiable. + + Last review by Alexander Schlemmer on 2024-05-29. + """ refs = [] for prop in registered_identifiable.properties: pname = prop.name.lower() @@ -263,83 +302,101 @@ startswith: bool, optional refs.append(val) return refs - def get_identifiable(self, record: db.Record, referencing_entities=None): + def get_identifiable(self, se: SyncNode, identifiable_backrefs: set[SyncNode]) -> Identifiable: """ - Retrieve the registered identifiable and fill the property values to create an - identifiable. + Take the registered identifiable of given SyncNode ``se`` and fill the property values to + create an identifiable. Args: - record: the record for which the Identifiable shall be created. - referencing_entities: a dictionary (Type: dict[str, list[db.Entity]]), that - allows to look up entities with a certain RecordType, that reference ``record`` + se: the SyncNode for which the Identifiable shall be created. + identifiable_backrefs: a set (Type: set[SyncNode]), that contains SyncNodes + with a certain RecordType, that reference ``se`` Returns: Identifiable, the identifiable for record. - """ - - registered_identifiable = self.get_registered_identifiable(record) - if referencing_entities is None: - referencing_entities = {} + Last review by Alexander Schlemmer on 2024-05-29. + """ property_name_list_A = [] - property_name_list_B = [] identifiable_props = {} - identifiable_backrefs = [] - name_is_identifying_property = False - - if registered_identifiable is not None: - identifiable_backrefs = self.get_identifying_referencing_entities( - referencing_entities, registered_identifiable) - # fill the values: - for prop in registered_identifiable.properties: - if prop.name == "name": - # The name can be an identifiable, but it isn't a property - name_is_identifying_property = True - continue - # problem: what happens with multi properties? - # case A: in the registered identifiable - # case B: in the identifiable - - # treated above - if prop.name.lower() == "is_referenced_by": - continue + name = None + + if se.registered_identifiable is None: + raise ValueError("no registered_identifiable") + + # fill the values: + for prop in se.registered_identifiable.properties: + # TDOO: + # If there are multiproperties in the registered_identifiable, then only the LAST is + # taken into account (later properties overwrite previous one in the dict below). + if prop.name == "name": + name = se.name + continue - record_prop = record.get_property(prop.name) - if record_prop is None: - # TODO: how to handle missing values in identifiables - # raise an exception? - # TODO: is this the appropriate error? - raise NotImplementedError( - f"The following record is missing an identifying property:\n" - f"RECORD\n{record}\nIdentifying PROPERTY\n{prop.name}" + if prop.name.lower() == "is_referenced_by": + for el in identifiable_backrefs: + if not isinstance(el, SyncNode): + raise ValueError("Elements of `identifiable_backrefs` must be SyncNodes") + if len(identifiable_backrefs) == 0: + raise MissingReferencingEntityError( + f"Could not find referencing entities of type(s): {prop.value}\n" + f"for registered identifiable:\n{se.registered_identifiable}\n" + f"There were {len(identifiable_backrefs)} referencing entities to " + "choose from.\n" + f"This error can also occur in case of merge conflicts in the referencing" + " entities." ) - identifiable_props[record_prop.name] = record_prop.value - property_name_list_A.append(prop.name) - - # check for multi properties in the record: - for prop in property_name_list_A: - property_name_list_B.append(prop) - if (len(set(property_name_list_B)) != len(property_name_list_B) or len( - set(property_name_list_A)) != len(property_name_list_A)): - raise RuntimeError( - "Multi properties used in identifiables could cause unpredictable results and " - "are not allowed. You might want to consider a Property with a list as value.") + elif len([e.id for e in identifiable_backrefs if el.id is None]) > 0: + raise RuntimeError("Referencing entity has no id") + # At this point we know that there is at least one referencing SyncNode + # with an ID. We do not need to set any property value (the reference will be used + # in the backrefs argument below) and can thus continue with the next identifying + # property + continue + + options = [p.value for p in se.properties if p.name.lower() == prop.name.lower()] + if len(options) == 0: + raise MissingIdentifyingProperty( + f"The following record is missing an identifying property:\n" + f"RECORD\n{se}\nIdentifying PROPERTY\n{prop.name}" + ) + for ii, el in enumerate(options): + if isinstance(el, SyncNode): + options[ii] = el.id + if el.id is None: + raise RuntimeError( + "Reference to unchecked in identifiable:\n" + f"{prop.name}:\n{el}" + ) + else: + options[ii] = el + if not all([f == options[0] for f in options]): + raise RuntimeError("differing prop values ") + + identifiable_props[prop.name] = options[0] + property_name_list_A.append(prop.name) + + # check for multi properties in the record: + if len(set(property_name_list_A)) != len(property_name_list_A): + raise RuntimeError( + "Multi properties used in identifiables could cause unpredictable results and " + "are not allowed. You might want to consider a Property with a list as value." + ) # use the RecordType of the registered Identifiable if it exists # We do not use parents of Record because it might have multiple try: return Identifiable( - record_id=record.id, - record_type=(registered_identifiable.parents[0].name - if registered_identifiable else None), - name=record.name if name_is_identifying_property else None, + record_id=se.id, + record_type=se.registered_identifiable.parents[0].name, + name=name, properties=identifiable_props, - path=record.path, - backrefs=identifiable_backrefs + backrefs=[e.id for e in identifiable_backrefs], ) - except Exception: - logger.error(f"Error while creating identifiable for this record:\n{record}") + except Exception as exc: + logger.error(exc) + logger.error(f"Error while creating identifiable for this record:\n{se}") raise @abstractmethod @@ -354,7 +411,9 @@ startswith: bool, optional """ pass - def retrieve_identified_record_for_record(self, record: db.Record, referencing_entities=None): + def retrieve_identified_record_for_record( + self, record: db.Record, referencing_entities=None + ): """ This function combines all functionality of the IdentifierAdapter by returning the identifiable after having checked for an appropriate @@ -368,10 +427,36 @@ startswith: bool, optional if record.id is not None: return cached_get_entity_by(eid=record.id) - identifiable = self.get_identifiable(record, referencing_entities=referencing_entities) + identifiable = self.get_identifiable( + record, referencing_entities=referencing_entities + ) return self.retrieve_identified_record_for_identifiable(identifiable) + @staticmethod + def referencing_entity_has_appropriate_type(parents, register_identifiable): + """returns true if one of the parents is listed by the 'is_referenced_by' property + + This function also returns True if 'is_referenced_by' contains the wildcard '*'. + + Last review by Alexander Schlemmer on 2024-05-29. + """ + if register_identifiable.get_property("is_referenced_by") is None: + return False + if register_identifiable.get_property("is_referenced_by").value is None: + return False + + appropriate_types = [] + for rt in register_identifiable.get_property("is_referenced_by").value: + appropriate_types.extend(get_children_of_rt(rt)) + appropriate_types = [el.lower() for el in appropriate_types] + if "*" in appropriate_types: + return True + for parent in parents: + if parent.name.lower() in appropriate_types: + return True + return False + class LocalStorageIdentifiableAdapter(IdentifiableAdapter): """ @@ -379,8 +464,11 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): """ def __init__(self): - warnings.warn(DeprecationWarning( - "This class is deprecated. Please use the CaosDBIdentifiableAdapter.")) + warnings.warn( + DeprecationWarning( + "This class is deprecated. Please use the CaosDBIdentifiableAdapter." + ) + ) self._registered_identifiables = dict() self._records = [] @@ -395,7 +483,9 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): Just look in records for a file with the same path. """ candidates = [] - warnings.warn(DeprecationWarning("This function is deprecated. Please do not use it.")) + warnings.warn( + DeprecationWarning("This function is deprecated. Please do not use it.") + ) for record in self._records: if record.role == "File" and record.path == identifiable.path: candidates.append(record) @@ -407,15 +497,18 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): def store_state(self, filename): with open(filename, "w") as f: - f.write(db.common.utils.xml2str( - db.Container().extend(self._records).to_xml())) + f.write( + db.common.utils.xml2str(db.Container().extend(self._records).to_xml()) + ) def restore_state(self, filename): with open(filename, "r") as f: self._records = db.Container().from_xml(f.read()) # TODO: move to super class? - def is_identifiable_for_record(self, registered_identifiable: db.RecordType, record: db.Record): + def is_identifiable_for_record( + self, registered_identifiable: db.RecordType, record: db.Record + ): """ Check whether this registered_identifiable is an identifiable for the record. @@ -426,8 +519,7 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): Return True in that case and False otherwise. """ if len(registered_identifiable.parents) != 1: - raise RuntimeError( - "Multiple parents for identifiables not supported.") + raise RuntimeError("Multiple parents for identifiables not supported.") if not has_parent(record, registered_identifiable.parents[0].name): return False @@ -437,14 +529,13 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): return False return True - def get_registered_identifiable(self, record: db.Record): + def get_registered_identifiable(self, record: db.Entity): identifiable_candidates = [] for _, definition in self._registered_identifiables.items(): if self.is_identifiable_for_record(definition, record): identifiable_candidates.append(definition) if len(identifiable_candidates) > 1: - raise RuntimeError( - "Multiple candidates for an identifiable found.") + raise RuntimeError("Multiple candidates for an identifiable found.") if len(identifiable_candidates) == 0: return None return identifiable_candidates[0] @@ -459,8 +550,9 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): record is the record from the local database to check against. identifiable is the record that was created during the crawler run. """ - if (identifiable.record_type is not None - and not has_parent(record, identifiable.record_type)): + if identifiable.record_type is not None and not has_parent( + record, identifiable.record_type + ): return False for propname, propvalue in identifiable.properties.items(): prop_record = record.get_property(propname) @@ -489,21 +581,26 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): candidates.append(record) if len(candidates) > 1: raise RuntimeError( - f"Identifiable was not defined unambigiously. Possible candidates are {candidates}") + f"Identifiable was not defined unambigiously. Possible candidates are {candidates}" + ) if len(candidates) == 0: return None return candidates[0] def resolve_reference(self, value: db.Record): if self.get_registered_identifiable(value) is None: - raise NotImplementedError("Non-identifiable references cannot" - " be used as properties in identifiables.") + raise NotImplementedError( + "Non-identifiable references cannot" + " be used as properties in identifiables." + ) # TODO: just resolve the entity value_identifiable = self.retrieve_identified_record_for_record(value) if value_identifiable is None: - raise RuntimeError("The identifiable which is used as property" - " here has to be inserted first.") + raise RuntimeError( + "The identifiable which is used as property" + " here has to be inserted first." + ) if value_identifiable.id is None: raise RuntimeError("The entity has not been assigned an ID.") @@ -523,7 +620,7 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): def load_from_yaml_definition(self, path: str): """Load identifiables defined in a yaml file""" - with open(path, 'r', encoding="utf-8") as yaml_f: + with open(path, "r", encoding="utf-8") as yaml_f: identifiable_data = yaml.safe_load(yaml_f) for key, value in identifiable_data.items(): @@ -543,7 +640,9 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): self._registered_identifiables[name] = definition def get_file(self, identifiable: Identifiable): - warnings.warn(DeprecationWarning("This function is deprecated. Please do not use it.")) + warnings.warn( + DeprecationWarning("This function is deprecated. Please do not use it.") + ) # TODO is this needed for Identifiable? # or can we get rid of this function? if isinstance(identifiable, db.Entity): @@ -557,7 +656,7 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): return None return candidates[0] - def get_registered_identifiable(self, record: db.Record): + def get_registered_identifiable(self, record: db.Entity): """ returns the registered identifiable for the given Record @@ -585,9 +684,13 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): query_string = self.create_query_for_identifiable(identifiable) try: candidates = cached_query(query_string) - except db.exceptions.HTTPServerError as err: - query_string = self.create_query_for_identifiable(identifiable, startswith=True) - candidates = cached_query(query_string).copy() # Copy against cache poisoning + except db.exceptions.HTTPServerError: + query_string = self.create_query_for_identifiable( + identifiable, startswith=True + ) + candidates = cached_query( + query_string + ).copy() # Copy against cache poisoning # Test if the candidates really match all properties for pname, pvalue in identifiable.properties.items(): @@ -606,7 +709,8 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): raise RuntimeError( f"Identifiable was not defined unambiguously.\n{query_string}\nReturned the " f"following {candidates}." - f"Identifiable:\n{identifiable.record_type}{identifiable.properties}") + f"Identifiable:\n{identifiable.record_type}{identifiable.properties}" + ) if len(candidates) == 0: return None return candidates[0] diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py index 248cb185c4da99903dba81a3e0a4edfeab17127d..9f8f5e40beb729d73151bad38f3e390a4a8cecb4 100644 --- a/src/caoscrawler/scanner.py +++ b/src/caoscrawler/scanner.py @@ -361,16 +361,19 @@ def scanner(items: list[StructureElement], debug_tree.debug_metadata["usage"][str(element)].add( "/".join(converters_path + [converter.name])) mod_info = debug_tree.debug_metadata["provenance"] - for record_name, prop_name in keys_modified: - # TODO: check - internal_id = record_store_copy.get_internal_id( - record_name) - record_identifier = record_name + \ - "_" + str(internal_id) - converter.metadata["usage"].add(record_identifier) - mod_info[record_identifier][prop_name] = ( - structure_elements_path + [element.get_name()], - converters_path + [converter.name]) + # TODO: actually keys_modified must not be None. create_records should + # always return a list. + if keys_modified is not None: + for record_name, prop_name in keys_modified: + # TODO: check + internal_id = record_store_copy.get_internal_id( + record_name) + record_identifier = record_name + \ + "_" + str(internal_id) + converter.metadata["usage"].add(record_identifier) + mod_info[record_identifier][prop_name] = ( + structure_elements_path + [element.get_name()], + converters_path + [converter.name]) scanner(children, converter.converters, general_store_copy, record_store_copy, diff --git a/src/caoscrawler/sync_graph.py b/src/caoscrawler/sync_graph.py new file mode 100644 index 0000000000000000000000000000000000000000..9c021a10f35e95ca56d45151b8d064ec905993ec --- /dev/null +++ b/src/caoscrawler/sync_graph.py @@ -0,0 +1,719 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Henrik tom Wörden +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" +A data model class for the graph of entities that shall be created during synchronization of the +crawler. +""" + +from __future__ import annotations + +import logging +from typing import Any, Optional, Union, Callable + +import linkahead as db +from linkahead.cached import cached_get_entity_by +from linkahead.exceptions import EmptyUniqueQueryError + +from .identifiable_adapters import IdentifiableAdapter +from .identifiable import Identifiable +from .sync_node import SyncNode, TempID + +import re + +logger = logging.getLogger(__name__) + + +def _set_each_scalar_value( + node: SyncNode, condition: Callable[[Any], bool], value: Any +): + """helper function that conditionally replaces each value element of each property of a node + + If the property value is a list, the replacement is done for each list entry. + The replacement is only performed if the condition that + is provided is fulfilled, i.e. the callable ``condition`` returns True. The callable + ``condition`` must take the property value (or list element) as the sole argument. + + Args: + node (SyncNode): The node which provides the properties (and their values) to operate on. + condition (Callable): A function with one argument which is interpreted as a condition: + Only if it returns True for the property value, the action is + executed. + value (Callable): A function returning a new value that is set as the property value. This + function receives the old value as the single argument. + + Last review by Alexander Schlemmer on 2024-05-24. + """ + for p in node.properties: + if isinstance(p.value, list): + for ii, el in enumerate(p.value): + if condition(el): + p.value[ii] = value(el) + elif condition(p.value): + p.value = value(p.value) + + +class SyncGraph: + """ + A data model class for the graph of entities that shall be created during synchronization of + the crawler. + + The SyncGraph combines nodes in the graph based on their identity in order to create a graph of + objects that can either be inserted or updated in(to) the remote server. This combination of + SyncNodes happens during initialization and later on when the ID of SyncNodes is set. + + When the SyncGraph is initialized, the properties of given entities are scanned and used to + create multiple reference maps that track how SyncNodes reference each other. + These maps are kept up to date when SyncNodes are merged because they are identified with each + other. During initialization, SyncNodes are first merged based on their ID, path or + identifiable. + + When additional information is added to the graph by setting the ID of a node + (via `set_id_of_node`) then the graph is updated accordingly: + - if this information implies that the node is equivalent to another node (e.g. has same ID), + then they are merged + - if knowing that one node does not exist in the remote server, then this might imply that some + other node also does not exist if its identity relies on the latter. + - The new ID might make it possible to create the identifiables of connected nodes and thus + might trigger further merging of nodes based on the new identifiables. + + A SyncGraph should only be manipulated via one function: + - set_id_of_node: a positive integer means the Entity exists, None means it is missing + TODO what about String IDs + + The SyncGraph can be converted back to lists of entities which allow to perform the desired + inserts and updates. + + Usage: + - Initialize the Graph with a list of entities. Those will be converted to the SyncNodes of the + graph. + - SyncNodes that can be merged are automatically merged and SyncNodes where the existence can + be determined are automatically removed from the list of unchecked SyncNodes: + graph.unchecked. + - You manipulate the graph by setting the ID of a SyncNode (either to a valid ID or to None). + For example, you can check whether a SyncNode has an identifiable and then query the remote + server and use the result to set the ID. + - After each manipulation, the graph updates accordingly (see above) + - Ideally, the unchecked list is empty after some manipulation. + - You can export a list of entities to be inserted and one of entities to be updated with + export_record_lists. + + Last review by Alexander Schlemmer on 2024-05-24. + """ + + # General implementation remark: + # There are three cases where an update of one SyncNode can affect other nodes: + # - mark existing (add identifiables) + # - mark missing (add identifiables and add (negative) IDs) + # - merge (add identifiables) + # + # We cannot get an infinite recursion where one update triggers another update and so on + # because updates are conditional: + # Setting an ID removes the node (immediately) from the unchecked list and it is only tried to + # set an ID in _mark_missing if a node is in the uncheck list. Thus, setting the ID once + # prevents future attempts to set the ID of the same node. + # Also, setting an identifiable is only done when needed, i.e. there is no identifiable. + # Note, that when ever one node is changed, we check all dependent nodes (see usage of + # `_get_nodes_whose_identity_relies_on`) whether something should be updated. Thus, we cannot + # miss a necessary update. + def __init__( + self, entities: list[db.Entity], identifiableAdapter: IdentifiableAdapter + ): + self.identifiableAdapter = identifiableAdapter + # A dictionary allowing for quick lookup of sync nodes using their (possibly negative) IDs. + # This dictionary is initially set using _mark_entities_with_path_or_id and later updated + # using set_id_of_node or during merges of nodes. + self._id_look_up: dict[Union[int, TempID, str], SyncNode] = {} + # Similar as above for looking up nodes using paths + self._path_look_up: dict[str, SyncNode] = {} + # Similar as above for looking up nodes using identifiables. This dictionary uses the text + # representation generated by get_representation method of Identifiable as keys. + self._identifiable_look_up: dict[str, SyncNode] = {} + # look up for the nodes that were marked as being missing (on the remote server) + self._missing: dict[int, SyncNode] = {} + # same for existing + self._existing: dict[int, SyncNode] = {} + # entities that are missing get negative IDs to allow identifiable creation + self._remote_missing_counter = -1 + + self.nodes: list[SyncNode] = [] + self._initialize_nodes(entities) # list of all SemanticEntities + # list all SemanticEntities that have not yet been checked + self.unchecked = list(self.nodes) + + # initialize reference mappings (see _create_reference_mapping) + ( + self.forward_references, # id(node) -> full set of nodes referenced by the given node + self.backward_references, # id(node) -> full set of nodes referencing the given node + # as above, subset where the reference properties are part of identifiables + self.forward_references_id_props, + self.backward_references_id_props, + # as above, subset where references are part of identifiables due to "referenced_by" + self.forward_references_backref, + self.backward_references_backref, + ) = self._create_reference_mapping(self.nodes) + + # remove entities with path or ID from unchecked list + self._mark_entities_with_path_or_id() + + # add identifiables where possible + for node in list(self.nodes): + if self._identifiable_is_needed(node): + self._set_identifiable_of_node(node) + + # everything in unchecked neither does have an ID nor a path. + # Thus, it must be possible to create an + # identifiable which is checked using the following function: + for node in self.unchecked: + self.identifiableAdapter.all_identifying_properties_exist(node) + + def set_id_of_node(self, node: SyncNode, node_id: Optional[str] = None): + """sets the ID attribute of the given SyncNode to node_id. + + If node_id is None, a negative ID will be + given indicating that the node does not exist on the remote server. + Furthermore it will be marked as missing using _mark_missing. + + Last review by Alexander Schlemmer on 2024-05-24. + """ + if node.id is not None: + raise RuntimeError( + "Cannot update ID.\n" + f"It already is {node.id} and shall be set to {node_id}." + ) + if node_id is None: + node_id = TempID(self._get_new_id()) + node.id = node_id + if node_id in self._id_look_up: + self._merge_into(node, self._id_look_up[node.id]) + else: + self._id_look_up[node.id] = node + if isinstance(node.id, TempID): + self._mark_missing(node) + else: + self._mark_existing(node) + + def export_record_lists(self): + """exports the SyncGraph in form of db.Entities + + All nodes are converted to db.Entity objects and reference values that are SyncNodes are + replaced by their corresponding (newly created) db.Entity objects. + + Since the result is returned in form of two lists, one with Entities that have a valid ID + one with those that haven't, an error is raised if there are any SyncNodes without an + (possibly negative) ID. + + Last review by Alexander Schlemmer on 2024-05-24. + """ + # TODO reactivate once the implementation is appropriate + # if len(self.unchecked) > 1: + # self.unchecked_contains_circular_dependency() + + for el in self.nodes: + if el.id is None: + raise RuntimeError("Exporting unchecked entities is not supported") + + entities = [] + node_map = {} + for el in self.nodes: + entities.append(el.export_entity()) + node_map[id(el)] = entities[-1] + + for ent in entities: + _set_each_scalar_value( + ent, + condition=lambda val: isinstance(val, SyncNode), + value=lambda val: node_map[id(val)], + ) + + missing = [el for el in entities if el.id < 0] + existing = [el for el in entities if el.id > 0] + # remove negative IDs + for el in missing: + el.id = None + + return (missing, existing) + + def _identity_relies_on_unchecked_entity(self, node: SyncNode): + """ + If a record for which it could not yet be verified whether it exists in LA or not is part + of the identifying properties, this returns True, otherwise False + + Last review by Alexander Schlemmer on 2024-05-27. + """ + + return any( + [ + id(ent) not in self._missing and id(ent) not in self._existing + for ent in self.forward_references_id_props[id(node)] + ] + + [ + id(ent) not in self._missing and id(ent) not in self._existing + for ent in self.backward_references_backref[id(node)] + ] + ) + + def unchecked_contains_circular_dependency(self): + """ + Detects whether there are circular references in the given entity list and returns a list + where the entities are ordered according to the chain of references (and only the entities + contained in the circle are included. Returns None if no circular dependency is found. + + TODO: for the sake of detecting problems for split_into_inserts_and_updates we should only + consider references that are identifying properties. + """ + raise NotImplementedError("This function is not yet properly implemented") + # TODO if the first element is not part of the circle, then + # this will not work + # We must created a better implementation (see also TODO in docstring) + circle = [self.unchecked[0]] + closed = False + while not closed: + added_to_circle = False + for referenced in self.forward_references[id(circle[-1])]: + if referenced in self.unchecked: + if referenced in circle: + closed = True + circle.append(referenced) + added_to_circle = True + if not added_to_circle: + return None + return circle + + def get_equivalent(self, entity: SyncNode) -> Optional[SyncNode]: + """ + Return an equivalent SyncNode. + + Equivalent means that ID, path or identifiable are the same. + If a new information was added to the given SyncNode (e.g. the ID), it might be possible + then to identify an equivalent node (i.e. one with the same ID in this example). + There might be more than one equivalent node in the graph. However, simply the first that + is found is being returned. (When an equivalent node is found, the given node is + typically merged, into the one that was found and after the merge the graph is again + checked for equivalent nodes.) + + Returns None if no equivalent node is found. + + Last review by Alexander Schlemmer on 2024-05-28. + """ + if entity.id is not None and entity.id in self._id_look_up: + candidate = self._id_look_up[entity.id] + if candidate is not entity: + return candidate + if entity.path is not None and entity.path in self._path_look_up: + candidate = self._path_look_up[entity.path] + if candidate is not entity: + return candidate + if ( + entity.identifiable is not None + and entity.identifiable.get_representation() in self._identifiable_look_up + ): + candidate = self._identifiable_look_up[ + entity.identifiable.get_representation() + ] + if candidate is not entity: + return candidate + return None + + def _get_new_id(self): + """returns the next unused temporary ID + + Last review by Alexander Schlemmer on 2024-05-24. + """ + self._remote_missing_counter -= 1 + return self._remote_missing_counter + + def _set_identifiable_of_node( + self, node: SyncNode, identifiable: Optional[Identifiable] = None + ): + """sets the identifiable and checks whether an equivalent node can be found with that new + information. If an equivalent node is found, 'node' is merged into that node. + + if no identifiable is given, the identifiable is retrieved from the identifiable adapter + + Raises a ValueError if the equivalent node found does not have an identifiable. + Raises a RuntimeError if there is no equivalent node found and + the (unique) string representation of the identifiable of node is already contained in + the identifiable_look_up. + + Last review by Alexander Schlemmer on 2024-05-29. + """ + if identifiable is None: + self.identifiableAdapter.all_identifying_properties_exist(node) + identifiable = self.identifiableAdapter.get_identifiable( + node, self.backward_references_backref[id(node)] + ) + node.identifiable = identifiable + equivalent_se = self.get_equivalent(node) + if equivalent_se is not None: + self._merge_into(node, equivalent_se) + else: + if node.identifiable.get_representation() in self._identifiable_look_up: + raise RuntimeError("Identifiable is already in the look up") + self._identifiable_look_up[node.identifiable.get_representation()] = node + + @staticmethod + def _sanity_check(entities: list[db.Entity]): + """ + Checks whether each record in entities has at least one parent. + + Last review by Alexander Schlemmer on 2024-05-24. + """ + for ent in entities: + if ent.role == "Record" and len(ent.parents) == 0: + raise ValueError(f"Records must have a parent.\n{ent}") + if isinstance(ent.id, int) and ent.id < 0: + raise ValueError( + f"Records must not have negative integers as IDs.\n{ent}" + ) + if isinstance(ent.id, str) and re.match(r"^-\d+$", ent.id): + raise ValueError( + f"Records must not have negative integers as IDs.\n{ent}" + ) + + def _get_nodes_whose_identity_relies_on(self, node: SyncNode): + """returns a set of nodes that reference the given node as identifying property or are + referenced by the given node and the parent of the given node is listed as + "is_referenced_by" + + Last review by Alexander Schlemmer on 2024-05-24. + """ + return self.backward_references_id_props[id(node)].union( + self.forward_references_backref[id(node)] + ) + + @staticmethod + def _create_flat_list( + ent_list: list[db.Entity], flat: Optional[list[db.Entity]] = None + ): + """ + Recursively adds entities and all their properties contained in ent_list to + the output list flat. + + TODO: This function will be moved to pylib as it is also needed by the + high level API. + + Last review by Alexander Schlemmer on 2024-05-29. + """ + # Note: A set would be useful here, but we do not want a random order. + if flat is None: + flat = list() + for el in ent_list: + if el not in flat: + flat.append(el) + for ent in ent_list: + for p in ent.properties: + # For lists append each element that is of type Entity to flat: + if isinstance(p.value, list): + for el in p.value: + if isinstance(el, db.Entity): + if el not in flat: + flat.append(el) + SyncGraph._create_flat_list([el], flat) + elif isinstance(p.value, db.Entity): + if p.value not in flat: + flat.append(p.value) + SyncGraph._create_flat_list([p.value], flat) + return flat + + @staticmethod + def _create_reference_mapping(flat: list[SyncNode]): + """ + Create six dictionaries that describe references among SyncNodes. All dictionaries use the + Python ID of SyncNodes as keys. + There is always one dictionary to describe the direction of the reference, i.e. + map[id(node)] -> other where other is a set of SyncNodes that are being referenced by node. + And then there is always one dictionary for the inverse direction. The two dictionaries are + named "forward_" and "backward_", respectively. + + Then there are three kinds of maps being generated: One includes all references + ("_references"), one includes references that are values of identifying properties + ("_references_id_props") and one includes references that are relevant for identifying + backreferences/"is_referenced_by" ("_references_backref"). I.e. the two latter are subesets + of the former reference map. + + Arguments: + ---------- + flat: list[SyncNode] + all SyncNodes that span the graph for which the reference map shall be created + + Last review by Alexander Schlemmer on 2024-05-29. + """ + # TODO we need to treat children of RecordTypes somehow. + forward_references: dict[int, set[SyncNode]] = {} + backward_references: dict[int, set[SyncNode]] = {} + forward_references_id_props: dict[int, set[SyncNode]] = {} + backward_references_id_props: dict[int, set[SyncNode]] = {} + forward_references_backref: dict[int, set[SyncNode]] = {} + backward_references_backref: dict[int, set[SyncNode]] = {} + + # initialize with empty lists/dict + for node in flat: + forward_references[id(node)] = set() + backward_references[id(node)] = set() + forward_references_id_props[id(node)] = set() + backward_references_id_props[id(node)] = set() + forward_references_backref[id(node)] = set() + backward_references_backref[id(node)] = set() + for node in flat: + for p in node.properties: + val = p.value + if not isinstance(val, list): + val = [val] + for v in val: + if isinstance(v, SyncNode): + forward_references[id(node)].add(v) + backward_references[id(v)].add(node) + if ( + node.registered_identifiable is not None + and len( + [ + el.name + for el in node.registered_identifiable.properties + if el.name == p.name + ] + ) + > 0 + ): + forward_references_id_props[id(node)].add(v) + backward_references_id_props[id(v)].add(node) + if ( + v.registered_identifiable is not None + and IdentifiableAdapter.referencing_entity_has_appropriate_type( + node.parents, v.registered_identifiable + ) + ): + forward_references_backref[id(node)].add(v) + backward_references_backref[id(v)].add(node) + + return ( + forward_references, + backward_references, + forward_references_id_props, + backward_references_id_props, + forward_references_backref, + backward_references_backref, + ) + + def _mark_entities_with_path_or_id(self): + """A path or an ID is sufficiently identifying. Thus, those entities can be marked as + checked + + When this function returns, there is only one node for each ID (i.e. no two nodes with the + same ID). The same is true for paths. + + This function also updates _id_look_up and _path_look_up + + Last review by Alexander Schlemmer on 2024-05-29. + """ + for node in list(self.nodes): + if node.id is not None: + eq_node = self.get_equivalent(node) + if eq_node is not None: + self._basic_merge_into(node, eq_node) + else: + self._id_look_up[node.id] = node + self._mark_existing(node) + + for node in list(self.nodes): + if node.path is not None: + eq_node = self.get_equivalent(node) + if eq_node is not None: + self._basic_merge_into(node, eq_node) + else: + self._path_look_up[node.path] = node + try: + existing = cached_get_entity_by(path=node.path) + except EmptyUniqueQueryError: + existing = None + remote_id = None + if existing is not None: + remote_id = existing.id + self.set_id_of_node(node, remote_id) + + def _basic_merge_into(self, source: SyncNode, target: SyncNode): + """tries to merge source into target and updates member variables + + - reference maps are updated + - self.nodes is updated + - self.unchecked is updated + - lookups are being updated + """ + # sanity checks + if source is target: + raise ValueError("source must not be target") + + target.update(source) + + # replace actual reference property values + for node in self.backward_references[id(source)]: + _set_each_scalar_value( + node, condition=lambda val: val is source, value=lambda val: target + ) + + # update reference mappings + for setA, setB in ( + (self.forward_references, self.backward_references), # ref: source -> other + (self.backward_references, self.forward_references), # ref: other -> source + (self.forward_references_id_props, self.backward_references_id_props), + (self.backward_references_id_props, self.forward_references_id_props), + (self.forward_references_backref, self.backward_references_backref), + (self.backward_references_backref, self.forward_references_backref), + ): + for node in setA.pop(id(source)): + setA[id(target)].add(node) + setB[id(node)].remove(source) + setB[id(node)].add(target) + + # remove unneeded SyncNode + self.nodes.remove(source) + if source in self.unchecked: + self.unchecked.remove(source) + # update look ups + if target.id is not None: + self._id_look_up[target.id] = target + if target.path is not None: + self._path_look_up[target.path] = target + if target.identifiable is not None: + self._identifiable_look_up[target.identifiable.get_representation()] = target + + def _merge_into(self, source: SyncNode, target: SyncNode): + """tries to merge source into target and performs the necessary updates: + - update the member variables of target using source (``target.update(source)``). + - replaces reference values to source by target + - updates the reference map + - updates lookup tables + - removes source from node lists + - marks target as missing/existing if source was marked that way + - adds an identifiable if now possible (e.g. merging based on ID might allow create an + identifiable when none of the two nodes had the sufficient properties on its own before) + - check whether dependent nodes can now get an identifiable (the merge might have set the + ID such that dependent nodes can now create an identifiable) + + Last review by Alexander Schlemmer on 2024-05-29. + """ + self._basic_merge_into(source, target) + + if (id(source) in self._existing and id(target) in self._missing) or ( + id(target) in self._existing and id(source) in self._missing + ): + raise RuntimeError("Trying to merge missing and existing") + + if id(source) in self._missing and id(target) not in self._missing: + self._mark_missing(target) + elif id(source) in self._existing and id(target) not in self._existing: + self._mark_existing(target) + + # due to the merge it might now be possible to create an identifiable + if self._identifiable_is_needed(target): + self._set_identifiable_of_node(target) + # This is one of three cases that affect other nodes: + # - mark existing + # - mark missing + # - merge + self._add_identifiables_to_dependent_nodes(target) + + eq_node = self.get_equivalent(target) + if eq_node is not None: + self._merge_into(target, eq_node) + + def _identifiable_is_needed(self, node: SyncNode): + """ + This function checks: + - the identifiable of node is None + - the node has all properties that are needed for the identifiable + - there are no unchecked entities that are needed for the identifiable of the node, + neither as forward or as backward references + + Last review by Alexander Schlemmer on 2024-05-24. + """ + return ( + node.identifiable is None + and not self._identity_relies_on_unchecked_entity(node) + and self.identifiableAdapter.all_identifying_properties_exist( + node, raise_exception=False + ) + ) + + def _initialize_nodes(self, entities: list[db.Entity]): + """create initial set of SyncNodes from provided Entity list""" + self._sanity_check(entities) + entities = self._create_flat_list(entities) + se_lookup: dict[int, SyncNode] = {} # lookup: python id -> SyncNode + + # Create new sync nodes from the list of entities, their registered identifiables + # are set from the identifiable adapter. + for el in entities: + self.nodes.append( + SyncNode(el, self.identifiableAdapter.get_registered_identifiable(el)) + ) + se_lookup[id(el)] = self.nodes[-1] + + # replace db.Entity objects with SyncNodes in references: + for node in self.nodes: + _set_each_scalar_value( + node, + condition=lambda val: id(val) in se_lookup, + value=lambda val: se_lookup[id(val)], + ) + + def _add_identifiables_to_dependent_nodes(self, node): + """For each dependent node, we check whether this allows to create an identifiable + + Last review by Alexander Schlemmer on 2024-05-29. + """ + for other_node in self._get_nodes_whose_identity_relies_on(node): + if self._identifiable_is_needed(other_node): + self._set_identifiable_of_node(other_node) + + def _mark_missing(self, node: SyncNode): + """Mark a sync node as missing and remove it from the dictionary of unchecked nodes. + + Last review by Alexander Schlemmer on 2024-05-24. + """ + self._missing[id(node)] = node + self.unchecked.remove(node) + + # This is one of three cases that affect other nodes: + # - mark existing + # - mark missing + # - merge + self._add_identifiables_to_dependent_nodes(node) + # For each dependent node, we set the ID to None (missing) + # (None is the default second argument of set_id_of_node.) + for other_node in self._get_nodes_whose_identity_relies_on(node): + if other_node in self.unchecked: + self.set_id_of_node(other_node) + + def _mark_existing(self, node: SyncNode): + """Mark a sync node as existing and remove it from the dictionary of unchecked nodes. + + Last review by Alexander Schlemmer on 2024-05-24. + """ + if isinstance(node.id, TempID): + raise ValueError("ID must valid existing entities, not TempID") + self._existing[id(node)] = node + self.unchecked.remove(node) + # This is one of three cases that affect other nodes: + # - mark existing + # - mark missing + # - merge + self._add_identifiables_to_dependent_nodes(node) diff --git a/src/caoscrawler/sync_node.py b/src/caoscrawler/sync_node.py new file mode 100644 index 0000000000000000000000000000000000000000..141e743bffa09f0caf661bcd1939a4233cb7249c --- /dev/null +++ b/src/caoscrawler/sync_node.py @@ -0,0 +1,267 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Henrik tom Wörden +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING, Any, Optional, Union + +import linkahead as db +import yaml +from linkahead.common.models import Parent, _ParentList, _Properties +from warnings import warn + +from .exceptions import ImpossibleMergeError + +if TYPE_CHECKING: + from .identifiable import Identifiable + +logger = logging.getLogger(__name__) + + +class TempID(int): + """A special kind of int for negative temporary IDs. + + This allows to identify TempIDs in the presence of String IDs. + A string ID might look like a negative integer. + """ + pass + + +class SyncNode(db.Entity): + """represents the information of an Entity as it shall be created in LinkAhead + + The following information is taken from an db.Entity object during initialization or when the + object is updated using the `update` member function: + - id + - role + - path + - file + - name + - description + - parents + - properties + + Typically, this class is used in the following way: + 1. A SyncNode is initialized with a db.Entity object. + 2. The SyncNode object is possibly updated one or more times with other SyncNode objects. + 3. A db.Entity object is created (`export_entity`) that contains the combined information. + """ + + def __init__( + self, entity: db.Entity, registered_identifiable: Optional[db.RecordType] = None, + **kwargs + ): + super().__init__(name=entity.name, + id=entity.id, + description=entity.description, + **kwargs) + # db.Entity properties + self.role = entity.role + self.path = entity.path + self.file = entity.file + self.parents = _ParentList().extend(entity.parents) + self.properties = _Properties().extend(entity.properties) + self._check_for_multiproperties() + # other members + self.identifiable: Optional[Identifiable] = None + self.registered_identifiable = registered_identifiable + + def update(self, other: SyncNode) -> None: + """update this node with information of given ``other`` SyncNode. + + parents are added if they are not yet in the list + properties are added in any case. This may lead to duplication of properties. + We allow this duplication here and remove it when we create a db.Entity (export_entity + function) because if property values are SyncNode objects, they might not be comparable (no + ID, no identifiable) yet. + """ + + if other.identifiable is not None and self.identifiable is not None: + if ( + other.identifiable.get_representation() + != self.identifiable.get_representation() + ): + raise ValueError( + "The SyncNode that is used with update must have an equivalent" + f" identifiable. I.e. you cannot merge entities with differing identifiables" + "The identifiables where:\n" + f"{self.identifiable._create_hashable_string(self.identifiable)}\n" + f"and\n{other.identifiable._create_hashable_string(other.identifiable)}." + ) + + if other.identifiable: + self.identifiable = other.identifiable + for attr in ["id", "role", "path", "file", "name", "description"]: + if other.__getattribute__(attr) is not None: + if self.__getattribute__(attr) is None: + self.__setattr__(attr, other.__getattribute__(attr)) + else: + if self.__getattribute__(attr) != other.__getattribute__(attr): + raise ImpossibleMergeError( + f"Trying to update {attr} but this would lead to an " + f"override of the value '{self.__getattribute__(attr)}' " + f"by the value '{other.__getattribute__(attr)}'", + pname=attr, values=(self.__getattribute__(attr), + other.__getattribute__(attr)) + ) + for p in other.parents: + if not parent_in_list(p, self.parents): + self.parents.append(p) + for p in other.properties: + self.properties.append(p) + + def export_entity(self) -> db.Entity: + """create a db.Entity object from this SyncNode + + Properties are only added once (based on id or name). If values do not match, an Error is + raised. If values are SyncNode objects with IDs, they are considered equal if their IDs are + equal. + """ + ent = None + if self.role == "Record": + ent = db.Record() + elif self.role == "File": + ent = db.File() + else: + raise RuntimeError("Invalid role") + for attr in ["id", "role", "path", "file", "name", "description"]: + ent.__setattr__(attr, self.__getattribute__(attr)) + for p in self.parents: + ent.add_parent(p) + for p in self.properties: + entval: Any = ent.get_property(p) + if entval is None: + ent.add_property(id=p.id, name=p.name, value=p.value, description=p.description, + datatype=p.datatype, unit=p.unit) + else: + entval = entval.value + unequal = False + pval = p.value + if isinstance(entval, list) != isinstance(pval, list): + unequal = True + if not isinstance(entval, list): + entval = [entval] + if not isinstance(pval, list): + pval = [pval] + if len(entval) != len(pval): + unequal = True + else: + for e_el, p_el in zip(entval, pval): + if isinstance(e_el, SyncNode) and e_el.id is not None: + e_el = e_el.id + if isinstance(p_el, SyncNode) and p_el.id is not None: + p_el = p_el.id + if e_el != p_el: + unequal = True + + if unequal: + logger.error( + "The Crawler is trying to create an entity," + " but there are conflicting property values." + f"Problematic Property: {p.name}\n" + f"First value:\n{entval}\n" + f"Second value:\n{pval}\n" + f"{self}" + ) + ime = ImpossibleMergeError( + "Cannot merge Entities", pname=p.name, values=(entval, pval) + ) + raise ime + return ent + + def __repr__(self) -> str: + """ somewhat concise text representation of the SyncNode """ + res = f"\n=====================================================\n{self.role}\n" + res += yaml.dump( + { + "id": self.id, + "name": self.name, + "path": self.path, + "parents": [el.name for el in self.parents], + }, + allow_unicode=True, + ) + res += "---------------------------------------------------\n" + res += "properties:\n" + d: dict[str, Any] = {} + for p in self.properties: + v = p.value + d[p.name] = [] + if not isinstance(p.value, list): + v = [v] + for el in v: + if isinstance(el, SyncNode): + d[p.name].append( + { + "id": el.id, + "name": el.name, + "path": el.path, + "parents": [e.name for e in el.parents], + } + ) + else: + d[p.name].append(el) + + return ( + res + + yaml.dump(d, allow_unicode=True) + + "=====================================================\n" + ) + + def _check_for_multiproperties(self): + """ warns if multiproperties are present """ + ids = set() + names = set() + for p in self.properties: + if p.name is not None: + if p.name in names: + warn("Multiproperties are not supported by the crawler.") + names.add(p.name) + if p.id is not None: + if p.id in ids: + warn("Multiproperties are not supported by the crawler.") + ids.add(p.id) + + +def parent_in_list(parent: Parent, plist: _ParentList) -> bool: + """helper function that checks whether a parent with the same name or ID is in the plist""" + missing = False + if parent.name is not None: + if parent.name not in plist._element_by_name: + missing = True + if parent.id is not None: + if str(parent.id) not in plist._element_by_id: + missing = True + return not missing + + +def property_in_list(prop: db.Property, plist: _Properties) -> bool: + """helper function that checks whether a property with the same name or ID is in the plist""" + missing = False + if prop.name is not None: + if prop.name not in plist._element_by_name: + missing = True + if prop.id is not None: + if str(prop.id) not in plist._element_by_id: + missing = True + return not missing diff --git a/src/doc/getting_started/furtherreading.rst b/src/doc/getting_started/furtherreading.rst index eb600416c1fce3857d28fc2e856ceabebb3a8bb7..8d8d3ecc4b5575f71e90e9e5a17b060a63403a07 100644 --- a/src/doc/getting_started/furtherreading.rst +++ b/src/doc/getting_started/furtherreading.rst @@ -6,3 +6,4 @@ Further reading - Some useful examples can be found in the `integration tests <https://gitlab.com/caosdb/caosdb-crawler/-/tree/main/integrationtests>`_ (and to a certain extent in the unit tests). +- TODO: Information on caching diff --git a/tox.ini b/tox.ini index c8a29dc38a19011d4c47e2665aa90d5163591d8f..7b2b7460ac37dfb4766c28cddc65965f0d8f4691 100644 --- a/tox.ini +++ b/tox.ini @@ -15,6 +15,9 @@ commands = caosdb-crawler --help [flake8] max-line-length = 100 +[pycodestyle] +max-line-length = 100 + [pytest] testpaths = unittests -xfail_strict = True \ No newline at end of file +xfail_strict = True diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index a48b5e16ad1a71beeb4a5bf1c2ac52f67bbd7afe..4e8b057e382e6353698b8b63bbcc4e648284d711 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -39,10 +39,12 @@ import linkahead.common.models as dbmodels import pytest import yaml from caosadvancedtools.models.parser import parse_model_from_string -from caoscrawler.crawl import (Crawler, SecurityMode, TreatedRecordLookUp, - _treat_deprecated_prefix, crawler_main, - split_restricted_path) +from caoscrawler.crawl import (Crawler, SecurityMode, _treat_deprecated_prefix, + crawler_main, split_restricted_path) from caoscrawler.debug_tree import DebugTree +from caoscrawler.exceptions import (ImpossibleMergeError, + MissingIdentifyingProperty, + MissingReferencingEntityError) from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import (CaosDBIdentifiableAdapter, IdentifiableAdapter, @@ -52,6 +54,7 @@ from caoscrawler.scanner import (create_converter_registry, scan_directory, from caoscrawler.stores import GeneralStore, RecordStore from caoscrawler.structure_elements import (DictElement, DictListElement, DictTextElement, File) +from caoscrawler.sync_graph import SyncGraph from linkahead.apiutils import compare_entities from linkahead.cached import cache_clear from linkahead.exceptions import EmptyUniqueQueryError @@ -87,6 +90,20 @@ NEW_ELEMENT = (db.Record() .add_property(name="result", value="homogeneous")) +def reset_mocks(mocks): + for mock in mocks: + mock.reset_mock() + + +def mock_create_values(values, element): + pass + + +def mock_get_entity_by_query(query=None): + if query is not None: + return db.Record(id=1111, name='rec_name').add_parent('RT') + + def mock_get_entity_by(eid=None, name=None, path=None): if eid is not None: candidates = [el for el in EXAMPLE_SERVER_STATE if el.id == eid] @@ -110,6 +127,14 @@ def mock_get_entity_by(eid=None, name=None, path=None): raise EmptyUniqueQueryError("") +def basic_retrieve_by_name_mock_up(rec, referencing_entities=None, known=None): + """ returns a stored Record if rec.name is an existing key, None otherwise """ + if rec.name in known: + return known[rec.name] + else: + return None + + def mock_retrieve_record(identifiable: Identifiable): """ assumes that the identifiable is always only the date""" @@ -156,8 +181,56 @@ def clear_cache(): cache_clear() +@pytest.fixture +def crawler_mocked_identifiable_retrieve(): + crawler = Crawler() + # TODO use minimal setup + # mock retrieval of registered identifiabls: return Record with just a parent + crawler.identifiableAdapter.get_registered_identifiable = Mock( + side_effect=lambda x: db.Record().add_parent(x.parents[0].name).add_property(name='name')) + + # Simulate remote server content by using the names to identify records + # There is only a single known Record with name A + crawler.identifiableAdapter.retrieve_identified_record_for_record = Mock(side_effect=partial( + basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A")})) + crawler.identifiableAdapter.retrieve_identified_record_for_identifiable = Mock( + side_effect=partial( + basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A")})) + return crawler + + +@pytest.fixture +def crawler_mocked_for_backref_test(): + crawler = Crawler() + # mock retrieval of registered identifiabls: return Record with just a parent + + def get_reg_ident(x): + if x.parents[0].name == "C": + return db.Record().add_parent(x.parents[0].name).add_property( + "is_referenced_by", value=["BR"]).add_property("name") + elif x.parents[0].name == "D": + return db.Record().add_parent(x.parents[0].name).add_property( + "is_referenced_by", value=["BR", "BR2"]).add_property("name") + else: + return db.Record().add_parent(x.parents[0].name).add_property("name") + crawler.identifiableAdapter.get_registered_identifiable = Mock(side_effect=get_reg_ident) + + # Simulate remote server content by using the names to identify records + # There is only a single known Record with name A + crawler.identifiableAdapter.retrieve_identified_record_for_record = Mock(side_effect=partial( + basic_retrieve_by_name_mock_up, known={"A": + db.Record(id=1111, name="A").add_parent("BR")})) + crawler.identifiableAdapter.retrieve_identified_record_for_identifiable = Mock( + side_effect=partial( + basic_retrieve_by_name_mock_up, known={"A": + db.Record(id=1111, name="A").add_parent("BR")})) + return crawler + + @pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_constructor(): + # tests that appropriate DeprecationWarnings are triggered by the constructor when deprecated + # arguments are being passed. with warnings.catch_warnings(record=True) as w: # Cause all warnings to always be triggered. warnings.filterwarnings("ignore") @@ -174,6 +247,7 @@ def test_constructor(): @pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_deprecated_functions(): + # tests that appropriate DeprecationWarnings are triggered by deprecated methods with warnings.catch_warnings(record=True) as w: # Cause all warnings to always be triggered. warnings.filterwarnings("ignore") @@ -218,95 +292,62 @@ def test_check_whether_parent_exists(): def test_remove_unnecessary_updates(): # test trvial case - upl = [db.Record().add_parent("A")] - irs = [db.Record().add_parent("A")] - updates = Crawler.remove_unnecessary_updates(upl, irs) + crawled_data = [db.Record().add_parent("A")] + identified_records = [db.Record().add_parent("A")] + updates = Crawler.remove_unnecessary_updates(crawled_data, identified_records) assert len(updates) == 0 # test property difference case - # TODO this should work right? - # upl = [db.Record().add_parent("A").add_property("a", 3)] - # irs = [db.Record().add_parent("A")] # ID should be s - # Crawler.remove_unnecessary_updates(upl, irs) - # assert len(upl) == 1 + crawled_data = [db.Record().add_parent("A").add_property("a", 3)] + identified_records = [db.Record().add_parent("A")] # ID should be s + Crawler.remove_unnecessary_updates(crawled_data, identified_records) + assert len(crawled_data) == 1 # test value difference case - upl = [db.Record().add_parent("A").add_property("a", 5)] - irs = [db.Record().add_parent("A").add_property("a")] - updates = Crawler.remove_unnecessary_updates(upl, irs) + crawled_data = [db.Record().add_parent("A").add_property("a", 5)] + identified_records = [db.Record().add_parent("A").add_property("a")] + updates = Crawler.remove_unnecessary_updates(crawled_data, identified_records) assert len(updates) == 1 - upl = [db.Record().add_parent("A").add_property("a", 5)] - irs = [db.Record().add_parent("A").add_property("a", 5)] - updates = Crawler.remove_unnecessary_updates(upl, irs) + crawled_data = [db.Record().add_parent("A").add_property("a", 5)] + identified_records = [db.Record().add_parent("A").add_property("a", 5)] + updates = Crawler.remove_unnecessary_updates(crawled_data, identified_records) assert len(updates) == 0 # test unit difference case - upl = [db.Record().add_parent("A").add_property("a", unit='cm')] - irs = [db.Record().add_parent("A").add_property("a")] - updates = Crawler.remove_unnecessary_updates(upl, irs) + crawled_data = [db.Record().add_parent("A").add_property("a", unit='cm')] + identified_records = [db.Record().add_parent("A").add_property("a")] + updates = Crawler.remove_unnecessary_updates(crawled_data, identified_records) assert len(updates) == 1 # test None difference case - upl = [db.Record().add_parent("A").add_property("a")] - irs = [db.Record().add_parent("A").add_property("a", 5)] - updates = Crawler.remove_unnecessary_updates(upl, irs) + crawled_data = [db.Record().add_parent("A").add_property("a")] + identified_records = [db.Record().add_parent("A").add_property("a", 5)] + updates = Crawler.remove_unnecessary_updates(crawled_data, identified_records) assert len(updates) == 1 def test_split_into_inserts_and_updates_trivial(): crawler = Crawler() - crawler.split_into_inserts_and_updates([]) - + st = SyncGraph([], crawler.identifiableAdapter) + crawler.split_into_inserts_and_updates(st) -def test_split_into_inserts_and_updates_unidentified(): - crawler = Crawler() - with raises(ValueError) as err: - crawler.split_into_inserts_and_updates([db.Record(name="recname").add_parent("someparent")]) - assert str(err.value).startswith("There is no identifying information.") - -def basic_retrieve_by_name_mock_up(rec, referencing_entities=None, known=None): - """ returns a stored Record if rec.name is an existing key, None otherwise """ - if rec.name in known: - return known[rec.name] - else: - return None - - -@pytest.fixture -def crawler_mocked_identifiable_retrieve(): - crawler = Crawler() - # TODO use minimal setup - # mock retrieval of registered identifiabls: return Record with just a parent - crawler.identifiableAdapter.get_registered_identifiable = Mock( - side_effect=lambda x: db.Record().add_parent(x.parents[0].name).add_property(name='name')) - - # Simulate remote server content by using the names to identify records - # There is only a single known Record with name A - crawler.identifiableAdapter.retrieve_identified_record_for_record = Mock(side_effect=partial( - basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A")})) - crawler.identifiableAdapter.retrieve_identified_record_for_identifiable = Mock( - side_effect=partial( - basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A")})) - return crawler - - -def test_split_into_inserts_and_updates_single(crawler_mocked_identifiable_retrieve): +def test_split_into_inserts_and_updates_simple(crawler_mocked_identifiable_retrieve): + # basic test that checks whether two records are correctly sorted to update and insert based on + # whether an entity can be found using the identifiable crawler = crawler_mocked_identifiable_retrieve identlist = [Identifiable(name="A", record_type="C"), Identifiable(name="B", record_type="C")] - entlist = [db.Record(name="A").add_parent( - "C"), db.Record(name="B").add_parent("C")] + entlist = [db.Record(name="A").add_parent("C"), + db.Record(name="B").add_parent("C")] - assert crawler.treated_records_lookup.get_any(entlist[0], identlist[0]) is None - assert crawler.treated_records_lookup.get_any(entlist[0], identlist[0]) is None - assert not crawler._has_reference_value_without_id(identlist[0]) - assert not crawler._has_reference_value_without_id(identlist[1]) + st = SyncGraph(entlist, crawler.identifiableAdapter) + # check setup assert crawler.identifiableAdapter.retrieve_identified_record_for_record( identlist[0]).id == 1111 assert crawler.identifiableAdapter.retrieve_identified_record_for_record( identlist[1]) is None - insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) + insert, update = crawler.split_into_inserts_and_updates(st) assert len(insert) == 1 assert insert[0].name == "B" assert len(update) == 1 @@ -316,49 +357,20 @@ def test_split_into_inserts_and_updates_single(crawler_mocked_identifiable_retri crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() -def test_split_into_inserts_and_updates_with_duplicate(crawler_mocked_identifiable_retrieve): +def test_split_into_inserts_and_updates_with_circ(crawler_mocked_identifiable_retrieve): + # test trying to split circular dependency crawler = crawler_mocked_identifiable_retrieve - a = db.Record(name="A").add_parent("C") - b = db.Record(name="B").add_parent("C") - b.add_property("A", a) - # This is identical to a and should be removed - c = db.Record(name="A").add_parent("C") - entlist = [a, b, c] - insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) - assert len(insert) == 1 - assert insert[0].name == "B" - assert len(update) == 1 - assert update[0].name == "A" - # if this ever fails, the mock up may be removed - crawler.identifiableAdapter.get_registered_identifiable.assert_called() - crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() - - -def test_split_into_inserts_and_updates_with_ref(crawler_mocked_identifiable_retrieve): - crawler = crawler_mocked_identifiable_retrieve - # try it with a reference - a = db.Record(name="A").add_parent("C") - b = db.Record(name="B").add_parent("C") - b.add_property("A", a) - entlist = [a, b] - insert, update = crawler.split_into_inserts_and_updates(entlist) - assert len(insert) == 1 - assert insert[0].name == "B" - assert len(update) == 1 - assert update[0].name == "A" - # if this ever fails, the mock up may be removed - crawler.identifiableAdapter.get_registered_identifiable.assert_called() - crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() - + crawler.identifiableAdapter.get_registered_identifiable = Mock( + side_effect=lambda x: db.Record().add_parent('C').add_property(name='a') + ) + # two records that reference each other via identifying properties + a = db.Record().add_parent("C") + b = db.Record().add_parent("C").add_property(name='a', value=a) + a.add_property(name='a', value=b) -def test_split_into_inserts_and_updates_with_circ(): - # try circular - a = db.Record(name="A").add_parent("C") - b = db.Record(name="B").add_parent("C") - b.add_property("A", a) - a.add_property("B", b) - entlist = [a, b] - # TODO this does not seem to be complete! + st = SyncGraph([a, b], crawler.identifiableAdapter) + with pytest.raises(RuntimeError): + crawler.split_into_inserts_and_updates(st) def test_split_into_inserts_and_updates_with_complex(crawler_mocked_identifiable_retrieve): @@ -372,11 +384,12 @@ def test_split_into_inserts_and_updates_with_complex(crawler_mocked_identifiable b = db.Record(name="B").add_parent("C") g = db.Record(name="G").add_parent("C") f = db.Record(name="F").add_parent("C") - g.add_property("A", a) - b.add_property("A", f) + g.add_property("C", b) b.add_property("A", a) + b.add_property("C", f) entlist = [a, b, g] - insert, update = crawler.split_into_inserts_and_updates(entlist) + st = SyncGraph(entlist, crawler.identifiableAdapter) + insert, update = crawler.split_into_inserts_and_updates(st) assert len(insert) == 3 assert "B" in [el.name for el in insert] assert len(update) == 1 @@ -388,23 +401,8 @@ def test_split_into_inserts_and_updates_with_complex(crawler_mocked_identifiable # TODO write test where the unresoled entity is not part of the identifiable -def test_split_into_inserts_and_updates_with_copy_attr(crawler_mocked_identifiable_retrieve): - crawler = crawler_mocked_identifiable_retrieve - # assume identifiable is only the name - a = db.Record(name="A").add_parent("C") - a.add_property("foo", 1) - b = db.Record(name="A").add_parent("C") - b.add_property("bar", 2) - entlist = [a, b] - insert, update = crawler.split_into_inserts_and_updates(entlist) - - assert update[0].get_property("bar").value == 2 - assert update[0].get_property("foo").value == 1 - # if this ever fails, the mock up may be removed - crawler.identifiableAdapter.get_registered_identifiable.assert_called() - crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() - - +@patch("caoscrawler.crawl.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) @patch("caoscrawler.identifiable_adapters.cached_query", new=Mock(side_effect=mock_cached_only_rt)) def test_split_iiau_with_unmergeable_list_items(): @@ -440,6 +438,12 @@ b1: ("same", c1) b2: ("same", c2) a: ([b1, b2]) + + + +- a can be identified. +- bs can be identified with each other once a is identified +- cs depend on b(s), but cannot be put in one Entity because they have conflicting properties """ prop_ident = db.Property("prop_ident", datatype=db.INTEGER) prop_other = db.Property("prop_ident", datatype=db.INTEGER) @@ -472,82 +476,108 @@ a: ([b1, b2]) crawler = Crawler(identifiableAdapter=ident_adapter) - with raises(RuntimeError) as rte: - crawler.synchronize(commit_changes=False, - crawled_data=[rec_a, *rec_b, *rec_c]) - assert not isinstance(rte.value, NotImplementedError), \ - "Exception must not be NotImplementedError, but plain RuntimeError." - assert "Could not find referencing entities" in rte.value.args[0] - assert "merge conflicts in the referencing" in rte.value.args[0] + st = SyncGraph(deepcopy([rec_a, *rec_b, *rec_c]), crawler.identifiableAdapter) + assert st._identity_relies_on_unchecked_entity(st.nodes[0]) is False + assert st._identity_relies_on_unchecked_entity(st.nodes[1]) + assert st._identity_relies_on_unchecked_entity(st.nodes[2]) + assert st._identity_relies_on_unchecked_entity(st.nodes[3]) + assert st._identity_relies_on_unchecked_entity(st.nodes[4]) + assert len(st.unchecked) == 5 + + # The Cs cannot be merged due to different identifying properties + # The Bs cannot be merged due to different references to Cs + with raises(ImpossibleMergeError) as rte: + crawler.split_into_inserts_and_updates(st) + # TODO + # assert not isinstance(rte.value, NotImplementedError), \ + # "Exception must not be NotImplementedError, but plain RuntimeError." + # assert "Could not find referencing entities" in rte.value.args[0] + # assert "merge conflicts in the referencing" in rte.value.args[0] + + +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test): + # test that backrefs are appropriately considered in the identifiable + crawler = crawler_mocked_for_backref_test + identlist = [Identifiable(name="A", record_type="BR"), + Identifiable(name="B", record_type="C", backrefs=[db.Entity()])] + referenced = db.Record(name="B").add_parent("C") + entlist = [referenced, db.Record(name="A").add_parent("BR").add_property("ref", referenced), ] + # Test without referencing object + # currently a RuntimeError is raised if necessary properties are missing. + with raises(MissingReferencingEntityError): + st = SyncGraph([db.Record(name="B").add_parent("C")], crawler.identifiableAdapter) -def test_has_missing_object_in_references(): - crawler = Crawler() - # Simulate remote server content by using the names to identify records - # There are only two known Records with name A and B - crawler.identifiableAdapter.get_registered_identifiable = Mock(side_effect=partial( - basic_retrieve_by_name_mock_up, known={"C": db.Record(name="C").add_parent("RTC") - .add_property("d").add_property("name"), - "D": db.Record(name="D").add_parent("RTD") - .add_property("d").add_property("e").add_property("name"), - })) - - # one reference with id -> check - assert not crawler._has_missing_object_in_references( - Identifiable(name="C", record_type="RTC", properties={'d': 123}), {}) - # one ref with Entity with id -> check - rec = db.Record(id=123).add_parent("C") - assert not crawler._has_missing_object_in_references( - Identifiable(name="C", record_type="RTC", properties={'d': rec}), {id(rec): {'C': [None]}}) - # one ref with id one with Entity with id (mixed) -> check - rec = db.Record(id=123).add_parent("RTC") - assert not crawler._has_missing_object_in_references( - Identifiable(name="C", record_type="RTD", - properties={'d': 123, 'b': rec}), {id(rec): {'C': [None]}}) - # entity to be referenced in the following - a = db.Record(name="C").add_parent("C").add_property("d", 12311) - # one ref with id one with Entity without id (but not identifying) -> fail - assert not crawler._has_missing_object_in_references( - Identifiable(name="C", record_type="RTC", properties={'d': 123, 'e': a}), - {id(a): {'C': [None]}}) - - # one ref with id one with Entity without id (mixed) -> fail - assert not crawler._has_missing_object_in_references( - Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), - {id(a): {'C': [None]}}) - - crawler.treated_records_lookup.add(a, Identifiable(name="C", record_type="RTC", - properties={'d': 12311})) - # one ref with id one with Entity without id but in cache -> check - assert crawler._has_missing_object_in_references( - Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), - {id(a): {'C': [None]}}) + # identifiables were not yet checked + st = SyncGraph(entlist, crawler.identifiableAdapter) + assert st.get_equivalent(st.nodes[1]) is None + assert st.get_equivalent(st.nodes[0]) is None + # one can be found remotely, one not + assert crawler.identifiableAdapter.retrieve_identified_record_for_record( + identlist[0]).id == 1111 + assert crawler.identifiableAdapter.retrieve_identified_record_for_record( + identlist[1]) is None - # if this ever fails, the mock up may be removed - crawler.identifiableAdapter.get_registered_identifiable.assert_called() + # check the split... + insert, update = crawler.split_into_inserts_and_updates(st) + # A was found remotely and is therefore in the update list + assert len(update) == 1 + assert update[0].name == "A" + # B does not exist on the (simulated) remote server + assert len(insert) == 1 + assert insert[0].name == "B" -@ pytest.mark.xfail() -def test_references_entities_without_ids(): - crawler = Crawler() - assert not crawler._has_reference_value_without_id(db.Record().add_parent("Person") - .add_property('last_name', 123) - .add_property('first_name', 123)) - # id and rec with id - assert not crawler._has_reference_value_without_id(db.Record().add_parent("Person") - .add_property('first_name', 123) - .add_property('last_name', - db.Record(id=123))) - # id and rec with id and one unneeded prop - assert crawler._has_reference_value_without_id(db.Record().add_parent("Person") - .add_property('first_name', 123) - .add_property('stuff', db.Record()) - .add_property('last_name', db.Record(id=123))) - - # one identifying prop is missing - assert crawler._has_reference_value_without_id(db.Record().add_parent("Person") - .add_property('first_name', 123) - .add_property('last_name', db.Record())) +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_test): + # test whether multiple references of the same record type are correctly used + crawler = crawler_mocked_for_backref_test + referenced = db.Record(name="B").add_parent("C") + entlist = [referenced, + db.Record(id=1, name="A").add_parent("BR").add_property("ref", referenced), + db.Record(id=2, name="C").add_parent("BR").add_property("ref", referenced), + ] + + # test whether both entities are listed in the backref attribute of the identifiable + st = SyncGraph(entlist, crawler.identifiableAdapter) + + identifiable = crawler.identifiableAdapter.get_identifiable( + st.nodes[0], + st.backward_references_backref[id(st.nodes[0])]) + assert len(identifiable.backrefs) == 2 + + # check the split... + insert, update = crawler.split_into_inserts_and_updates(st) + assert len(update) == 2 + assert len(insert) == 1 + + +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_test): + # test whether multiple references of the different record types are correctly used + crawler = crawler_mocked_for_backref_test + referenced = db.Record(name="B").add_parent("D") + entlist = [referenced, + db.Record(id=1, name="A").add_parent("BR").add_property("ref", referenced), + db.Record(id=2, name="A").add_parent("BR2").add_property("ref", referenced), + ] + + # test whether both entities are listed in the backref attribute of the identifiable + st = SyncGraph(entlist, crawler.identifiableAdapter) + identifiable = crawler.identifiableAdapter.get_identifiable( + st.nodes[0], + st.backward_references_backref[id(st.nodes[0])]) + + assert len(identifiable.backrefs) == 2 + + # check the split... + insert, update = crawler.split_into_inserts_and_updates(st) + assert len(update) == 2 + assert len(insert) == 1 def test_replace_entities_with_ids(): @@ -562,20 +592,15 @@ def test_replace_entities_with_ids(): assert a.get_property("C").value == [12345, 233324] -def reset_mocks(mocks): - for mock in mocks: - mock.reset_mock() - - -@ patch("caoscrawler.crawl.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by)) -@ patch("caoscrawler.identifiable_adapters.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by)) -@ patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter." - "retrieve_identified_record_for_identifiable", - new=Mock(side_effect=mock_retrieve_record)) -@ patch("caoscrawler.crawl.db.Container.insert") -@ patch("caoscrawler.crawl.db.Container.update") +@patch("caoscrawler.crawl.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@patch("caoscrawler.identifiable_adapters.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter." + "retrieve_identified_record_for_identifiable", + new=Mock(side_effect=mock_retrieve_record)) +@patch("caoscrawler.crawl.db.Container.insert") +@patch("caoscrawler.crawl.db.Container.update") def test_synchronization_no_commit(upmock, insmock): crawled_data = [r.copy() for r in EXAMPLE_SERVER_STATE if r.role == "Record"] # change one; add one @@ -592,20 +617,19 @@ def test_synchronization_no_commit(upmock, insmock): assert len(ups) == 1 -@ patch("caoscrawler.crawl.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by)) -@ patch("caoscrawler.identifiable_adapters.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by)) -@ patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter." - "retrieve_identified_record_for_identifiable", - new=Mock(side_effect=mock_retrieve_record)) -@ patch("caoscrawler.crawl.db.Container.insert") -@ patch("caoscrawler.crawl.db.Container.update") -@ patch("caoscrawler.crawl.UpdateCache.insert") +@patch("caoscrawler.crawl.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@patch("caoscrawler.identifiable_adapters.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter." + "retrieve_identified_record_for_identifiable", + new=Mock(side_effect=mock_retrieve_record)) +@patch("caoscrawler.crawl.db.Container.insert") +@patch("caoscrawler.crawl.db.Container.update") +@patch("caoscrawler.crawl.UpdateCache.insert") def test_security_mode(updateCacheMock, upmock, insmock): # trivial case: nothing to do crawled_data = [r.copy() for r in EXAMPLE_SERVER_STATE if r.role == "Record"] - print(crawled_data) crawler = Crawler(securityMode=SecurityMode.RETRIEVE) crawler.synchronize(commit_changes=True, crawled_data=crawled_data) assert crawler.run_id is not None @@ -640,9 +664,6 @@ def test_security_mode(updateCacheMock, upmock, insmock): assert crawler.run_id is not None insmock.assert_not_called() upmock.assert_not_called() - # import IPython - # IPython.embed() - # print(updateCacheMock.call_args_list) assert updateCacheMock.call_count == 1 # reset counts reset_mocks([updateCacheMock, insmock, upmock]) @@ -698,65 +719,6 @@ def test_security_mode(updateCacheMock, upmock, insmock): crawled_data[-1] = EXAMPLE_SERVER_STATE[-1].copy() -def test_create_reference_mapping(): - a = db.Record().add_parent("A") - b = db.Record(id=132).add_parent("B").add_property('a', a) - ref = Crawler.create_reference_mapping([a, b]) - assert id(a) in ref - assert id(b) in ref - assert "B" in ref[id(a)] - assert {} == ref[id(b)] - assert ref[id(a)]["B"] == [132] - - -def test_create_flat_list(): - a = db.Record() - b = db.Record() - a.add_property(name="a", value=a) - a.add_property(name="b", value=b) - flat = Crawler.create_flat_list([a]) - assert len(flat) == 2 - assert a in flat - assert b in flat - c = db.Record() - c.add_property(name="a", value=a) - # This would caus recursion if it is not dealt with properly. - a.add_property(name="c", value=c) - flat = Crawler.create_flat_list([c]) - assert len(flat) == 3 - assert a in flat - assert b in flat - assert c in flat - - -@ pytest.fixture -def crawler_mocked_for_backref_test(): - crawler = Crawler() - # mock retrieval of registered identifiabls: return Record with just a parent - - def get_reg_ident(x): - if x.parents[0].name == "C": - return db.Record().add_parent(x.parents[0].name).add_property( - "is_referenced_by", value=["BR"]).add_property("name") - elif x.parents[0].name == "D": - return db.Record().add_parent(x.parents[0].name).add_property( - "is_referenced_by", value=["BR", "BR2"]).add_property("name") - else: - return db.Record().add_parent(x.parents[0].name).add_property("name") - crawler.identifiableAdapter.get_registered_identifiable = Mock(side_effect=get_reg_ident) - - # Simulate remote server content by using the names to identify records - # There is only a single known Record with name A - crawler.identifiableAdapter.retrieve_identified_record_for_record = Mock(side_effect=partial( - basic_retrieve_by_name_mock_up, known={"A": - db.Record(id=1111, name="A").add_parent("BR")})) - crawler.identifiableAdapter.retrieve_identified_record_for_identifiable = Mock( - side_effect=partial( - basic_retrieve_by_name_mock_up, known={"A": - db.Record(id=1111, name="A").add_parent("BR")})) - return crawler - - def test_validation_error_print(caplog): caplog.set_level(logging.DEBUG, logger="caoscrawler.converters") # there should be no server interaction since we only test the behavior if a validation error @@ -773,96 +735,7 @@ def test_validation_error_print(caplog): caplog.clear() -@ patch("caoscrawler.identifiable_adapters.get_children_of_rt", - new=Mock(side_effect=lambda x: [x])) -def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test): - crawler = crawler_mocked_for_backref_test - identlist = [Identifiable(name="A", record_type="BR"), - Identifiable(name="B", record_type="C", backrefs=[db.Entity()])] - referenced = db.Record(name="B").add_parent("C") - entlist = [referenced, db.Record(name="A").add_parent("BR").add_property("ref", referenced), ] - - # Test without referencing object - # currently a RuntimeError is raised if necessary properties are missing. - with raises(RuntimeError): - crawler.split_into_inserts_and_updates([db.Record(name="B").add_parent("C")]) - - # identifiables were not yet checked - assert crawler.treated_records_lookup.get_any(entlist[1], identlist[0]) is None - assert crawler.treated_records_lookup.get_any(entlist[0], identlist[1]) is None - # one with reference, one without - assert not crawler._has_reference_value_without_id(identlist[0]) - assert crawler._has_reference_value_without_id(identlist[1]) - # one can be found remotely, one not - assert crawler.identifiableAdapter.retrieve_identified_record_for_record( - identlist[0]).id == 1111 - assert crawler.identifiableAdapter.retrieve_identified_record_for_record( - identlist[1]) is None - - # check the split... - insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) - # A was found remotely and is therefore in the update list - assert len(update) == 1 - assert update[0].name == "A" - # B does not exist on the (simulated) remote server - assert len(insert) == 1 - assert insert[0].name == "B" - - -@ patch("caoscrawler.identifiable_adapters.get_children_of_rt", - new=Mock(side_effect=lambda x: [x])) -def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_test): - # test whether multiple references of the same record type are correctly used - crawler = crawler_mocked_for_backref_test - referenced = db.Record(name="B").add_parent("C") - entlist = [referenced, - db.Record(name="A").add_parent("BR").add_property("ref", referenced), - db.Record(name="C").add_parent("BR").add_property("ref", referenced), - ] - - # test whether both entities are listed in the backref attribute of the identifiable - referencing_entities = crawler.create_reference_mapping(entlist) - identifiable = crawler.identifiableAdapter.get_identifiable( - referenced, - referencing_entities[id(referenced)]) - assert len(identifiable.backrefs) == 2 - - # check the split... - insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) - assert len(update) == 1 - assert len(insert) == 2 - - -@ patch("caoscrawler.identifiable_adapters.get_children_of_rt", - new=Mock(side_effect=lambda x: [x])) -def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_test): - # test whether multiple references of the different record types are correctly used - crawler = crawler_mocked_for_backref_test - referenced = db.Record(name="B").add_parent("D") - entlist = [referenced, - db.Record(name="A").add_parent("BR").add_property("ref", referenced), - db.Record(name="A").add_parent("BR2").add_property("ref", referenced), - ] - - # test whether both entities are listed in the backref attribute of the identifiable - referencing_entities = crawler.create_reference_mapping(entlist) - identifiable = crawler.identifiableAdapter.get_identifiable( - referenced, - referencing_entities[id(referenced)]) - - assert len(identifiable.backrefs) == 2 - - # check the split... - insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) - assert len(update) == 2 - assert len(insert) == 1 - - -def mock_create_values(values, element): - pass - - -@ patch("caoscrawler.converters.IntegerElementConverter.create_values") +@patch("caoscrawler.converters.IntegerElementConverter.create_values") def test_restricted_path(create_mock): """ The restricted_path argument allows to ignroe part of the crawled data structure. Here, we make @@ -955,7 +828,7 @@ def test_split_restricted_path(): # Filter the warning because we want to have it here and this way it does not hinder running # tests with -Werror. -@ pytest.mark.filterwarnings("ignore:The prefix:DeprecationWarning") +@pytest.mark.filterwarnings("ignore:The prefix:DeprecationWarning") def test_deprecated_prefix_option(): """Test that calling the crawler's main function with the deprecated `prefix` option raises the correct errors and warnings. @@ -993,36 +866,8 @@ def test_create_entity_summary(): assert "<a href='/Entity/4'>a</a>, <a href='/Entity/6'>b</a>" in text -def test_detect_circular_dependency(crawler_mocked_identifiable_retrieve, caplog): - crawler = crawler_mocked_identifiable_retrieve - crawler.identifiableAdapter.get_registered_identifiable = Mock( - side_effect=lambda x: db.Record().add_parent('C').add_property(name='C')) - a = db.Record(name='a').add_parent("C") - b = db.Record(name='b').add_parent("C").add_property(name="C", value=a) - c = db.Record(name='c').add_parent("C").add_property(name='D', value='e' - ).add_property(name="C", value=b) - d = db.Record(name='c').add_parent("C") - a.add_property(name="C", value=c) - flat = [a, b, c] - circle = Crawler.detect_circular_dependency(flat) - assert [id(el) for el in circle] == [id(el) for el in [a, c, b, a]] - - assert Crawler.detect_circular_dependency([d]) is None - with raises(RuntimeError): - _, _ = crawler.split_into_inserts_and_updates(flat) - caplog.set_level(logging.ERROR, logger="caoscrawler.converters") - assert "Found circular dependency" in caplog.text - assert "\n--------\n\n> Parent: C\n\n>> Name: a\n[\'C\']" in caplog.text - caplog.clear() - - -def mock_get_entity_by_query(query=None): - if query is not None: - return db.Record(id=1111, name='rec_name').add_parent('RT') - - -@ patch("caoscrawler.crawl.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by_query)) +@patch("caoscrawler.crawl.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by_query)) def test_replace_name_with_referenced_entity(): test_text = 'lkajsdf' test_int = 134343 @@ -1090,72 +935,3 @@ def test_replace_name_with_referenced_entity(): assert isinstance(prop.value[2], int) assert prop.value[2] == test_id assert caoscrawler.crawl.cached_get_entity_by.call_count == 3 - - -def test_treated_record_lookup(): - trlu = TreatedRecordLookUp() - exist = db.Record(id=1) - trlu.add(exist) - assert len(trlu._existing) == 1 - # was added to existing - assert trlu._existing[id(exist)] is exist - # is in ID lookup - assert trlu._id_look_up[exist.id] is exist - # can be accessed via get_existing - assert trlu.get_existing(db.Record(id=1)) is exist - - miss = db.Record() - # exception when identifiable is missing - with raises(RuntimeError): - trlu.add(miss) - ident = Identifiable(name='a') - trlu.add(miss, ident) - # was added to missing - assert trlu._missing[id(miss)] is miss - # is in ident lookup - assert trlu._identifiable_look_up[ident.get_representation()] is miss - # can be accessed via get_missing - assert trlu.get_missing(db.Record(), Identifiable(name='a')) is miss - - fi = db.File(path='a', id=2) - trlu.add(fi) - assert len(trlu._existing) == 2 - # was added to existing - assert trlu._existing[id(fi)] is fi - # is in ID lookup - assert trlu._id_look_up[fi.id] is fi - # is in path lookup - assert trlu._path_look_up[fi.path] is fi - # can be accessed via get_existing - assert trlu.get_existing(fi) is fi - - all_exi = trlu.get_existing_list() - assert fi in all_exi - assert exist in all_exi - all_mi = trlu.get_missing_list() - assert miss in all_mi - - # If a Record was added using the ID, the ID must be used to identify it even though later an - # identifiable may be passed as well - assert trlu.get_any(exist, Identifiable(name='b')) is exist - - fi2 = db.File(path='b') - trlu.add(fi2) - assert trlu.get_any(db.File(path='b'), Identifiable(name='c')) is fi2 - - -def test_merge_entity_with_identifying_reference(crawler_mocked_identifiable_retrieve): - # When one python object representing a record is merged into another python object - # representing the same record, the former object can be forgotten and references from it to - # other records must not play a role - crawler = crawler_mocked_identifiable_retrieve - crawler.identifiableAdapter.get_registered_identifiable = Mock( - side_effect=lambda x: db.Record().add_parent('C').add_property(name='name') if - x.parents[0].name == "C" else - db.Record().add_parent('D').add_property(name='is_referenced_by', value="*") - ) - a = db.Record(name='a').add_parent("D") - b = db.Record(name='b').add_parent("C") - c = db.Record(name='b').add_parent("C").add_property(name="C", value=a) - flat = [a, c, b] - _, _ = crawler.split_into_inserts_and_updates(flat) diff --git a/unittests/test_file_identifiables.py b/unittests/test_file_identifiables.py deleted file mode 100644 index 4ec02aa3fc497f8dc35adc709533ef5b35066f3a..0000000000000000000000000000000000000000 --- a/unittests/test_file_identifiables.py +++ /dev/null @@ -1,69 +0,0 @@ -#!/bin/python -# Tests for file identifiables -# A. Schlemmer, 06/2021 - -from unittest.mock import Mock, patch - -import caosdb as db -import pytest -from caoscrawler.identifiable import Identifiable -from caoscrawler.identifiable_adapters import LocalStorageIdentifiableAdapter -from caosdb.cached import cache_clear -from caosdb.exceptions import EmptyUniqueQueryError -from pytest import raises - -from test_crawler import mock_get_entity_by - - -@pytest.fixture(autouse=True) -def clear_cache(): - cache_clear() - - -@patch("caoscrawler.identifiable_adapters.get_children_of_rt", - new=Mock(side_effect=id)) -@patch("caoscrawler.identifiable_adapters.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by)) -def test_file_identifiable(): - ident = LocalStorageIdentifiableAdapter() - - # Without a path there is no identifying information - with raises(ValueError): - ident.get_identifiable(db.File(), []) - - fp = "/test/bla/bla.txt" - file_obj = db.File(path=fp) - identifiable = ident.get_identifiable(file_obj) - - # the path is copied to the identifiable - assert fp == identifiable.path - assert isinstance(identifiable, Identifiable) - - # __eq__ function is only defined for Identifiable objects - with raises(ValueError): - file_obj != identifiable - - # since the path does not exist in the data in ident, the follwoing functions return None - with raises(EmptyUniqueQueryError): - ident.retrieve_identified_record_for_record(file_obj) - assert ident.get_file(identifiable) is None - - # Try again with actual files in the store: - records = ident.get_records() - test_record_wrong_path = db.File(path="/bla/bla/test.txt") - test_record_correct_path = db.File(path="/test/bla/bla.txt") - test_record_alsocorrect_path = db.File(path="/test/bla/bla.txt") - records.append(test_record_wrong_path) - # Now, there is a file, but still wrong path -> result is still None - identified_file = ident.get_file(file_obj) - assert identified_file is None - - records.append(test_record_correct_path) - # now there is a match - identified_file = ident.get_file(file_obj) - assert identified_file is not None - assert identified_file.path == file_obj.path - - with raises(RuntimeError, match=".*unambigiously.*"): - records.append(test_record_alsocorrect_path) - identified_file = ident.get_file(file_obj) diff --git a/unittests/test_identifiable.py b/unittests/test_identifiable.py index 28bdb7a2ad75d5b9389b47ca3f0ec2b2e2a1404b..074c3843e351b20d17813a661974fdc59ca0442a 100644 --- a/unittests/test_identifiable.py +++ b/unittests/test_identifiable.py @@ -27,6 +27,7 @@ test identifiable module import caosdb as db import pytest from caoscrawler.identifiable import Identifiable +from caoscrawler.sync_node import SyncNode def test_create_hashable_string(): @@ -42,25 +43,20 @@ def test_create_hashable_string(): assert ( Identifiable._create_hashable_string( Identifiable(name="A", record_type="B", - properties={'a': db.Record(id=12)}) + properties={'a': SyncNode(db.Record(id=12))}) ) == "P<B>N<A>R<[]>a:12") a = Identifiable._create_hashable_string( - Identifiable(name="A", record_type="B", properties={'a': [db.Record(id=12)]})) + Identifiable(name="A", record_type="B", properties={'a': [SyncNode(db.Record(id=12))]})) assert (a == "P<B>N<A>R<[]>a:[12]") assert (Identifiable._create_hashable_string( Identifiable(name="A", record_type="B", properties={'a': [12]})) == "P<B>N<A>R<[]>a:[12]") assert ( Identifiable._create_hashable_string( Identifiable(name="A", record_type="B", properties={ - 'a': [db.Record(id=12), 11]}) + 'a': [SyncNode(db.Record(id=12)), 11]}) ) == "P<B>N<A>R<[]>a:[12, 11]") - assert ( - Identifiable._create_hashable_string( - Identifiable(record_type="B", properties={'a': [db.Record()]}) - ) != Identifiable._create_hashable_string( - Identifiable(record_type="B", properties={'a': [db.Record()]}))) assert Identifiable._create_hashable_string( - Identifiable(name="A", record_type="B", backrefs=[123, db.Entity(id=124)], + Identifiable(name="A", record_type="B", backrefs=[123, SyncNode(db.Record(id=124))], properties={'a': 5})) == "P<B>N<A>R<['123', '124']>a:5" @@ -73,9 +69,9 @@ def test_repr(): # only test that something meaningful is returned assert 'properties' in str(Identifiable(name="A", record_type="B")) assert str(Identifiable(name="A", record_type="B", properties={'a': 0})).split( - "properties:\n")[1].split('\n')[0] == '{"a": 0}' + "properties:\n")[1].split('\n')[0] == '{"a": "0"}' assert str(Identifiable(name="A", record_type="B", properties={'a': 0, 'b': "test"})).split( - "properties:\n")[1].split('\n')[0] == '{"a": 0, "b": "test"}' + "properties:\n")[1].split('\n')[0] == '{"a": "0", "b": "test"}' # TODO(henrik): Add a test using backrefs once that's implemented. @@ -87,13 +83,5 @@ def test_equality(): record_id=12, properties={"a": 0}) != Identifiable(record_id=13, properties={"a": 0}) assert Identifiable( record_id=12, properties={"a": 0}) == Identifiable(properties={"a": 0}) - assert Identifiable( - path="a", properties={"a": 0}) != Identifiable(path="b", properties={"a": 0}) - assert Identifiable( - path="a", properties={"a": 0}) == Identifiable(path="a", properties={"a": 1}) - assert Identifiable( - path="a", properties={"a": 0}) == Identifiable(properties={"a": 0}) - assert Identifiable(properties={"a": 0}) == Identifiable( - properties={"a": 0}) - assert Identifiable(properties={"a": 0}) != Identifiable( - properties={"a": 1}) + assert Identifiable(properties={"a": 0}) == Identifiable(properties={"a": 0}) + assert Identifiable(properties={"a": 0}) != Identifiable(properties={"a": 1}) diff --git a/unittests/test_identifiable_adapters.py b/unittests/test_identifiable_adapters.py index ee0e0d6cd7c791f78e7cd2307dc6f34698326b4a..bdcfeacb6dea514ad689156bf2f61e712c665a4e 100644 --- a/unittests/test_identifiable_adapters.py +++ b/unittests/test_identifiable_adapters.py @@ -29,6 +29,7 @@ test identifiable_adapters module import os from datetime import datetime +from unittest.mock import MagicMock, Mock, patch from pathlib import Path import caosdb as db @@ -37,6 +38,7 @@ from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import (CaosDBIdentifiableAdapter, IdentifiableAdapter, convert_value) +from caoscrawler.sync_graph import SyncNode UNITTESTDIR = Path(__file__).parent @@ -122,28 +124,25 @@ def test_load_from_yaml_file(): def test_non_default_name(): ident = CaosDBIdentifiableAdapter() - ident.register_identifiable( - "Person", db.RecordType() - .add_parent(name="Person") - .add_property(name="last_name")) - identifiable = ident.get_identifiable(db.Record(name="don't touch it") + identifiable = ident.get_identifiable(SyncNode(db.Record(name="don't touch it") .add_parent("Person") - .add_property(name="last_name", value='Tom') - ) + .add_property(name="last_name", value='Tom'), db.RecordType() + .add_parent(name="Person") + .add_property(name="last_name")), []) assert identifiable.name is None def test_wildcard_ref(): ident = CaosDBIdentifiableAdapter() - ident.register_identifiable( - "Person", db.RecordType() - .add_parent(name="Person") - .add_property(name="is_referenced_by", value=["*"])) rec = (db.Record(name="don't touch it").add_parent("Person") .add_property(name="last_name", value='Tom')) - identifiable = ident.get_identifiable(rec, - referencing_entities={ - 'A': [1]} + dummy = SyncNode(db.Record(), None) + dummy.id = 1 + identifiable = ident.get_identifiable(SyncNode(rec, db.RecordType() + .add_parent(name="Person") + .add_property(name="is_referenced_by", value=["*"])), + + [dummy] ) assert identifiable.backrefs[0] == 1 @@ -158,25 +157,63 @@ def test_convert_value(): def test_get_identifiable(): - # TODO modify this such that it becomes a test that acutally tests (sufficiently) the - # get_identifable function - ident = CaosDBIdentifiableAdapter() ident.load_from_yaml_definition(UNITTESTDIR / "example_identifiables.yml") - r_cur = (db.Record(id=5) - .add_parent(name="Experiment", id=3) - .add_property(name="date", value="2022-02-01") - .add_property(name="result", value="FAIL")) - id_r0 = ident.get_identifiable(r_cur) - assert r_cur.parents[0].name == id_r0.record_type - assert r_cur.get_property( - "date").value == id_r0.properties["date"] - assert len(r_cur.parents) == 1 - assert len(r_cur.properties) == 2 + rec = (db.Record(id=5) + .add_parent(name="Experiment", id=3) + .add_property(name="date", value="2022-02-01") + .add_property(name="result", value="FAIL")) + se = SyncNode(rec, + ident.get_registered_identifiable(rec)) + id_r0 = ident.get_identifiable(se, []) + assert rec.parents[0].name == id_r0.record_type + assert rec.get_property("date").value == id_r0.properties["date"] + assert len(rec.parents) == 1 + assert len(rec.properties) == 2 assert len(id_r0.properties) == 1 + ident = CaosDBIdentifiableAdapter() + ident_a = db.RecordType(name="A").add_parent("A").add_property("name").add_property("a") + ident.register_identifiable("A", ident_a) + rec = (db.Record(id=5) + .add_parent(name="A", id=3) + .add_property(name="a", value="2022-02-01") + .add_property(name="result", value="FAIL")) + se = SyncNode(rec, ident.get_registered_identifiable(rec)) + for el in [ + db.Record() + .add_parent(name="A", id=3) + .add_property(name="a", value="2022-02-01") + .add_property(name="result", value="FAIL"), + db.Record(name='a') + .add_parent(name="A", id=3) + .add_property(name="a", value="2022-02-01") + .add_property(name="result", value="FAIL"), + ]: + se.update(SyncNode(el)) + + id_r0 = ident.get_identifiable(se, []) + assert "A" == id_r0.record_type + assert "2022-02-01" == id_r0.properties["a"] + assert 'a' == id_r0.name + assert len(id_r0.properties) == 1 + + rec = (db.Record(name='a') + .add_parent(name="A") + .add_property(name="a", value="2") + ) + se = SyncNode(rec, ident.get_registered_identifiable(rec)) + se.update(SyncNode( + db.Record(name='a') + .add_parent(name="A") + .add_property(name="a", value="3") + )) + + with pytest.raises(RuntimeError): + id_r0 = ident.get_identifiable(se, []) -@pytest.mark.xfail + +@ pytest.mark.xfail def test_retrieve_identified_record_for_identifiable(): # TODO modify this such that it becomes a test that acutally tests (sufficiently) the # retrieve_identified_record_for_identifiable function @@ -190,7 +227,7 @@ def test_retrieve_identified_record_for_identifiable(): r_cur = r break - id_r1 = ident.get_identifiable(r_cur) + id_r1 = ident.get_identifiable(r_cur, []) assert r_cur.parents[0].name == id_r1.record_type assert r_cur.get_property( "identifier").value == id_r1.properties["identifier"] @@ -211,3 +248,19 @@ def test_retrieve_identified_record_for_identifiable(): assert r_cur.get_property( "responsible").value == idr_r1.get_property("responsible").value assert r_cur.description == idr_r1.description + + +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_referencing_entity_has_appropriate_type(): + dummy = db.Record().add_parent("A") + registered_identifiable = db.RecordType() + rft = IdentifiableAdapter.referencing_entity_has_appropriate_type + assert not rft([], registered_identifiable) + assert not rft(dummy.parents, registered_identifiable) + registered_identifiable.add_property("is_referenced_by", "B") + assert not rft(dummy.parents, registered_identifiable) + registered_identifiable.properties[0].value = ["B", "A"] + assert rft(dummy.parents, registered_identifiable) + registered_identifiable.properties[0].value = ["B", "*"] + assert rft(dummy.parents, registered_identifiable) diff --git a/unittests/test_sync_graph.py b/unittests/test_sync_graph.py new file mode 100644 index 0000000000000000000000000000000000000000..2c63cb54aceeaef98df36630ba0873cd62ebf7e3 --- /dev/null +++ b/unittests/test_sync_graph.py @@ -0,0 +1,651 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +from functools import partial +from unittest.mock import MagicMock, Mock, patch + +import linkahead as db +import pytest +from test_crawler import basic_retrieve_by_name_mock_up, mock_get_entity_by + +from caoscrawler.exceptions import (ImpossibleMergeError, + MissingIdentifyingProperty) +from caoscrawler.identifiable import Identifiable +from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter +from caoscrawler.sync_graph import SyncGraph, _set_each_scalar_value +from caoscrawler.sync_node import SyncNode, parent_in_list, property_in_list + +from itertools import product + + +@pytest.fixture +def simple_adapter(): + # different RTs with different registered identifiables to allow to test various behavior + ident_adapter = CaosDBIdentifiableAdapter() + ident_adapter.register_identifiable( + "RT1", + db.RecordType().add_parent("RT1").add_property("RT2")) + ident_adapter.register_identifiable( + "RT2", + db.RecordType().add_parent("RT2").add_property("is_referenced_by", ["RT1", "RT3"])) + ident_adapter.register_identifiable( + "RT3", + db.RecordType().add_parent("RT3").add_property("a")) + ident_adapter.register_identifiable( + "RT4", + db.RecordType().add_parent("RT4").add_property("RT3")) + ident_adapter.register_identifiable( + "RT5", + db.RecordType().add_parent("RT5").add_property("name")) + return ident_adapter + + +def test_create_flat_list(): + a = db.Record() + b = db.Record() + a.add_property(name="a", value=a) + a.add_property(name="b", value=b) + flat = SyncGraph._create_flat_list([a]) + assert len(flat) == 2 + assert a in flat + assert b in flat + c = db.Record() + c.add_property(name="a", value=a) + # This would cause a recursion error if it is not dealt with properly. + a.add_property(name="c", value=c) + flat = SyncGraph._create_flat_list([c]) + assert len(flat) == 3 + assert a in flat + assert b in flat + assert c in flat + + # Test for lists: + a = db.Record() + b = db.Record() + d = db.Record() + a.add_property(name="a", value=a) + a.add_property(name="list", value=[b, d]) + flat = SyncGraph._create_flat_list([a]) + assert len(flat) == 3 + assert a in flat + assert b in flat + assert d in flat + + c = db.Record() + c.add_property(name="a", value=a) + # This would cause a recursion error if it is not dealt with properly. + a.add_property(name="second_list", value=[b, d, c]) + flat = SyncGraph._create_flat_list([c]) + assert len(flat) == 4 + assert a in flat + assert b in flat + assert c in flat + assert d in flat + + +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_create_reference_mapping(): + a = SyncNode(db.Record().add_parent("RT1"), + db.RecordType().add_property("is_referenced_by", ["RT2"])) + b = SyncNode(db.Record(id=132).add_parent("RT2").add_property('a', a), + db.RecordType().add_property("a")) + ses = [a, b] + + mappings = SyncGraph._create_reference_mapping(ses) + # test initialization + for index, mapping in product((0, 1), mappings): + assert id(ses[index]) in mapping + + (forward_references, backward_references, forward_references_id_props, + backward_references_id_props, forward_references_backref, + backward_references_backref) = mappings + + # a has no ref + assert len(forward_references[id(a)]) == 0 + assert backward_references[id(a)] == set([b]) + # b does + assert forward_references[id(b)] == set([a]) + assert backward_references[id(b)] == set() + # a has no identifying reference + assert forward_references_id_props[id(a)] == set() + assert backward_references_id_props[id(a)] == set([b]) + # b has an identifying reference + assert forward_references_id_props[id(b)] == set([a]) + assert backward_references_id_props[id(b)] == set() + # a has an identifying back reference + assert forward_references_backref[id(a)] == set() + assert backward_references_backref[id(a)] == set([b]) + # b does not + assert forward_references_backref[id(b)] == set([a]) + assert backward_references_backref[id(b)] == set() + + +@patch("caoscrawler.sync_graph.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +def test_SyncGraph_init(): + # trivial case + a = db.Record(id=101).add_parent("A") + ident_a = db.RecordType().add_parent("A").add_property("prop_ident") + ident_adapter = CaosDBIdentifiableAdapter() + ident_adapter.register_identifiable("A", ident_a) + SyncGraph([a], ident_adapter) + SyncGraph([], ident_adapter) # should not fail either... + # test whether missing identifying properties cause an exception + with pytest.raises(MissingIdentifyingProperty): + SyncGraph([db.Record().add_parent("A")], ident_adapter) + + entlist = [ + db.Record(id=101).add_parent("A"), + db.Record(id=102).add_parent("A"), + db.File(path='a').add_parent("A"), + db.File(path='b').add_parent("A"), + db.Record(id=103).add_parent("A"), + db.Record(id=104).add_parent("A").add_property(name='prop_ident', value="MERGEME"), + db.Record().add_parent("A").add_property(name='prop_ident', value="MERGEME"), + db.File(path='a', file='b').add_parent("A"), + db.Record(id=101).add_parent("A"), + db.Record().add_parent("A").add_property(name='prop_ident', value="other"), + db.Record().add_parent("A").add_property(name='prop_ident', + value=db.Record().add_parent("A") + .add_property(name='prop_ident', value="other")), + db.File(path='a', file='b').add_parent("A"), + db.Record(id=101).add_parent("A"), + ] + st = SyncGraph(entlist, ident_adapter) + # all nodes with ID=101 have been merged + assert len([el for el in st.nodes if el.id == 101]) == 1 + # all nodes with path='a' have been merged + assert len([el for el in st.nodes if el.path == 'a']) == 1 + # all nodes with ID or path were removed from unchecked + for el in st.nodes: + if el.id is not None or el.path is not None: + assert el not in st.unchecked + # all nodes with ID are in the ID lookup + for el in st.nodes: + if el.id is not None: + assert st._id_look_up[el.id] is el + # all nodes with path are in the path lookup + for el in st.nodes: + if el.path is not None: + assert st._path_look_up[el.path] is el + # all nodes with identifiable are in the identifiable lookup + for el in st.nodes: + if el.identifiable is not None: + assert st._identifiable_look_up[el.identifiable.get_representation()] is el + # The node, which has no ID but has an identifiable, was merged with another node with ID (due + # to the shared identifiable) + new_one = [el for el in st.nodes if len(el.properties) > 0 + and el.properties[0].value == "MERGEME"] + assert len(new_one) == 1 + assert new_one[0].id == 104 + # every node that does not rely on something unchecked has an identifiable or an ID + for el in st.nodes: + if not st._identity_relies_on_unchecked_entity(el): + assert el.identifiable is not None or el.id is not None + + +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_merge_into_trivial(simple_adapter): + # simplest case: a -> c + # b + # (a reference c; b does not reference anything; a & b have the same target + # record) + c = db.Record(name='c').add_parent("RT2") + a = db.Record(name='a').add_parent("RT1").add_property('RT2', c) + b = db.Record(id=101).add_parent("RT1") + + st = SyncGraph([a, b], simple_adapter) + se_a, se_b, se_c = st.nodes + assert se_a.name == 'a' + assert se_b.id == 101 + assert se_c.name == 'c' + + # CHECK REFERENCE MAP (before merge): + # c is referenced by a + assert len(st.forward_references[id(se_a)]) == 1 + assert se_c in st.forward_references[id(se_a)] + assert len(st.forward_references[id(se_b)]) == 0 + assert len(st.forward_references[id(se_c)]) == 0 + assert len(st.backward_references[id(se_a)]) == 0 + assert len(st.backward_references[id(se_b)]) == 0 + assert len(st.backward_references[id(se_c)]) == 1 + assert se_a in st.backward_references[id(se_c)] + + assert len(st.forward_references_id_props[id(se_a)]) == 1 + assert se_c in st.forward_references_id_props[id(se_a)] + assert len(st.forward_references_id_props[id(se_b)]) == 0 + assert len(st.forward_references_id_props[id(se_c)]) == 0 + assert len(st.backward_references_id_props[id(se_a)]) == 0 + assert len(st.backward_references_id_props[id(se_b)]) == 0 + assert len(st.backward_references_id_props[id(se_c)]) == 1 + assert se_a in st.backward_references_id_props[id(se_c)] + + assert len(st.forward_references_backref[id(se_a)]) == 1 + assert se_c in st.forward_references_backref[id(se_a)] + assert len(st.forward_references_backref[id(se_b)]) == 0 + assert len(st.forward_references_backref[id(se_c)]) == 0 + assert len(st.backward_references_backref[id(se_a)]) == 0 + assert len(st.backward_references_backref[id(se_b)]) == 0 + assert len(st.backward_references_backref[id(se_c)]) == 1 + assert se_a in st.backward_references_backref[id(se_c)] + + st.set_id_of_node(se_a, 101) + + # CHECK REFERENCE MAP (after merge): + # c is now referenced by b + assert id(se_a) not in st.forward_references + assert len(st.forward_references[id(se_b)]) == 1 + assert se_c in st.forward_references[id(se_b)] + assert len(st.forward_references[id(se_c)]) == 0 + assert id(se_a) not in st.backward_references + assert len(st.backward_references[id(se_b)]) == 0 + assert len(st.backward_references[id(se_c)]) == 1 + assert se_b in st.backward_references[id(se_c)] + + assert id(se_a) not in st.forward_references_id_props + assert len(st.forward_references_id_props[id(se_b)]) == 1 + assert se_c in st.forward_references_id_props[id(se_b)] + assert len(st.forward_references_id_props[id(se_c)]) == 0 + assert id(se_a) not in st.backward_references_id_props + assert len(st.backward_references_id_props[id(se_b)]) == 0 + assert len(st.backward_references_id_props[id(se_c)]) == 1 + assert se_b in st.backward_references_id_props[id(se_c)] + + assert id(se_a) not in st.forward_references_backref + assert len(st.forward_references_backref[id(se_b)]) == 1 + assert se_c in st.forward_references_backref[id(se_b)] + assert len(st.forward_references_backref[id(se_c)]) == 0 + assert id(se_a) not in st.backward_references_backref + assert len(st.backward_references_backref[id(se_b)]) == 0 + assert len(st.backward_references_backref[id(se_c)]) == 1 + assert se_b in st.backward_references_backref[id(se_c)] + + +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_merge_into_simple(simple_adapter): + # simple case: a -> c <- b (a & b reference c; a & b have the same target record) + c = db.Record(name='c').add_parent("RT2") + a = db.Record().add_parent("RT1").add_property('RT2', c) + b = db.Record().add_parent("RT1").add_property('RT2', c) + + st = SyncGraph([a, b], simple_adapter) + se_a = st.nodes[0] + se_b = st.nodes[1] + se_c = st.nodes[2] + + # CHECK REFERENCE MAP: + # c is referenced by a & b + assert len(st.forward_references[id(se_a)]) == 1 + se_c in st.forward_references[id(se_a)] + assert len(st.forward_references[id(se_b)]) == 1 + se_c in st.forward_references[id(se_b)] + assert len(st.forward_references[id(se_c)]) == 0 + assert len(st.backward_references[id(se_a)]) == 0 + assert len(st.backward_references[id(se_b)]) == 0 + assert len(st.backward_references[id(se_c)]) == 2 + se_a in st.backward_references[id(se_c)] + se_b in st.backward_references[id(se_c)] + + assert len(st.forward_references_id_props[id(se_a)]) == 1 + se_c in st.forward_references_id_props[id(se_a)] + assert len(st.forward_references_id_props[id(se_b)]) == 1 + se_c in st.forward_references_id_props[id(se_b)] + assert len(st.forward_references_id_props[id(se_c)]) == 0 + assert len(st.backward_references_id_props[id(se_a)]) == 0 + assert len(st.backward_references_id_props[id(se_b)]) == 0 + assert len(st.backward_references_id_props[id(se_c)]) == 2 + se_a in st.backward_references_id_props[id(se_c)] + se_b in st.backward_references_id_props[id(se_c)] + + assert len(st.forward_references_backref[id(se_a)]) == 1 + se_c in st.forward_references_backref[id(se_a)] + assert len(st.forward_references_backref[id(se_b)]) == 1 + se_c in st.forward_references_backref[id(se_b)] + assert len(st.forward_references_backref[id(se_c)]) == 0 + assert len(st.backward_references_backref[id(se_a)]) == 0 + assert len(st.backward_references_backref[id(se_b)]) == 0 + assert len(st.backward_references_backref[id(se_c)]) == 2 + se_a in st.backward_references_backref[id(se_c)] + se_b in st.backward_references_backref[id(se_c)] + + st._merge_into(se_a, se_b) + + # CHECK REFERENCE MAP (after merge): + # c is now referenced by b + # (same situation as above) + assert id(se_a) not in st.forward_references + assert len(st.forward_references[id(se_b)]) == 1 + se_c in st.forward_references[id(se_b)] + assert len(st.forward_references[id(se_c)]) == 0 + assert id(se_a) not in st.backward_references + assert len(st.backward_references[id(se_b)]) == 0 + assert len(st.backward_references[id(se_c)]) == 1 + se_b in st.backward_references[id(se_c)] + + assert id(se_a) not in st.forward_references_id_props + assert len(st.forward_references_id_props[id(se_b)]) == 1 + se_c in st.forward_references_id_props[id(se_b)] + assert len(st.forward_references_id_props[id(se_c)]) == 0 + assert id(se_a) not in st.backward_references_id_props + assert len(st.backward_references_id_props[id(se_b)]) == 0 + assert len(st.backward_references_id_props[id(se_c)]) == 1 + se_b in st.backward_references_id_props[id(se_c)] + + assert id(se_a) not in st.forward_references_backref + assert len(st.forward_references_backref[id(se_b)]) == 1 + se_c in st.forward_references_backref[id(se_b)] + assert len(st.forward_references_backref[id(se_c)]) == 0 + assert id(se_a) not in st.backward_references_backref + assert len(st.backward_references_backref[id(se_b)]) == 0 + assert len(st.backward_references_backref[id(se_c)]) == 1 + se_b in st.backward_references_backref[id(se_c)] + + +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_backward_references_backref(): + # We use the reference as identifying reference in both directions. Thus the map is the same + # for all three categories: references, id_references and id_referenced_by + ident_a = db.RecordType().add_parent("BR").add_property("name") + ident_b = db.RecordType().add_parent("C").add_property("is_referenced_by", ["BR"]) + ident_adapter = CaosDBIdentifiableAdapter() + ident_adapter.register_identifiable("BR", ident_a) + ident_adapter.register_identifiable("C", ident_b) + + referenced = db.Record(name="B").add_parent("C") + ent_list = [referenced, db.Record(name="A").add_parent("BR").add_property("ref", referenced), ] + + st = SyncGraph(ent_list, ident_adapter) + assert st.nodes[1] in st.backward_references_backref[id(st.nodes[0])] + + +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_set_id_of_node(simple_adapter): + # setting the id should lead to the node being marked as existing + ent_list = [db.Record(name='a').add_parent("RT5")] + st = SyncGraph(ent_list, simple_adapter) + assert len(st.nodes) == 1 + assert len(st.unchecked) == 1 + st.set_id_of_node(st.unchecked[0], 101) + assert len(st.nodes) == 1 + assert len(st.unchecked) == 0 + assert id(st.nodes[0]) in st._existing + + # setting the id with None should lead to the node being marked as missing + ent_list = [db.Record().add_parent("RT1").add_property(name="RT2", value=1)] + st = SyncGraph(ent_list, simple_adapter) + assert len(st.nodes) == 1 + assert len(st.unchecked) == 1 + # is automatically set in during initialization of graph + assert st.nodes[0].identifiable is not None + st.set_id_of_node(st.unchecked[0]) + assert len(st.nodes) == 1 + assert len(st.unchecked) == 0 + assert id(st.nodes[0]) in st._missing + + # setting the id to one that already exists should lead to a merge + ent_list = [ + db.Record(id=101).add_parent("RT5"), + db.Record(name='a').add_parent("RT5").add_property(name="RT2", value=1)] + st = SyncGraph(ent_list, simple_adapter) + assert len(st.nodes) == 2 + assert len(st.unchecked) == 1 + st.set_id_of_node(st.unchecked[0], 101) + assert len(st.nodes) == 1 + assert len(st.unchecked) == 0 + assert st.nodes[0].properties[0].name == "RT2" + + # setting the id to None should lead to depending nodes marked as missing + ent_list = [ + db.Record().add_parent("RT3").add_property(name="a", value=1).add_property( + name="RT2", value=db.Record().add_parent("RT2")), + ] + st = SyncGraph(ent_list, simple_adapter) + assert len(st.nodes) == 2 + assert len(st.unchecked) == 2 + st.set_id_of_node(st.unchecked[0]) + assert len(st.nodes) == 2 + assert len(st.unchecked) == 0 + assert id(st.nodes[0]) in st._missing + assert id(st.nodes[1]) in st._missing + + # same as above but with backref + ent_list = [ + db.Record() + .add_parent("RT4") + .add_property(name="RT3", + value=db.Record().add_parent("RT3").add_property(name="a", value=1)), + ] + st = SyncGraph(ent_list, simple_adapter) + assert len(st.nodes) == 2 + assert len(st.unchecked) == 2 + assert st.unchecked[1].identifiable is not None + st.set_id_of_node(st.unchecked[1]) + assert len(st.nodes) == 2 + assert len(st.unchecked) == 0 + assert id(st.nodes[0]) in st._missing + assert id(st.nodes[1]) in st._missing + + # setting an id might allow to check another node that depends on the former + ent_list = [ + db.Record() + .add_parent("RT4") + .add_property(name="RT3", + value=db.Record().add_parent("RT3").add_property(name="a", value=1)), + ] + st = SyncGraph(ent_list, simple_adapter) + assert st.nodes[0].identifiable is None + assert st.nodes[1].identifiable is not None + st.set_id_of_node(st.unchecked[1], 111) + assert st.nodes[0].identifiable is not None + assert st.nodes[1].identifiable is not None + + # same as above but going one step further: the new identifiable allows to merge that node + ent_list = [ + (db.Record() + .add_parent("RT4") + .add_property(name="RT3", + value=db.Record().add_parent("RT3").add_property(name="a", value=1))), + + (db.Record() + .add_parent("RT4") + .add_property(name="RT3", value=111)) + ] + st = SyncGraph(ent_list, simple_adapter) + assert st.nodes[0].identifiable is None + assert st.nodes[1].identifiable is not None + assert st.nodes[2].identifiable is not None + assert len(st.nodes) == 3 + st.set_id_of_node(st.unchecked[2], 111) + assert st.nodes[0].identifiable is not None + assert len(st.nodes) == 2 + + +@patch("caoscrawler.sync_graph.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +def test_merging(simple_adapter): + # identifying information can be given at various locations in the hierachical tree + # test whether an object is correctly combined for all cases + ident_adapter = CaosDBIdentifiableAdapter() + ident_a = db.RecordType().add_parent("A").add_property("name").add_property("a") + ident_adapter.register_identifiable("A", ident_a) + ident_adapter.retrieve_identified_record_for_identifiable = Mock( + side_effect=partial( + basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A")})) + + # merging based on id + ent_list = [ + db.Record(id=101).add_parent("A"), + db.Record(id=101).add_parent("A")] + st = SyncGraph(ent_list, ident_adapter) + assert len(st.nodes) == 1 + assert len(st.unchecked) == 0 + assert 101 == st.nodes[0].id + assert "A" == st.nodes[0].parents[0].name + + # merging based on path + ent_list = [ + db.File(path='101').add_parent("A"), + db.File(path='101').add_parent("A")] + st = SyncGraph(ent_list, ident_adapter) + assert len(st.nodes) == 1 + assert len(st.unchecked) == 0 + assert '101' == st.nodes[0].path + assert "A" == st.nodes[0].parents[0].name + + # merging based on identifiable (non identifying properties are ignored) + ent_list = [ + db.File(name='101').add_parent("A").add_property('a', value=1).add_property('b', value=1), + db.File(name='101').add_parent("A").add_property('a', value=1).add_property('b', value=2)] + st = SyncGraph(ent_list, ident_adapter) + assert len(st.nodes) == 1 + assert st.nodes[0].id is None + assert '101' == st.nodes[0].name + assert "A" == st.nodes[0].parents[0].name + assert 1 == st.nodes[0].properties[0].value + assert "a" == st.nodes[0].properties[0].name + + # Merging a mix. One Record needs the identifiable to be merged. But the identifying + # information is scattered in the other case. + ent_list = [ + db.Record(id=101).add_parent("A"), + db.Record(id=101, name='a').add_parent("A"), + db.Record(id=101).add_parent("A").add_property('a', value=1), + db.Record(name='a').add_parent("A").add_property('a', value=1)] + + st = SyncGraph(ent_list, ident_adapter) + assert len(st.nodes) == 1 + assert len(st.unchecked) == 0 + assert 'a' == st.nodes[0].name + assert "A" == st.nodes[0].parents[0].name + assert 1 == st.nodes[0].properties[0].value + assert "a" == st.nodes[0].properties[0].name + assert 101 == st.nodes[0].id + + # test that adding an ID can lead to a cascade of merges + # This also tests whether setting something to missing allows to create an identifiable + # and thus allows a merge + subtree = db.Record(name='a').add_parent("A").add_property('a', value=db.Record( + name='b').add_parent("A").add_property('a', value=db.Record( + name='c').add_parent("A").add_property('a', value="missing"))) + ent_list = [ + db.Record(id=101).add_parent("A"), + db.Record(id=101, name='z').add_parent("A"), + db.Record(id=101).add_parent("A").add_property('a', value=subtree), + db.Record(name='z').add_parent("A").add_property('a', value=subtree), + ] + + st = SyncGraph(ent_list, ident_adapter) + assert len(st.nodes) == 5 + assert len(st.unchecked) == 4 + missing_one = [el for el in st.nodes if el.name == 'c'][0] + st.set_id_of_node(missing_one) + # setting c to missing means that b cannot exist which means that a cannot exist, this allows + # to merge the two z nodes + assert len(st.nodes) == 4 + assert len(st.unchecked) == 0 + + +def test_update_of_reference_values(simple_adapter): + # multiple nodes are merged including one that is referenced + # assure that this still leads to the value of the property of the referencing node to be + # updated, when the id is set. (Value object is replaced appropriately) + a = db.Record().add_parent("RT3").add_property('a', value=1) + ent_list = [ + a, + db.Record().add_parent("RT3").add_property('a', value=1), + db.Record().add_parent("RT3").add_property('a', value=1), + db.Record().add_parent("RT3").add_property('a', value=1), + db.Record().add_parent("RT3").add_property('a', value=1), + db.Record().add_parent("RT4").add_property('RT3', value=a), + db.Record().add_parent("RT3").add_property('a', value=1), + db.Record().add_parent("RT3").add_property('a', value=1)] + st = SyncGraph(ent_list, simple_adapter) + assert len(st.nodes) == 2 + assert len(st.unchecked) == 2 + assert 'RT4' == st.nodes[1].parents[0].name + st.set_id_of_node(st.nodes[0], 101) + b_prop = st.nodes[1].properties[0].value + assert b_prop.id == 101 + + +def test_ignoring_irrelevant_references(simple_adapter): + # make sure that a circle of references is no problem if one references is not identifying + b = db.Record(name='b').add_parent("RT5") + a = db.Record().add_parent("RT3").add_property('a', value=b) + b.add_property('a', value=a) + ent_list = [a, b] + st = SyncGraph(ent_list, simple_adapter) + assert len(st.nodes) == 2 + assert len(st.unchecked) == 2 + assert st.nodes[1].name == 'b' + + # a relies on b + assert st._identity_relies_on_unchecked_entity(st.nodes[0]) + # b relies on nothing + assert not st._identity_relies_on_unchecked_entity(st.nodes[1]) + # set ID of b + st.set_id_of_node(st.nodes[1], 101) + assert len(st.unchecked) == 1 + # now a nolonger relies on unchecked + assert not st._identity_relies_on_unchecked_entity(st.nodes[0]) + +# 'is implementation insufficient' + + +@pytest.mark.xfail() +def test_detect_circular_dependency(crawler_mocked_identifiable_retrieve, caplog): + crawler = crawler_mocked_identifiable_retrieve + crawler.identifiableAdapter.get_registered_identifiable = Mock( + side_effect=lambda x: db.Record().add_parent('C').add_property(name='C')) + a = db.Record(name='a').add_parent("C") + b = db.Record(name='b').add_parent("C").add_property(name="C", value=a) + c = db.Record(name='c').add_parent("C").add_property(name='D', value='e' + ).add_property(name="C", value=b) + d = db.Record(name='c').add_parent("C") + a.add_property(name="C", value=c) + flat = [a, b, c] + circle = Crawler.detect_circular_dependency(flat) + assert [id(el) for el in circle] == [id(el) for el in [a, c, b, a]] + + assert Crawler.detect_circular_dependency([d]) is None + st = SyncGraph(flat, crawler.identifiableAdapter) + with raises(RuntimeError): + _, _ = crawler.split_into_inserts_and_updates(st) + caplog.set_level(logging.ERROR, logger="caoscrawler.converters") + assert "Found circular dependency" in caplog.text + assert "\n--------\n\n> Parent: C\n\n>> Name: a\n[\'C\']" in caplog.text + caplog.clear() + + +def test_set_each_scalar_value(): + """Test whether properties with None as value are treated appropriately.""" + a = SyncNode(db.Record().add_parent("RT1").add_property(name="bla"), + db.RecordType().add_property("is_referenced_by", ["RT2"])) + _set_each_scalar_value(a, lambda x: False, None) + _set_each_scalar_value(a, lambda x: isinstance(x, SyncNode), None) + _set_each_scalar_value(a, lambda x: x is None, lambda x: 42) + assert a.properties[0].value == 42 + _set_each_scalar_value(a, lambda x: x == 42, lambda x: None) + assert a.properties[0].value is None diff --git a/unittests/test_sync_node.py b/unittests/test_sync_node.py new file mode 100644 index 0000000000000000000000000000000000000000..668a53470d028dfcfce7bb5785d68b685b034595 --- /dev/null +++ b/unittests/test_sync_node.py @@ -0,0 +1,347 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +from functools import partial +from unittest.mock import MagicMock, Mock, patch + +import linkahead as db +import pytest +from caoscrawler.exceptions import ImpossibleMergeError +from caoscrawler.identifiable import Identifiable +from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter +from caoscrawler.sync_graph import SyncGraph +from caoscrawler.sync_node import SyncNode, parent_in_list, property_in_list + +from test_crawler import basic_retrieve_by_name_mock_up, mock_get_entity_by + + +def assert_parents_equal(p1, p2): + """Special assertion for comparing parents.""" + for a, b in zip(p1, p2): + assert a.id == b.id + assert a.name == b.name + + +def assert_properties_equal(p1, p2): + """Special assertion for comparing properties.""" + for a, b in zip(p1, p2): + assert a.id == b.id + assert a.name == b.name + assert a.value == b.value + assert a.datatype == b.datatype + + +def test_sync_node(): + # initialization + rec = (db.Record(id=101, name='101') + .add_parent("A") + .add_parent("B") + .add_parent(id=102) + .add_property(name="a", value='a') + .add_property(id=103, value='b')) + rec.description = "hallo" + sna = SyncNode(rec) + # check information stored in initialized SyncNode + assert "Record" in str(sna) + assert sna.id == rec.id + assert sna.role == rec.role + assert sna.name == rec.name + assert sna.description == rec.description + assert_parents_equal(sna.parents, rec.parents) + assert_properties_equal(sna.properties, rec.properties) + # ... special case File (path and file attributes) + fi = db.File(id=101, name='101', path='/a/') + snb = SyncNode(fi) + assert snb.role == fi.role + assert snb.name == fi.name + assert snb.id == fi.id + assert snb.path == fi.path + assert snb.file == fi.file + + # check information in exported db.Entity + export = sna.export_entity() + assert export.id == rec.id + assert export.role == rec.role + assert export.name == rec.name + assert export.description == rec.description + assert_parents_equal(export.parents, rec.parents) + assert_properties_equal(export.properties, rec.properties) + export = snb.export_entity() + assert export.role == fi.role + assert export.name == fi.name + assert export.id == fi.id + assert export.path == fi.path + assert export.file == fi.file + + # merge no common information + # --------------------------- + rec_a = (db.Record(name='101') + .add_parent("A") + .add_parent(id=102) + .add_property(name="a", value='a') + .add_property(id=103, value='b')) + + rec_b = (db.Record(id=101) + .add_parent("B") + .add_parent(id=103) + .add_property(name="a", value='a') + .add_property(id=103, value='b')) + rec_b.description = "tja" + + sn_a = SyncNode(rec_a) + sn_b = SyncNode(rec_b) + sn_a.update(sn_b) + # test information in updated node + assert sn_a.id == rec_b.id + assert sn_a.role == rec_a.role + assert sn_a.name == rec_a.name + assert sn_a.description == rec_b.description + for p in rec_a.parents + rec_b.parents: + assert p in sn_a.parents + for p in rec_a.properties + rec_b.properties: + assert p in sn_a.properties + # Check for duplicated property: + ps = [p for p in sn_a.properties if p.name == "a"] + assert len(ps) == 2 + assert ps[0].value == "a" + assert ps[1].value == "a" + + # test information in exported entity + export = sn_a.export_entity() + assert export.id == rec_b.id + assert export.name == rec_a.name + for p in rec_a.parents + rec_b.parents: + assert parent_in_list(p, export.parents) + for p in rec_a.properties + rec_b.properties: + if p.name is not None: + assert p.name in [el.name for el in export.properties] + if p.id is not None: + assert p.id in [el.id for el in export.properties] + assert len(export.properties) == 2 + assert export.get_property('a').value == 'a' + assert export.get_property(103).value == 'b' + assert export.description == rec_b.description + assert export.role == rec_a.role + + # merge with common information + # ----------------------------- + rec_a = (db.Record(id=101, name='101') + .add_parent("A") + .add_parent(id=102) + .add_property(name="a", value='a')) + + rec_b = (db.Record(id=101, name='101') + .add_parent("A") + .add_parent(id=102) + .add_property(name="a", value='a')) + + sn_a = SyncNode(rec_a) + sn_b = SyncNode(rec_b) + sn_a.update(sn_b) + assert sn_a.id == rec_b.id + assert sn_a.name == rec_a.name + for p in rec_a.parents + rec_b.parents: + assert parent_in_list(p, sn_a.parents) + for p in rec_a.properties + rec_b.properties: + assert property_in_list(p, sn_a.properties) + assert sn_a.description == rec_b.description + assert sn_a.role == rec_a.role + + # merge with conflicting information + # ---------------------------------- + # ID mismatch + sn_a = SyncNode(db.Record(id=102)) + with pytest.raises(ImpossibleMergeError, match="Trying to update"): + sn_a.update(SyncNode(db.Record(id=101))) + + # name mismatch + sn_a = SyncNode(db.Record(name='102')) + with pytest.raises(ImpossibleMergeError, match="Trying to update"): + sn_a.update(SyncNode(db.Record(name='101'))) + + # type mismatch + sn_a = SyncNode(db.Record(name='102')) + with pytest.raises(ImpossibleMergeError, match="Trying to update"): + sn_a.update(SyncNode(db.File(name='102'))) + + # description mismatch + sn_a = SyncNode(db.Record(description='102')) + with pytest.raises(ImpossibleMergeError, match="Trying to update"): + sn_a.update(SyncNode(db.Record(description='101'))) + + # path mismatch + sn_a = SyncNode(db.File(path='102')) + with pytest.raises(ImpossibleMergeError, match="Trying to update"): + sn_a.update(SyncNode(db.File(path='101'))) + + # identifiable mismatch + sn_a = SyncNode(db.File(path='102')) + sn_a.identifiable = Identifiable(name='a') + sn_b = SyncNode(db.File(path='101')) + sn_b.identifiable = Identifiable(name='b') + with pytest.raises(ValueError, match="identifiable"): + sn_a.update(sn_b) + + +def test_export_node(): + rec_a = (db.Record(id=101) + .add_parent("B") + .add_parent(id=103) + .add_property(name="a", value=[SyncNode(db.Record())]) + .add_property(name='b', id=103, value='b')) + + sn_a = SyncNode(rec_a) + exp = sn_a.export_entity() + assert exp.id == rec_a.id + assert exp.name == rec_a.name + for p in rec_a.parents: + assert len([el for el in exp.parents if p.name == el.name]) == 1 + for p in rec_a.properties: + assert p.value == exp.get_property(p.name).value + if isinstance(p.value, list): + assert len(p.value) == len(exp.get_property(p.name).value) + assert len(exp.properties) == len(rec_a.properties) + assert len(exp.parents) == len(rec_a.parents) + + # --------------------------------------------------------------------------------------------- + # NOTE: in the following we create a SyncNode object with twice the same Property as a short + # hand for a SyncNode that was created from one Entity with such a Property and then updating + # it with another SyncNode that also has the Property + # --------------------------------------------------------------------------------------------- + + # same property name, different values + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value='b') + .add_property(name="a", value='a')) + + # there should be a warning when multiproperties are used + with pytest.warns(UserWarning) as caught: + SyncNode(rec_a) + messages = {str(w.message) for w in caught} + assert ("Multiproperties are not supported by the crawler.") in messages + + with pytest.raises(ImpossibleMergeError): + exp = SyncNode(rec_a).export_entity() + + # SyncNodes with same ID are considered equal + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=SyncNode(db.Record(id=1))) + .add_property(name="a", value=SyncNode(db.Record(id=1)))) + + exp = SyncNode(rec_a).export_entity() + assert exp.get_property('a').value.id == 1 + # SyncNodes convert multi properties into single properties + assert len([p for p in exp.properties if p.name == "a"]) == 1 + + # same SyncNode object is obviously equal + sn = SyncNode(db.Record(id=1)) + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=sn) + .add_property(name="a", value=sn)) + + exp = SyncNode(rec_a).export_entity() + assert exp.get_property('a').value.id == 1 + assert len([p for p in exp.properties if p.name == "a"]) == 1 + + # different SyncNode Objects (without an ID) are not equal + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=SyncNode(db.Record())) + .add_property(name="a", value=SyncNode(db.Record()))) + + with pytest.raises(ImpossibleMergeError): + exp = SyncNode(rec_a).export_entity() + + # different SyncNode Objects with differing ID are not equal + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=SyncNode(db.Record(id=1))) + .add_property(name="a", value=SyncNode(db.Record(id=2)))) + + with pytest.raises(ImpossibleMergeError): + exp = SyncNode(rec_a).export_entity() + + # SyncNodes with same ID are considered equal (list) + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=[SyncNode(db.Record(id=1)), SyncNode(db.Record(id=2))]) + .add_property(name="a", value=[SyncNode(db.Record(id=1)), SyncNode(db.Record(id=2))])) + + exp = SyncNode(rec_a).export_entity() + assert exp.get_property('a').value[0].id == 1 + assert len([p for p in exp.properties if p.name == "a"]) == 1 + + # SyncNodes with same ID are not equal when in different order (list) + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=[SyncNode(db.Record(id=1)), SyncNode(db.Record(id=2))]) + .add_property(name="a", value=[SyncNode(db.Record(id=2)), SyncNode(db.Record(id=1))])) + + with pytest.raises(ImpossibleMergeError): + exp = SyncNode(rec_a).export_entity() + + # same SyncNode object is obviously equal (list) + sn = SyncNode(db.Record(id=1)) + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=[sn]) + .add_property(name="a", value=[sn])) + + exp = SyncNode(rec_a).export_entity() + assert exp.get_property('a').value[0].id == 1 + + # different SyncNode Objects are not equal (list) + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=[SyncNode(db.Record())]) + .add_property(name="a", value=[SyncNode(db.Record())])) + + with pytest.raises(ImpossibleMergeError): + exp = SyncNode(rec_a).export_entity() + + # different SyncNode Objects with differing are not equal (list) + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=[SyncNode(db.Record(id=1))]) + .add_property(name="a", value=[SyncNode(db.Record(id=2))])) + + with pytest.raises(ImpossibleMergeError): + exp = SyncNode(rec_a).export_entity() + + # list vs no list + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=SyncNode(db.Record(id=1))) + .add_property(name="a", value=[SyncNode(db.Record(id=1))])) + + with pytest.raises(ImpossibleMergeError): + exp = SyncNode(rec_a).export_entity() + + # different list sizes + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=[SyncNode(db.Record(id=1))]) + .add_property(name="a", value=[SyncNode(db.Record(id=1)), SyncNode(db.Record(id=1))])) + + with pytest.raises(ImpossibleMergeError): + exp = SyncNode(rec_a).export_entity()