From 6d799f95e589ec95cc478f69c18e58036a21d46a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Tue, 22 Nov 2022 13:26:56 +0100 Subject: [PATCH] WIP: lots of changes Among others: - pass referencing_entities to other functions - remove old class - move query test to the right file - prevent identifiableAdapter from being None - add backrefs to checks - type hints - add backrefs to representation --- identifiable_dm.yml | 10 -- src/caoscrawler/crawl.py | 90 +++++++-------- src/caoscrawler/identifiable.py | 10 +- src/caoscrawler/identifiable_adapters.py | 48 ++++---- src/doc/concepts.rst | 34 ++++-- synchronize.md | 20 ---- unittests/test_file_identifiables.py | 2 +- unittests/test_identifiable_adapters.py | 12 ++ unittests/test_tool.py | 139 ++++++++++++++++++----- 9 files changed, 221 insertions(+), 144 deletions(-) delete mode 100644 identifiable_dm.yml diff --git a/identifiable_dm.yml b/identifiable_dm.yml deleted file mode 100644 index 8eda82ce..00000000 --- a/identifiable_dm.yml +++ /dev/null @@ -1,10 +0,0 @@ -Identifiable: - obligatory_properties: - identifying: - datatype: REFERENCE - description: The RecordType that is governed by this registered identifiable. - suggested_properties: - is_referenced_by: - datatype: REFERENCE - description: If a reference pointing to the identifiable is part of the registered - identifiable. diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index ec12e896..0d8cfab3 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -57,7 +57,6 @@ from typing import Any, Optional, Type, Union from caosdb.apiutils import compare_entities, merge_entities from copy import deepcopy from jsonschema import validate -import importlib from .macros import defmacro_constructor, macro_constructor @@ -209,9 +208,9 @@ class Crawler(object): if generalStore is None: self.generalStore = GeneralStore() - self.identifiableAdapter = identifiableAdapter - if identifiableAdapter is None: - self.identifiableAdapter = LocalStorageIdentifiableAdapter() + self.identifiableAdapter: IdentifiableAdapter = LocalStorageIdentifiableAdapter() + if identifiableAdapter is not None: + self.identifiableAdapter = identifiableAdapter # If a directory is crawled this may hold the path to that directory self.crawled_directory: Optional[str] = None self.debug = debug @@ -532,7 +531,7 @@ class Crawler(object): """ if ident is None: raise ValueError("Identifiable has to be given as argument") - for pname, pvalue in ident.properties.items(): + for pvalue in list(ident.properties.values())+ident.backrefs: if isinstance(pvalue, list): for el in pvalue: if isinstance(el, db.Entity) and el.id is None: @@ -564,7 +563,7 @@ class Crawler(object): flat.append(p.value) Crawler.create_flat_list([p.value], flat) - def _has_missing_object_in_references(self, ident: Identifiable): + def _has_missing_object_in_references(self, ident: Identifiable, referencing_entities: list): """ returns False if any value in the properties attribute is a db.Entity object that is contained in the `remote_missing_cache`. If ident has such an object in @@ -573,21 +572,20 @@ class Crawler(object): """ if ident is None: raise ValueError("Identifiable has to be given as argument") - for pname, pvalue in ident.properties.items(): - # if (is_reference(p) + for pvalue in list(ident.properties.values())+ident.backrefs: # Entity instead of ID and not cached locally if (isinstance(pvalue, list)): for el in pvalue: if (isinstance(el, db.Entity) and self.get_from_remote_missing_cache( - self.identifiableAdapter.get_identifiable(el)) is not None): + self.identifiableAdapter.get_identifiable(el, referencing_entities)) is not None): return True if (isinstance(pvalue, db.Entity) and self.get_from_remote_missing_cache( - self.identifiableAdapter.get_identifiable(pvalue)) is not None): + self.identifiableAdapter.get_identifiable(pvalue, referencing_entities)) is not None): # might be checked when reference is resolved return True return False - def replace_references_with_cached(self, record: db.Record): + def replace_references_with_cached(self, record: db.Record, referencing_entities: list): """ Replace all references with the versions stored in the cache. @@ -599,7 +597,7 @@ class Crawler(object): for el in p.value: if (isinstance(el, db.Entity) and el.id is None): cached = self.get_from_any_cache( - self.identifiableAdapter.get_identifiable(el)) + self.identifiableAdapter.get_identifiable(el, referencing_entities)) if cached is None: raise RuntimeError("Not in cache.") if not check_identical(cached, el, True): @@ -614,7 +612,7 @@ class Crawler(object): p.value = lst if (isinstance(p.value, db.Entity) and p.value.id is None): cached = self.get_from_any_cache( - self.identifiableAdapter.get_identifiable(p.value)) + self.identifiableAdapter.get_identifiable(p.value, referencing_entities)) if cached is None: raise RuntimeError("Not in cache.") if not check_identical(cached, p.value, True): @@ -655,31 +653,31 @@ class Crawler(object): else: return None - def add_to_remote_missing_cache(self, record: db.Record): + def add_to_remote_missing_cache(self, record: db.Record, identifiable: Identifiable): """ stores the given Record in the remote_missing_cache. - If no identifiable can be created for the given Record, the Record is NOT stored. + If identifiable is None, the Record is NOT stored. """ - self.add_to_cache(record=record, cache=self.remote_missing_cache) + self.add_to_cache(record=record, cache=self.remote_missing_cache, + identifiable=identifiable) - def add_to_remote_existing_cache(self, record: db.Record): + def add_to_remote_existing_cache(self, record: db.Record, identifiable: Identifiable): """ stores the given Record in the remote_existing_cache. - If no identifiable can be created for the given Record, the Record is NOT stored. + If identifiable is None, the Record is NOT stored. """ - self.add_to_cache(record=record, cache=self.remote_existing_cache) + self.add_to_cache(record=record, cache=self.remote_existing_cache, + identifiable=identifiable) - def add_to_cache(self, record: db.Record, cache) -> Union[Identifiable, None]: + def add_to_cache(self, record: db.Record, cache: IdentifiedCache, + identifiable: Identifiable) -> None: """ stores the given Record in the given cache. - If no identifiable can be created for the given Record, the Record is NOT stored. + If identifiable is None, the Record is NOT stored. """ - if self.identifiableAdapter is None: - raise RuntimeError("Should not happen.") - identifiable = self.identifiableAdapter.get_identifiable(record) if identifiable is not None: cache.add(identifiable=identifiable, record=record) @@ -703,7 +701,7 @@ class Crawler(object): def create_reference_mapping(flat: list[db.Entity]): """ Create a dictionary of dictionaries of the form: - dict[int, dict[str, db.Entity]] + dict[int, dict[str, list[db.Entity]]] - The integer index is the Python id of the value object. - The string is the name of the first parent of the referencing object. @@ -713,7 +711,8 @@ class Crawler(object): So the returned mapping maps ids of entities to the objects which are referring to them. """ - references = {} + # TODO we need to treat children of RecordTypes somehow. + references: dict[int, dict[str, list[db.Entity]]] = {} for ent in flat: for p in ent.properties: val = p.value @@ -723,12 +722,13 @@ class Crawler(object): if isinstance(v, db.Entity): if id(v) not in references: references[id(v)] = {} - references[id(v)][ent.parents[0].name] = ent + if ent.parents[0].name not in references[id(v)]: + references[id(v)][ent.parents[0].name] = [] + references[id(v)][ent.parents[0].name].append(ent) + return references def split_into_inserts_and_updates(self, ent_list: list[db.Entity]): - if self.identifiableAdapter is None: - raise RuntimeError("Should not happen.") to_be_inserted: list[db.Entity] = [] to_be_updated: list[db.Entity] = [] flat = list(ent_list) @@ -740,12 +740,12 @@ class Crawler(object): if ent.role == "Record" and len(ent.parents) == 0: raise RuntimeError("Records must have a parent.") - referencing_entities = self.create_reference_mapping(flat) - resolved_references = True # flat contains Entities which could not yet be checked against the remote server while resolved_references and len(flat) > 0: resolved_references = False + referencing_entities = self.create_reference_mapping( + flat + to_be_updated + to_be_inserted) # For each element we try to find out whether we can find it in the server or whether # it does not yet exist. Since a Record may reference other unkown Records it might not @@ -758,7 +758,9 @@ class Crawler(object): # 5. Does it have to be new since a needed reference is missing? for i in reversed(range(len(flat))): record = flat[i] - identifiable = self.identifiableAdapter.get_identifiable(record) + identifiable = self.identifiableAdapter.get_identifiable( + record, + referencing_entities=referencing_entities) # TODO remove if the exception is never raised if record in to_be_inserted: @@ -767,14 +769,14 @@ class Crawler(object): # 1. Can it be identified via an ID? elif record.id is not None: to_be_updated.append(record) - self.add_to_remote_existing_cache(record) + self.add_to_remote_existing_cache(record, identifiable) del flat[i] # 2. Can it be identified via a path? elif record.path is not None: existing = self._get_entity_by_path(record.path) if existing is None: to_be_inserted.append(record) - self.add_to_remote_missing_cache(record) + self.add_to_remote_missing_cache(record, identifiable) del flat[i] else: record.id = existing.id @@ -783,7 +785,7 @@ class Crawler(object): record._size = existing._size record._checksum = existing._checksum to_be_updated.append(record) - self.add_to_remote_existing_cache(record) + self.add_to_remote_existing_cache(record, identifiable) del flat[i] # 3. Is it in the cache of already checked Records? elif self.get_from_any_cache(identifiable) is not None: @@ -804,33 +806,33 @@ class Crawler(object): if identified_record is None: # identifiable does not exist remotely -> record needs to be inserted to_be_inserted.append(record) - self.add_to_remote_missing_cache(record) + self.add_to_remote_missing_cache(record, identifiable) del flat[i] else: # side effect record.id = identified_record.id to_be_updated.append(record) - self.add_to_remote_existing_cache(record) + self.add_to_remote_existing_cache(record, identifiable) del flat[i] resolved_references = True # 5. Does it have to be new since a needed reference is missing? # (Is it impossible to check this record because an identifiable references a # missing record?) - elif self._has_missing_object_in_references(identifiable): + elif self._has_missing_object_in_references(identifiable, referencing_entities): to_be_inserted.append(record) - self.add_to_remote_missing_cache(record) + self.add_to_remote_missing_cache(record, identifiable) del flat[i] resolved_references = True for record in flat: - self.replace_references_with_cached(record) + self.replace_references_with_cached(record, referencing_entities) if len(flat) > 0: raise RuntimeError( "Could not resolve all Entity references. Circular Dependency?") - return to_be_inserted, to_be_updated, referencing_entities + return to_be_inserted, to_be_updated def replace_entities_with_ids(self, rec: db.Record): for el in rec.properties: @@ -1024,12 +1026,10 @@ class Crawler(object): Return the final to_be_inserted and to_be_updated as tuple. """ - if self.identifiableAdapter is None: - raise RuntimeError("Should not happen.") - - to_be_inserted, to_be_updated, referencing_entities = self.split_into_inserts_and_updates( + to_be_inserted, to_be_updated = self.split_into_inserts_and_updates( crawled_data ) + referencing_entities = self.create_reference_mapping(to_be_updated + to_be_inserted) # TODO: refactoring of typo for el in to_be_updated: diff --git a/src/caoscrawler/identifiable.py b/src/caoscrawler/identifiable.py index 5b2b2152..85366531 100644 --- a/src/caoscrawler/identifiable.py +++ b/src/caoscrawler/identifiable.py @@ -49,8 +49,9 @@ class Identifiable(): def __init__(self, record_id: int = None, path: str = None, record_type: str = None, name: str = None, properties: dict = None, backrefs: list[Union[int, str]] = None): - if (record_id is None and path is None and name is None and backrefs is None and ( - properties is None or len(properties) == 0)): + if (record_id is None and path is None and name is None + and (backrefs is None or len(backrefs) == 0) + and (properties is None or len(properties) == 0)): raise ValueError("There is no identifying information. You need to add a path or " "properties or other identifying attributes.") if properties is not None and 'name' in [k.lower() for k in properties.keys()]: @@ -63,7 +64,7 @@ class Identifiable(): self.properties: dict = {} if properties is not None: self.properties = properties - self.backrefs: list = [] + self.backrefs: list[Union[int, db.Entity]] = [] if backrefs is not None: self.backrefs = backrefs @@ -102,7 +103,8 @@ class Identifiable(): creates a string from the attributes of an identifiable that can be hashed String has the form "P<parent>N<name>a:5b:10" """ - rec_string = "P<{}>N<{}>".format(identifiable.record_type, identifiable.name) + rec_string = "P<{}>N<{}>R<{}>".format(identifiable.record_type, identifiable.name, + identifiable.backrefs) # TODO this structure neglects Properties if multiple exist for the same name for pname in sorted(identifiable.properties.keys()): rec_string += ("{}:".format(pname) + diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index 0cfaaa57..27332e76 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -23,6 +23,7 @@ # ** end header # +from __future__ import annotations import yaml from datetime import datetime @@ -34,19 +35,6 @@ from .utils import has_parent logger = logging.getLogger(__name__) -class Identifiable(): - def __init__(self, record_type: str, name: str = None, properties: dict = None, - backrefs: list = None): - self.record_type = record_type - self.name = name - self.properties: dict = {} - if properties is not None: - self.properties = properties - self.backrefs: list = [] - if backrefs is not None: - self.backrefs = backrefs - - def convert_value(value): """ Returns a string representation of the value that is suitable to be used in the query @@ -113,7 +101,7 @@ class IdentifiableAdapter(metaclass=ABCMeta): if ident.record_type is not None: query_string += ident.record_type for ref in ident.backrefs: - query_string += (" WHICH IS REFERENCED BY " + ref + " AND") + query_string += (" WHICH IS REFERENCED BY " + str(ref) + " AND") query_string += " WITH " @@ -164,13 +152,6 @@ class IdentifiableAdapter(metaclass=ABCMeta): def resolve_reference(self, record: db.Record): pass - # ------------------------- - # TODO: this uses an active database connection which is discouraged here! - @staticmethod - def get_recordtype_name(rt_id): - return db.RecordType(rt_id).retrieve().name - # ------------------------- - @abstractmethod def get_file(self, identifiable: db.File): """ @@ -188,6 +169,7 @@ class IdentifiableAdapter(metaclass=ABCMeta): if referencing_entities is None: referencing_entities = {} + property_name_list_A = [] property_name_list_B = [] identifiable_props = {} @@ -203,18 +185,26 @@ class IdentifiableAdapter(metaclass=ABCMeta): # case A: in the registered identifiable # case B: in the identifiable - # TODO: refactoring for backref - # change this to checkt the custom data structure from above instead of the record - if prop.name.lower() == "is_referenced_by": - rtname = self.get_recordtype_name(prop.value) - if (id(record) in referencing_entities - and rtname in referencing_entities[id(record)]): - identifiable_backrefs.append(referencing_entities[rtname]) + # TODO: similar to the Identifiable class, Registred Identifiable should be a + # separate class too + if prop.name.lower() == "is_referenced_by": + for rtname in prop.value: + if (id(record) in referencing_entities + and rtname in referencing_entities[id(record)]): + identifiable_backrefs.extend(referencing_entities[id(record)][rtname]) + else: + # TODO: is this the appropriate error? + raise NotImplementedError( + f"The following record is missing an identifying property:" + f"RECORD\n{record}\nIdentifying PROPERTY\n{prop.name}" + ) continue + record_prop = record.get_property(prop.name) if record_prop is None: # TODO: how to handle missing values in identifiables # raise an exception? + # TODO: is this the appropriate error? raise NotImplementedError( f"The following record is missing an identifying property:" f"RECORD\n{record}\nIdentifying PROPERTY\n{prop.name}" @@ -239,7 +229,7 @@ class IdentifiableAdapter(metaclass=ABCMeta): if registered_identifiable else None), name=record.name, properties=identifiable_props, - path=record.path + path=record.path, backrefs=identifiable_backrefs ) diff --git a/src/doc/concepts.rst b/src/doc/concepts.rst index 012f393b..89757f21 100644 --- a/src/doc/concepts.rst +++ b/src/doc/concepts.rst @@ -38,36 +38,52 @@ Identifiables An Identifiable of a Record is like the fingerprint of a Record. The identifiable contains the information that is used by the CaosDB Crawler to identify Records. -In order to check whether a Record exits in the CaosDB Server, the CaosDB Crawler creates a query +For example, in order to check whether a Record exits in the CaosDB Server, the CaosDB Crawler creates a query using the information contained in the Identifiable. -For example, suppose a certain experiment is at most done once per day, then the identifiable could -consist of the RecordType "SomeExperiment" (as a parent) and the Property "date". +Suppose a certain experiment is at most done once per day, then the identifiable could +consist of the RecordType "SomeExperiment" (as a parent) and the Property "date" with the respective value. You can think of the properties that are used by the identifiable as a dictionary. For each property name there can be one value. However, this value can be a list such that the created query can look like "FIND RECORD ParamenterSet WITH a=5 AND a=6". This is meaningful if there is a ParamenterSet with two Properties with the name 'a' (multi property) or if 'a' is a list containing at least the values 5 and 6. -The path of a File object can serve as a Property that identifies files and similarly the name of +When we use a reference Property in the identifiable, we effectively use the reference from the object to +be identified pointing to some other object as an identifying attribute. We can also use references that point +in the other direction, i.e. towards the object to be identified. An identifiable may denote one or more +Entities that are referencing the object to be identified. + +The path of a File object can serve as a Property that identifies files and similarly the name of Records can be used. -In the current implementation an identifiable can only use one RecordType even though the identified Records might have multiple -Parents. +In the current implementation an identifiable can only use one RecordType even though the identified Records might have multiple Parents. Relevant sources in - ``src/identifiable_adapters.py`` - ``src/identifiable.py`` -RegisteredIdentifiables -+++++++++++++++++++++++ -A Registered Identifiable is the blue print for Identifiables. +Registered Identifiables +++++++++++++++++++++++++ +A Registered Identifiable is the blue print for Identifiables. You can think of registered identifiables as identifiables without concrete values for properties. RegisteredIdentifiables are associated with RecordTypes and define of what information an identifiable for that RecordType exists. There can be multiple Registered Identifiables for one RecordType. +If identifiables shall contain references to the object to be identified, the Registered +Identifiable must list the RecordTypes of the Entities that have those references. +For example, the Registered Identifiable for the "Experiment" RecordType may contain +the "date" Property and "Project" as the RecordType of an Entity that is referencing +the object to be identified. Then if we have a structure of some Records at hand, +we can check whether a Record with the parent "Project" is referencing the "Experiment" +Record. If that is the case, this reference is part of the identifiable for the "Experiment" +Record. Note, that if there are multiple Records with the appropriate parent (e.g. +multiple "Project" Records in the above example) it will be required that all of them +reference the object to be identified. + + Identified Records ++++++++++++++++++ TODO diff --git a/synchronize.md b/synchronize.md index f89b6350..b178e647 100644 --- a/synchronize.md +++ b/synchronize.md @@ -31,23 +31,3 @@ Maybe keep another dict that tracks what Record objects are in the to_be_updated After treating leaf Records, Records that could not be checked before can be checked: Either referenced Records now have an ID or they are in the to_be_inserted dict such that it is clear that the identifiable at hand does not exist in the server. This way, the whole structure can be resolved except if there are circular dependencies: Those can be added fully to the to_be_inserted dict. (???) - - - - -## Back References - -What are the implications if an identifiable shall allow an attribute like "is referenced by XYZ"? -1. During creation of the Record structure, it is unlikely, that something like -`db.Record(id=xyz) -> A ` is created, where A is the identifiable. More likely is that another identifiable -references A. This must be checked in order to check A (and "not existing" means A is not existing). -2. Normal Properties are not sufficient. One possibility is to use an auxiliary Property (e.g. -"is_referenced_by") -3. Before, it was checked whether all entities that are referenced are checked against the server. -Now, it also needs to be checked whether there are unchecked referencing entities. But only if the -identifiable has a "is_referenced_by" Property. -4. There might be multiple Entities referencing A. What is the value of "is_referenced_by" in the identifiable? -One possibility is to have "is_referenced_by=RT1" in the registred identifiable and -"is_referenced_by=<idxyz>" in the identifiable. Still there might be multiple candidates, how ever this -might be ignored (NotImplementedError) for now. - diff --git a/unittests/test_file_identifiables.py b/unittests/test_file_identifiables.py index c7821f39..aff174d0 100644 --- a/unittests/test_file_identifiables.py +++ b/unittests/test_file_identifiables.py @@ -16,7 +16,7 @@ def test_file_identifiable(): # Without a path there is no identifying information with raises(ValueError): - ident.get_identifiable(db.File()) + ident.get_identifiable(db.File(), []) fp = "/test/bla/bla.txt" file_obj = db.File(path=fp) diff --git a/unittests/test_identifiable_adapters.py b/unittests/test_identifiable_adapters.py index 743f64c1..6817b9e6 100644 --- a/unittests/test_identifiable_adapters.py +++ b/unittests/test_identifiable_adapters.py @@ -59,6 +59,18 @@ def test_create_query_for_identifiable(): Identifiable(name="TestRecord", record_type="TestType")) assert query.lower() == "find record testtype with name='testrecord'" + # With referencing entity (backref) + query = IdentifiableAdapter.create_query_for_identifiable( + Identifiable(record_type="Person", backrefs=[14433], properties={'last_name': "B"})) + assert query.lower() == ("find record person which is referenced by 14433 and with " + "'last_name'='b' ") + + # With two referencing entities (backref) + query = IdentifiableAdapter.create_query_for_identifiable( + Identifiable(record_type="Person", backrefs=[14433, 333], properties={'last_name': "B"})) + assert query.lower() == ("find record person which is referenced by 14433 and which is " + "referenced by 333 and with 'last_name'='b' ") + def test_load_from_yaml_file(): ident = CaosDBIdentifiableAdapter() diff --git a/unittests/test_tool.py b/unittests/test_tool.py index 2f68126e..5c78eaf4 100755 --- a/unittests/test_tool.py +++ b/unittests/test_tool.py @@ -259,20 +259,6 @@ def test_synchronization(crawler, ident): assert len(updl) == 0 -def test_identifiable_adapter(): - query = IdentifiableAdapter.create_query_for_identifiable( - Identifiable(record_type="Person", - properties={"first_name": "A", - "last_name": "B"})) - assert query.lower() == "find record person with 'first_name'='a' and 'last_name'='b' " - query = IdentifiableAdapter.create_query_for_identifiable( - db.Record().add_parent("Person") - .add_property("is_referenced_by", value="14433") - .add_property("last_name", value="B")) - assert query.lower() == ("find record person which is referenced by 14433 and with " - "'last_name'='b' ") - - def test_remove_unnecessary_updates(): # test trvial case upl = [db.Record().add_parent("A")] @@ -383,7 +369,7 @@ def test_split_into_inserts_and_updates_single(crawler_mocked_identifiable_retri assert crawler.identifiableAdapter.retrieve_identified_record_for_record( identlist[1]) is None - insert, update, _ = crawler.split_into_inserts_and_updates(deepcopy(entlist)) + insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) assert len(insert) == 1 assert insert[0].name == "B" assert len(update) == 1 @@ -401,7 +387,7 @@ def test_split_into_inserts_and_updates_with_duplicate(crawler_mocked_identifiab # This is identical to a and should be removed c = db.Record(name="A").add_parent("C") entlist = [a, b, c] - insert, update, _ = crawler.split_into_inserts_and_updates(deepcopy(entlist)) + insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) assert len(insert) == 1 assert insert[0].name == "B" assert len(update) == 1 @@ -418,7 +404,7 @@ def test_split_into_inserts_and_updates_with_ref(crawler_mocked_identifiable_ret b = db.Record(name="B").add_parent("C") b.add_property("A", a) entlist = [a, b] - insert, update, _ = crawler.split_into_inserts_and_updates(entlist) + insert, update = crawler.split_into_inserts_and_updates(entlist) assert len(insert) == 1 assert insert[0].name == "B" assert len(update) == 1 @@ -453,7 +439,7 @@ def test_split_into_inserts_and_updates_with_complex(crawler_mocked_identifiable b.add_property("A", f) b.add_property("A", a) entlist = [a, b, g] - insert, update, _ = crawler.split_into_inserts_and_updates(entlist) + insert, update = crawler.split_into_inserts_and_updates(entlist) assert len(insert) == 3 assert "B" in [el.name for el in insert] assert len(update) == 1 @@ -473,7 +459,7 @@ def test_split_into_inserts_and_updates_with_copy_attr(crawler_mocked_identifiab b = db.Record(name="A").add_parent("C") b.add_property("bar", 2) entlist = [a, b] - insert, update, _ = crawler.split_into_inserts_and_updates(entlist) + insert, update = crawler.split_into_inserts_and_updates(entlist) assert update[0].get_property("bar").value == 2 assert update[0].get_property("foo").value == 1 @@ -494,29 +480,30 @@ def test_has_missing_object_in_references(crawler): # one reference with id -> check assert not crawler._has_missing_object_in_references( - Identifiable(name="C", record_type="RTC", properties={'d': 123})) + Identifiable(name="C", record_type="RTC", properties={'d': 123}), []) # one ref with Entity with id -> check assert not crawler._has_missing_object_in_references( Identifiable(name="C", record_type="RTC", properties={'d': db.Record(id=123) - .add_parent("C")})) + .add_parent("C")}), []) # one ref with id one with Entity with id (mixed) -> check assert not crawler._has_missing_object_in_references( Identifiable(name="C", record_type="RTD", - properties={'d': 123, 'b': db.Record(id=123).add_parent("RTC")})) + properties={'d': 123, 'b': db.Record(id=123).add_parent("RTC")}), []) # entity to be referenced in the following a = db.Record(name="C").add_parent("C").add_property("d", 12311) # one ref with id one with Entity without id (but not identifying) -> fail assert not crawler._has_missing_object_in_references( - Identifiable(name="C", record_type="RTC", properties={'d': 123, 'e': a})) + Identifiable(name="C", record_type="RTC", properties={'d': 123, 'e': a}), []) # one ref with id one with Entity without id (mixed) -> fail assert not crawler._has_missing_object_in_references( - Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a})) + Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), []) - crawler.add_to_remote_missing_cache(a) + crawler.add_to_remote_missing_cache(a, Identifiable(name="C", record_type="RTC", + properties={'d': 12311})) # one ref with id one with Entity without id but in cache -> check assert crawler._has_missing_object_in_references( - Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a})) + Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), []) # if this ever fails, the mock up may be removed crawler.identifiableAdapter.get_registered_identifiable.assert_called() @@ -725,3 +712,103 @@ def test_create_flat_list(): a = db.Record() a.add_property(name="a", value=a) Crawler.create_flat_list([a], []) + + +@pytest.fixture +def crawler_mocked_for_backref_test(crawler): + # mock retrieval of registered identifiabls: return Record with just a parent + def get_reg_ident(x): + if x.parents[0].name == "C": + return db.Record().add_parent(x.parents[0].name).add_property( + "is_referenced_by", value=["BR"]) + elif x.parents[0].name == "D": + return db.Record().add_parent(x.parents[0].name).add_property( + "is_referenced_by", value=["BR", "BR2"]) + else: + return db.Record().add_parent(x.parents[0].name) + crawler.identifiableAdapter.get_registered_identifiable = Mock(side_effect=get_reg_ident) + + # Simulate remote server content by using the names to identify records + # There is only a single known Record with name A + crawler.identifiableAdapter.retrieve_identified_record_for_record = Mock(side_effect=partial( + basic_retrieve_by_name_mock_up, known={"A": + db.Record(id=1111, name="A").add_parent("BR")})) + crawler.identifiableAdapter.retrieve_identified_record_for_identifiable = Mock( + side_effect=partial( + basic_retrieve_by_name_mock_up, known={"A": + db.Record(id=1111, name="A").add_parent("BR")})) + return crawler + + +def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test): + crawler = crawler_mocked_for_backref_test + identlist = [Identifiable(name="A", record_type="BR"), + Identifiable(name="B", record_type="C", backrefs=[db.Entity()])] + referenced = db.Record(name="B").add_parent("C") + entlist = [referenced, db.Record(name="A").add_parent("BR").add_property("ref", referenced), ] + + # Test without referencing object + # currently a NotImplementedError is raised if necessary properties are missing. + with raises(NotImplementedError): + crawler.split_into_inserts_and_updates([db.Record(name="B").add_parent("C")]) + + # identifiables were not yet checked + assert crawler.get_from_any_cache(identlist[0]) is None + assert crawler.get_from_any_cache(identlist[1]) is None + # one with reference, one without + assert not crawler._has_reference_value_without_id(identlist[0]) + assert crawler._has_reference_value_without_id(identlist[1]) + # one can be found remotely, one not + assert crawler.identifiableAdapter.retrieve_identified_record_for_record( + identlist[0]).id == 1111 + assert crawler.identifiableAdapter.retrieve_identified_record_for_record( + identlist[1]) is None + + # check the split... + insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) + # A was found remotely and is therefore in the update list + assert len(update) == 1 + assert update[0].name == "A" + # B does not exist on the (simulated) remote server + assert len(insert) == 1 + assert insert[0].name == "B" + + +def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_test): + # test whether multiple references of the same record type are correctly used + crawler = crawler_mocked_for_backref_test + referenced = db.Record(name="B").add_parent("C") + entlist = [referenced, + db.Record(name="A").add_parent("BR").add_property("ref", referenced), + db.Record(name="C").add_parent("BR").add_property("ref", referenced), + ] + + # test whether both entities are listed in the backref attribute of the identifiable + referencing_entities = crawler.create_reference_mapping(entlist) + identifiable = crawler.identifiableAdapter.get_identifiable(referenced, referencing_entities) + assert len(identifiable.backrefs) == 2 + + # check the split... + insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) + assert len(update) == 1 + assert len(insert) == 2 + + +def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_test): + # test whether multiple references of the different record types are correctly used + crawler = crawler_mocked_for_backref_test + referenced = db.Record(name="B").add_parent("D") + entlist = [referenced, + db.Record(name="A").add_parent("BR").add_property("ref", referenced), + db.Record(name="A").add_parent("BR2").add_property("ref", referenced), + ] + + # test whether both entities are listed in the backref attribute of the identifiable + referencing_entities = crawler.create_reference_mapping(entlist) + identifiable = crawler.identifiableAdapter.get_identifiable(referenced, referencing_entities) + assert len(identifiable.backrefs) == 2 + + # check the split... + insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) + assert len(update) == 2 + assert len(insert) == 1 -- GitLab