diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index e6b5f65cde7bd9159ab104f763597c3ded1c3903..cccbbdb040c556da8f904f27dd9d03aafb6d4872 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -213,9 +213,9 @@ class Crawler(object): if generalStore is None: self.generalStore = GeneralStore() - self.identifiableAdapter = identifiableAdapter - if identifiableAdapter is None: - self.identifiableAdapter = LocalStorageIdentifiableAdapter() + self.identifiableAdapter: IdentifiableAdapter = LocalStorageIdentifiableAdapter() + if identifiableAdapter is not None: + self.identifiableAdapter = identifiableAdapter # If a directory is crawled this may hold the path to that directory self.crawled_directory: Optional[str] = None self.debug = debug @@ -536,7 +536,7 @@ class Crawler(object): """ if ident is None: raise ValueError("Identifiable has to be given as argument") - for pname, pvalue in ident.properties.items(): + for pvalue in list(ident.properties.values()) + ident.backrefs: if isinstance(pvalue, list): for el in pvalue: if isinstance(el, db.Entity) and el.id is None: @@ -568,7 +568,7 @@ class Crawler(object): flat.append(p.value) Crawler.create_flat_list([p.value], flat) - def _has_missing_object_in_references(self, ident: Identifiable): + def _has_missing_object_in_references(self, ident: Identifiable, referencing_entities: list): """ returns False if any value in the properties attribute is a db.Entity object that is contained in the `remote_missing_cache`. If ident has such an object in @@ -577,21 +577,20 @@ class Crawler(object): """ if ident is None: raise ValueError("Identifiable has to be given as argument") - for pname, pvalue in ident.properties.items(): - # if (is_reference(p) + for pvalue in list(ident.properties.values()) + ident.backrefs: # Entity instead of ID and not cached locally if (isinstance(pvalue, list)): for el in pvalue: if (isinstance(el, db.Entity) and self.get_from_remote_missing_cache( - self.identifiableAdapter.get_identifiable(el)) is not None): + self.identifiableAdapter.get_identifiable(el, referencing_entities)) is not None): return True if (isinstance(pvalue, db.Entity) and self.get_from_remote_missing_cache( - self.identifiableAdapter.get_identifiable(pvalue)) is not None): + self.identifiableAdapter.get_identifiable(pvalue, referencing_entities)) is not None): # might be checked when reference is resolved return True return False - def replace_references_with_cached(self, record: db.Record): + def replace_references_with_cached(self, record: db.Record, referencing_entities: list): """ Replace all references with the versions stored in the cache. @@ -603,7 +602,7 @@ class Crawler(object): for el in p.value: if (isinstance(el, db.Entity) and el.id is None): cached = self.get_from_any_cache( - self.identifiableAdapter.get_identifiable(el)) + self.identifiableAdapter.get_identifiable(el, referencing_entities)) if cached is None: raise RuntimeError("Not in cache.") if not check_identical(cached, el, True): @@ -618,7 +617,7 @@ class Crawler(object): p.value = lst if (isinstance(p.value, db.Entity) and p.value.id is None): cached = self.get_from_any_cache( - self.identifiableAdapter.get_identifiable(p.value)) + self.identifiableAdapter.get_identifiable(p.value, referencing_entities)) if cached is None: raise RuntimeError("Not in cache.") if not check_identical(cached, p.value, True): @@ -659,31 +658,31 @@ class Crawler(object): else: return None - def add_to_remote_missing_cache(self, record: db.Record): + def add_to_remote_missing_cache(self, record: db.Record, identifiable: Identifiable): """ stores the given Record in the remote_missing_cache. - If no identifiable can be created for the given Record, the Record is NOT stored. + If identifiable is None, the Record is NOT stored. """ - self.add_to_cache(record=record, cache=self.remote_missing_cache) + self.add_to_cache(record=record, cache=self.remote_missing_cache, + identifiable=identifiable) - def add_to_remote_existing_cache(self, record: db.Record): + def add_to_remote_existing_cache(self, record: db.Record, identifiable: Identifiable): """ stores the given Record in the remote_existing_cache. - If no identifiable can be created for the given Record, the Record is NOT stored. + If identifiable is None, the Record is NOT stored. """ - self.add_to_cache(record=record, cache=self.remote_existing_cache) + self.add_to_cache(record=record, cache=self.remote_existing_cache, + identifiable=identifiable) - def add_to_cache(self, record: db.Record, cache) -> Union[Identifiable, None]: + def add_to_cache(self, record: db.Record, cache: IdentifiedCache, + identifiable: Identifiable) -> None: """ stores the given Record in the given cache. - If no identifiable can be created for the given Record, the Record is NOT stored. + If identifiable is None, the Record is NOT stored. """ - if self.identifiableAdapter is None: - raise RuntimeError("Should not happen.") - identifiable = self.identifiableAdapter.get_identifiable(record) if identifiable is not None: cache.add(identifiable=identifiable, record=record) @@ -703,9 +702,38 @@ class Crawler(object): if p.value is old: p.value = new + @staticmethod + def create_reference_mapping(flat: list[db.Entity]): + """ + Create a dictionary of dictionaries of the form: + dict[int, dict[str, list[db.Entity]]] + + - The integer index is the Python id of the value object. + - The string is the name of the first parent of the referencing object. + + Each value objects is taken from the values of all properties from the list flat. + + So the returned mapping maps ids of entities to the objects which are referring + to them. + """ + # TODO we need to treat children of RecordTypes somehow. + references: dict[int, dict[str, list[db.Entity]]] = {} + for ent in flat: + for p in ent.properties: + val = p.value + if not isinstance(val, list): + val = [val] + for v in val: + if isinstance(v, db.Entity): + if id(v) not in references: + references[id(v)] = {} + if ent.parents[0].name not in references[id(v)]: + references[id(v)][ent.parents[0].name] = [] + references[id(v)][ent.parents[0].name].append(ent) + + return references + def split_into_inserts_and_updates(self, ent_list: list[db.Entity]): - if self.identifiableAdapter is None: - raise RuntimeError("Should not happen.") to_be_inserted: list[db.Entity] = [] to_be_updated: list[db.Entity] = [] flat = list(ent_list) @@ -721,6 +749,8 @@ class Crawler(object): # flat contains Entities which could not yet be checked against the remote server while resolved_references and len(flat) > 0: resolved_references = False + referencing_entities = self.create_reference_mapping( + flat + to_be_updated + to_be_inserted) # For each element we try to find out whether we can find it in the server or whether # it does not yet exist. Since a Record may reference other unkown Records it might not @@ -733,7 +763,9 @@ class Crawler(object): # 5. Does it have to be new since a needed reference is missing? for i in reversed(range(len(flat))): record = flat[i] - identifiable = self.identifiableAdapter.get_identifiable(record) + identifiable = self.identifiableAdapter.get_identifiable( + record, + referencing_entities=referencing_entities) # TODO remove if the exception is never raised if record in to_be_inserted: @@ -742,14 +774,14 @@ class Crawler(object): # 1. Can it be identified via an ID? elif record.id is not None: to_be_updated.append(record) - self.add_to_remote_existing_cache(record) + self.add_to_remote_existing_cache(record, identifiable) del flat[i] # 2. Can it be identified via a path? elif record.path is not None: existing = self._get_entity_by_path(record.path) if existing is None: to_be_inserted.append(record) - self.add_to_remote_missing_cache(record) + self.add_to_remote_missing_cache(record, identifiable) del flat[i] else: record.id = existing.id @@ -758,7 +790,7 @@ class Crawler(object): record._size = existing._size record._checksum = existing._checksum to_be_updated.append(record) - self.add_to_remote_existing_cache(record) + self.add_to_remote_existing_cache(record, identifiable) del flat[i] # 3. Is it in the cache of already checked Records? elif self.get_from_any_cache(identifiable) is not None: @@ -782,27 +814,27 @@ class Crawler(object): if identified_record is None: # identifiable does not exist remotely -> record needs to be inserted to_be_inserted.append(record) - self.add_to_remote_missing_cache(record) + self.add_to_remote_missing_cache(record, identifiable) del flat[i] else: # side effect record.id = identified_record.id to_be_updated.append(record) - self.add_to_remote_existing_cache(record) + self.add_to_remote_existing_cache(record, identifiable) del flat[i] resolved_references = True # 5. Does it have to be new since a needed reference is missing? # (Is it impossible to check this record because an identifiable references a # missing record?) - elif self._has_missing_object_in_references(identifiable): + elif self._has_missing_object_in_references(identifiable, referencing_entities): to_be_inserted.append(record) - self.add_to_remote_missing_cache(record) + self.add_to_remote_missing_cache(record, identifiable) del flat[i] resolved_references = True for record in flat: - self.replace_references_with_cached(record) + self.replace_references_with_cached(record, referencing_entities) if len(flat) > 0: raise RuntimeError( @@ -1002,11 +1034,8 @@ class Crawler(object): Return the final to_be_inserted and to_be_updated as tuple. """ - if self.identifiableAdapter is None: - raise RuntimeError("Should not happen.") - - to_be_inserted, to_be_updated = self.split_into_inserts_and_updates( - crawled_data) + to_be_inserted, to_be_updated = self.split_into_inserts_and_updates(crawled_data) + referencing_entities = self.create_reference_mapping(to_be_updated + to_be_inserted) # TODO: refactoring of typo for el in to_be_updated: @@ -1014,7 +1043,8 @@ class Crawler(object): self.replace_entities_with_ids(el) identified_records = [ - self.identifiableAdapter.retrieve_identified_record_for_record(record) + self.identifiableAdapter.retrieve_identified_record_for_record(record, + referencing_entities) for record in to_be_updated] # Merge with existing data to prevent unwanted overwrites to_be_updated = self._merge_properties_from_remote(to_be_updated, diff --git a/src/caoscrawler/identifiable.py b/src/caoscrawler/identifiable.py index 0db566bf34e62d09553a4890ceb7bab4375079f2..7ff7172576be08e068ba412f319b059fb349bbeb 100644 --- a/src/caoscrawler/identifiable.py +++ b/src/caoscrawler/identifiable.py @@ -50,8 +50,9 @@ class Identifiable(): def __init__(self, record_id: int = None, path: str = None, record_type: str = None, name: str = None, properties: dict = None, backrefs: list[Union[int, str]] = None): - if (record_id is None and path is None and name is None and backrefs is None and ( - properties is None or len(properties) == 0)): + if (record_id is None and path is None and name is None + and (backrefs is None or len(backrefs) == 0) + and (properties is None or len(properties) == 0)): raise ValueError("There is no identifying information. You need to add a path or " "properties or other identifying attributes.") if properties is not None and 'name' in [k.lower() for k in properties.keys()]: @@ -64,7 +65,7 @@ class Identifiable(): self.properties: dict = {} if properties is not None: self.properties = properties - self.backrefs: list = [] + self.backrefs: list[Union[int, db.Entity]] = [] if backrefs is not None: self.backrefs = backrefs @@ -101,9 +102,12 @@ class Identifiable(): def _create_hashable_string(identifiable: Identifiable) -> str: """ creates a string from the attributes of an identifiable that can be hashed - String has the form "P<parent>N<name>a:5b:10" + String has the form "P<parent>N<name>R<reference-ids>a:5b:10" """ - rec_string = "P<{}>N<{}>".format(identifiable.record_type, identifiable.name) + rec_string = "P<{}>N<{}>R<{}>".format( + identifiable.record_type, + identifiable.name, + [Identifiable._value_representation(el) for el in identifiable.backrefs]) # TODO this structure neglects Properties if multiple exist for the same name for pname in sorted(identifiable.properties.keys()): rec_string += ("{}:".format(pname) + diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index 0abeab6252e031bf8493a1ad04dc960f70de4e4c..653951f404f2ccbd71f6ff998e6713a4f449dade 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -23,6 +23,7 @@ # ** end header # +from __future__ import annotations import yaml from datetime import datetime @@ -95,9 +96,15 @@ class IdentifiableAdapter(metaclass=ABCMeta): uses the properties of ident to create a query that can determine whether the required record already exists. """ + query_string = "FIND Record " if ident.record_type is not None: query_string += ident.record_type + for ref in ident.backrefs: + eid = ref + if isinstance(ref, db.Entity): + eid = ref.id + query_string += (" WHICH IS REFERENCED BY " + str(eid) + " AND") query_string += " WITH " @@ -155,17 +162,29 @@ class IdentifiableAdapter(metaclass=ABCMeta): """ pass - def get_identifiable(self, record: db.Record): + def get_identifiable(self, record: db.Record, referencing_entities=None): """ retrieve the registred identifiable and fill the property values to create an identifiable + + Args: + record: the record for which the Identifiable shall be created. + referencing_entities: a dictionary (Type: dict[int, dict[str, list[db.Entity]]]), that + allows to look up entities with a certain RecordType, that reference ``record`` + + Returns: + Identifiable, the identifiable for record. """ registered_identifiable = self.get_registered_identifiable(record) + if referencing_entities is None: + referencing_entities = {} + property_name_list_A = [] property_name_list_B = [] identifiable_props = {} + identifiable_backrefs = [] if registered_identifiable is not None: # fill the values: @@ -177,10 +196,26 @@ class IdentifiableAdapter(metaclass=ABCMeta): # case A: in the registered identifiable # case B: in the identifiable + # TODO: similar to the Identifiable class, Registred Identifiable should be a + # separate class too + if prop.name.lower() == "is_referenced_by": + for rtname in prop.value: + if (id(record) in referencing_entities + and rtname in referencing_entities[id(record)]): + identifiable_backrefs.extend(referencing_entities[id(record)][rtname]) + else: + # TODO: is this the appropriate error? + raise NotImplementedError( + f"The following record is missing an identifying property:" + f"RECORD\n{record}\nIdentifying PROPERTY\n{prop.name}" + ) + continue + record_prop = record.get_property(prop.name) if record_prop is None: # TODO: how to handle missing values in identifiables # raise an exception? + # TODO: is this the appropriate error? raise NotImplementedError( f"The following record is missing an identifying property:" f"RECORD\n{record}\nIdentifying PROPERTY\n{prop.name}" @@ -205,7 +240,8 @@ class IdentifiableAdapter(metaclass=ABCMeta): if registered_identifiable else None), name=record.name, properties=identifiable_props, - path=record.path + path=record.path, + backrefs=identifiable_backrefs ) @abstractmethod @@ -222,7 +258,7 @@ class IdentifiableAdapter(metaclass=ABCMeta): # TODO: remove side effect # TODO: use ID if record has one? - def retrieve_identified_record_for_record(self, record: db.Record): + def retrieve_identified_record_for_record(self, record: db.Record, referencing_entities=None): """ This function combines all functionality of the IdentifierAdapter by returning the identifiable after having checked for an appropriate @@ -231,7 +267,7 @@ class IdentifiableAdapter(metaclass=ABCMeta): In case there was no appropriate registered identifiable or no identifiable could be found return value is None. """ - identifiable = self.get_identifiable(record) + identifiable = self.get_identifiable(record, referencing_entities=referencing_entities) if identifiable.path is not None: return self.get_file(identifiable) @@ -392,7 +428,14 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): for key, value in identifiable_data.items(): rt = db.RecordType().add_parent(key) for prop_name in value: - rt.add_property(name=prop_name) + if isinstance(prop_name, str): + rt.add_property(name=prop_name) + elif isinstance(prop_name, dict): + for k, v in prop_name.items(): + rt.add_property(name=k, value=v) + else: + NotImplementedError("YAML is not structured correctly") + self.register_identifiable(key, rt) def register_identifiable(self, name: str, definition: db.RecordType): diff --git a/src/doc/concepts.rst b/src/doc/concepts.rst index 012f393bca2caf1f0107ea462fdc531a1a8c799e..89757f21958f3d94649b33e9f9112593f703191d 100644 --- a/src/doc/concepts.rst +++ b/src/doc/concepts.rst @@ -38,36 +38,52 @@ Identifiables An Identifiable of a Record is like the fingerprint of a Record. The identifiable contains the information that is used by the CaosDB Crawler to identify Records. -In order to check whether a Record exits in the CaosDB Server, the CaosDB Crawler creates a query +For example, in order to check whether a Record exits in the CaosDB Server, the CaosDB Crawler creates a query using the information contained in the Identifiable. -For example, suppose a certain experiment is at most done once per day, then the identifiable could -consist of the RecordType "SomeExperiment" (as a parent) and the Property "date". +Suppose a certain experiment is at most done once per day, then the identifiable could +consist of the RecordType "SomeExperiment" (as a parent) and the Property "date" with the respective value. You can think of the properties that are used by the identifiable as a dictionary. For each property name there can be one value. However, this value can be a list such that the created query can look like "FIND RECORD ParamenterSet WITH a=5 AND a=6". This is meaningful if there is a ParamenterSet with two Properties with the name 'a' (multi property) or if 'a' is a list containing at least the values 5 and 6. -The path of a File object can serve as a Property that identifies files and similarly the name of +When we use a reference Property in the identifiable, we effectively use the reference from the object to +be identified pointing to some other object as an identifying attribute. We can also use references that point +in the other direction, i.e. towards the object to be identified. An identifiable may denote one or more +Entities that are referencing the object to be identified. + +The path of a File object can serve as a Property that identifies files and similarly the name of Records can be used. -In the current implementation an identifiable can only use one RecordType even though the identified Records might have multiple -Parents. +In the current implementation an identifiable can only use one RecordType even though the identified Records might have multiple Parents. Relevant sources in - ``src/identifiable_adapters.py`` - ``src/identifiable.py`` -RegisteredIdentifiables -+++++++++++++++++++++++ -A Registered Identifiable is the blue print for Identifiables. +Registered Identifiables +++++++++++++++++++++++++ +A Registered Identifiable is the blue print for Identifiables. You can think of registered identifiables as identifiables without concrete values for properties. RegisteredIdentifiables are associated with RecordTypes and define of what information an identifiable for that RecordType exists. There can be multiple Registered Identifiables for one RecordType. +If identifiables shall contain references to the object to be identified, the Registered +Identifiable must list the RecordTypes of the Entities that have those references. +For example, the Registered Identifiable for the "Experiment" RecordType may contain +the "date" Property and "Project" as the RecordType of an Entity that is referencing +the object to be identified. Then if we have a structure of some Records at hand, +we can check whether a Record with the parent "Project" is referencing the "Experiment" +Record. If that is the case, this reference is part of the identifiable for the "Experiment" +Record. Note, that if there are multiple Records with the appropriate parent (e.g. +multiple "Project" Records in the above example) it will be required that all of them +reference the object to be identified. + + Identified Records ++++++++++++++++++ TODO diff --git a/synchronize.md b/synchronize.md index 30cf342cd4c4342ab43ed05799faf8b89abce71a..b178e647866d1e01c85ccfc8bff3383d5f93d21d 100644 --- a/synchronize.md +++ b/synchronize.md @@ -31,4 +31,3 @@ Maybe keep another dict that tracks what Record objects are in the to_be_updated After treating leaf Records, Records that could not be checked before can be checked: Either referenced Records now have an ID or they are in the to_be_inserted dict such that it is clear that the identifiable at hand does not exist in the server. This way, the whole structure can be resolved except if there are circular dependencies: Those can be added fully to the to_be_inserted dict. (???) - diff --git a/unittests/test_directories/single_file_test_data/identifiables.yml b/unittests/test_directories/single_file_test_data/identifiables.yml index e32746d5a6984096cc46fa618250832b325965b0..c6f82be3dbf11db3f69e06d9a6fd2ee692901212 100644 --- a/unittests/test_directories/single_file_test_data/identifiables.yml +++ b/unittests/test_directories/single_file_test_data/identifiables.yml @@ -5,3 +5,7 @@ Keyword: Project: - project_id - title +Unknown: + - propa + - is_referenced_by: [Some] + diff --git a/unittests/test_file_identifiables.py b/unittests/test_file_identifiables.py index c7821f396a3f55634042f74dbe5e6f2d7e223811..aff174d0228d2750efd1cca129547c821c974127 100644 --- a/unittests/test_file_identifiables.py +++ b/unittests/test_file_identifiables.py @@ -16,7 +16,7 @@ def test_file_identifiable(): # Without a path there is no identifying information with raises(ValueError): - ident.get_identifiable(db.File()) + ident.get_identifiable(db.File(), []) fp = "/test/bla/bla.txt" file_obj = db.File(path=fp) diff --git a/unittests/test_identifiable.py b/unittests/test_identifiable.py index 847438cac752ea91a94adf6fb5dc81e54b87882d..3f3c606b163df4dc238be9a669fd31eb630a582d 100644 --- a/unittests/test_identifiable.py +++ b/unittests/test_identifiable.py @@ -32,9 +32,9 @@ from caoscrawler.identified_cache import IdentifiedCache def test_create_hashable_string(): assert Identifiable._create_hashable_string( - Identifiable(name="A", record_type="B")) == "P<B>N<A>" + Identifiable(name="A", record_type="B")) == "P<B>N<A>R<[]>" assert Identifiable._create_hashable_string( - Identifiable(name="A", record_type="B", properties={'a': 5})) == "P<B>N<A>a:5" + Identifiable(name="A", record_type="B", properties={'a': 5})) == "P<B>N<A>R<[]>a:5" a = Identifiable._create_hashable_string( Identifiable(name="A", record_type="B", properties={'a': 4, 'b': 5})) b = Identifiable._create_hashable_string( @@ -44,22 +44,25 @@ def test_create_hashable_string(): Identifiable._create_hashable_string( Identifiable(name="A", record_type="B", properties={'a': db.Record(id=12)}) - ) == "P<B>N<A>a:12") + ) == "P<B>N<A>R<[]>a:12") a = Identifiable._create_hashable_string( Identifiable(name="A", record_type="B", properties={'a': [db.Record(id=12)]})) - assert (a == "P<B>N<A>a:[12]") + assert (a == "P<B>N<A>R<[]>a:[12]") assert (Identifiable._create_hashable_string( - Identifiable(name="A", record_type="B", properties={'a': [12]})) == "P<B>N<A>a:[12]") + Identifiable(name="A", record_type="B", properties={'a': [12]})) == "P<B>N<A>R<[]>a:[12]") assert ( Identifiable._create_hashable_string( Identifiable(name="A", record_type="B", properties={ 'a': [db.Record(id=12), 11]}) - ) == "P<B>N<A>a:[12, 11]") + ) == "P<B>N<A>R<[]>a:[12, 11]") assert ( Identifiable._create_hashable_string( Identifiable(record_type="B", properties={'a': [db.Record()]}) ) != Identifiable._create_hashable_string( Identifiable(record_type="B", properties={'a': [db.Record()]}))) + assert Identifiable._create_hashable_string( + Identifiable(name="A", record_type="B", backrefs=[123, db.Entity(id=124)], + properties={'a': 5})) == "P<B>N<A>R<['123', '124']>a:5" def test_name(): diff --git a/unittests/test_identifiable_adapters.py b/unittests/test_identifiable_adapters.py index 743f64c19653806519074d5ef6fb272ca2847e7f..6817b9e6993c0ec509354b68ff60d9a9caf534ae 100644 --- a/unittests/test_identifiable_adapters.py +++ b/unittests/test_identifiable_adapters.py @@ -59,6 +59,18 @@ def test_create_query_for_identifiable(): Identifiable(name="TestRecord", record_type="TestType")) assert query.lower() == "find record testtype with name='testrecord'" + # With referencing entity (backref) + query = IdentifiableAdapter.create_query_for_identifiable( + Identifiable(record_type="Person", backrefs=[14433], properties={'last_name': "B"})) + assert query.lower() == ("find record person which is referenced by 14433 and with " + "'last_name'='b' ") + + # With two referencing entities (backref) + query = IdentifiableAdapter.create_query_for_identifiable( + Identifiable(record_type="Person", backrefs=[14433, 333], properties={'last_name': "B"})) + assert query.lower() == ("find record person which is referenced by 14433 and which is " + "referenced by 333 and with 'last_name'='b' ") + def test_load_from_yaml_file(): ident = CaosDBIdentifiableAdapter() diff --git a/unittests/test_issues.py b/unittests/test_issues.py index 0e024429400b00441ca113a5d020adb9b2b77d12..6b7b0d52ce5f4a1cfe5e4ac189d72eafd1454db7 100644 --- a/unittests/test_issues.py +++ b/unittests/test_issues.py @@ -25,6 +25,7 @@ from pytest import mark import caosdb as db from caoscrawler.crawl import Crawler +from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter from caoscrawler.structure_elements import DictElement from test_tool import rfp @@ -105,8 +106,10 @@ def test_issue_39(): flat_list = [b1, a1, a2, b2] # the two records with ids exist remotely - crawler.add_to_remote_existing_cache(a1) - crawler.add_to_remote_existing_cache(b2) + crawler.add_to_remote_existing_cache(a1, + Identifiable(name="A", record_id=101, record_type="RT_A")) + crawler.add_to_remote_existing_cache(b2, + Identifiable(name="B", record_id=102, record_type="RT_B")) # this would result in a merge conflict before ins, ups = crawler.split_into_inserts_and_updates(flat_list) diff --git a/unittests/test_tool.py b/unittests/test_tool.py index 9f49bfb9ebfd47dfaf1df120039e04f7ced06ed2..71180b17e22409bc2491a51d4cdd45ed6f4aa346 100755 --- a/unittests/test_tool.py +++ b/unittests/test_tool.py @@ -259,14 +259,6 @@ def test_synchronization(crawler, ident): assert len(updl) == 0 -def test_identifiable_adapter(): - query = IdentifiableAdapter.create_query_for_identifiable( - Identifiable(record_type="Person", - properties={"first_name": "A", - "last_name": "B"})) - assert query.lower() == "find record person with 'first_name'='a' and 'last_name'='b' " - - def test_remove_unnecessary_updates(): # test trvial case upl = [db.Record().add_parent("A")] @@ -338,7 +330,7 @@ def test_split_into_inserts_and_updates_trivial(crawler): crawler.split_into_inserts_and_updates([]) -def basic_retrieve_by_name_mock_up(rec, known): +def basic_retrieve_by_name_mock_up(rec, referencing_entities=None, known=None): """ returns a stored Record if rec.name is an existing key, None otherwise """ if rec.name in known: return known[rec.name] @@ -488,29 +480,30 @@ def test_has_missing_object_in_references(crawler): # one reference with id -> check assert not crawler._has_missing_object_in_references( - Identifiable(name="C", record_type="RTC", properties={'d': 123})) + Identifiable(name="C", record_type="RTC", properties={'d': 123}), []) # one ref with Entity with id -> check assert not crawler._has_missing_object_in_references( Identifiable(name="C", record_type="RTC", properties={'d': db.Record(id=123) - .add_parent("C")})) + .add_parent("C")}), []) # one ref with id one with Entity with id (mixed) -> check assert not crawler._has_missing_object_in_references( Identifiable(name="C", record_type="RTD", - properties={'d': 123, 'b': db.Record(id=123).add_parent("RTC")})) + properties={'d': 123, 'b': db.Record(id=123).add_parent("RTC")}), []) # entity to be referenced in the following a = db.Record(name="C").add_parent("C").add_property("d", 12311) # one ref with id one with Entity without id (but not identifying) -> fail assert not crawler._has_missing_object_in_references( - Identifiable(name="C", record_type="RTC", properties={'d': 123, 'e': a})) + Identifiable(name="C", record_type="RTC", properties={'d': 123, 'e': a}), []) # one ref with id one with Entity without id (mixed) -> fail assert not crawler._has_missing_object_in_references( - Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a})) + Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), []) - crawler.add_to_remote_missing_cache(a) + crawler.add_to_remote_missing_cache(a, Identifiable(name="C", record_type="RTC", + properties={'d': 12311})) # one ref with id one with Entity without id but in cache -> check assert crawler._has_missing_object_in_references( - Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a})) + Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), []) # if this ever fails, the mock up may be removed crawler.identifiableAdapter.get_registered_identifiable.assert_called() @@ -705,7 +698,117 @@ def test_security_mode(updateCacheMock, upmock, insmock, ident): ident._records = deepcopy(records_backup) +def test_create_reference_mapping(): + a = db.Record().add_parent("A") + b = db.Record().add_parent("B").add_property('a', a) + ref = Crawler.create_reference_mapping([a, b]) + assert id(a) in ref + assert id(b) not in ref + assert "B" in ref[id(a)] + assert ref[id(a)]["B"] == [b] + + def test_create_flat_list(): a = db.Record() a.add_property(name="a", value=a) Crawler.create_flat_list([a], []) + + +@pytest.fixture +def crawler_mocked_for_backref_test(crawler): + # mock retrieval of registered identifiabls: return Record with just a parent + def get_reg_ident(x): + if x.parents[0].name == "C": + return db.Record().add_parent(x.parents[0].name).add_property( + "is_referenced_by", value=["BR"]) + elif x.parents[0].name == "D": + return db.Record().add_parent(x.parents[0].name).add_property( + "is_referenced_by", value=["BR", "BR2"]) + else: + return db.Record().add_parent(x.parents[0].name) + crawler.identifiableAdapter.get_registered_identifiable = Mock(side_effect=get_reg_ident) + + # Simulate remote server content by using the names to identify records + # There is only a single known Record with name A + crawler.identifiableAdapter.retrieve_identified_record_for_record = Mock(side_effect=partial( + basic_retrieve_by_name_mock_up, known={"A": + db.Record(id=1111, name="A").add_parent("BR")})) + crawler.identifiableAdapter.retrieve_identified_record_for_identifiable = Mock( + side_effect=partial( + basic_retrieve_by_name_mock_up, known={"A": + db.Record(id=1111, name="A").add_parent("BR")})) + return crawler + + +def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test): + crawler = crawler_mocked_for_backref_test + identlist = [Identifiable(name="A", record_type="BR"), + Identifiable(name="B", record_type="C", backrefs=[db.Entity()])] + referenced = db.Record(name="B").add_parent("C") + entlist = [referenced, db.Record(name="A").add_parent("BR").add_property("ref", referenced), ] + + # Test without referencing object + # currently a NotImplementedError is raised if necessary properties are missing. + with raises(NotImplementedError): + crawler.split_into_inserts_and_updates([db.Record(name="B").add_parent("C")]) + + # identifiables were not yet checked + assert crawler.get_from_any_cache(identlist[0]) is None + assert crawler.get_from_any_cache(identlist[1]) is None + # one with reference, one without + assert not crawler._has_reference_value_without_id(identlist[0]) + assert crawler._has_reference_value_without_id(identlist[1]) + # one can be found remotely, one not + assert crawler.identifiableAdapter.retrieve_identified_record_for_record( + identlist[0]).id == 1111 + assert crawler.identifiableAdapter.retrieve_identified_record_for_record( + identlist[1]) is None + + # check the split... + insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) + # A was found remotely and is therefore in the update list + assert len(update) == 1 + assert update[0].name == "A" + # B does not exist on the (simulated) remote server + assert len(insert) == 1 + assert insert[0].name == "B" + + +def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_test): + # test whether multiple references of the same record type are correctly used + crawler = crawler_mocked_for_backref_test + referenced = db.Record(name="B").add_parent("C") + entlist = [referenced, + db.Record(name="A").add_parent("BR").add_property("ref", referenced), + db.Record(name="C").add_parent("BR").add_property("ref", referenced), + ] + + # test whether both entities are listed in the backref attribute of the identifiable + referencing_entities = crawler.create_reference_mapping(entlist) + identifiable = crawler.identifiableAdapter.get_identifiable(referenced, referencing_entities) + assert len(identifiable.backrefs) == 2 + + # check the split... + insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) + assert len(update) == 1 + assert len(insert) == 2 + + +def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_test): + # test whether multiple references of the different record types are correctly used + crawler = crawler_mocked_for_backref_test + referenced = db.Record(name="B").add_parent("D") + entlist = [referenced, + db.Record(name="A").add_parent("BR").add_property("ref", referenced), + db.Record(name="A").add_parent("BR2").add_property("ref", referenced), + ] + + # test whether both entities are listed in the backref attribute of the identifiable + referencing_entities = crawler.create_reference_mapping(entlist) + identifiable = crawler.identifiableAdapter.get_identifiable(referenced, referencing_entities) + assert len(identifiable.backrefs) == 2 + + # check the split... + insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) + assert len(update) == 2 + assert len(insert) == 1