diff --git a/integrationtests/basic_example/test_basic.py b/integrationtests/basic_example/test_basic.py index c906a81d86af56669f7c522169bceb3b5fcb3e01..55ba8c2267b96560212a1ecffbdafcd84908a519 100755 --- a/integrationtests/basic_example/test_basic.py +++ b/integrationtests/basic_example/test_basic.py @@ -139,6 +139,7 @@ def test_single_insertion(clear_database, usemodel, crawler, ident): # xml.remove(xml.find(tag)) # f.write(db.common.utils.xml2str(xml)) + breakpoint() assert len(ins) == 18 assert len(ups) == 0 diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 23707a6c3ca1f1893fd870e4b11a3c56bb8f06ba..d545403f049784673b0bd567f2e5011aad866372 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -387,10 +387,13 @@ class Crawler(object): # 3. check on the remote server else: st.check_remote_server(se) + print("checked", se.id) if se.id is None: st.set_missing(se) + print("missing") else: st.set_existing(se) + print("exisitng") entity_was_treated = True # TODO @@ -737,6 +740,8 @@ class Crawler(object): for record in to_be_updated: if record.id is not None: # TODO: use cache here? + print(record.id) + print(record) identified_records.append(cached_get_entity_by(eid=record.id)) else: raise Exception("Please report a bug: At this stage all records to be updated" diff --git a/src/caoscrawler/identifiable.py b/src/caoscrawler/identifiable.py index 3df5bfa77ccb8965dfd660ad1ce2a8b7cafcff96..e8ee521d2af8f5e6891c52ecf024d6ca0f0c19fb 100644 --- a/src/caoscrawler/identifiable.py +++ b/src/caoscrawler/identifiable.py @@ -96,7 +96,8 @@ class Identifiable(): if value.id is not None: return str(value.id) else: - return "PyID=" + str(id(value)) + print(value) + raise RuntimeError("Python Entity without id not allowed") elif isinstance(value, list): return "[" + ", ".join([Identifiable._value_representation(el) for el in value]) + "]" elif (isinstance(value, str) or isinstance(value, int) or isinstance(value, float) @@ -105,7 +106,7 @@ class Identifiable(): else: raise ValueError(f"Unknown datatype of the value: {value}") - @staticmethod + @ staticmethod def _create_hashable_string(identifiable: Identifiable) -> str: """ creates a string from the attributes of an identifiable that can be hashed diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index 85f57449b8e862b4d558f8295f6de6f1a44f9d82..dcf5babddcc66b920973f76d0d48ab3ee4e6a230 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -284,18 +284,24 @@ startswith: bool, optional ) continue - options = [f.get_property(prop.name) for f in se.fragments + options = [f.get_property(prop.name).value for f in se.fragments if f.get_property(prop.name) is not None] if len(options) == 0: raise NotImplementedError( f"The following record is missing an identifying property:\n" f"RECORD\n{se.fragments[0]}\nIdentifying PROPERTY\n{prop.name}" ) - if not all([f.value == options[0].value for f in options]): + for ii, el in enumerate(options): + if isinstance(el, db.Entity): + options[ii] = el.id + if el.id is None: + raise RuntimeError("reference to unchecked in identifiable") + else: + options[ii] = el + if not all([f == options[0] for f in options]): raise RuntimeError("differing prop values in fragments") - record_prop = options[0] - identifiable_props[record_prop.name] = record_prop.value + identifiable_props[prop.name] = options[0] property_name_list_A.append(prop.name) # check for multi properties in the record: diff --git a/src/caoscrawler/semantic_target.py b/src/caoscrawler/semantic_target.py index dfd817b932b543990515c86bb8c5a610a0315b9d..5a852eb19c8d31431c5464771cf775363f61fbf5 100644 --- a/src/caoscrawler/semantic_target.py +++ b/src/caoscrawler/semantic_target.py @@ -87,6 +87,15 @@ class SemanticEntity(): class SemanticTarget(): """ models the target structure of Entities as it shall be created by the Crawler + The target entities are composed using the information of the entity fragments (db.Entity + objects) of SemanticEntities. This is information like name, parents and properties and, + importantly, references. Those references are typically given by a Python reference to some + other db.Entity object. These references are scanned initially and used to create multiple + reference maps that track how SemanticEntities reference each other. These maps are kept up to + date when SemanticEntities are merged because they are identified with each other. When + creating the final list of db.Entity objects (``create_record_lists``) the Python references + are updated according to the reference map. + This model should only be manipulated via three functions: - make_identifiable: adds an identifiable to a SemanticEntity what possibly allows to merge it with another SemanticEntity @@ -105,7 +114,8 @@ class SemanticTarget(): self._identifiable_look_up: Dict[str, SemanticEntity] = {} self._missing: Dict[int, SemanticEntity] = {} self._existing: Dict[int, SemanticEntity] = {} - self._remote_missing_counter = -1 # TODO: I guess we can now get rid of this... + # entities that are missing get negative IDs to allow identifiable creation + self._remote_missing_counter = -1 # create initial set of SemanticEntities from provided Entity list self.se: List[SemanticEntity] = [] # list of all SemanticEntities @@ -113,7 +123,9 @@ class SemanticTarget(): self.se_lookup: Dict[str, SemanticEntity] = {} # lookup: UUID -> SemanticEntity entities = self._create_flat_list(entities) self._sanity_check(entities) + print("ids") for el in entities: + print(el.id) self.se.append(SemanticEntity( el, self.identifiableAdapter.get_registered_identifiable(el))) @@ -166,6 +178,8 @@ class SemanticTarget(): if se.path is None and se.identifiable is None: raise RuntimeError("no identifying information") se.id = self._remote_missing_counter + for f in se.fragments: + f.id = self._remote_missing_counter self._remote_missing_counter -= 1 self._add_any(se, self._missing) self.unchecked.remove(se) @@ -177,6 +191,7 @@ class SemanticTarget(): on the remote server. """ assert se.id is not None + assert se.id > 0 self._add_any(se, self._existing) self.unchecked.remove(se) @@ -191,6 +206,10 @@ class SemanticTarget(): return self._id_look_up[entity.id] if entity.path is not None and entity.path in self._path_look_up: return self._path_look_up[entity.path] + for e in self.se: + if e.identifiable is not None: + print(e.identifiable._create_hashable_string(e.identifiable)) + print(self._identifiable_look_up) if (entity.identifiable is not None and entity.identifiable.get_representation() in self._identifiable_look_up): return self._identifiable_look_up[entity.identifiable.get_representation()] @@ -217,12 +236,21 @@ class SemanticTarget(): else: return None + def combine_fragments(self): + for se in self.se: + if len(se.fragments) < 2: + continue + for ent in se.fragments[1:]: + merge_entities(se.fragments[0], ent, merge_id_with_resolved_entity=True) + def create_record_lists(self): for se in self.se: for f in se.fragments: f.id = se.id f.path = se.path + self._update_reference_values() self.combine_fragments() + # TODO assure that there is only one fragment each missing = [el.fragments[0] for el in self._missing.values()] # remove negative IDs @@ -235,6 +263,18 @@ class SemanticTarget(): return (missing, [el.fragments[0] for el in self._existing.values()]) + def _update_reference_values(self): + for se in self.se: + for f in se.fragments: + for p in f.properties: + if isinstance(p.value, list): + for index, val in enumerate(p.value): + if id(val) in self.se_lookup: + p.value[index] = self.se_lookup[id(val)].fragments[0] + else: + if id(p.value) in self.se_lookup: + p.value = self.se_lookup[id(p.value)].fragments[0] + def identity_relies_on_unchecked_entity(self, se: SemanticEntity): """ If a record for which it could not yet be verified whether it exists in LA or not is part @@ -262,14 +302,6 @@ class SemanticTarget(): if ent.role == "Record" and len(ent.parents) == 0: raise RuntimeError(f"Records must have a parent.\n{ent}") - def combine_fragments(self): - for se in self.se: - if len(se.fragments) < 2: - continue - for ent in se.fragments[1:]: - merge_entities(se.fragments[0], ent, merge_id_with_resolved_entity=True) - se.fragments = [se.fragments[0]] - @ staticmethod def _create_flat_list(ent_list: List[db.Entity], flat: Optional[List[db.Entity]] = None): """ @@ -394,8 +426,9 @@ class SemanticTarget(): el.name == p.name]) > 0: forward_id_references[se.uuid].add(vse) backward_id_references[vse.uuid].add(se) - if IdentifiableAdapter.referencing_entity_has_appropriate_type( - ent.parents, vse.registered_identifiable): + if (vse.registered_identifiable is not None and + IdentifiableAdapter.referencing_entity_has_appropriate_type( + ent.parents, vse.registered_identifiable)): forward_id_referenced_by[se.uuid].add(vse) backward_id_referenced_by[vse.uuid].add(se) @@ -407,6 +440,7 @@ class SemanticTarget(): """ A path or an ID is sufficiently identifying. Thus, those entities can be marked as checked """ for semantic_entity in list(self.se[::-1]): + print(semantic_entity.uuid) assert len(semantic_entity.fragments) == 1 entity = semantic_entity.fragments[0] if entity.id is None and entity.path is None: @@ -419,9 +453,10 @@ class SemanticTarget(): if existing is not None: semantic_entity.identify_with(existing) + # at this point, semantic_entity has an ID if it is existing treated_before = self.get_checked_equivalent(semantic_entity) if treated_before is None: - if semantic_entity.id is None: + if semantic_entity.id is None or semantic_entity.id < 0: self.set_missing(semantic_entity) else: self.set_existing(semantic_entity) @@ -451,23 +486,6 @@ class SemanticTarget(): return None return circle - @ staticmethod - def _bend_references_to_new_object(old, new, entities): - # TODO still needed??? - """ Bend references to the other object - Iterate over all entities in `entities` and check the values of all properties of - occurances of old Entity and replace them with new Entity - """ - for el in entities: - for p in el.properties: - if isinstance(p.value, list): - for index, val in enumerate(p.value): - if val is old: - p.value[index] = new - else: - if p.value is old: - p.value = new - def _add_any(self, entity: SemanticEntity, lookup): if entity.id is not None: self._id_look_up[entity.id] = entity diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index b18f6062ba045d69669fa08efcc7493887a5b0a6..85564705adff2646a618e1e781341cfe6309aeb0 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -413,6 +413,8 @@ def test_split_into_inserts_and_updates_with_copy_attr(crawler_mocked_identifiab crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() +@ patch("caoscrawler.crawl.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) @patch("caoscrawler.identifiable_adapters.cached_query", new=Mock(side_effect=mock_cached_only_rt)) def test_split_iiau_with_unmergeable_list_items(): @@ -480,7 +482,7 @@ a: ([b1, b2]) crawler = Crawler(identifiableAdapter=ident_adapter) - st = SemanticTarget([rec_a, *rec_b, *rec_c], crawler.identifiableAdapter) + st = SemanticTarget(deepcopy([rec_a, *rec_b, *rec_c]), crawler.identifiableAdapter) assert st.identity_relies_on_unchecked_entity(st.se[0]) is False assert st.identity_relies_on_unchecked_entity(st.se[1]) assert st.identity_relies_on_unchecked_entity(st.se[2]) diff --git a/unittests/test_identifiable.py b/unittests/test_identifiable.py index 28bdb7a2ad75d5b9389b47ca3f0ec2b2e2a1404b..c79498ed8700f8418461c8047f9823529747ae47 100644 --- a/unittests/test_identifiable.py +++ b/unittests/test_identifiable.py @@ -54,11 +54,6 @@ def test_create_hashable_string(): Identifiable(name="A", record_type="B", properties={ 'a': [db.Record(id=12), 11]}) ) == "P<B>N<A>R<[]>a:[12, 11]") - assert ( - Identifiable._create_hashable_string( - Identifiable(record_type="B", properties={'a': [db.Record()]}) - ) != Identifiable._create_hashable_string( - Identifiable(record_type="B", properties={'a': [db.Record()]}))) assert Identifiable._create_hashable_string( Identifiable(name="A", record_type="B", backrefs=[123, db.Entity(id=124)], properties={'a': 5})) == "P<B>N<A>R<['123', '124']>a:5" diff --git a/unittests/test_semantic_target.py b/unittests/test_semantic_target.py index dc6bf0d7a5f4b04fa60082e59b64aa3bd47dff64..14bd845247318b63d4e556638fa40bfbbdecbeb2 100644 --- a/unittests/test_semantic_target.py +++ b/unittests/test_semantic_target.py @@ -314,11 +314,13 @@ def test_merging(): db.File(name='101').add_parent("A").add_property('a', value=1), db.File(name='101').add_parent("A").add_property('a', value=1)] st = SemanticTarget(entlist, ident_adapter) + assert len(st.unchecked) == 2 st.make_identifiable(st.se[0]) st.check_remote_server(st.se[0]) st.set_missing(st.se[0]) assert len(st.unchecked) == 1 st.make_identifiable(st.se[1]) + assert st.se[1].id is None assert st.merge_with_equivalent(st.se[1]) assert len(st.se) == 1 assert len(st.unchecked) == 0