diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index d5210a189a52d538e9bee5a8669695136182e5b3..b9510ce44380af16e33d79018c1c2da6a2e9e280 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -299,46 +299,46 @@ class Crawler(object): return data def split_into_inserts_and_updates(self, st: SyncGraph): + """ iteratively identifies nodes in the SyncGraph st and checks whether those exist on the + remote server such that in the end two list are being created that list entities that need + to be update or inserted""" entity_was_treated = True - # st.entities contains Entities which could not yet be checked against the remote server + # st.unchecked contains Entities which could not yet be checked against the remote server while entity_was_treated and len(st.unchecked) > 0: entity_was_treated = False # For each element we try to find out whether we can find it in the server or whether # it does not yet exist. Since a Record may reference other unkown Records it might not # be possible to answer this right away. - # The following checks are done on each Record: - # 1. Is it in the cache of already checked Records? - # 2. Does it have to be new since a needed reference is missing? - # 3. Can it be checked on the remote server? for se in list(st.unchecked): - if se not in st.unchecked: + if se not in st.unchecked: # the node might have been checke in the mean time continue - if st.identity_relies_on_unchecked_entity(se): + if st.identity_relies_on_unchecked_entity(se): # no remote server check possible continue - if se.identifiable is None: + if se.identifiable is None: # generate identifiable if it is missing + self.identifiableAdapter.check_identifying_props(se) st.set_identifiable_of_node(se, st.identifiableAdapter.get_identifiable( se, st.backward_id_referenced_by[se.uuid])) - # entity was merged with another due to the new identifiable + + entity_was_treated = True + # if the node was merged with another due to the new identifiable, we skip if se not in st.unchecked: continue + # check remote server identified_record = ( st.identifiableAdapter.retrieve_identified_record_for_identifiable( se.identifiable)) remote_id = None if identified_record is not None: remote_id = identified_record.id + # set id of node. if node is missing, remote_id is None and the SyncGraph marks it + # as missing st.set_id_of_node(se, remote_id) entity_was_treated = True - # TODO - # for record in st.entities: - # self.replace_references_with_cached(record, referencing_entities) - - # We postponed the merge for records where it failed previously and try it again now. # This only might add properties of the postponed records to the already used ones. if len(st.unchecked) > 0: circle = st.unchecked_contains_circular_dependency() diff --git a/src/caoscrawler/identifiable.py b/src/caoscrawler/identifiable.py index 91eb45ce787128c55ab361c9c54821eda115cfd5..3114abc4a9594d93933c344ed7b9b7ebc22ad811 100644 --- a/src/caoscrawler/identifiable.py +++ b/src/caoscrawler/identifiable.py @@ -29,6 +29,7 @@ from typing import Union import linkahead as db +from .exceptions import MissingIdentifyingProperty from .sync_node import SyncNode logger = logging.getLogger(__name__) @@ -60,8 +61,9 @@ class Identifiable(): if (record_id is None and path is None and name is None and (backrefs is None or len(backrefs) == 0) and (properties is None or len(properties) == 0)): - raise ValueError("There is no identifying information. You need to add a path or " - "properties or other identifying attributes.") + raise ValueError( + "There is no identifying information. You need to add a path or " + "properties or other identifying attributes.") if properties is not None and 'name' in [k.lower() for k in properties.keys()]: raise ValueError("Please use the separete 'name' keyword instead of the properties " "dict for name") diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index ef27515f09f1a9f9fa763e934b20e19c5a25abb6..5ccff2766232b93644f647a727e74f049f3dac24 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -148,6 +148,24 @@ identifiabel, identifiable and identified record) for a Record. query_string = query_string[:-len(" AND ")] return query_string + def check_identifying_props(self, node): + if node.registered_identifiable is None: + raise RuntimeError("no registered_identifiable") + for prop in node.registered_identifiable.properties: + if prop.name.lower() == "is_referenced_by": + continue + if prop.name.lower() == "name": + if node.name is None: + i = MissingIdentifyingProperty(f"The node has no name.") + i.prop = "name" + raise i + else: + continue + + if (len([el for el in node.properties if el.name.lower() == prop.name.lower()]) == 0): + i = MissingIdentifyingProperty(f"The property {prop.name} is missing.") + i.prop = prop.name + @staticmethod def __create_pov_snippet(pname: str, pvalue, startswith: bool = False): """Return something like ``'name'='some value'`` or ``'name' LIKE 'some*'``. diff --git a/src/caoscrawler/sync_graph.py b/src/caoscrawler/sync_graph.py index 37d46d15767237b7e877072f5304b4b5d66be0eb..91b9291f2de1a90922f2de449665af4c7839a57e 100644 --- a/src/caoscrawler/sync_graph.py +++ b/src/caoscrawler/sync_graph.py @@ -442,6 +442,7 @@ class SyncGraph(): if target.path is not None: self._path_look_up[target.path] = target + # due to the merge it might now be possible to create an identifiable if (target.identifiable is None and not self.identity_relies_on_unchecked_entity(target)): try: identifiable = self.identifiableAdapter.get_identifiable( diff --git a/unittests/test_sync_graph.py b/unittests/test_sync_graph.py index 1d053c54aa5096a61ae0c8404511099ce753ba07..30f65d6c15ce19558fbe1ad1e8b503f276697470 100644 --- a/unittests/test_sync_graph.py +++ b/unittests/test_sync_graph.py @@ -412,25 +412,6 @@ def test_set_id_of_node(simple_adapter): st = SyncGraph(ent_list, simple_adapter) with pytest.raises(ImpossibleMergeError): st.export_record_lists() - ent_list = [ - db.Record(id=101).add_parent("RT3") - .add_property('b', value=db.Record().add_parent("RT5")), - db.Record().add_parent("RT3") - .add_property('b', value=db.Record().add_parent("RT5")), - ] - - st = SyncGraph(ent_list, simple_adapter) - assert st.nodes[2].is_unidentifiable() - assert st.nodes[3].is_unidentifiable() - assert len(st.nodes) == 4 - assert len(st.unchecked) == 1 - st.set_id_of_node(st.nodes[1], 101) - assert len(st.nodes) == 3 - assert len(st.unchecked) == 0 - # until implementation of it ... - with pytest.raises(NotImplementedError): - # with pytest.raises(ImpossibleMergeError): - st.export_record_lists() @patch("caoscrawler.sync_graph.cached_get_entity_by", @@ -657,26 +638,3 @@ def test_export_node(): assert len(p.value) == len(exp.get_property(p.name).value) assert len(exp.properties) == len(rec_a.properties) assert len(exp.parents) == len(rec_a.parents) - - -def test_remove_merged(simple_adapter): - # We reference an entity that is merged into another node and then remove the merged node - # This should result in the reference being removed - b = db.Record().add_parent("RT3").add_property('a', value=1) - ent_list = [ - db.Record().add_parent("RT3").add_property('a', value=1), - b, - db.Record().add_parent("RT3").add_property('a', value=3).add_property('RT3', value=b), - ] - - st = SyncGraph(ent_list, simple_adapter) - se_a = st.nodes[0] - se_c = st.nodes[1] - for node in st.nodes: - print(node) - assert len(st.nodes) == 2 - assert len(st.unchecked) == 2 - st.remove_failed(se_a) - assert len(st.nodes) == 1 - assert len(st.unchecked) == 1 - assert "RT3" not in [p.name for p in se_c.properties]