diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 4b5e6a5d3b359df1ad5a9135d1299290fb7ff1d4..d88f8b939eb6fdac4b3aadeee3297816198a19bc 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -373,10 +373,8 @@ class Crawler(object): continue if st.identity_relies_on_unchecked_entity(se): - print(st.nodes.index(se), "relies on unchecked") continue - print(se.identifiable) if se.identifiable is None: try: self.identifiableAdapter.check_identifying_props(se) @@ -429,14 +427,13 @@ class Crawler(object): circle = st.unchecked_contains_circular_dependency() if circle is None: logger.error("Failed, but found NO circular dependency. The data is as follows:" - # + str(self.compact_entity_list_representation(st.entities, - # referencing_entities)) + + "\n".join([str(el) for el in st.unchecked]) + ) else: logger.error("Found circular dependency (Note that this might include references " "that are not identifying properties): " - # + self.compact_entity_list_representation(circle, - # referencing_entities) + + "\n".join([str(el) for el in st.unchecked]) ) raise RuntimeError( @@ -769,8 +766,6 @@ class Crawler(object): for record in to_be_updated: if record.id is not None: # TODO: use cache here? - print(record.id) - print(record) identified_records.append(cached_get_entity_by(eid=record.id)) else: raise Exception("Please report a bug: At this stage all records to be updated" diff --git a/src/caoscrawler/sync_graph.py b/src/caoscrawler/sync_graph.py index 1b51952c8927dec74a91fc7845169f8c0e7b1a5d..a4d321681473f02e12b79960e09ed41bec4aa986 100644 --- a/src/caoscrawler/sync_graph.py +++ b/src/caoscrawler/sync_graph.py @@ -34,7 +34,7 @@ from linkahead.apiutils import (EntityMergeConflictError, compare_entities, from linkahead.cached import cache_clear, cached_get_entity_by from linkahead.exceptions import EmptyUniqueQueryError -from .exceptions import MissingReferencingEntityError +from .exceptions import ImpossibleMergeError, MissingReferencingEntityError from .identifiable_adapters import IdentifiableAdapter from .sync_node import SyncNode @@ -127,6 +127,7 @@ class SyncGraph(): def remove_unidentifiable(self, node): self.unchecked.remove(node) + # TODO remove from identifying references! def set_id_of_node(self, node: SyncNode, node_id: Optional[str] = None): """sets the ID attribute of the given SyncNode. If node_id is None, a negative Id will be @@ -181,16 +182,47 @@ class SyncGraph(): entities = [] node_map = {} self.deal_with_non_ident() + for k, el in self.forward_references.items(): + print(k, "references") + for e in el: + print(e.uuid) + print('UNCHECKED ENTITIES') + for el in self.unchecked: + print(el.uuid) + print(el) for el in self.nodes: - entities.append(el.export_entity()) - node_map[id(el)] = entities[-1] - for oel in el.other: - node_map[id(oel)] = entities[-1] + try: + el.export_entity() + except ImpossibleMergeError as exc: + if self.raise_problems: + raise + else: + remove, rmrefs = self.remove_failed(el) + logger.error(exc) + logger.error(f"{len(remove)}, {len(rmrefs)}") + for el in self.nodes: + try: + entities.append(el.export_entity()) + node_map[id(el)] = entities[-1] + for oel in el.other: + node_map[id(oel)] = entities[-1] + except ImpossibleMergeError as exc: + if self.raise_problems: + raise + else: + remove, rmrefs = self.remove_failed(el) + logger.error(exc) + logger.error(f"{len(remove)}, {len(rmrefs)}") + + if len(self.unchecked) > 1: + self.unchecked_contains_circular_dependency() for ent in entities: for p in ent.properties: if isinstance(p.value, list): for ii, el in enumerate(p.value): if isinstance(el, SyncNode): + # TODO how is it possible that id(el) is not in node_map + # probably because el was removed from nodes. p.value[ii] = node_map[id(el)] elif isinstance(p.value, SyncNode): @@ -321,23 +353,29 @@ class SyncGraph(): return flat def remove_failed(self, node): + if node not in self.nodes: + return [], [] self.nodes.remove(node) if node in self.unchecked: self.unchecked.remove(node) refs2 = [] # remove reference property or value from referencing nodes for referencing in self.backward_references[node.uuid]: - for p in el.properties: + for p in referencing.properties: v = p.value if not isinstance(p.value, list): v = [v] for vv in v: - if vv == record: + if vv is node: if not isinstance(p.value, list): - el.properties.remove(p) + referencing.properties.remove(p) else: - p.value.remove(record) - refs2.append(el) + p.value.remove(node) + refs2.append(referencing) + + remove = [] + remove.extend(self.forward_id_referenced_by[node.uuid]) + remove.extend(self.backward_id_references[node.uuid]) # update reference mappings for other in self.forward_references.pop(node.uuid): @@ -355,9 +393,6 @@ class SyncGraph(): for other in self.backward_id_referenced_by.pop(node.uuid): self.forward_id_referenced_by[other.uuid].remove(node) - remove = [] - remove.extend(self.forward_id_referenced_by[node.uuid]) - remove.extend(self.backward_id_references[node.uuid]) for el in remove: rm, rf = self.remove_failed(el) remove.extend(rm)