diff --git a/src/newcrawler/crawl.py b/src/newcrawler/crawl.py index ab18446943a8dbab2e77496925b2028bd59191c7..f4c15fee33728c60a49cd3c7ad8f835de5b25491 100644 --- a/src/newcrawler/crawl.py +++ b/src/newcrawler/crawl.py @@ -226,6 +226,7 @@ class Crawler(object): comp = compare_entities(record, identifiable) identical = True for j in range(2): + # TODO: should be implemented elsewhere for label in ("properties", "parents"): if len(comp[j][label]) > 0: identical = False diff --git a/src/newcrawler/identifiable_adapters.py b/src/newcrawler/identifiable_adapters.py index 0f374e87702cb0fd3a412a5c199f1df8c151a25f..00c8597aaef80a65bc89388dbe9fc3e8accbb87e 100644 --- a/src/newcrawler/identifiable_adapters.py +++ b/src/newcrawler/identifiable_adapters.py @@ -63,6 +63,7 @@ class IdentifiableAdapter(object): if len(ident.parents) != 1: raise RuntimeError("Multiple parents for identifiables not supported.") + # TODO prevent multiple parents query_string = "FIND Record " + ident.get_parents()[0].name query_string += " WITH " @@ -74,6 +75,8 @@ class IdentifiableAdapter(object): query_string += "name='{}' AND".format(ident.name) for p in ident.get_properties(): + # TODO this is badly wrong :-| + if p.datatype is not None and p.datatype.startswith("LIST<"): for v in p.value: query_string += ("references " + str(v.id if isinstance(v, db.Entity) diff --git a/synchronize.md b/synchronize.md new file mode 100644 index 0000000000000000000000000000000000000000..7d240095a770a33d55370d9e48a1ab89b60c02dd --- /dev/null +++ b/synchronize.md @@ -0,0 +1,34 @@ +# Synchronization + +## Goals +Ideally, with current XML API we only issue two transactions in order to be as atomic as possible: One insert and one update. +( This should also allow to remove all inserted objects if the update fails...) + +## Difficulties + +### Recursive References + +A Record might reference another Record from the list. This Record then has to be identified using the appropriate identifiable before the former Record can be inserted or updated. Thus starting with the leaves in this structure, for each record needs to be checked (using the identifiable) whether it exists. If it does not exist it can be added to an to_be_inserted list. + +If an identifiable contains a reference to or shall be referenced by an object in the inserted list, then that means that it also does not exist and can be added to the to_be_inserted list. + +### Duplicates +It must not happen, that an identifiable is checked twice and added twice (or more times) to the to_be_inserted list (e.g. two Experiments with a certain date). Inserting duplicates could be prevented by inserting one Record and then checking the identifiable of the next before another insertion. However, this violates the above goal to have one insertion. Thus, it is necessary to check whether an identifiable is already in the list without server interaction. + +This should be possible by using a dict with hashes as keys. The hashes can be computed from the identifiables as follows: RT+name+prop1=val+prop2=val+referencedby=A+B+C. Here, references to other objects in the to_be_inserted list (or dict) can be replaced by the hash. Creating the has carefully (sorted props etc) should make it unique such that an identification is possible without server check. + + + +## Implementation Sketch +Recursively, run through created objects and check whether the identifiable references or is referenced by Records that are not yet checked. Once the identifiable is checked against the server, the following applies: +a) identifiable exists in the Server: set the id of the Record object and add it to the to_be_updated list (if update is required) +b) identifiable does not exist: check the to_be_inserted dict(key is a hash computed as described above and value is the Record object) + 1. hash exists: reuse the value corresponding to the key + 2. hash does not exist: add hash and value (Record object) to the dict + +Maybe keep another dict that tracks what Record objects are in the to_be_updated dict (id(rec) as key?) + +After treating leave Records, Records that could not be checked before can be checked: Either referenced Records now have an ID or they are in the to_be_inserted dict such that it is clear that the identifiable at hand does not exist in the server. + +This way, the whole structure can be resolved except if there are circular dependencies: Those can be added fully to the to_be_inserted dict. (???) +