diff --git a/src/newcrawler/crawl.py b/src/newcrawler/crawl.py index b4ec9e4c02b507779153f2cda14e805ebb049028..ab18446943a8dbab2e77496925b2028bd59191c7 100644 --- a/src/newcrawler/crawl.py +++ b/src/newcrawler/crawl.py @@ -205,23 +205,29 @@ class Crawler(object): insertList: list[db.Record] = [] # Walk backwards through list, so that deletion is possible: - for i in range(len(updateList)-1, 0, -1): + for i in reversed(range(len(updateList))): record = updateList[i] identifiable = self.identifiableAdapter.retrieve_identifiable(record) - print(record) - print(identifiable) # if there is no identifiable, move record from update list to insert list: if identifiable is None: insertList.append(record) del updateList[i] + # also update all references to this entity to get the value -x + # where -x is the id of the new entity in insert list + + # also add this entity directly to the list of known entites of the current + # identifiable adapter. + + # any reference to this entity does not need to be compared anymore, as it + # definitely needs updating (the new value cannot have existed before) continue - + comp = compare_entities(record, identifiable) identical = True - for i in range(2): + for j in range(2): for label in ("properties", "parents"): - if len(comp[i][label]) > 0: + if len(comp[j][label]) > 0: identical = False break if not identical: @@ -230,14 +236,13 @@ class Crawler(object): if identical: del updateList[i] continue + else: + pass return (insertList, updateList) - - @staticmethod def debug_build_usage_tree(converter: Converter): - print(converter) res: dict[str, dict[str, Any]] = { converter.name: { "usage": ", ".join(converter.metadata["usage"]), diff --git a/src/newcrawler/identifiable_adapters.py b/src/newcrawler/identifiable_adapters.py index 62868d4d6b0c2a328387e8566cfbdb2d314dd170..0f374e87702cb0fd3a412a5c199f1df8c151a25f 100644 --- a/src/newcrawler/identifiable_adapters.py +++ b/src/newcrawler/identifiable_adapters.py @@ -207,17 +207,28 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): return None return identifiable_candidates[0] - @staticmethod - def check_record(record: db.Record, identifiable: db.Record): + def check_record(self, record: db.Record, identifiable: db.Record): + """ + Naming of the parameters could be confusing: + record is the record from the local database to check against. + identifiable is the record that was created during the crawler run. + """ if len(identifiable.parents) != 1: raise RuntimeError("Multiple parents for identifiables not supported.") if not has_parent(record, identifiable.parents[0].name): return False for prop in identifiable.properties: prop_record = record.get_property(prop.name) + # if prop is an entity, it needs to be resolved first. + # there are two different cases: + # a) prop_record.value has a registered identifiable: + # in this case, fetch the identifiable and set the value accordingly + if prop_record is None: return False if prop.value != prop_record.value: + if prop.name == "project": + breakpoint() return False return True diff --git a/unittests/test_tool.py b/unittests/test_tool.py index 0a5a5f67ed15661cb2be894bfea393ef95e0d4ea..38280773276f9fbc86027058d607d16e1322ef82 100755 --- a/unittests/test_tool.py +++ b/unittests/test_tool.py @@ -291,10 +291,8 @@ def test_crawler_update_list(): assert len(comp[1]["properties"]) == 0 insl, updl = crawler.synchronize(crawler.updateList) - # print(insl) - # print("-" * 28) - # print(updl) - assert 0 + assert len(insl) == 0 + assert len(updl) == 0 def test_identifiable_adapter(): query = IdentifiableAdapter.create_query_for_identifiable(