From e4de31312b54b8310ed7fb7b5698d4e4d3fb2268 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <henrik@trineo.org> Date: Fri, 17 Dec 2021 17:13:13 +0100 Subject: [PATCH] MAINT: introduce the cache in the crawler --- src/newcrawler/crawl.py | 15 +++++++++++---- src/newcrawler/identifiable_adapters.py | 3 +-- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/newcrawler/crawl.py b/src/newcrawler/crawl.py index 48e9c9a5..c1760587 100644 --- a/src/newcrawler/crawl.py +++ b/src/newcrawler/crawl.py @@ -67,6 +67,7 @@ from argparse import RawTextHelpFormatter import caosdb as db from caosdb.common.datatype import is_reference from .stores import GeneralStore, RecordStore +from .identified_cache import IdentifiedCache from .structure_elements import StructureElement, Directory from .converters import Converter, DirectoryConverter from .identifiable_adapters import IdentifiableAdapter, LocalStorageIdentifiableAdapter @@ -103,6 +104,7 @@ class Crawler(object): """ self.global_converters = converters + self.identified_cache = IdentifiedCache() self.recordStore = RecordStore() self.generalStore = generalStore @@ -238,15 +240,19 @@ class Crawler(object): return False return True - def get_identified_record_from_local_cache(self, identifiable: db.Record): + def get_identified_record_from_local_cache(self, record: db.Record): """ returns the identifiable if an identifiable with the same values already exists locally (Each identifiable that is not found on the remote server, is 'cached' locally to prevent that the same identifiable exists twice) """ - raise NotImplementedError() + identifiable = self.identifiableAdapter.get_identifiable(record) + if identifiable in self.identified_cache: + return self.identified_cache[identifiable] + else: + return None - def add_identified_record_to_local_cache(self, identifiable: db.Record): + def add_identified_record_to_local_cache(self, record: db.Record): """ adds the given identifiable to the local cache @@ -254,7 +260,8 @@ class Crawler(object): (Each identifiable that is not found on the remote server, is 'cached' locally to prevent that the same identifiable exists twice) """ - raise NotImplementedError() + identifiable = self.identifiableAdapter.get_identifiable(record) + self.identified_cache.add(identifiable=identifiable, record=record) def copy_attributes(self, fro: db.Entity, to: db.Entity): raise NotImplementedError() diff --git a/src/newcrawler/identifiable_adapters.py b/src/newcrawler/identifiable_adapters.py index 06573d3a..27f2bae1 100644 --- a/src/newcrawler/identifiable_adapters.py +++ b/src/newcrawler/identifiable_adapters.py @@ -102,7 +102,6 @@ class IdentifiableAdapter(object): def resolve_references(self, record: db.Record): pass - # TODO rename retrieve_registered_identifiable? def get_identifiable(self, record: db.Record): """ retrieve the registred identifiable and fill the property values to create an @@ -126,7 +125,7 @@ class IdentifiableAdapter(object): # case A: in the registered identifiable # case B: in the identifiable - #TODO use id if value is Entity + # TODO use id if value is Entity identifiable.add_property(record.get_property(prop.name)) property_name_list_A.append(prop.name) -- GitLab