Skip to content
Snippets Groups Projects
Commit bd146bd3 authored by Alexander Schlemmer's avatar Alexander Schlemmer
Browse files

MAINT: removed old synchronization function and refactored init method

parent f1636f29
No related branches found
No related tags found
2 merge requests!108Release 0.5.0,!104Create a new scanner module and move functions from crawl module there
...@@ -4,8 +4,8 @@ ...@@ -4,8 +4,8 @@
# ** header v3.0 # ** header v3.0
# This file is a part of the CaosDB Project. # This file is a part of the CaosDB Project.
# #
# Copyright (C) 2021 Henrik tom Wörden # Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com>
# 2021-2023 Alexander Schlemmer (alexander.schlemmer@ds.mpg.de) # 2021-2023 Alexander Schlemmer <alexander.schlemmer@ds.mpg.de>
# #
# This program is free software: you can redistribute it and/or modify # This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as # it under the terms of the GNU Affero General Public License as
...@@ -67,6 +67,8 @@ from .stores import GeneralStore, RecordStore ...@@ -67,6 +67,8 @@ from .stores import GeneralStore, RecordStore
from .structure_elements import StructureElement, Directory, NoneElement from .structure_elements import StructureElement, Directory, NoneElement
from .version import check_cfood_version from .version import check_cfood_version
from .scanner import scan_directory
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
SPECIAL_PROPERTIES_STRICT = ("description", "name", "id", "path") SPECIAL_PROPERTIES_STRICT = ("description", "name", "id", "path")
...@@ -175,27 +177,13 @@ class Crawler(object): ...@@ -175,27 +177,13 @@ class Crawler(object):
""" """
def __init__(self, def __init__(self,
generalStore: Optional[GeneralStore] = None, identifiableAdapter: Optional[IdentifiableAdapter] = None,
debug: bool = False, securityMode: SecurityMode = SecurityMode.UPDATE):
identifiableAdapter: IdentifiableAdapter = None,
securityMode: SecurityMode = SecurityMode.UPDATE
):
""" """
Create a new crawler and initialize an empty RecordStore and GeneralStore. Create a new crawler and initialize an empty RecordStore and GeneralStore.
Parameters Parameters
---------- ----------
recordStore : GeneralStore
An initial GeneralStore which might store e.g. environment variables.
debug : bool
Create a debugging information tree when set to True.
The debugging information tree is a variable stored in
self.debug_tree. It is a dictionary mapping directory entries
to a tuple of general stores and record stores which are valid for
the directory scope.
Furthermore, it is stored in a second tree named self.debug_copied whether the
objects in debug_tree had been copied from a higher level in the hierarchy
of the structureelements.
identifiableAdapter : IdentifiableAdapter identifiableAdapter : IdentifiableAdapter
TODO describe TODO describe
securityMode : int securityMode : int
...@@ -208,30 +196,30 @@ class Crawler(object): ...@@ -208,30 +196,30 @@ class Crawler(object):
# different caches. # different caches.
self.remote_existing_cache = IdentifiedCache() self.remote_existing_cache = IdentifiedCache()
self.remote_missing_cache = IdentifiedCache() self.remote_missing_cache = IdentifiedCache()
self.recordStore = RecordStore()
self.securityMode = securityMode self.securityMode = securityMode
self.generalStore = generalStore
if generalStore is None:
self.generalStore = GeneralStore()
self.identifiableAdapter: IdentifiableAdapter = LocalStorageIdentifiableAdapter() self.identifiableAdapter: IdentifiableAdapter = LocalStorageIdentifiableAdapter()
if identifiableAdapter is not None: if identifiableAdapter is not None:
self.identifiableAdapter = identifiableAdapter self.identifiableAdapter = identifiableAdapter
# If a directory is crawled this may hold the path to that directory
self.crawled_directory: Optional[str] = None
self.debug = debug
def crawl_directory(self,
def synchronize(self, commit_changes: bool = True, unique_names=True): crawled_directory: str,
crawler_definition_path: str,
restricted_path: Optional[list[str]] = None):
""" """
Carry out the actual synchronization. The new main function to run the crawler on a directory.
""" """
# After the crawling, the actual synchronization with the database, based on the self.crawled_directory = crawled_directory
# update list is carried out: self.run_id = uuid.uuid1()
# TODO: This is not ideal yet, the data is just returned and needs to be
# separately supplied to the synchronize function.
return scan_directory(crawled_directory,
crawler_definition_path,
restricted_path)
return self._synchronize(self.crawled_data, commit_changes, unique_names=unique_names)
def _has_reference_value_without_id(self, ident: Identifiable) -> bool: def _has_reference_value_without_id(self, ident: Identifiable) -> bool:
""" """
...@@ -704,7 +692,8 @@ class Crawler(object): ...@@ -704,7 +692,8 @@ class Crawler(object):
return db.Entity(id=id).retrieve() return db.Entity(id=id).retrieve()
@staticmethod @staticmethod
def execute_inserts_in_list(to_be_inserted, securityMode, run_id: uuid.UUID = None, def execute_inserts_in_list(to_be_inserted, securityMode,
run_id: Optional[uuid.UUID] = None,
unique_names=True): unique_names=True):
for record in to_be_inserted: for record in to_be_inserted:
for prop in record.properties: for prop in record.properties:
...@@ -732,7 +721,8 @@ class Crawler(object): ...@@ -732,7 +721,8 @@ class Crawler(object):
_resolve_datatype(prop, entity) _resolve_datatype(prop, entity)
@staticmethod @staticmethod
def execute_updates_in_list(to_be_updated, securityMode, run_id: uuid.UUID = None, def execute_updates_in_list(to_be_updated, securityMode,
run_id: Optional[uuid.UUID] = None,
unique_names=True): unique_names=True):
Crawler.set_ids_and_datatype_of_parents_and_properties(to_be_updated) Crawler.set_ids_and_datatype_of_parents_and_properties(to_be_updated)
logger.debug("UPDATE") logger.debug("UPDATE")
...@@ -744,8 +734,10 @@ class Crawler(object): ...@@ -744,8 +734,10 @@ class Crawler(object):
update_cache = UpdateCache() update_cache = UpdateCache()
update_cache.insert(to_be_updated, run_id) update_cache.insert(to_be_updated, run_id)
def _synchronize(self, crawled_data: list[db.Record], commit_changes: bool = True, def synchronize(self,
unique_names=True): crawled_data: list[db.Record],
commit_changes: bool = True,
unique_names=True):
""" """
This function applies several stages: This function applies several stages:
1) Retrieve identifiables for all records in crawled_data. 1) Retrieve identifiables for all records in crawled_data.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment