Skip to content
Snippets Groups Projects
Commit edf47c2d authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

suggestions

parent 221345d0
No related branches found
No related tags found
3 merge requests!108Release 0.5.0,!106Suggestion htw,!104Create a new scanner module and move functions from crawl module there
Pipeline #34910 failed
...@@ -204,6 +204,10 @@ class Crawler(object): ...@@ -204,6 +204,10 @@ class Crawler(object):
Please use SecurityMode Enum Please use SecurityMode Enum
""" """
# Remove this once the property `crawled_data` is no longer needed for compatibility
# reasons
self._crawled_data = None
# The following caches store records, where we checked whether they exist on the remote # The following caches store records, where we checked whether they exist on the remote
# server. Since, it is important to know whether they exist or not, we store them into two # server. Since, it is important to know whether they exist or not, we store them into two
# different caches. # different caches.
...@@ -254,10 +258,22 @@ class Crawler(object): ...@@ -254,10 +258,22 @@ class Crawler(object):
"The function start_crawling in the crawl module is deprecated. " "The function start_crawling in the crawl module is deprecated. "
"Please use scan_structure_elements from the scanner module.")) "Please use scan_structure_elements from the scanner module."))
self.generate_run_id() data = scan_structure_elements(
return scan_structure_elements(
items, crawler_definition, converter_registry, restrict_path) items, crawler_definition, converter_registry, restrict_path)
self.crawled_data = data
return data
@property
def crawled_data(self):
warnings.warn(DeprecationWarning(
"The use of self.crawled_data is depricated. You should not access this variable. "
"Instead, create the data with the scanner and then pass it as argument to Crawler "
"functions"))
return self._crawled_data
@crawled_data.setter
def crawled_data(self, arg):
self._crawled_data = arg
def crawl_directory(self, def crawl_directory(self,
crawled_directory: str, crawled_directory: str,
...@@ -267,15 +283,16 @@ class Crawler(object): ...@@ -267,15 +283,16 @@ class Crawler(object):
The new main function to run the crawler on a directory. The new main function to run the crawler on a directory.
""" """
warnings.warn(DeprecationWarning(
"The function crawl_directory in the crawl module is deprecated. "
"Please use scan_directory from the scanner module."))
self.crawled_directory = crawled_directory self.crawled_directory = crawled_directory
self.generate_run_id()
# TODO: This is not ideal yet, the data is just returned and needs to be data = scan_directory(crawled_directory,
# separately supplied to the synchronize function.
return scan_directory(crawled_directory,
crawler_definition_path, crawler_definition_path,
restricted_path) restricted_path)
self.crawled_data = data
return data
def _has_reference_value_without_id(self, ident: Identifiable) -> bool: def _has_reference_value_without_id(self, ident: Identifiable) -> bool:
""" """
...@@ -791,9 +808,10 @@ class Crawler(object): ...@@ -791,9 +808,10 @@ class Crawler(object):
update_cache.insert(to_be_updated, run_id) update_cache.insert(to_be_updated, run_id)
def synchronize(self, def synchronize(self,
crawled_data: list[db.Record],
commit_changes: bool = True, commit_changes: bool = True,
unique_names=True): unique_names: bool = True,
crawled_data: Optional[list[db.Record]] = None,
):
""" """
This function applies several stages: This function applies several stages:
1) Retrieve identifiables for all records in crawled_data. 1) Retrieve identifiables for all records in crawled_data.
...@@ -808,6 +826,13 @@ class Crawler(object): ...@@ -808,6 +826,13 @@ class Crawler(object):
Return the final to_be_inserted and to_be_updated as tuple. Return the final to_be_inserted and to_be_updated as tuple.
""" """
if crawled_data is None:
warnings.warn(DeprecationWarning(
"Calling synchronize without the data to be synchronized is depricated. Please "
"use for example the Scanner to create this data."))
crawled_data = self.crawled_data
self.generate_run_id()
to_be_inserted, to_be_updated = self.split_into_inserts_and_updates(crawled_data) to_be_inserted, to_be_updated = self.split_into_inserts_and_updates(crawled_data)
referencing_entities = self.create_reference_mapping(to_be_updated + to_be_inserted) referencing_entities = self.create_reference_mapping(to_be_updated + to_be_inserted)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment