diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index f93c0ec436e4106c22f3bb065137394c54f12754..fd348e224131dc301c7d99555c80948392b5cbff 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -204,6 +204,10 @@ class Crawler(object): Please use SecurityMode Enum """ + # Remove this once the property `crawled_data` is no longer needed for compatibility + # reasons + self._crawled_data = None + # The following caches store records, where we checked whether they exist on the remote # server. Since, it is important to know whether they exist or not, we store them into two # different caches. @@ -254,10 +258,22 @@ class Crawler(object): "The function start_crawling in the crawl module is deprecated. " "Please use scan_structure_elements from the scanner module.")) - self.generate_run_id() - - return scan_structure_elements( + data = scan_structure_elements( items, crawler_definition, converter_registry, restrict_path) + self.crawled_data = data + return data + + @property + def crawled_data(self): + warnings.warn(DeprecationWarning( + "The use of self.crawled_data is depricated. You should not access this variable. " + "Instead, create the data with the scanner and then pass it as argument to Crawler " + "functions")) + return self._crawled_data + + @crawled_data.setter + def crawled_data(self, arg): + self._crawled_data = arg def crawl_directory(self, crawled_directory: str, @@ -267,15 +283,16 @@ class Crawler(object): The new main function to run the crawler on a directory. """ + warnings.warn(DeprecationWarning( + "The function crawl_directory in the crawl module is deprecated. " + "Please use scan_directory from the scanner module.")) self.crawled_directory = crawled_directory - self.generate_run_id() - - # TODO: This is not ideal yet, the data is just returned and needs to be - # separately supplied to the synchronize function. - return scan_directory(crawled_directory, + data = scan_directory(crawled_directory, crawler_definition_path, restricted_path) + self.crawled_data = data + return data def _has_reference_value_without_id(self, ident: Identifiable) -> bool: """ @@ -791,9 +808,10 @@ class Crawler(object): update_cache.insert(to_be_updated, run_id) def synchronize(self, - crawled_data: list[db.Record], commit_changes: bool = True, - unique_names=True): + unique_names: bool = True, + crawled_data: Optional[list[db.Record]] = None, + ): """ This function applies several stages: 1) Retrieve identifiables for all records in crawled_data. @@ -808,6 +826,13 @@ class Crawler(object): Return the final to_be_inserted and to_be_updated as tuple. """ + if crawled_data is None: + warnings.warn(DeprecationWarning( + "Calling synchronize without the data to be synchronized is depricated. Please " + "use for example the Scanner to create this data.")) + crawled_data = self.crawled_data + + self.generate_run_id() to_be_inserted, to_be_updated = self.split_into_inserts_and_updates(crawled_data) referencing_entities = self.create_reference_mapping(to_be_updated + to_be_inserted)