suggestions

edf47c2d · Henrik tom Wörden · 221345d0 · edf47c2d
Commit edf47c2d authored Mar 24, 2023 by Henrik tom Wörden
--- a/src/caoscrawler/crawl.py
+++ b/src/caoscrawler/crawl.py
@@ -204,6 +204,10 @@ class Crawler(object):
             Please use SecurityMode Enum
        """
+        # Remove this once the property `crawled_data` is no longer needed for compatibility
+        # reasons
+        self._crawled_data = None
        # The following caches store records, where we checked whether they exist on the remote
        # server. Since, it is important to know whether they exist or not, we store them into two
        # different caches.
@@ -254,10 +258,22 @@ class Crawler(object):
            "The function start_crawling in the crawl module is deprecated. "
            "Please use scan_structure_elements from the scanner module."))
-        self.generate_run_id()
+        data = scan_structure_elements(
-        return scan_structure_elements(
            items, crawler_definition, converter_registry, restrict_path)
+        self.crawled_data = data
+        return data
+    @property
+    def crawled_data(self):
+        warnings.warn(DeprecationWarning(
+            "The use of self.crawled_data is depricated. You should not access this variable. "
+            "Instead, create the data with the scanner and then pass it as argument to Crawler "
+            "functions"))
+        return self._crawled_data
+    @crawled_data.setter
+    def crawled_data(self, arg):
+        self._crawled_data = arg
    def crawl_directory(self,
                        crawled_directory: str,
@@ -267,15 +283,16 @@ class Crawler(object):
        The new main function to run the crawler on a directory.
        """
+        warnings.warn(DeprecationWarning(
+            "The function crawl_directory in the crawl module is deprecated. "
+            "Please use scan_directory from the scanner module."))
        self.crawled_directory = crawled_directory
-        self.generate_run_id()
-        # TODO: This is not ideal yet, the data is just returned and needs to be
+        data = scan_directory(crawled_directory,
-        #       separately supplied to the synchronize function.
-        return scan_directory(crawled_directory,
                              crawler_definition_path,
                              restricted_path)
+        self.crawled_data = data
+        return data
    def _has_reference_value_without_id(self, ident: Identifiable) -> bool:
        """
@@ -791,9 +808,10 @@ class Crawler(object):
                update_cache.insert(to_be_updated, run_id)
    def synchronize(self,
-                    crawled_data: list[db.Record],
                    commit_changes: bool = True,
-                    unique_names=True):
+                    unique_names: bool = True,
+                    crawled_data: Optional[list[db.Record]] = None,
+                    ):
        """
        This function applies several stages:
        1) Retrieve identifiables for all records in crawled_data.
@@ -808,6 +826,13 @@ class Crawler(object):
        Return the final to_be_inserted and to_be_updated as tuple.
        """
+        if crawled_data is None:
+            warnings.warn(DeprecationWarning(
+                "Calling synchronize without the data to be synchronized is depricated. Please "
+                "use for example the Scanner to create this data."))
+            crawled_data = self.crawled_data
+        self.generate_run_id()
        to_be_inserted, to_be_updated = self.split_into_inserts_and_updates(crawled_data)
        referencing_entities = self.create_reference_mapping(to_be_updated + to_be_inserted)