From 9bbb91e929f9543503ade21c3a9b8b468f4e3d5b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com>
Date: Tue, 1 Jun 2021 11:06:33 +0000
Subject: [PATCH] MAINT: make output of crawler a bit more precise

---
 README_SETUP.md                                    |  2 +-
 integrationtests/crawl.py                          |  2 +-
 .../extroot/{.cerate_dir => .create_dir}           |  0
 .../2010_TestProject/2019-02-03/README.md          |  2 +-
 .../2010_TestProject/2019-02-03/README.md          |  2 +-
 .../2019-02-03_really_cool_finding/README.md       |  2 +-
 .../2010_TestProject/2019-02-03/README.md          |  2 +-
 setup.py                                           |  4 ++--
 src/caosadvancedtools/crawler.py                   |  9 +++++++--
 src/doc/crawler.rst                                | 14 +++++++++-----
 10 files changed, 24 insertions(+), 15 deletions(-)
 rename integrationtests/extroot/{.cerate_dir => .create_dir} (100%)

diff --git a/README_SETUP.md b/README_SETUP.md
index 9b7b27ec..19f05163 100644
--- a/README_SETUP.md
+++ b/README_SETUP.md
@@ -34,7 +34,7 @@ For testing:
 3. Start an empty (!) CaosDB instance (with the mounted extroot). The
    database will be cleared during testing, so it's important to use
    an empty instance.
-4. Run `test.sh`.
+4. Run `test.sh`.  Note that this may modify content of the `integrationtest/extroot/` directory.
 
 ## Code Formatting
 `autopep8 -i -r ./`
diff --git a/integrationtests/crawl.py b/integrationtests/crawl.py
index bf72b5f7..65600016 100755
--- a/integrationtests/crawl.py
+++ b/integrationtests/crawl.py
@@ -43,7 +43,7 @@ except ModuleNotFoundError:
         return argparse.ArgumentParser()
 
     def print_success(text):
-        print("Success: "+text)
+        print("Success: " + text)
 
 
 def get_parser():
diff --git a/integrationtests/extroot/.cerate_dir b/integrationtests/extroot/.create_dir
similarity index 100%
rename from integrationtests/extroot/.cerate_dir
rename to integrationtests/extroot/.create_dir
diff --git a/integrationtests/extroot/DataAnalysis/2010_TestProject/2019-02-03/README.md b/integrationtests/extroot/DataAnalysis/2010_TestProject/2019-02-03/README.md
index 71454e89..ce0111b8 100644
--- a/integrationtests/extroot/DataAnalysis/2010_TestProject/2019-02-03/README.md
+++ b/integrationtests/extroot/DataAnalysis/2010_TestProject/2019-02-03/README.md
@@ -1,7 +1,7 @@
 ---
 responsible:	
 - Only Responsible
-description: 	A description of another example analysis.
+description: 	A description of this example analysis.
 
 sources:
 - file:	"/ExperimentalData/2010_TestProject/2019-02-03/*.dat"
diff --git a/integrationtests/extroot/ExperimentalData/2010_TestProject/2019-02-03/README.md b/integrationtests/extroot/ExperimentalData/2010_TestProject/2019-02-03/README.md
index b7e5051c..b9d7ef5e 100644
--- a/integrationtests/extroot/ExperimentalData/2010_TestProject/2019-02-03/README.md
+++ b/integrationtests/extroot/ExperimentalData/2010_TestProject/2019-02-03/README.md
@@ -1,7 +1,7 @@
 ---
 responsible:	
 - Only Responsible
-description: 	A description of another example experiment.
+description: 	A description of this example experiment.
 
 results:
 - file:	"/ExperimentalData/2010_TestProject/2019-02-03/*.dat"
diff --git a/integrationtests/extroot/Publications/Posters/2019-02-03_really_cool_finding/README.md b/integrationtests/extroot/Publications/Posters/2019-02-03_really_cool_finding/README.md
index c95e37ec..6d378818 100644
--- a/integrationtests/extroot/Publications/Posters/2019-02-03_really_cool_finding/README.md
+++ b/integrationtests/extroot/Publications/Posters/2019-02-03_really_cool_finding/README.md
@@ -1,7 +1,7 @@
 ---
 responsible:	
 - Only Responsible
-description: 	A description of another example experiment.
+description: 	A description of this example experiment.
 
 sources:
 - /DataAnalysis/2010_TestProject/2019-02-03/results.pdf
diff --git a/integrationtests/extroot/SimulationData/2010_TestProject/2019-02-03/README.md b/integrationtests/extroot/SimulationData/2010_TestProject/2019-02-03/README.md
index fba1bd48..f978bb77 100644
--- a/integrationtests/extroot/SimulationData/2010_TestProject/2019-02-03/README.md
+++ b/integrationtests/extroot/SimulationData/2010_TestProject/2019-02-03/README.md
@@ -1,7 +1,7 @@
 ---
 responsible:	
 - Only Responsible
-description: 	A description of another example experiment.
+description: 	A description of this example experiment.
 
 results:
 - file:	"*.dat"
diff --git a/setup.py b/setup.py
index 8902af8c..1d824f51 100755
--- a/setup.py
+++ b/setup.py
@@ -155,8 +155,8 @@ def setup_package():
         author='Henrik tom Wörden',
         author_email='h.tomwoerden@indiscale.com',
         install_requires=["caosdb>=0.4.0",
-                          "openpyxl>=3.0.0",
-                          "xlrd==1.2.0",
+                          "pandas>=1.2.0",
+                          "xlrd>=2.0",
                           ],
         packages=find_packages('src'),
         package_dir={'': 'src'},
diff --git a/src/caosadvancedtools/crawler.py b/src/caosadvancedtools/crawler.py
index 1684c970..0a36284a 100644
--- a/src/caosadvancedtools/crawler.py
+++ b/src/caosadvancedtools/crawler.py
@@ -310,8 +310,6 @@ class Crawler(object):
         if self.interactive and "y" != input("Do you want to continue? (y)"):
             return
 
-        logger.info("Inserting or updating Records...")
-
         for cfood in cfoods:
             try:
                 cfood.create_identifiables()
@@ -544,6 +542,10 @@ carefully and if the changes are ok, click on the following link:
 
         logger.debug(cfood.to_be_updated)
         try:
+            if len(cfood.to_be_updated) > 0:
+                logger.info(
+                        "Updating {} Records...".format(
+                            len(cfood.to_be_updated)))
             guard.safe_update(cfood.to_be_updated, unique=False)
         except FileNotFoundError as e:
             logger.info("Cannot access {}. However, it might be needed for"
@@ -605,6 +607,9 @@ carefully and if the changes are ok, click on the following link:
             logger.debug("No new entities to be inserted.")
         else:
             try:
+                logger.info(
+                    "Inserting {} Records...".format(
+                        len(missing_identifiables)))
                 guard.safe_insert(missing_identifiables, unique=False)
             except Exception as e:
                 DataModelProblems.evaluate_exception(e)
diff --git a/src/doc/crawler.rst b/src/doc/crawler.rst
index 7c95dad9..0a710fa1 100644
--- a/src/doc/crawler.rst
+++ b/src/doc/crawler.rst
@@ -75,7 +75,7 @@ The crawler can be executed directly via a python script (usually called
 ``crawl.py``). The script prints the progress and reports potential
 problems. The exact behavior depends on your setup. However, you can
 have a look at the example in the
-`tests <https://gitlab.com/caosdb/caosdb-advanced-user-tools/-/blob/main/integrationtests/full_test/crawl.py>`__.
+`tests <https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools/-/blob/main/integrationtests/crawl.py>`__.
 
 .. Note:: The crawler depends on the CaosDB Python client, so make sure to install :doc:`pycaosdb
           <caosdb-pylib:getting_started>`.
@@ -86,14 +86,18 @@ Typically, an invocation looks like:
 
 .. code:: python
 
-   python3 crawl.py "/TestData/"
+   python3 crawl.py /someplace/
 
-In this case ``/TestData/`` identifies the path to be crawled **within
-the CaosDB file system**. You can browse the CaosDB file system by
+.. Note:: For trying out the above mentioned example crawler from the integration tests,
+          make sure that the ``extroot`` directory in the ``integrationtests`` folder is used as
+          CaosDB's extroot directory, and call the crawler indirectly via ``./test.sh``.
+
+In this case ``/someplace/`` identifies the path to be crawled **within
+CaosDB's file system**. You can browse the CaosDB file system by
 opening the WebUI of your CaosDB instance and clicking on “File System”.
 
 In the backend, ``crawl.py`` starts a CQL query
-``FIND File WHICH IS STORED AT /TestData/**`` and crawls the resulting
+``FIND File WHICH IS STORED AT /someplace/**`` and crawls the resulting
 files according to your customized ``CFoods``.
 
 Crawling may consist of two distinct steps: 1. Insertion of files (use
-- 
GitLab