Merge branch 'master' into inaccurate_time

b35f2030 · Henrik tom Wörden · 68ce0fcc · c4692684 · b35f2030 · b35f2030
Commit b35f2030 authored 5 years ago by Henrik tom Wörden
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -22,7 +22,6 @@ variables:
   CI_REGISTRY_IMAGE: $CI_REGISTRY/caosdb-advanced-testenv
   # When using dind, it's wise to use the overlayfs driver for
   # improved performance.
-   DOCKER_DRIVER: overlay2

 services:
  - docker:19.03.0-dind
@@ -42,8 +41,13 @@ test:
            CAOSDB_TAG=dev-latest;
        fi
      - echo $CAOSDB_TAG
-      - docker login -u testuser -p $CI_REGISTRY_PASSWORD $CI_REGISTRY
-      - docker pull $CI_REGISTRY_IMAGE:latest
+      - time docker load < /image-cache/caosdb-advanced-testenv.tar || true
+      - time docker load < /image-cache/mariadb.tar || true
+      - time docker load < /image-cache/caosdb.tar || true
+      - docker login -u indiscale -p $CI_REGISTRY_PASSWORD $CI_REGISTRY
+      - time docker pull $CI_REGISTRY_IMAGE:latest
+      - time docker pull mariadb:10.4
+      - time docker pull $CI_REGISTRY/caosdb:$CAOSDB_TAG
      - EXEPATH=`pwd` CAOSDB_TAG=$CAOSDB_TAG docker-compose 
        -f .docker/docker-compose.yml up -d
      - cd .docker 
@@ -63,19 +67,20 @@ test:
    expire_in: 1 week

 build-testenv:
-  tags: [notls]
+  tags: [cached-dind]
  image: docker:18.09
  stage: setup
  only:
      - schedules
  script: 
      - df -h
-      - docker login -u testuser -p $CI_REGISTRY_PASSWORD $CI_REGISTRY
+      - docker login -u indiscale -p $CI_REGISTRY_PASSWORD $CI_REGISTRY
        # use here general latest or specific branch latest...
      - docker build 
        --file .docker/Dockerfile
        -t $CI_REGISTRY_IMAGE:latest .
      - docker push $CI_REGISTRY_IMAGE:latest
+      - docker save $CI_REGISTRY_IMAGE:latest > /image-cache/caosdb-advanced-testenv.tar

 cert:
  tags: [docker]
@@ -89,7 +94,7 @@ cert:
      - cd .docker
      - CAOSHOSTNAME=caosdb-server ./cert.sh
 style:
-  tags: [notls]
+  tags: [docker]
  stage: style
  image: $CI_REGISTRY_IMAGE:latest
  script:

--- a/integrationtests/full_test/crawl.py
+++ b/integrationtests/full_test/crawl.py
@@ -24,12 +24,14 @@
 #

 import argparse
+import logging
+import sys
 from argparse import RawTextHelpFormatter

 import caosdb as db
 from caosadvancedtools.crawler import Crawler
 from caosadvancedtools.guard import INSERT, RETRIEVE, UPDATE, Guard
-from caosadvancedtools.verbosity import DEBUG, INFO, QUIET, VERBOSE
+from caosadvancedtools.utils import set_log_level
 from scifolder import (AnalysisCFood, ExperimentCFood, ProjectCFood,
                       PublicationCFood, SimulationCFood)

@@ -48,6 +50,8 @@ def access(path):


 if __name__ == "__main__":
+    set_log_level(logging.DEBUG)
+
    parser = get_parser()
    args = parser.parse_args()

@@ -56,7 +60,6 @@ if __name__ == "__main__":
    print("Query done...")
    config = db.configuration.get_config()
    c = Crawler(use_cache=True, access=access,
-                verbosity=DEBUG,
                food=[ProjectCFood,
                      ExperimentCFood, AnalysisCFood,
                      PublicationCFood, SimulationCFood,

--- a/integrationtests/full_test/extroot/ExperimentalData/2010_TestProject/2019-02-03/not_treated.file
+++ b/integrationtests/full_test/extroot/ExperimentalData/2010_TestProject/2019-02-03/not_treated.file
--- a/src/caosadvancedtools/__init__.py
+++ b/src/caosadvancedtools/__init__.py
+import logging
+import sys
+
+logger = logging.getLogger("caosadvancedtools")
+logger.setLevel(level=logging.INFO)
+logger.addHandler(logging.StreamHandler(sys.stdout))
--- a/src/caosadvancedtools/cfood.py
+++ b/src/caosadvancedtools/cfood.py
@@ -34,15 +34,17 @@ match. This occurs in basically three steps:
 3. the identifiables are update based on the date in the file structure
 """

+import logging
 import re

 import caosdb as db

 from .guard import global_guard as guard
-from .verbosity import DEBUG, INFO, VERBOSE

 ENTITIES = {}

+logger = logging.getLogger(__name__)
+

 def get_entity(name):
    """ Returns the entity with a given name, preferably from a local cache.
@@ -63,7 +65,7 @@ class AbstractCFood(object):
    # function match()
    _pattern = None

-    def __init__(self, crawled_file, access=lambda x: x, verbosity=INFO):
+    def __init__(self, crawled_file, access=lambda x: x):
        """ Abstract base class for Crawler food (CFood).

        Parameters
@@ -80,7 +82,6 @@ class AbstractCFood(object):
        self.match = type(self).match(crawled_file)
        self.to_be_updated = db.Container()
        self.identifiables = db.Container()
-        self.verbosity = verbosity
        self.attached_ones = []
        self.attached_filenames = []

@@ -174,16 +175,12 @@ class AbstractCFood(object):

        self.to_be_updated = tmp

-        if self.verbosity >= INFO:
-            print("/"*60)
-            print("UPDATE: updating the following entities")
+        logger.info("UPDATE: updating the following entities")

        for el in self.to_be_updated:
-                print(el.name if el.name is not None else el.id)
+            logger.info("\t" + el.name if el.name is not None else el.id)

-            if self.verbosity >= DEBUG:
-                print(self.to_be_updated)
-            print("/"*60)
+        logger.debug(self.to_be_updated)
        guard.safe_update(self.to_be_updated)

    def attach(self, crawled_file):
@@ -239,7 +236,7 @@ class AbstractCFood(object):


 def assure_object_is_in_list(obj, containing_object, property_name,
-                             to_be_updated, datatype=None, verbosity=INFO):
+                             to_be_updated, datatype=None):
    """
    Checks whether `obj` is one of the values in the list property
    `property_name` of the supplied entity  containing_object`.
@@ -289,16 +286,12 @@ def assure_object_is_in_list(obj, containing_object, property_name,
                break

        if contained:
-            if verbosity >= VERBOSE:
-                print("{} is in {} of entity {}".format(
+            logger.debug("{} is in {} of entity {}".format(
                    o, property_name, containing_object.id))

        else:
-            if verbosity >= INFO:
-                print("/"*60)
-                print("UPDATE: Appending {} to {} of entity {}".format(
+            logger.info("UPDATE: Appending {} to {} of entity {}".format(
                    o, property_name, containing_object.id))
-                print("/"*60)
            current_list.append(o)
            update = True

@@ -307,7 +300,7 @@ def assure_object_is_in_list(obj, containing_object, property_name,


 def assure_has_description(entity, description, to_be_updated=None,
-                           verbosity=INFO, force=False):
+                           force=False):
    """
    Checks whether `entity` has the description that is passed.

@@ -320,9 +313,7 @@ def assure_has_description(entity, description, to_be_updated=None,
    if entity.description == description:
        return

-    print("/"*60)
-    print("UPDATE: set description of entity {}".format(entity.id))
-    print("/"*60)
+    logger.info("UPDATE: set description of entity {}".format(entity.id))
    entity.description = description

    if to_be_updated is None:
@@ -334,7 +325,7 @@ def assure_has_description(entity, description, to_be_updated=None,
        to_be_updated.append(entity)


-def assure_has_parent(entity, parent, to_be_updated=None, verbosity=INFO,
+def assure_has_parent(entity, parent, to_be_updated=None,
                      force=False, unique=True):
    """
    Checks whether `entity` has a parent with name `parent`.
@@ -354,16 +345,12 @@ def assure_has_parent(entity, parent, to_be_updated=None, verbosity=INFO,
            break

    if contained:
-        if verbosity >= VERBOSE:
-            print("entity {} has parent {}".format(entity.id, parent))
+        logger.debug("entity {} has parent {}".format(entity.id, parent))

        return

-    if verbosity >= INFO:
-        print("/"*60)
-        print("UPDATE: Adding  parent {} to entity {}".format(parent,
+    logger.info("UPDATE: Adding  parent {} to entity {}".format(parent,
                                                                entity.id))
-        print("/"*60)
    entity.add_parent(parent)

    if to_be_updated is None:
@@ -381,7 +368,7 @@ def assure_has_parent(entity, parent, to_be_updated=None, verbosity=INFO,


 def assure_has_property(entity, name, value, to_be_updated=None,
-                        verbosity=INFO, datatype=None):
+                        datatype=None):
    """
    Checks whether `entity` has a property `name` with the value `value`.

@@ -395,11 +382,8 @@ def assure_has_property(entity, name, value, to_be_updated=None,
        if entity.description == value:
            return
        else:
-            print("/"*60)
-            print("UPDATE: Adding  description with value {} to "
-                  "entity {}".format(
-                      value, entity.id))
-            print("/"*60)
+            logger.info("UPDATE: Adding  description with value {} to "
+                        "entity {}".format(value, entity.id))
            entity.description = value

            if to_be_updated is None:
@@ -421,19 +405,14 @@ def assure_has_property(entity, name, value, to_be_updated=None,
            break

    if contained:
-        if verbosity >= VERBOSE:
-            print("entity {} has property  {} with value {}".format(
-                entity.id,
-                name, value))
+        logger.debug("entity {} has property  {} with value {}".format(
+                entity.id, name, value))

        return

-    if verbosity >= INFO:
-        print("/"*60)
-        print("UPDATE: Adding  property {} with value {} to entity {}".format(
-            name,
-            value, entity.id))
-        print("/"*60)
+    logger.info(
+        "UPDATE: Adding  property {} with value {} to entity {}".format(
+            name, value, entity.id))

    if datatype is None:
        entity.add_property(name=name, value=value)

--- a/src/caosadvancedtools/crawler.py
+++ b/src/caosadvancedtools/crawler.py
@@ -36,6 +36,7 @@ match. This occurs in basically three steps:
 """


+import logging
 import traceback
 from datetime import datetime

@@ -45,13 +46,38 @@ from caosdb.exceptions import TransactionError
 from .cache import Cache
 from .guard import INSERT, RETRIEVE, UPDATE
 from .guard import global_guard as guard
-from .verbosity import DEBUG, INFO, VERBOSE
+
+logger = logging.getLogger(__name__)
+
+
+def separated(text):
+    return "-"*60 + "\n" + text
+
+
+class UnknownCache(object):
+    def __init__(self, interactive=False):
+        if interactive and "y" == input(
+                "\nDo you want to load filenames that previously were not "
+                "matched by any CFood?\nIn that case, they will not show up "
+                "again. (y)"):
+            with open("known_cache.db") as fi:
+                self.filenames = [el.strip("\n") for el in fi.readlines()]
+        else:
+            self.filenames = []
+
+    def save(self):
+        with open("known_cache.db", "w") as fi:
+            for name in self.filenames:
+                fi.write(name + "\n")
+
+    def add(self, el):
+        self.filenames.append(el)

 from datetime import timedelta

 class Crawler(object):
    def __init__(self, food, access=lambda x: x, use_cache=False,
-                 abort_on_exception=True, verbosity=INFO):
+                 abort_on_exception=True):
        """
        Parameters
        ----------
@@ -69,40 +95,31 @@ class Crawler(object):
        self.access = access
        self.report = db.Container()
        self.use_cache = use_cache
-        self.verbosity = verbosity
        self.abort_on_exception = abort_on_exception

        if self.use_cache:
            self.cache = Cache()

-    def match(self, files):
+    def match(self, files, interactive):
        errors_occured = False
        tbs = []
        cfoods = []
-        matches = {f: 0 for f in files}
+        matches = {f: [] for f in files}

-        if self.verbosity >= INFO:
-            print("-"*60)
-            print("Matching files against CFoods")
+        logger.info(separated("Matching files against CFoods"))

        for Cfood in self.food:
-            if self.verbosity >= VERBOSE:
-                print("Matching against {}...".format(Cfood.__name__))
+            logger.debug("Matching against {}...".format(Cfood.__name__))

            for crawled_file in files:
-                if self.verbosity >= DEBUG:
-                    print("Matching {}...".format(crawled_file))
-
                if Cfood.match(crawled_file) is not None:
-                    matches[crawled_file] += 1
+                    matches[crawled_file].append(Cfood.__name__)

-                    if self.verbosity >= VERBOSE:
-                        print("{} matched\n{}.".format(
+                    logger.debug("{} matched\n{}.".format(
                            Cfood.__class__.__name__,
                            crawled_file))
                    try:
-                        cfood = Cfood.cook(crawled_file, access=self.access,
-                                           verbosity=self.verbosity)
+                        cfood = Cfood.cook(crawled_file, access=self.access)

                        if cfood is not None:
                            cfoods.append(cfood)
@@ -115,42 +132,53 @@ class Crawler(object):
                        errors_occured = True
                        tbs.append(e)

-        if self.verbosity >= INFO:
-            print("-"*60)
-            print("CFoods are collecting information...")
+        logger.info(separated("CFoods are collecting information..."))

        for cfood in cfoods:
            cfood.collect_information()

-        if self.verbosity >= INFO:
-            print("-"*60)
-            print("Trying to attach files to created CFoods")
+        logger.info(separated("Trying to attach files to created CFoods"))

        for cfood in cfoods:
-            if self.verbosity >= VERBOSE:
-                print("Matching against {}...".format(Cfood.__name__))
+            logger.debug("Matching against {}...".format(Cfood.__name__))

            for crawled_file in files:
-                if self.verbosity >= DEBUG:
-                    print("Matching {}...".format(crawled_file))
-
                if cfood.looking_for(crawled_file):
-                    if self.verbosity >= VERBOSE:
-                        print("{} matched\n{}.".format(
+                    logger.debug("{} matched\n{}.".format(
                            Cfood.__class__.__name__,
                            crawled_file))
                    cfood.attach(crawled_file)
-                    matches[crawled_file] += 1
+                    matches[crawled_file].append(Cfood.__name__)
+
+        # possibly load previously encountered "Missing matches" and
+        # "Multiple matches"
+        ucache = UnknownCache(interactive=interactive)

-        if self.verbosity >= INFO:
        for crawled_file in files:
-                if matches[crawled_file] == 0:
-                    print("ATTENTION: No matching cfood!")
-                    print("Tried to match {}".format(crawled_file))
+            if len(matches[crawled_file]) == 0:
+                msg = ("ATTENTION: No matching cfood!\n"
+                       "Tried to match {}".format(crawled_file))
+
+                if crawled_file in ucache.filenames:
+                    logger.debug(msg)
+                else:
+                    logger.warn(msg)
+                ucache.add(crawled_file)

-                if matches[crawled_file] > 1:
-                    print("Attention: More than one matching cfood!")
-                    print("Tried to match {}".format(crawled_file))
+            if len(matches[crawled_file]) > 1:
+                msg = ("Attention: More than one matching cfood!"
+                       + "Tried to match {}\n".format(crawled_file)
+                       + "\tRecordTypes:\t" + ", ".join(
+                            matches[crawled_file]))
+
+                if crawled_file in ucache.filenames:
+                    logger.debug(msg)
+                else:
+                    logger.warn(msg)
+                    ucache.add(crawled_file)
+
+        # Save the encountered prblem matches
+        ucache.save()

        return cfoods, matches, tbs, errors_occured

@@ -159,14 +187,12 @@ class Crawler(object):

        files = sorted([f.path for f in files])

-        cfoods, matches, tbs, errors_occured = self.match(files)
+        cfoods, matches, tbs, errors_occured = self.match(files, interactive)

        if interactive and "y" != input("Do you want to continue? (y)"):
            return

-        if self.verbosity >= INFO:
-            print("-"*60)
-            print("Creating and updating Identifiables")
+        logger.info(separated("Creating and updating Identifiables"))

        for cfood in cfoods:
            try:
@@ -176,9 +202,7 @@ class Crawler(object):
                    hashes = self.cache.update_ids_from_cache(
                        cfood.identifiables)

-                self.find_or_insert_identifiables(cfood.identifiables,
-                                                  self.verbosity,
-                                                  )
+                self.find_or_insert_identifiables(cfood.identifiables)

                if self.use_cache:
                    self.cache.insert_list(hashes, cfood.identifiables)
@@ -194,19 +218,15 @@ class Crawler(object):
                errors_occured = True
                tbs.append(e)

-        if self.verbosity >= INFO:
-            print("-"*60)
-
        if errors_occured:
-                print("Crawler terminated with failures!")
-                print(tbs)
+            logger.warn("Crawler terminated with failures!")
+            logger.warn(tbs)
        else:
-                print("Crawler terminated successfully!")
-            print("-"*60)
+            logger.info("Crawler terminated successfully!")

    # TODO remove static?
    @staticmethod
-    def find_or_insert_identifiables(identifiables, verbosity=INFO):
+    def find_or_insert_identifiables(identifiables):
        """ Sets the ids of identifiables (that do not have already an id from the
        cache) based on searching CaosDB and retrieves those entities.
        The remaining entities (those which can not be retrieved) have no
@@ -215,22 +235,15 @@ class Crawler(object):
        # looking for matching entities in CaosDB when there is no valid id
        # i.e. there was none set from a cache

-        if verbosity >= VERBOSE:
-            print("-----------------------------------------------------")
-
        for ent in identifiables:
            if ent.id is None or ent.id < 0:
-                if verbosity >= VERBOSE:
-                    print("Looking for:")
-                    print(ent)
-                existing = Crawler.find_existing(ent, verbosity=verbosity)
+                logger.debug("Looking for: {}".format(ent))
+                existing = Crawler.find_existing(ent)

                if existing is not None:
                    ent.id = existing.id
            else:
-                if verbosity >= DEBUG:
-                    print("Id is known of:")
-                    print(ent)
+                logger.debug("Id is known of: {}".format(ent))

        # insert missing, i.e. those which are not valid
        missing_identifiables = db.Container()
@@ -241,27 +254,22 @@ class Crawler(object):
        for ent in missing_identifiables:
            ent.id = None

-        if verbosity >= INFO and len(missing_identifiables) > 0:
-            print("Going to insert the following entities:")
+        if len(missing_identifiables) > 0:
+            logger.info("Going to insert the following entities:")

            for ent in missing_identifiables:
-                print(ent)
+                logger.info(ent)

        if len(missing_identifiables) == 0:
-            if verbosity >= VERBOSE:
-                print("No new entities to be inserted.")
+            logger.debug("No new entities to be inserted.")
        else:
            guard.safe_insert(missing_identifiables)

-        if verbosity >= VERBOSE:
-            print("Retrieving entities from CaosDB...")
+        logger.debug("Retrieving entities from CaosDB...")
        identifiables.retrieve(unique=True, raise_exception_on_error=False)

-        if verbosity >= VERBOSE:
-            print("-----------------------------------------------------")
-
    @staticmethod
-    def find_existing(entity, verbosity=INFO):
+    def find_existing(entity):
        """searches for an entity that matches the identifiable in CaosDB

        Characteristics of the identifiable like, properties, name or id are
@@ -288,8 +296,7 @@ class Crawler(object):
        else:
            query_string = "FIND '{}'".format(entity.name)

-        if verbosity >= VERBOSE:
-            print(query_string)
+        logger.debug(query_string)
        q = db.Query(query_string)
        # the identifiable should identify an object uniquely. Thus the query
        # is using the unique keyword
@@ -309,9 +316,9 @@ class Crawler(object):
    def query_files(path):
        query_str = "FIND FILE WHICH IS STORED AT " + \
            (path if path.endswith("/") else path + "/") + "**"
-        print("FILES QUERY: " + query_str)
+        logger.info("FILES QUERY: " + query_str)
        files = db.execute_query(query_str)
-        print("{} FILES TO BE PROCESSED.".format(len(files)))
+        logger.info("{} FILES TO BE PROCESSED.".format(len(files)))

        return files


--- a/src/caosadvancedtools/pandoc_header_tools.py
+++ b/src/caosadvancedtools/pandoc_header_tools.py
@@ -45,6 +45,7 @@ class NoValidHeader(Exception):
               "using -f option".format(filename))
        super().__init__(msg, *args, **kwargs)

+
 class MetadataFileMissing(Exception):

    def __init__(self, filename, *args, **kwargs):
@@ -52,6 +53,7 @@ class MetadataFileMissing(Exception):
        msg = "Metadata file README.md missing in " + filename
        super().__init__(msg, *args, **kwargs)

+
 class ParseErrorsInHeader(Exception):
    def __init__(self, filename, reason, *args, **kwargs):
        self.filename = filename
@@ -59,6 +61,7 @@ class ParseErrorsInHeader(Exception):
        msg = "Invalid header in {}. Reason: {}".format(filename, reason)
        super().__init__(msg, *args, **kwargs)

+
 TEMPLATEHEADER = """
 ---
 responsible:
@@ -185,7 +188,6 @@ def save_header(filename, header_data):
        f.writelines(textlines)


-
 def add_header(filename, header_dict=None):
    """
    Add a header to an md file.

--- a/src/caosadvancedtools/read_md_header.py
+++ b/src/caosadvancedtools/read_md_header.py
@@ -27,6 +27,7 @@

 from . import pandoc_header_tools

+
 def get_header(fn):
    return pandoc_header_tools.get_header(fn)[2]


--- a/src/caosadvancedtools/utils.py
+++ b/src/caosadvancedtools/utils.py
@@ -22,11 +22,17 @@
 # ** end header
 #

+import logging
 import os

 import caosdb as db


+def set_log_level(level):
+    logger = logging.getLogger("caosadvancedtools")
+    logger.setLevel(level=logging.DEBUG)
+
+
 def replace_path_prefix(path, old_prefix, new_prefix):
    """
    Replaces the prefix old_prefix in path with new_prefix.

--- a/src/caosadvancedtools/verbosity.py
+++ b/src/caosadvancedtools/verbosity.py
-#!/usr/bin/env python
-# encoding: utf-8
-#
-# This file is a part of the CaosDB Project.
-#
-# Copyright (C) 2018 Research Group Biomedical Physics,
-# Max-Planck-Institute for Dynamics and Self-Organization Göttingen
-# Copyright (C) 2019 Henrik tom Wörden
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as
-# published by the Free Software Foundation, either version 3 of the
-# License, or (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with this program. If not, see <https://www.gnu.org/licenses/>.
-
-"""
-Verbosity levels are: quiet, info, verbose, debug
-"""
-QUIET = 0
-INFO = 1
-VERBOSE = 2
-DEBUG = 3