From 0795545ecac22cff1e05c12c35ff93f07bb1b3f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <henrik@trineo.org>
Date: Sun, 19 May 2019 17:49:33 +0200
Subject: [PATCH] update crawler

---
 src/caosadvancedtools/cfood.py          |  7 ++-
 src/caosadvancedtools/read_md_header.py | 74 +-----------------------
 src/caosadvancedtools/utils.py          | 77 +++++++++++++++++++++++++
 unittests/test_read_md_header.py        | 19 ++++--
 4 files changed, 99 insertions(+), 78 deletions(-)

diff --git a/src/caosadvancedtools/cfood.py b/src/caosadvancedtools/cfood.py
index 94ac85a0..e89216dc 100644
--- a/src/caosadvancedtools/cfood.py
+++ b/src/caosadvancedtools/cfood.py
@@ -128,7 +128,7 @@ class AbstractCFood(object):
 
     @staticmethod
     # move to api?
-    def set_property(entity, prop, value):
+    def remove_property(entity, prop):
         # TODO only do something when it is necessary?
 
         if isinstance(prop, db.Entity):
@@ -138,6 +138,11 @@ class AbstractCFood(object):
 
         while entity.get_property(name) is not None:
             entity.remove_property(name)
+
+    @staticmethod
+    # move to api?
+    def set_property(entity, prop, value):
+        AbstractCFood.remove_property(entity, prop)
         entity.add_property(prop, value)
 
     @staticmethod
diff --git a/src/caosadvancedtools/read_md_header.py b/src/caosadvancedtools/read_md_header.py
index 8a6086c5..724b6d51 100644
--- a/src/caosadvancedtools/read_md_header.py
+++ b/src/caosadvancedtools/read_md_header.py
@@ -32,6 +32,7 @@ import caosdb as db
 import yaml
 
 from .cfood import AbstractCFood, get_entity
+from .utils import string_to_person
 
 
 def _clean_header(header):
@@ -140,77 +141,6 @@ it is not at the beginning, it must be preceded by a blank line.
             # print(headerlines)
             raise NoValidHeader(filename)
 
-        return (found_1, found_2, _clean_header(yaml_part))
+        return _clean_header(yaml_part)
 
     raise NoValidHeader(filename)
-
-def parse_responsibles(header):
-    """
-    Extract the responsible person(s) from the yaml header.
-
-    If field responsible is a list every entry from that list will be added as a person.
-    Currently only the format <Firstname> <Lastname> <*> is supported.
-    If it is a simple string, it is added as the only person.
-    """
-    people = []
-
-    if type(header["responsible"]) == list:
-        resplist = header["responsible"]
-    else:
-        resplist = [header["responsible"]]
-
-    for person in header["responsible"]:
-        firstname = person.split(" ")[0]
-        lastname = person.split(" ")[1]
-        p = db.Record()
-        p.add_parent("Person")
-        p.add_property("lastname", lastname)
-        p.add_property("firstname", firstname)
-        people.append(p)
-
-    return people
-
-
-def reference_data_files(entity, header, prefix=""):
-    referenced_files = []
-
-    for glob in get_data_glob(header):
-        if not glob.startswith("/"):
-            glob = os.path.normpath(os.path.join(prefix, glob))
-
-        for dfile in find_file_for_fileglob(glob):
-            if dfile is not None:
-                referenced_files.append(dfile)
-    AbstractCFood.set_property(entity, get_entity("Data"), referenced_files)
-
-
-def get_data_glob(header):
-    if "data" in header:
-        name = "data"
-    elif "results" in header:
-        name = "results"
-    else:
-        raise Exception("no suitable header fields")
-    globs = []
-
-    for el in header[name]:
-        if isinstance(el, dict) and "filename" in el:
-            globs.append(el["filename"])
-        else:
-            globs.append(el)
-
-    return globs
-
-
-def find_file_for_fileglob(glob):
-    # TODO this probalby needs to be extended for different kinds of glob
-    query_string = "FIND file which is stored at {}".format(glob)
-    print(query_string)
-
-    return db.execute_query(query_string)
-
-
-def find_records_referencing_files(glob):
-    # is it possible to reference the "data records"? anything might
-    # reference this file
-    db.execute_query("FIND Record which references file which is stored at")
diff --git a/src/caosadvancedtools/utils.py b/src/caosadvancedtools/utils.py
index 1a8eaf98..b33fee4b 100644
--- a/src/caosadvancedtools/utils.py
+++ b/src/caosadvancedtools/utils.py
@@ -24,6 +24,8 @@
 
 import os
 
+import caosdb as db
+
 
 def replace_path_prefix(path, old_prefix, new_prefix):
     """
@@ -31,8 +33,83 @@ def replace_path_prefix(path, old_prefix, new_prefix):
 
     Raises a RuntimeError when the path does not start with old_prefix.
     """
+
     if not path.startswith(old_prefix):
         raise RuntimeError("Path does not start with old_prefix")
     path = path[len(old_prefix):]
 
     return os.path.join(new_prefix, path)
+
+
+def string_to_person(person):
+    """
+    Creates a Person Record from a string.
+
+    Currently only the format <Firstname> <Lastname> <*> is supported.
+    """
+    firstname = person.split(" ")[0]
+    lastname = person.split(" ")[1]
+    pr = db.Record()
+    pr.add_parent("Person")
+    pr.add_property("lastname", lastname)
+    pr.add_property("firstname", firstname)
+
+    return pr
+
+
+def read_field_as_list(field):
+    """
+    E.g. in yaml headers entries can be single values or list. To simplify the
+    work with those values, this function puts single values in a list.
+    """
+
+    if isinstance(field, list):
+        return field
+    else:
+        return [field]
+
+
+def find_file_included_by(glob):
+    """
+    Executes a query that looks for files included by a glob. The glob needs
+    to be according to CaosDB rules.
+    """
+
+    query_string = "FIND file which is stored at {}".format(glob)
+    print(query_string)
+
+    return db.execute_query(query_string)
+
+
+def assure_absolute_path_in_glob(glob, prefix):
+    """
+    Prefixes a relative globs with some path.
+
+    Some times files are defined by a relative glob (e.g. "scripts/**"). In
+    order to search for such files in CaosDB, these globs are prefixed with the
+    current location.
+
+    A relative glob is identified by a missing "/" in the beginning.
+    """
+
+    if not glob.startswith("/"):
+        glob = os.path.normpath(os.path.join(prefix, glob))
+    else:
+        glob = os.path.normpath(glob)
+
+    return glob
+
+
+def return_field_or_property(value, prop=None):
+    """
+    returns value itself of a property.
+
+    Typical in yaml headers is that a field might sometimes contain a single
+    value and other times a dict itself. This function either returns the
+    single value or (in case of dict as value) a value of the dict.
+    """
+
+    if isinstance(value, dict) and prop in el:
+        return value[prop]
+    else:
+        return value
diff --git a/unittests/test_read_md_header.py b/unittests/test_read_md_header.py
index ab4ba217..21c641f7 100644
--- a/unittests/test_read_md_header.py
+++ b/unittests/test_read_md_header.py
@@ -26,7 +26,7 @@ from copy import deepcopy
 from tempfile import NamedTemporaryFile
 
 import caosdb as db
-from caosadvancedtools.read_md_header import get_data_glob, get_header
+from caosadvancedtools.read_md_header import get_header
 
 
 def to_file(string):
@@ -44,12 +44,21 @@ class ReadHeaderTest(unittest.TestCase):
     def tearDown(self):
         pass
 
-    def test_hash(self):
+    def test_list(self):
         file_content = """
 ---
 data:
-    - results/**
+    - results
+    - data
 ...
 """
-        header = get_header(to_file(file_content))[2]
-        assert get_data_glob(header)[0] == "results/**"
+        header = get_header(to_file(file_content))
+        assert type(header["data"]) is list
+        file_content = """
+---
+data:
+    - data
+...
+"""
+        header = get_header(to_file(file_content))
+        assert type(header["data"]) is list
-- 
GitLab