new version with variable information backends

cd2ff85c · Alexander Schlemmer · 0e743800 · cd2ff85c · cd2ff85c
Commit cd2ff85c authored 3 years ago by Alexander Schlemmer
--- a/src/newcrawler/crawl.py
+++ b/src/newcrawler/crawl.py
@@ -8,89 +8,12 @@ import sys
 import yaml
 import re
 import json
+import yaml_header_tools
 from abc import abstractmethod

-def match_file_object(node: dict,
-                      filename: str):
-    """
-    Try to match a filename with the supplied node.
-
-    This function only uses the current path name specified by filename.
-    It does not check whether the file system object behind that path is valid
-    and matching the type of the node.
-
-    Parameters
-    ----------
-    node : A dictionary containing the matcher.
-    filename : A filename to match.
-
-    Returns
-    -------
-    None if the matcher does not match and otherwise a dict with the values of the matcher.
-    """
-
-    flags = 0
-    if node["case"] == "insensitive":
-        flags += re.IGNORECASE
-        
-    regexp = node["re"]
-    pattern = re.compile(regexp)
-    matcher = re.match(pattern, filename)
-
-    if matcher is None:
-        return None
-
-    # Value of node:
-    # - Add the numeric groups
-    # - Add the dictionary groups as well
-
-    valdict = {0: matcher.group()}
-    for i in range(len(matcher.groups())):
-        valdict[i+1] = matcher.group(i+1)
-    for k, v in matcher.groupdict().items():
-        valdict[k] = v
-
-    return valdict
-
-def get_subnode_with_defaults(node: dict,
-                              key: str):
-    """
-    Return the key from node as subnode setting some important defaults for
-    the cfood specification.
-
-    Currently this is:
-    - Creating an "re" (regular expression) from the key, if no re is set.
-    - Add type "dir" if no type is present.
-    - Add default case "sensitive" to the node.
-    
-    Parameters
-    ----------
-    node : The dictionary containing the subnode as key.
-    key : The key of the dictionary.
-
-    Returns
-    -------
-    The subnode including the defaults.
-    """
-
-    if key not in node:
-        raise ValueError("Key {} is not in node.".format(key))
-
-    subnode = node[key]
-
-    if "re" not in subnode:
-        subnode["re"] = re.escape(key)

-    if "type" not in subnode:
-        subnode["type"] = "dir"

-    if "case" not in subnode:
-        subnode["case"] = "sensitive"

-    if "nodeName" not in subnode:
-        subnode["nodeName"] = key
-
-    return subnode

 def match_complete(node: dict):
    """Determine whether the match is complete.
@@ -121,13 +44,84 @@ class InformationBackend(object):
        return

    @abstractmethod
-    def sub_matcher(self, current_node, current_element):
+    def sub_matcher(self, current_node, current_element, subelement):
        pass

 class DirectoryInformationBackend(InformationBackend):
-    def __init__(current_dir):
+    def __init__(self, current_dir):
        self.current_dir = current_dir

+    def add_defaults(self, node: dict):
+        """
+        Return the key from node as subnode setting some important defaults for
+        the cfood specification.
+
+        Currently this is:
+        - Creating an "re" (regular expression) from the key, if no re is set.
+        - Add type "dir" if no type is present.
+        - Add default case "sensitive" to the node.
+
+        Parameters
+        ----------
+        node : The dictionary containing the subnode as key.
+        key : The key of the dictionary.
+
+        Returns
+        -------
+        The subnode including the defaults.
+        """
+
+        if "re" not in node:
+            node["re"] = re.escape(node["nodeName"])
+
+        if "type" not in node:
+            node["type"] = "dir"
+
+        if "case" not in node:
+            node["case"] = "sensitive"
+
+    def match_file_object(self, current_node: dict,
+                          current_element: str):
+        """
+        Try to match a filename with the supplied current_node.
+
+        This function only uses the current path name specified by filename.
+        It does not check whether the file system object behind that path is valid
+        and matching the type of the current_node.
+
+        Parameters
+        ----------
+        current_node : A dictionary containing the matcher.
+        filename : A filename to match.
+
+        Returns
+        -------
+        None if the matcher does not match and otherwise a dict with the values of the matcher.
+        """
+
+        flags = 0
+        if current_node["case"] == "insensitive":
+            flags += re.IGNORECASE
+
+        regexp = current_node["re"]
+        pattern = re.compile(regexp)
+        matcher = re.match(pattern, current_element)
+
+        if matcher is None:
+            return None
+
+        # Value of current_node:
+        # - Add the numeric groups
+        # - Add the dictionary groups as well
+
+        valdict = {0: matcher.group()}
+        for i in range(len(matcher.groups())):
+            valdict[i+1] = matcher.group(i+1)
+        for k, v in matcher.groupdict().items():
+            valdict[k] = v
+
+        return valdict
+
    def check_type(self, current_node, current_element):
        path = os.path.join(self.current_dir, current_element)
        
@@ -141,74 +135,82 @@ class DirectoryInformationBackend(InformationBackend):
    def list_elements_function(self):
        return os.listdir(self.current_dir)

-    def sub_matcher(self, current_node, subelement):
+    def sub_matcher(self, current_node, current_element, subelement):
        path = os.path.join(self.current_dir, current_element)
        if current_node["type"] == "dir":
-            match_current_dir_node(path, subelement)
+            match_current_dir_node(subelement,
+                                   DirectoryInformationBackend(path))
        elif current_node["type"] == "file":
            if current_node["representer"] == "markdown":
-                match_markdown_node(path, subelement)
+                print("MARKDOWN")
+                match_current_dir_node(subelement,
+                                    MarkdownInformationBackend(path))
            else:
                raise RuntimeError("Not implemented")

-def match_current_dir_node(current_dir, current_node):
-    """Do the recursive matching in the file tree.
+class MarkdownInformationBackend(InformationBackend):
+    def __init__(self, filename=None, header=None):
+        """
+
+        Parameters
+        ----------
+        filename : str
+                   The filename of the markdown file. If None, header will be used directly.
+        header : dict
+                 The header dictionary object.
+        """
+        if filename is None and header is None:
+            raise ValueError("filename and header cannot both be None.")
+        
+        if filename is not None:
+            self.header = yaml_header_tools.get_header_from_file(filename, clean=False)
+        else:
+            self.header = header

-    """
+    def add_defaults(self, node: dict):
+        if "re" not in node:
+            node["re"] = ".*"

-    for element in os.listdir(current_dir):
-        path = os.path.join(current_dir, element)
-        
-        if current_node["type"] == "dir" and not os.path.isdir(path):
-            continue
-        elif current_node["type"] == "file" and os.path.isdir(path):
-            continue
-        
-        match = match_file_object(current_node, element)
-        if match is not None:
-            if "value" not in current_node:
-                current_node["value"] = []
-            current_node["value"].append(match)
+        if "type" not in node:
+            node["type"] = "LIST"

-            if "children" in current_node:
-                match["children"] = []
-                for subelement_name in current_node["children"]:
-                    subelement = get_subnode_with_defaults(
-                        current_node["children"], subelement_name).copy()
-                    match["children"].append(subelement)
-                    
-                    if current_node["type"] == "dir":
-                        match_current_dir_node(path, subelement)
-                    elif current_node["type"] == "file":
-                        if current_node["representer"] == "markdown":
-                            match_markdown_node(path, subelement)
-
-def get_dict_match(node, key, value):
-    """
-    Try to match a dict element with key and value with the information supplied in node.
+        if "case" not in node:
+            node["case"] = "sensitive"

-    This is absolutely work-in-progress also in the specification, e.g.:
-    - It is currently not possible to match the name with a regexp.
-    """
+    def match_file_object(self, current_node: dict,
+                          current_element: str):
+        """
+        Try to match a filename with the supplied current_node.
+
+        This function only uses the current path name specified by filename.
+        It does not check whether the file system object behind that path is valid
+        and matching the type of the current_node.
+
+        Parameters
+        ----------
+        current_node : A dictionary containing the matcher.
+        filename : A filename to match.
+
+        Returns
+        -------
+        None if the matcher does not match and otherwise a dict with the values of the matcher.
+        """
+
+        if current_node["nodeName"] != current_element:
+            return None

-    if node["type"] == "TEXT":
-        
        flags = 0
-        if node["case"] == "insensitive":
+        if current_node["case"] == "insensitive":
            flags += re.IGNORECASE

-        if "re" in node:
-            regexp = node["re"]
-        else:
-            regexp = ".*"
-            
+        regexp = current_node["re"]
        pattern = re.compile(regexp)
-        matcher = re.match(pattern, )
+        matcher = re.match(pattern, self.header[current_element])

        if matcher is None:
            return None

-        # Value of node:
+        # Value of current_node:
        # - Add the numeric groups
        # - Add the dictionary groups as well

@@ -217,22 +219,55 @@ def get_dict_match(node, key, value):
            valdict[i+1] = matcher.group(i+1)
        for k, v in matcher.groupdict().items():
            valdict[k] = v
-    else:
-        raise RuntimeError("Only TEXT is supported at the moment.")

-    return valdict
-
-def match_dict_node(current_dict, current_node):
-    for key, value in current_dict:
+        return valdict
    
+    def list_elements_function(self):
+        print(list(self.header.keys()))
+        return self.header
+
+    def check_type(self, current_node, current_element):
+        if current_node["type"] == "LIST" and not type(self.header[current_element]) == list:
+            return False
+        if current_node["type"] == "TEXT" and not type(self.header[current_element]) == str:
+            return False
+        return True

-def match_markdown_node(current_dir, current_node):
-    import yaml_header_tools
+    def sub_matcher(self, current_node, current_element, subelement):
+        print(current_node)
+        if current_node["type"] == "LIST":
+            print("sub ok")
+            match_current_dir_node(subelement,
+                                   MarkdownInformationBackend(header=self.header[current_element]))
+        else:
+            pass

-    header = yaml_header_tools.get_header_from_file(current_dir)

-    match_dict_node(header, current_node)
-    
+def match_current_dir_node(current_node, information_backend):
+    """Do the recursive matching in the file tree.
+
+    """
+    information_backend.add_defaults(current_node)
+
+    for element in information_backend.list_elements_function():
+        if not information_backend.check_type(current_node, element):
+            continue
+
+        match = information_backend.match_file_object(current_node, element)
+        if match is not None:
+            if "value" not in current_node:
+                current_node["value"] = []
+            current_node["value"].append(match)
+
+            if "children" in current_node:
+                match["children"] = []
+                for subelement_name in current_node["children"]:
+                    subelement = current_node["children"][subelement_name].copy()
+                    subelement["nodeName"] = subelement_name
+                    match["children"].append(subelement)
+
+                    information_backend.sub_matcher(current_node, element, subelement)
+

 def crawl_cfood(dirname: str,
                cfood: str):
@@ -249,12 +284,13 @@ def crawl_cfood(dirname: str,
    # Assume root to have a single element (for now):
    if len(root_node) != 1:
        raise ValueError("Only a single cfood root is allowed.")
-    current_node = get_subnode_with_defaults(root_node, list(root_node.keys())[0])
-    current_dir = dirname

-    match_current_dir_node(current_dir, current_node)
+    root_node_name = list(root_node.keys())[0]
+    root_node[root_node_name]["nodeName"] = root_node_name
+    match_current_dir_node(root_node[root_node_name],
+        DirectoryInformationBackend(dirname))

-    return current_node
+    return root_node


            

--- a/tests/scifolder_cfood.yml
+++ b/tests/scifolder_cfood.yml
@@ -25,6 +25,7 @@ root:
                  description:
                    type: TEXT
                  responsible:
+                    type: LIST
                    children:
                      person:
                        type: TEXT