folder structure for module corrected

532aa60b · Alexander Schlemmer · e179b2ce · e179b2ce · 532aa60b · 532aa60b
Commit 532aa60b authored 3 years ago by Alexander Schlemmer
--- a/src/crawl.py
+++ b/src/crawl.py
-#!/usr/bin/env python
-# The prototype for a new crawler for CaosDB
-# A. Schlemmer, 06/2021
-import argparse
-import os
-import sys
-import yaml
-import re
-def crawl_cfood(dirname: str,
-                cfood: str):
-    """
-    Crawl a single cfood.
-    """
-    # Load the cfood from a yaml file:
-    with open(cfood, "r") as f:
-        cf = yaml.load(f, Loader=yaml.SafeLoader)
-    for currentpath, dirs, files in os.walk(dirname):
-        # for current nodes of type dir look in the list of dirs for matches
-        # dir is the default
-        if current_node["type"] == "dir":
-            for dirname in dirs:
-                pass
-        elif current_node["type"] == "file":
-            for filename in files:
-                pass
-        else:
-            # work in progress
-            pass
-def crawl(dirname: str,
-          cfoods: list[str]):
-    """
-    Craw a given file hierarchy.
-    dirname : the root path of the file tree to be crawled
-    cfoods  : a list of filenames of cfood files
-    """
-    # simplified for testing:
-    for cfood in cfoods:
-        crawl_cfood(dirname, cfood)
-def main():
-    crawl(sys.args[1], [sys.args[2]])
-if __name__ == "__main__":
-    main()
--- a/src/__init__.py
+++ b/src/__init__.py
--- a/src/newcrawler/crawl.py
+++ b/src/newcrawler/crawl.py
+#!/usr/bin/env python
+# The prototype for a new crawler for CaosDB
+# A. Schlemmer, 06/2021
+import argparse
+import os
+import sys
+import yaml
+import re
+import json
+def match_file_object(node: dict,
+                      filename: str):
+    """
+    Try to match a filename with the supplied node.
+    This function only uses the current path name specified by filename.
+    It does not check whether the file system object behind that path is valid
+    and matching the type of the node.
+    Parameters
+    ----------
+    node : A dictionary containing the matcher.
+    filename : A filename to match.
+    Returns
+    -------
+    A copy of the node with values from the re match object if the node matches.
+    If it does not match this function returns None.
+    """
+    if "value" in node:
+        raise ValueError("This node already contains a value.")
+    flags = 0
+    if node["case"] == "insensitive":
+        flags += re.IGNORECASE
+    regexp = node["re"]
+    pattern = re.compile(regexp)
+    matcher = re.match(pattern, filename)
+    if matcher is None:
+        return None
+    valnode = node.copy()
+    # Value of node:
+    # - Add the numeric groups
+    # - Add the dictionary groups as well
+    valdict = {0: matcher.group()}
+    for i in range(len(matcher.groups())):
+        valdict[i+1] = matcher.group(i+1)
+    for k, v in matcher.groupdict().items():
+        valdict[k] = v
+    valnode["value"] = valdict
+    return valnode
+def get_subnode_with_defaults(node: dict,
+                              key: str):
+    """
+    Return the key from node as subnode setting some important defaults for
+    the cfood specification.
+    Currently this is:
+    - Creating an "re" (regular expression) from the key, if no re is set.
+    - Add type "dir" if no type is present.
+    - Add default case "sensitive" to the node.
+    Parameters
+    ----------
+    node : The dictionary containing the subnode as key.
+    key : The key of the dictionary.
+    Returns
+    -------
+    A copy of the subnode including the defaults.
+    """
+    if key not in node:
+        raise ValueError("Key {} is not in node.".format(key))
+    subnode = node[key].copy()
+    if "re" not in subnode:
+        subnode["re"] = re.escape(key)
+    if "type" not in subnode:
+        subnode["type"] = "dir"
+    if "case" not in subnode:
+        subnode["case"] = "sensitive"
+    # also add a node name?
+    return subnode
+def crawl_cfood(dirname: str,
+                cfood: str):
+    """
+    Crawl a single cfood.
+    """
+    # Load the cfood from a yaml file:
+    with open(cfood, "r") as f:
+        cf = yaml.load(f, Loader=yaml.SafeLoader)
+    # Current way of determining the root node:
+    root_node = cf["root"]
+    # Assume root to have a single element (for now):
+    if len(root_node) != 1:
+        raise ValueError("Only a single cfood root is allowed.")
+    current_node = get_subnode_with_defaults(root_node, list(root_node.keys())[0])
+    # Strategy: keep a list of currently matching candidates...
+    for currentpath, dirs, files in os.walk(dirname):
+        # for current nodes of type dir look in the list of dirs for matches
+        # dir is the default
+        if current_node["type"] == "dir":
+            for dirname in dirs:
+                match = match_file_object(current_node, dirname)
+                if match is not None:
+                    print(json.dumps(match, indent=2))
+        elif current_node["type"] == "file":
+            for filename in files:
+                match = match_file_object(current_node, dirname)
+                if match is not None:
+                    print(match)
+        else:
+            # work in progress
+            pass
+def crawl(dirname: str,
+          cfoods: list[str]):
+    """
+    Craw a given file hierarchy.
+    dirname : the root path of the file tree to be crawled
+    cfoods  : a list of filenames of cfood files
+    """
+    # simplified for testing:
+    for cfood in cfoods:
+        crawl_cfood(dirname, cfood)
+def main():
+    crawl(sys.args[1], [sys.args[2]])
+if __name__ == "__main__":
+    main()