diff --git a/src/crawl.py b/src/crawl.py deleted file mode 100755 index 6a348f8ab5176665aae048c6e244a26af7384a66..0000000000000000000000000000000000000000 --- a/src/crawl.py +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env python -# The prototype for a new crawler for CaosDB -# A. Schlemmer, 06/2021 - -import argparse -import os -import sys -import yaml -import re - -def crawl_cfood(dirname: str, - cfood: str): - """ - Crawl a single cfood. - """ - - # Load the cfood from a yaml file: - with open(cfood, "r") as f: - cf = yaml.load(f, Loader=yaml.SafeLoader) - - for currentpath, dirs, files in os.walk(dirname): - # for current nodes of type dir look in the list of dirs for matches - # dir is the default - if current_node["type"] == "dir": - for dirname in dirs: - pass - elif current_node["type"] == "file": - for filename in files: - pass - else: - # work in progress - pass - - -def crawl(dirname: str, - cfoods: list[str]): - """ - Craw a given file hierarchy. - - dirname : the root path of the file tree to be crawled - cfoods : a list of filenames of cfood files - """ - - # simplified for testing: - for cfood in cfoods: - crawl_cfood(dirname, cfood) - -def main(): - crawl(sys.args[1], [sys.args[2]]) - - -if __name__ == "__main__": - main() diff --git a/src/__init__.py b/src/newcrawler/__init__.py similarity index 100% rename from src/__init__.py rename to src/newcrawler/__init__.py diff --git a/src/newcrawler/crawl.py b/src/newcrawler/crawl.py new file mode 100755 index 0000000000000000000000000000000000000000..f441a1a0dea6cfa1ba8134e1d2a3cd6272078503 --- /dev/null +++ b/src/newcrawler/crawl.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python +# The prototype for a new crawler for CaosDB +# A. Schlemmer, 06/2021 + +import argparse +import os +import sys +import yaml +import re +import json + +def match_file_object(node: dict, + filename: str): + """ + Try to match a filename with the supplied node. + + This function only uses the current path name specified by filename. + It does not check whether the file system object behind that path is valid + and matching the type of the node. + + Parameters + ---------- + node : A dictionary containing the matcher. + filename : A filename to match. + + Returns + ------- + A copy of the node with values from the re match object if the node matches. + If it does not match this function returns None. + """ + + if "value" in node: + raise ValueError("This node already contains a value.") + + flags = 0 + if node["case"] == "insensitive": + flags += re.IGNORECASE + + regexp = node["re"] + pattern = re.compile(regexp) + matcher = re.match(pattern, filename) + + if matcher is None: + return None + + valnode = node.copy() + + # Value of node: + # - Add the numeric groups + # - Add the dictionary groups as well + + valdict = {0: matcher.group()} + for i in range(len(matcher.groups())): + valdict[i+1] = matcher.group(i+1) + for k, v in matcher.groupdict().items(): + valdict[k] = v + + valnode["value"] = valdict + + return valnode + + +def get_subnode_with_defaults(node: dict, + key: str): + """ + Return the key from node as subnode setting some important defaults for + the cfood specification. + + Currently this is: + - Creating an "re" (regular expression) from the key, if no re is set. + - Add type "dir" if no type is present. + - Add default case "sensitive" to the node. + + Parameters + ---------- + node : The dictionary containing the subnode as key. + key : The key of the dictionary. + + Returns + ------- + A copy of the subnode including the defaults. + """ + + if key not in node: + raise ValueError("Key {} is not in node.".format(key)) + + subnode = node[key].copy() + + if "re" not in subnode: + subnode["re"] = re.escape(key) + + if "type" not in subnode: + subnode["type"] = "dir" + + if "case" not in subnode: + subnode["case"] = "sensitive" + + # also add a node name? + + return subnode + +def crawl_cfood(dirname: str, + cfood: str): + """ + Crawl a single cfood. + """ + + # Load the cfood from a yaml file: + with open(cfood, "r") as f: + cf = yaml.load(f, Loader=yaml.SafeLoader) + + # Current way of determining the root node: + root_node = cf["root"] + # Assume root to have a single element (for now): + if len(root_node) != 1: + raise ValueError("Only a single cfood root is allowed.") + current_node = get_subnode_with_defaults(root_node, list(root_node.keys())[0]) + + # Strategy: keep a list of currently matching candidates... + + for currentpath, dirs, files in os.walk(dirname): + # for current nodes of type dir look in the list of dirs for matches + # dir is the default + if current_node["type"] == "dir": + for dirname in dirs: + match = match_file_object(current_node, dirname) + if match is not None: + print(json.dumps(match, indent=2)) + elif current_node["type"] == "file": + for filename in files: + match = match_file_object(current_node, dirname) + if match is not None: + print(match) + else: + # work in progress + pass + + +def crawl(dirname: str, + cfoods: list[str]): + """ + Craw a given file hierarchy. + + dirname : the root path of the file tree to be crawled + cfoods : a list of filenames of cfood files + """ + + # simplified for testing: + for cfood in cfoods: + crawl_cfood(dirname, cfood) + +def main(): + crawl(sys.args[1], [sys.args[2]]) + + +if __name__ == "__main__": + main()