diff --git a/src/newcrawler/crawl.py b/src/newcrawler/crawl.py index 5c02ed922d6d404d6a9794d7f53ddc45819d11a0..923bb2baa797e2b6bcee196592b0cb33827f9d87 100755 --- a/src/newcrawler/crawl.py +++ b/src/newcrawler/crawl.py @@ -8,6 +8,7 @@ import sys import yaml import re import json +from abc import abstractmethod def match_file_object(node: dict, filename: str): @@ -25,13 +26,9 @@ def match_file_object(node: dict, Returns ------- - A copy of the node with values from the re match object if the node matches. - If it does not match this function returns None. + None if the matcher does not match and otherwise a dict with the values of the matcher. """ - if "value" in node: - raise ValueError("This node already contains a value.") - flags = 0 if node["case"] == "insensitive": flags += re.IGNORECASE @@ -43,8 +40,6 @@ def match_file_object(node: dict, if matcher is None: return None - valnode = node.copy() - # Value of node: # - Add the numeric groups # - Add the dictionary groups as well @@ -55,10 +50,7 @@ def match_file_object(node: dict, for k, v in matcher.groupdict().items(): valdict[k] = v - valnode["value"] = valdict - - return valnode - + return valdict def get_subnode_with_defaults(node: dict, key: str): @@ -78,13 +70,13 @@ def get_subnode_with_defaults(node: dict, Returns ------- - A copy of the subnode including the defaults. + The subnode including the defaults. """ if key not in node: raise ValueError("Key {} is not in node.".format(key)) - subnode = node[key].copy() + subnode = node[key] if "re" not in subnode: subnode["re"] = re.escape(key) @@ -95,7 +87,8 @@ def get_subnode_with_defaults(node: dict, if "case" not in subnode: subnode["case"] = "sensitive" - # also add a node name? + if "nodeName" not in subnode: + subnode["nodeName"] = key return subnode @@ -118,6 +111,129 @@ def match_complete(node: dict): return all([match_complete(element) for element in node["children"]]) return True +class InformationBackend(object): + @abstractmethod + def check_type(self, current_node, current_element): + return + + @abstractmethod + def list_elements_function(self): + return + + @abstractmethod + def sub_matcher(self, current_node, current_element): + pass + +class DirectoryInformationBackend(InformationBackend): + def __init__(current_dir): + self.current_dir = current_dir + + def check_type(self, current_node, current_element): + path = os.path.join(self.current_dir, current_element) + + if current_node["type"] == "dir" and not os.path.isdir(path): + return False + elif current_node["type"] == "file" and os.path.isdir(path): + return False + + return True + + def list_elements_function(self): + return os.listdir(self.current_dir) + + def sub_matcher(self, current_node, subelement): + path = os.path.join(self.current_dir, current_element) + if current_node["type"] == "dir": + match_current_dir_node(path, subelement) + elif current_node["type"] == "file": + if current_node["representer"] == "markdown": + match_markdown_node(path, subelement) + else: + raise RuntimeError("Not implemented") + +def match_current_dir_node(current_dir, current_node): + """Do the recursive matching in the file tree. + + """ + + for element in os.listdir(current_dir): + path = os.path.join(current_dir, element) + + if current_node["type"] == "dir" and not os.path.isdir(path): + continue + elif current_node["type"] == "file" and os.path.isdir(path): + continue + + match = match_file_object(current_node, element) + if match is not None: + if "value" not in current_node: + current_node["value"] = [] + current_node["value"].append(match) + + if "children" in current_node: + match["children"] = [] + for subelement_name in current_node["children"]: + subelement = get_subnode_with_defaults( + current_node["children"], subelement_name).copy() + match["children"].append(subelement) + + if current_node["type"] == "dir": + match_current_dir_node(path, subelement) + elif current_node["type"] == "file": + if current_node["representer"] == "markdown": + match_markdown_node(path, subelement) + +def get_dict_match(node, key, value): + """ + Try to match a dict element with key and value with the information supplied in node. + + This is absolutely work-in-progress also in the specification, e.g.: + - It is currently not possible to match the name with a regexp. + """ + + if node["type"] == "TEXT": + + flags = 0 + if node["case"] == "insensitive": + flags += re.IGNORECASE + + if "re" in node: + regexp = node["re"] + else: + regexp = ".*" + + pattern = re.compile(regexp) + matcher = re.match(pattern, ) + + if matcher is None: + return None + + # Value of node: + # - Add the numeric groups + # - Add the dictionary groups as well + + valdict = {0: matcher.group()} + for i in range(len(matcher.groups())): + valdict[i+1] = matcher.group(i+1) + for k, v in matcher.groupdict().items(): + valdict[k] = v + else: + raise RuntimeError("Only TEXT is supported at the moment.") + + return valdict + +def match_dict_node(current_dict, current_node): + for key, value in current_dict: + + +def match_markdown_node(current_dir, current_node): + import yaml_header_tools + + header = yaml_header_tools.get_header_from_file(current_dir) + + match_dict_node(header, current_node) + + def crawl_cfood(dirname: str, cfood: str): """ @@ -136,21 +252,9 @@ def crawl_cfood(dirname: str, current_node = get_subnode_with_defaults(root_node, list(root_node.keys())[0]) current_dir = dirname - # Strategy: keep a list of currently matching candidates... + match_current_dir_node(current_dir, current_node) - matches = [] - for element in os.listdir(current_dir): - path = os.path.join(dirname, element) - - - if current_node["type"] == "dir" and os.path.isdir(path): - match = match_file_object(current_node, dirname) - if match is not None: - matches.append((path, match)) - elif current_node["tpye"] == "file" and not os.path.isdir(path): - match = match_file_object(current_node, dirname) - if match is not None: - matches.append((path, match)) + return current_node