diff --git a/src/newcrawler/crawl.py b/src/newcrawler/crawl.py index 923bb2baa797e2b6bcee196592b0cb33827f9d87..862600d220010cb6142f0511f1680189fb949a9c 100755 --- a/src/newcrawler/crawl.py +++ b/src/newcrawler/crawl.py @@ -8,89 +8,12 @@ import sys import yaml import re import json +import yaml_header_tools from abc import abstractmethod -def match_file_object(node: dict, - filename: str): - """ - Try to match a filename with the supplied node. - - This function only uses the current path name specified by filename. - It does not check whether the file system object behind that path is valid - and matching the type of the node. - - Parameters - ---------- - node : A dictionary containing the matcher. - filename : A filename to match. - - Returns - ------- - None if the matcher does not match and otherwise a dict with the values of the matcher. - """ - - flags = 0 - if node["case"] == "insensitive": - flags += re.IGNORECASE - - regexp = node["re"] - pattern = re.compile(regexp) - matcher = re.match(pattern, filename) - - if matcher is None: - return None - - # Value of node: - # - Add the numeric groups - # - Add the dictionary groups as well - - valdict = {0: matcher.group()} - for i in range(len(matcher.groups())): - valdict[i+1] = matcher.group(i+1) - for k, v in matcher.groupdict().items(): - valdict[k] = v - - return valdict - -def get_subnode_with_defaults(node: dict, - key: str): - """ - Return the key from node as subnode setting some important defaults for - the cfood specification. - - Currently this is: - - Creating an "re" (regular expression) from the key, if no re is set. - - Add type "dir" if no type is present. - - Add default case "sensitive" to the node. - - Parameters - ---------- - node : The dictionary containing the subnode as key. - key : The key of the dictionary. - - Returns - ------- - The subnode including the defaults. - """ - - if key not in node: - raise ValueError("Key {} is not in node.".format(key)) - - subnode = node[key] - - if "re" not in subnode: - subnode["re"] = re.escape(key) - if "type" not in subnode: - subnode["type"] = "dir" - if "case" not in subnode: - subnode["case"] = "sensitive" - if "nodeName" not in subnode: - subnode["nodeName"] = key - - return subnode def match_complete(node: dict): """Determine whether the match is complete. @@ -121,13 +44,84 @@ class InformationBackend(object): return @abstractmethod - def sub_matcher(self, current_node, current_element): + def sub_matcher(self, current_node, current_element, subelement): pass class DirectoryInformationBackend(InformationBackend): - def __init__(current_dir): + def __init__(self, current_dir): self.current_dir = current_dir + def add_defaults(self, node: dict): + """ + Return the key from node as subnode setting some important defaults for + the cfood specification. + + Currently this is: + - Creating an "re" (regular expression) from the key, if no re is set. + - Add type "dir" if no type is present. + - Add default case "sensitive" to the node. + + Parameters + ---------- + node : The dictionary containing the subnode as key. + key : The key of the dictionary. + + Returns + ------- + The subnode including the defaults. + """ + + if "re" not in node: + node["re"] = re.escape(node["nodeName"]) + + if "type" not in node: + node["type"] = "dir" + + if "case" not in node: + node["case"] = "sensitive" + + def match_file_object(self, current_node: dict, + current_element: str): + """ + Try to match a filename with the supplied current_node. + + This function only uses the current path name specified by filename. + It does not check whether the file system object behind that path is valid + and matching the type of the current_node. + + Parameters + ---------- + current_node : A dictionary containing the matcher. + filename : A filename to match. + + Returns + ------- + None if the matcher does not match and otherwise a dict with the values of the matcher. + """ + + flags = 0 + if current_node["case"] == "insensitive": + flags += re.IGNORECASE + + regexp = current_node["re"] + pattern = re.compile(regexp) + matcher = re.match(pattern, current_element) + + if matcher is None: + return None + + # Value of current_node: + # - Add the numeric groups + # - Add the dictionary groups as well + + valdict = {0: matcher.group()} + for i in range(len(matcher.groups())): + valdict[i+1] = matcher.group(i+1) + for k, v in matcher.groupdict().items(): + valdict[k] = v + + return valdict + def check_type(self, current_node, current_element): path = os.path.join(self.current_dir, current_element) @@ -141,74 +135,82 @@ class DirectoryInformationBackend(InformationBackend): def list_elements_function(self): return os.listdir(self.current_dir) - def sub_matcher(self, current_node, subelement): + def sub_matcher(self, current_node, current_element, subelement): path = os.path.join(self.current_dir, current_element) if current_node["type"] == "dir": - match_current_dir_node(path, subelement) + match_current_dir_node(subelement, + DirectoryInformationBackend(path)) elif current_node["type"] == "file": if current_node["representer"] == "markdown": - match_markdown_node(path, subelement) + print("MARKDOWN") + match_current_dir_node(subelement, + MarkdownInformationBackend(path)) else: raise RuntimeError("Not implemented") -def match_current_dir_node(current_dir, current_node): - """Do the recursive matching in the file tree. +class MarkdownInformationBackend(InformationBackend): + def __init__(self, filename=None, header=None): + """ + + Parameters + ---------- + filename : str + The filename of the markdown file. If None, header will be used directly. + header : dict + The header dictionary object. + """ + if filename is None and header is None: + raise ValueError("filename and header cannot both be None.") + + if filename is not None: + self.header = yaml_header_tools.get_header_from_file(filename, clean=False) + else: + self.header = header - """ + def add_defaults(self, node: dict): + if "re" not in node: + node["re"] = ".*" - for element in os.listdir(current_dir): - path = os.path.join(current_dir, element) - - if current_node["type"] == "dir" and not os.path.isdir(path): - continue - elif current_node["type"] == "file" and os.path.isdir(path): - continue - - match = match_file_object(current_node, element) - if match is not None: - if "value" not in current_node: - current_node["value"] = [] - current_node["value"].append(match) + if "type" not in node: + node["type"] = "LIST" - if "children" in current_node: - match["children"] = [] - for subelement_name in current_node["children"]: - subelement = get_subnode_with_defaults( - current_node["children"], subelement_name).copy() - match["children"].append(subelement) - - if current_node["type"] == "dir": - match_current_dir_node(path, subelement) - elif current_node["type"] == "file": - if current_node["representer"] == "markdown": - match_markdown_node(path, subelement) - -def get_dict_match(node, key, value): - """ - Try to match a dict element with key and value with the information supplied in node. + if "case" not in node: + node["case"] = "sensitive" - This is absolutely work-in-progress also in the specification, e.g.: - - It is currently not possible to match the name with a regexp. - """ + def match_file_object(self, current_node: dict, + current_element: str): + """ + Try to match a filename with the supplied current_node. + + This function only uses the current path name specified by filename. + It does not check whether the file system object behind that path is valid + and matching the type of the current_node. + + Parameters + ---------- + current_node : A dictionary containing the matcher. + filename : A filename to match. + + Returns + ------- + None if the matcher does not match and otherwise a dict with the values of the matcher. + """ + + if current_node["nodeName"] != current_element: + return None - if node["type"] == "TEXT": - flags = 0 - if node["case"] == "insensitive": + if current_node["case"] == "insensitive": flags += re.IGNORECASE - if "re" in node: - regexp = node["re"] - else: - regexp = ".*" - + regexp = current_node["re"] pattern = re.compile(regexp) - matcher = re.match(pattern, ) + matcher = re.match(pattern, self.header[current_element]) if matcher is None: return None - # Value of node: + # Value of current_node: # - Add the numeric groups # - Add the dictionary groups as well @@ -217,22 +219,55 @@ def get_dict_match(node, key, value): valdict[i+1] = matcher.group(i+1) for k, v in matcher.groupdict().items(): valdict[k] = v - else: - raise RuntimeError("Only TEXT is supported at the moment.") - return valdict - -def match_dict_node(current_dict, current_node): - for key, value in current_dict: + return valdict + def list_elements_function(self): + print(list(self.header.keys())) + return self.header + + def check_type(self, current_node, current_element): + if current_node["type"] == "LIST" and not type(self.header[current_element]) == list: + return False + if current_node["type"] == "TEXT" and not type(self.header[current_element]) == str: + return False + return True -def match_markdown_node(current_dir, current_node): - import yaml_header_tools + def sub_matcher(self, current_node, current_element, subelement): + print(current_node) + if current_node["type"] == "LIST": + print("sub ok") + match_current_dir_node(subelement, + MarkdownInformationBackend(header=self.header[current_element])) + else: + pass - header = yaml_header_tools.get_header_from_file(current_dir) - match_dict_node(header, current_node) - +def match_current_dir_node(current_node, information_backend): + """Do the recursive matching in the file tree. + + """ + information_backend.add_defaults(current_node) + + for element in information_backend.list_elements_function(): + if not information_backend.check_type(current_node, element): + continue + + match = information_backend.match_file_object(current_node, element) + if match is not None: + if "value" not in current_node: + current_node["value"] = [] + current_node["value"].append(match) + + if "children" in current_node: + match["children"] = [] + for subelement_name in current_node["children"]: + subelement = current_node["children"][subelement_name].copy() + subelement["nodeName"] = subelement_name + match["children"].append(subelement) + + information_backend.sub_matcher(current_node, element, subelement) + def crawl_cfood(dirname: str, cfood: str): @@ -249,12 +284,13 @@ def crawl_cfood(dirname: str, # Assume root to have a single element (for now): if len(root_node) != 1: raise ValueError("Only a single cfood root is allowed.") - current_node = get_subnode_with_defaults(root_node, list(root_node.keys())[0]) - current_dir = dirname - match_current_dir_node(current_dir, current_node) + root_node_name = list(root_node.keys())[0] + root_node[root_node_name]["nodeName"] = root_node_name + match_current_dir_node(root_node[root_node_name], + DirectoryInformationBackend(dirname)) - return current_node + return root_node diff --git a/tests/scifolder_cfood.yml b/tests/scifolder_cfood.yml index f8dc2023bf6139011420953ceab09c3fa2621b5c..048e194b53ff792f18ac5299f0a4476b9c97bf15 100644 --- a/tests/scifolder_cfood.yml +++ b/tests/scifolder_cfood.yml @@ -25,6 +25,7 @@ root: description: type: TEXT responsible: + type: LIST children: person: type: TEXT