Skip to content
Snippets Groups Projects
Commit 532aa60b authored by Alexander Schlemmer's avatar Alexander Schlemmer
Browse files

folder structure for module corrected

parent e179b2ce
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python
# The prototype for a new crawler for CaosDB
# A. Schlemmer, 06/2021
import argparse
import os
import sys
import yaml
import re
def crawl_cfood(dirname: str,
cfood: str):
"""
Crawl a single cfood.
"""
# Load the cfood from a yaml file:
with open(cfood, "r") as f:
cf = yaml.load(f, Loader=yaml.SafeLoader)
for currentpath, dirs, files in os.walk(dirname):
# for current nodes of type dir look in the list of dirs for matches
# dir is the default
if current_node["type"] == "dir":
for dirname in dirs:
pass
elif current_node["type"] == "file":
for filename in files:
pass
else:
# work in progress
pass
def crawl(dirname: str,
cfoods: list[str]):
"""
Craw a given file hierarchy.
dirname : the root path of the file tree to be crawled
cfoods : a list of filenames of cfood files
"""
# simplified for testing:
for cfood in cfoods:
crawl_cfood(dirname, cfood)
def main():
crawl(sys.args[1], [sys.args[2]])
if __name__ == "__main__":
main()
File moved
#!/usr/bin/env python
# The prototype for a new crawler for CaosDB
# A. Schlemmer, 06/2021
import argparse
import os
import sys
import yaml
import re
import json
def match_file_object(node: dict,
filename: str):
"""
Try to match a filename with the supplied node.
This function only uses the current path name specified by filename.
It does not check whether the file system object behind that path is valid
and matching the type of the node.
Parameters
----------
node : A dictionary containing the matcher.
filename : A filename to match.
Returns
-------
A copy of the node with values from the re match object if the node matches.
If it does not match this function returns None.
"""
if "value" in node:
raise ValueError("This node already contains a value.")
flags = 0
if node["case"] == "insensitive":
flags += re.IGNORECASE
regexp = node["re"]
pattern = re.compile(regexp)
matcher = re.match(pattern, filename)
if matcher is None:
return None
valnode = node.copy()
# Value of node:
# - Add the numeric groups
# - Add the dictionary groups as well
valdict = {0: matcher.group()}
for i in range(len(matcher.groups())):
valdict[i+1] = matcher.group(i+1)
for k, v in matcher.groupdict().items():
valdict[k] = v
valnode["value"] = valdict
return valnode
def get_subnode_with_defaults(node: dict,
key: str):
"""
Return the key from node as subnode setting some important defaults for
the cfood specification.
Currently this is:
- Creating an "re" (regular expression) from the key, if no re is set.
- Add type "dir" if no type is present.
- Add default case "sensitive" to the node.
Parameters
----------
node : The dictionary containing the subnode as key.
key : The key of the dictionary.
Returns
-------
A copy of the subnode including the defaults.
"""
if key not in node:
raise ValueError("Key {} is not in node.".format(key))
subnode = node[key].copy()
if "re" not in subnode:
subnode["re"] = re.escape(key)
if "type" not in subnode:
subnode["type"] = "dir"
if "case" not in subnode:
subnode["case"] = "sensitive"
# also add a node name?
return subnode
def crawl_cfood(dirname: str,
cfood: str):
"""
Crawl a single cfood.
"""
# Load the cfood from a yaml file:
with open(cfood, "r") as f:
cf = yaml.load(f, Loader=yaml.SafeLoader)
# Current way of determining the root node:
root_node = cf["root"]
# Assume root to have a single element (for now):
if len(root_node) != 1:
raise ValueError("Only a single cfood root is allowed.")
current_node = get_subnode_with_defaults(root_node, list(root_node.keys())[0])
# Strategy: keep a list of currently matching candidates...
for currentpath, dirs, files in os.walk(dirname):
# for current nodes of type dir look in the list of dirs for matches
# dir is the default
if current_node["type"] == "dir":
for dirname in dirs:
match = match_file_object(current_node, dirname)
if match is not None:
print(json.dumps(match, indent=2))
elif current_node["type"] == "file":
for filename in files:
match = match_file_object(current_node, dirname)
if match is not None:
print(match)
else:
# work in progress
pass
def crawl(dirname: str,
cfoods: list[str]):
"""
Craw a given file hierarchy.
dirname : the root path of the file tree to be crawled
cfoods : a list of filenames of cfood files
"""
# simplified for testing:
for cfood in cfoods:
crawl_cfood(dirname, cfood)
def main():
crawl(sys.args[1], [sys.args[2]])
if __name__ == "__main__":
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment