Skip to content
Snippets Groups Projects
Commit e179b2ce authored by Alexander Schlemmer's avatar Alexander Schlemmer
Browse files

Merge branch 'main' of ../2021/210414_NewCrawlerSpecification/caosdb-newcrawler

parents 8fd73c4e f4f3a766
No related branches found
No related tags found
No related merge requests found
Pipeline #10105 failed
setup.py 0 → 100644
# This is a hack for:
# https://github.com/pypa/pip/issues/7953
import setuptools
import site
import sys
site.ENABLE_USER_SITE = "--user" in sys.argv[1:]
if __name__ == "__main__":
setuptools.setup()
src/crawl.py 100644 → 100755
......@@ -5,8 +5,32 @@
import argparse
import os
import sys
import yaml
import re
def crawl_cfood(dirname: str,
cfood: str):
"""
Crawl a single cfood.
"""
# Load the cfood from a yaml file:
with open(cfood, "r") as f:
cf = yaml.load(f, Loader=yaml.SafeLoader)
for currentpath, dirs, files in os.walk(dirname):
# for current nodes of type dir look in the list of dirs for matches
# dir is the default
if current_node["type"] == "dir":
for dirname in dirs:
pass
elif current_node["type"] == "file":
for filename in files:
pass
else:
# work in progress
pass
def crawl(dirname: str,
cfoods: list[str]):
......@@ -17,10 +41,12 @@ def crawl(dirname: str,
cfoods : a list of filenames of cfood files
"""
pass
# simplified for testing:
for cfood in cfoods:
crawl_cfood(dirname, cfood)
def main():
pass
crawl(sys.args[1], [sys.args[2]])
if __name__ == "__main__":
......
DataAnalysis:
children:
project_dir:
re: (?P<date>.*?)_(?P<identifier>.*)
handlers:
- type: identifiable
name: idf_project
children:
single:
re: (?P<date>.*?)_(?P<identifier>.*)
children:
README:
type: file
case: insensitive
re: README\.md
children:
- description:
type: TEXT
- responsible:
children:
type: TEXT
re: (?P<first_name>.+) (?P<last_name>.+)
handlers:
type: identifiable
name: idf_person
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment