From b4e115f5647255cd38692a4950677152a20c1d84 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <alexander@mail-schlemmer.de> Date: Fri, 9 Jul 2021 13:03:37 +0200 Subject: [PATCH] first tests for base functions --- src/newcrawler.egg-info/PKG-INFO | 61 ++++++++++++++++++++ src/newcrawler.egg-info/SOURCES.txt | 11 ++++ src/newcrawler.egg-info/dependency_links.txt | 1 + src/newcrawler.egg-info/top_level.txt | 1 + src/newcrawler/crawl.py | 51 +++++++++++----- tests/test_functions.py | 34 +++++++++++ tests/test_tool.py | 8 +-- 7 files changed, 146 insertions(+), 21 deletions(-) create mode 100644 src/newcrawler.egg-info/PKG-INFO create mode 100644 src/newcrawler.egg-info/SOURCES.txt create mode 100644 src/newcrawler.egg-info/dependency_links.txt create mode 100644 src/newcrawler.egg-info/top_level.txt create mode 100644 tests/test_functions.py diff --git a/src/newcrawler.egg-info/PKG-INFO b/src/newcrawler.egg-info/PKG-INFO new file mode 100644 index 00000000..4c6e7241 --- /dev/null +++ b/src/newcrawler.egg-info/PKG-INFO @@ -0,0 +1,61 @@ +Metadata-Version: 2.1 +Name: newcrawler +Version: 0.1 +Summary: A new crawler for caosdb +Home-page: UNKNOWN +Author: Alexander Schlemmer +Author-email: alexander.schlemmer@ds.mpg.de +License: UNKNOWN +Platform: UNKNOWN +Classifier: Programming Language :: Python :: 3 +Classifier: License :: OSI Approved :: AGPLv3 +Classifier: Operating System :: OS Independent +Requires-Python: >=3.6 +Description-Content-Type: text/markdown +License-File: LICENSE + +# newcrawler + +A new crawler for CaosDB. + + +This package has yaml-header-tools as a dependency: +https://gitlab.com/salexan/yaml-header-tools + + + +This python package can be installed using `pip`, e.g.: +```bash +pip install --user . +``` + +# Usage + +work in progress + +# Running the tests + +After installation of the package run (within the project folder): + +```bash +pytest +``` + + +# Contributers + +The original authors of this package are: + +- Alexander Schlemmer +- Henrik tom Wörden +- Florian Spreckelsen + +# License + +Copyright (C) 2021 Research Group Biomedical Physics, Max Planck Institute for +Dynamics and Self-Organization Göttingen. + +All files in this repository are licensed under a [GNU Affero General Public +License](LICENCE) (version 3 or later). + + diff --git a/src/newcrawler.egg-info/SOURCES.txt b/src/newcrawler.egg-info/SOURCES.txt new file mode 100644 index 00000000..4b7fb6ab --- /dev/null +++ b/src/newcrawler.egg-info/SOURCES.txt @@ -0,0 +1,11 @@ +LICENSE +README.md +pyproject.toml +setup.cfg +setup.py +src/newcrawler/__init__.py +src/newcrawler/crawl.py +src/newcrawler.egg-info/PKG-INFO +src/newcrawler.egg-info/SOURCES.txt +src/newcrawler.egg-info/dependency_links.txt +src/newcrawler.egg-info/top_level.txt \ No newline at end of file diff --git a/src/newcrawler.egg-info/dependency_links.txt b/src/newcrawler.egg-info/dependency_links.txt new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/src/newcrawler.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/src/newcrawler.egg-info/top_level.txt b/src/newcrawler.egg-info/top_level.txt new file mode 100644 index 00000000..db6194c5 --- /dev/null +++ b/src/newcrawler.egg-info/top_level.txt @@ -0,0 +1 @@ +newcrawler diff --git a/src/newcrawler/crawl.py b/src/newcrawler/crawl.py index f441a1a0..5c02ed92 100755 --- a/src/newcrawler/crawl.py +++ b/src/newcrawler/crawl.py @@ -99,6 +99,25 @@ def get_subnode_with_defaults(node: dict, return subnode +def match_complete(node: dict): + """Determine whether the match is complete. + + This function checks whether all nodes and subnodes have a value. + + Parameters + ---------- + node : The node to check. + + Returns + ------- + True if the match is complete and False otherwise. + """ + if "value" not in node: + return False + if "children" in node: + return all([match_complete(element) for element in node["children"]]) + return True + def crawl_cfood(dirname: str, cfood: str): """ @@ -115,25 +134,25 @@ def crawl_cfood(dirname: str, if len(root_node) != 1: raise ValueError("Only a single cfood root is allowed.") current_node = get_subnode_with_defaults(root_node, list(root_node.keys())[0]) + current_dir = dirname # Strategy: keep a list of currently matching candidates... - for currentpath, dirs, files in os.walk(dirname): - # for current nodes of type dir look in the list of dirs for matches - # dir is the default - if current_node["type"] == "dir": - for dirname in dirs: - match = match_file_object(current_node, dirname) - if match is not None: - print(json.dumps(match, indent=2)) - elif current_node["type"] == "file": - for filename in files: - match = match_file_object(current_node, dirname) - if match is not None: - print(match) - else: - # work in progress - pass + matches = [] + for element in os.listdir(current_dir): + path = os.path.join(dirname, element) + + + if current_node["type"] == "dir" and os.path.isdir(path): + match = match_file_object(current_node, dirname) + if match is not None: + matches.append((path, match)) + elif current_node["tpye"] == "file" and not os.path.isdir(path): + match = match_file_object(current_node, dirname) + if match is not None: + matches.append((path, match)) + + def crawl(dirname: str, diff --git a/tests/test_functions.py b/tests/test_functions.py new file mode 100644 index 00000000..a33f0d21 --- /dev/null +++ b/tests/test_functions.py @@ -0,0 +1,34 @@ +#!/bin/python +# Tests for main functions of crawler +# A. Schlemmer, 07/2021 + +from newcrawler import match_complete + +def test_match_complete(): + node = {"name": "bla"} + assert match_complete(node) == False + + node = {"name": "bla", + "children": [{ + "name": "test", + "value": 234}, { + "name": "test", + "value": 234}]} + assert match_complete(node) == False + + node = {"name": "bla", + "value": "ok", + "children": [{ + "name": "test", + "value": 234}, { + "name": "test", + "value": 234}]} + assert match_complete(node) == True + + node = {"name": "bla", + "value": "ok", + "children": [{ + "name": "test"}, { + "name": "test", + "value": 234}]} + assert match_complete(node) == False diff --git a/tests/test_tool.py b/tests/test_tool.py index 28add00b..3ba782c3 100755 --- a/tests/test_tool.py +++ b/tests/test_tool.py @@ -7,8 +7,6 @@ from newcrawler import crawl from os.path import join, dirname def test_examples_article(): - m = crawl(join(dirname(__file__), "test_directories/examples_article")) - assert len(m) == 14 - - for r in m: - assert len(r[2]) == 0 + m = crawl(join(dirname(__file__), "test_directories/examples_article"), + [join(dirname(__file__), "scifolder_cfood.yml")]) + assert True -- GitLab