diff --git a/src/newcrawler.egg-info/PKG-INFO b/src/newcrawler.egg-info/PKG-INFO new file mode 100644 index 0000000000000000000000000000000000000000..4c6e7241e76b3c419a59d21f9a43666dd76a1fab --- /dev/null +++ b/src/newcrawler.egg-info/PKG-INFO @@ -0,0 +1,61 @@ +Metadata-Version: 2.1 +Name: newcrawler +Version: 0.1 +Summary: A new crawler for caosdb +Home-page: UNKNOWN +Author: Alexander Schlemmer +Author-email: alexander.schlemmer@ds.mpg.de +License: UNKNOWN +Platform: UNKNOWN +Classifier: Programming Language :: Python :: 3 +Classifier: License :: OSI Approved :: AGPLv3 +Classifier: Operating System :: OS Independent +Requires-Python: >=3.6 +Description-Content-Type: text/markdown +License-File: LICENSE + +# newcrawler + +A new crawler for CaosDB. + + +This package has yaml-header-tools as a dependency: +https://gitlab.com/salexan/yaml-header-tools + + + +This python package can be installed using `pip`, e.g.: +```bash +pip install --user . +``` + +# Usage + +work in progress + +# Running the tests + +After installation of the package run (within the project folder): + +```bash +pytest +``` + + +# Contributers + +The original authors of this package are: + +- Alexander Schlemmer +- Henrik tom Wörden +- Florian Spreckelsen + +# License + +Copyright (C) 2021 Research Group Biomedical Physics, Max Planck Institute for +Dynamics and Self-Organization Göttingen. + +All files in this repository are licensed under a [GNU Affero General Public +License](LICENCE) (version 3 or later). + + diff --git a/src/newcrawler.egg-info/SOURCES.txt b/src/newcrawler.egg-info/SOURCES.txt new file mode 100644 index 0000000000000000000000000000000000000000..4b7fb6aba468a63d558145b360bb2eb50e239bfd --- /dev/null +++ b/src/newcrawler.egg-info/SOURCES.txt @@ -0,0 +1,11 @@ +LICENSE +README.md +pyproject.toml +setup.cfg +setup.py +src/newcrawler/__init__.py +src/newcrawler/crawl.py +src/newcrawler.egg-info/PKG-INFO +src/newcrawler.egg-info/SOURCES.txt +src/newcrawler.egg-info/dependency_links.txt +src/newcrawler.egg-info/top_level.txt \ No newline at end of file diff --git a/src/newcrawler.egg-info/dependency_links.txt b/src/newcrawler.egg-info/dependency_links.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/src/newcrawler.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/src/newcrawler.egg-info/top_level.txt b/src/newcrawler.egg-info/top_level.txt new file mode 100644 index 0000000000000000000000000000000000000000..db6194c54d7e8b2645c547662714ef69d22c9afa --- /dev/null +++ b/src/newcrawler.egg-info/top_level.txt @@ -0,0 +1 @@ +newcrawler diff --git a/src/newcrawler/crawl.py b/src/newcrawler/crawl.py index f441a1a0dea6cfa1ba8134e1d2a3cd6272078503..5c02ed922d6d404d6a9794d7f53ddc45819d11a0 100755 --- a/src/newcrawler/crawl.py +++ b/src/newcrawler/crawl.py @@ -99,6 +99,25 @@ def get_subnode_with_defaults(node: dict, return subnode +def match_complete(node: dict): + """Determine whether the match is complete. + + This function checks whether all nodes and subnodes have a value. + + Parameters + ---------- + node : The node to check. + + Returns + ------- + True if the match is complete and False otherwise. + """ + if "value" not in node: + return False + if "children" in node: + return all([match_complete(element) for element in node["children"]]) + return True + def crawl_cfood(dirname: str, cfood: str): """ @@ -115,25 +134,25 @@ def crawl_cfood(dirname: str, if len(root_node) != 1: raise ValueError("Only a single cfood root is allowed.") current_node = get_subnode_with_defaults(root_node, list(root_node.keys())[0]) + current_dir = dirname # Strategy: keep a list of currently matching candidates... - for currentpath, dirs, files in os.walk(dirname): - # for current nodes of type dir look in the list of dirs for matches - # dir is the default - if current_node["type"] == "dir": - for dirname in dirs: - match = match_file_object(current_node, dirname) - if match is not None: - print(json.dumps(match, indent=2)) - elif current_node["type"] == "file": - for filename in files: - match = match_file_object(current_node, dirname) - if match is not None: - print(match) - else: - # work in progress - pass + matches = [] + for element in os.listdir(current_dir): + path = os.path.join(dirname, element) + + + if current_node["type"] == "dir" and os.path.isdir(path): + match = match_file_object(current_node, dirname) + if match is not None: + matches.append((path, match)) + elif current_node["tpye"] == "file" and not os.path.isdir(path): + match = match_file_object(current_node, dirname) + if match is not None: + matches.append((path, match)) + + def crawl(dirname: str, diff --git a/tests/test_functions.py b/tests/test_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..a33f0d2182474e960fa42c2343db533c3d21d41f --- /dev/null +++ b/tests/test_functions.py @@ -0,0 +1,34 @@ +#!/bin/python +# Tests for main functions of crawler +# A. Schlemmer, 07/2021 + +from newcrawler import match_complete + +def test_match_complete(): + node = {"name": "bla"} + assert match_complete(node) == False + + node = {"name": "bla", + "children": [{ + "name": "test", + "value": 234}, { + "name": "test", + "value": 234}]} + assert match_complete(node) == False + + node = {"name": "bla", + "value": "ok", + "children": [{ + "name": "test", + "value": 234}, { + "name": "test", + "value": 234}]} + assert match_complete(node) == True + + node = {"name": "bla", + "value": "ok", + "children": [{ + "name": "test"}, { + "name": "test", + "value": 234}]} + assert match_complete(node) == False diff --git a/tests/test_tool.py b/tests/test_tool.py index 28add00bedb342e6625b7d3fd426682eadc87151..3ba782c3b651dea57c84e9531a48f32a076074ad 100755 --- a/tests/test_tool.py +++ b/tests/test_tool.py @@ -7,8 +7,6 @@ from newcrawler import crawl from os.path import join, dirname def test_examples_article(): - m = crawl(join(dirname(__file__), "test_directories/examples_article")) - assert len(m) == 14 - - for r in m: - assert len(r[2]) == 0 + m = crawl(join(dirname(__file__), "test_directories/examples_article"), + [join(dirname(__file__), "scifolder_cfood.yml")]) + assert True