diff --git a/README.md b/README.md index 7e99a70ce6b4fce574fc7a1b02e2dfed0e1d6e7b..88d8a6d9965e67ec268bff979ceb709dbf650129 100644 --- a/README.md +++ b/README.md @@ -38,6 +38,7 @@ The original authors of this package are: Copyright (C) 2021 Research Group Biomedical Physics, Max Planck Institute for Dynamics and Self-Organization Göttingen. +Copyright (C) 2021 IndiScale GmbH All files in this repository are licensed under a [GNU Affero General Public License](LICENCE) (version 3 or later). diff --git a/src/newcrawler/crawl-alt.py b/src/newcrawler/crawl-alt.py new file mode 100644 index 0000000000000000000000000000000000000000..f71d4dae0e24ae38609b7c7443c990f6f894b9fe --- /dev/null +++ b/src/newcrawler/crawl-alt.py @@ -0,0 +1,206 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2021 Henrik tom Wörden +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +""" +Data that is contained in a hierarchical structure is converted to a data +structure that is consistent with a predefined semantic data model. + +The hierarchical sturcture can be for example a file tree. However it can be +also something different like the contents of a json file or a file tree with +json files. + + +This hierarchical structure is assumed to be consituted of a tree of +StructureElements. The tree is created on the fly by so called Converters which +are defined in a yaml file. The tree of StructureElements is there for a model +of the existing data (For example could a tree of Python file objects +(StructureElements) represent a file tree that exists on some file server). + +Converters treat StructureElements and thereby create the StructureElement that +are the children of the treated StructureElement. Converters therefore create +the above named tree. The definition of a Converter also contains what +Converters shall be used to treat the generated child-StructureElements. The +definition is there a tree itself. (Question: Should there be global Converters +that are always checked when treating a StructureElement? Should Converters be +associated with generated child-StructureElements? Currently, all children are +created and checked against all Converters. It could be that one would like to +check file-StructureElements against one set of Converters and +directory-StructureElements against another) + +Each StructureElement in the tree has a set of data values, i.e a dictionary. +Some of those values are set due to the kind of StructureElement. For example, +a file could have the file name as such a key value pair: 'filename': <sth>. +Converters may define additional functions that create further values. For +example, a regular expresion could be used to get a date from a file name. + + +""" + +import argparse +from argparse import RawTextHelpFormatter + + +class StructureElement(object): + """ base class for elements in the hierarchical data structure """ + pass + + +class Directory(StructureElement): + def __init__(self, name, path): + self.values = {} + self.values['directory_name'] = name + self.values['directory_path'] = path + + +class File(StructureElement): + def __init__(self, name, path): + self.values = {} + self.values['file_name'] = name + self.values['file_path'] = path + + +def create_children_from_directory(element): + children = [] + + for name in os.listdir(element.path): + path = os.path.join(element.path, name) + + if os.is_dir(path): + children.append(Directory(name, path)) + elif os.is_file(path): + children.append(File(name, path)) + + return children + + +class Converter(object): + """ + Converters treat StructureElements contained in the hierarchical sturcture. + + A converter is defined via a yml file or part of it. The definition states + what kind of StructureElement it treats (typically one). + Also, it defines how children of the current StructureElement are + created and what Converters shall be used to treat those. + + The yaml definition looks like the following: + + converter-name: + type: <StructureElement Type> + match: ".*" + recordtypes: + Experiment: + <...> + valuegenerators: + datepattern: + <...> + childrengenerators: + create_children_from_directory + subtree: + + The converter-name is a description of what it represents (e.g. + 'experiment-folder') and is used as identifier. + + The type restricts what kind of StructureElements are treated. + The match is by default a regular expression, that is matche against the + name of StructureElements. Discussion: StructureElements might not have a + name (e.g. a dict) or should a name be created artificially if necessary + (e.g. "root-dict")? It might make sense to allow keywords like "always" and + other kinds of checks. For example a dictionary could be checked against a + json-schema definition. + + recordtypes is a list of definitions that define the semantic structure + (see details below). + + valuegenerators allow to provide additional functionality that creates + data values in addition to the ones given by default via the + StructureElement. This can be for example a match group of a regular + expression applied to the filename. + It should be possible to access the values of parent nodes. For example, + the name of a parent node could be accessed with $converter-name.name. + Discussion: This can introduce conflicts, if the key <converver-name> + already exists. An alternative would be to identify those lookups. E.g. + $$converter-name.name (2x$). + + childrengenerators denotes how StructureElements shall be created that are + children of the current one. + + subtree contains a list of Converter defnitions that look like the one + described here. + + those keywords should be allowed but not required. I.e. if no + valuegenerators shall be defined, the keyword may be omitted. + """ + + def __init__(self, definition, name): + self.definition = definition + # if definition['type'] != "directory": + #raise ValueError("type is not directory") + self.name = name + + for converter_def in definition['subtree']: + self.converters.append(create_converter(converter_def)) + + def create_values(self, values, element): + vals = {} + vals.update(element.values) + + def create_children(self, values, element): + children = [] + + for func_name in self.definition["childrengenerators"]: + # TODO remove eval!!! + children.extend(eval(func_name)(element)) + + return children + + +def crawl(items, global_converters, local_converters, values, datastructure): + for element in items: + for converter in [global_converters + local_converters]: + if (isinstance(element.type, converter.type) and + converter.match(element)): + converter.create_values(values, element) + converter.create_records(datastructure) + children = converter.create_children() + crawl(children, global_converters, + converter.local_converters, values, datastructure) + + +def main(args*): + pass + + +def parse_args(): + parser = argparse.ArgumentParser(description=__doc__, + formatter_class=RawTextHelpFormatter) + parser.add_argument("path", + help="the subtree of files below the given path will " + "be considered. Use '/' for everything.") + + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + sys.exit(main(*args)) diff --git a/tests/example.yml b/tests/example.yml new file mode 100644 index 0000000000000000000000000000000000000000..324d7ba78d4c0669947c529d68dcf4254983dbbe --- /dev/null +++ b/tests/example.yml @@ -0,0 +1,32 @@ + +experiment-directory: + type: directory #type defines that certain values are set; e.g. dir name + match: ".*" + recordtypes: + Experiment: + properties: + date: $date #in order to set property values, keys from this dict can be used + super: something.$date + valuegenerators: + - datepattern: + regexp: "8\d" + key: date #the value is stored under the key in the dict of this level + childrengenerators: + create_children_from_directory + subtree: + readme-file: + match: "README.md" + type: file + recordtypes: + Experiment: + properties: + responsible: responsible + valuegenerators: + - jsonloader: + regexp: "8\d" + value: date + + + + + diff --git a/tests/test_crawl.py b/tests/test_crawl.py new file mode 100644 index 0000000000000000000000000000000000000000..1ccba198b4a918a002fc08b4658dad2f6f04f109 --- /dev/null +++ b/tests/test_crawl.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2021 Henrik tom Wörden +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +""" +This file tests the behavoir of the crawler using small definitions and yml +structures. + +Dictionaries in the yaml are StructureElements. Key value pairs of the +dictionary are transfered to the value dict of the StructureElement. +""" + +import unittest + + +class BasicExperimentTest(unittest.TestCase): + # def setUp(self): + + def test_single(self): + # There is one dictionary, that is treated entirely as an experiment + structure = """ +stuff: 5 +""" + definition = """ +experiment: + type: dictionary + match: always + recordtypes: + Experiment: + properties: + stuff: $stuff """ + result = crawl() # dummy line + # 1 record + assert len(result) == 1 + # only parent is experiment + assert len(result[0].parents) == 1 + self.assertEqual(result[0].parents, "Experiment") + # name is set correctly + self.assertEqual(result[0].name, "cool-exp") + + def test_multiple(self): + # multiple dicts that each shall represent an experiment + structure = """ +cool-exp: + stuff: 5 +second-exp: +another-exp: + stuff: 5 +""" + # The outermost dictionary does not represent an entity. Therefore we + # need an additional level that creates child-nodes for each + # experiment. + definition = """ +toplevel: + type: dictionary + match: ".*" + subtree: + experiment: + type: dictionary + match: ".*" + recordtypes: + Experiment: + properties: + name: $name + stuff: $stuff """ + result = crawl() # dummy line + # 1 record + assert len(result) == 3 + + for r in result: + # only parent is experiment + assert len(r.parents) == 1 + self.assertEqual(r.parents, "Experiment") + # names are set correctly + self.assertTrue( + r.name in ["cool-exp", "second-exp", "another-exp"]) + + # second-exp shall not have a value for stuff + + if r.name == "second-exp": + self.assertEqual(r.get_property("stuff"), None) + + def test_three_level(self): + # multiple dicts that each shall represent an experiment + structure = """ +inter1: + cool-exp: + stuff: 5 +inter2: + second-exp: + another-exp: + stuff: 5 +""" + # The two outermost dictionaries do not represent an entity. We want to + # use the names of the intermediate levels in the Records. + definition = """ +toplevel: + type: dictionary + match: ".*" + subtree: + interlevel: + type: dictionary + match: ".*" + subtree: + experiment: + type: dictionary + match: ".*" + recordtypes: + Experiment: + properties: + name: $name + intermediate: $interlevel.name + stuff: $stuff """ + result = crawl() # dummy line + # 1 record + assert len(result) == 3 + + for r in result: + # only parent is experiment + assert len(r.parents) == 1 + self.assertEqual(r.parents, "Experiment") + # names are set correctly + self.assertTrue( + r.name in ["cool-exp", "second-exp", "another-exp"]) + + # second-exp shall not have a value for stuff + + if r.name == "second-exp": + self.assertEqual(r.get_property("stuff"), None) + definition = """ +experiment: + type: dictionary + match: ".*" + recordtypes: + Experiment: + properties: + name: $date #in order to set property values, keys from this dict can be used + valuegenerators: + childrengenerators: + create_children_from_dictionaries + subtree: +""" + definition = """ +toplevel: + type: dictionary + match: ".*" + subtree: + experiment: + type: dictionary + match: ".*" + recordtypes: + Experiment: + properties: + stuff: $stuff + valuegenerators: + childrengenerators: + subtree: """