snippets for exemplary crawler usage

661d63fc · Henrik tom Wörden · 661d63fc · 661d63fc · 661d63fc · 661d63fc
Commit 661d63fc authored 3 years ago by Henrik tom Wörden
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
+# Changelog
+All notable changes to this project will be documented in this file.
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [Unreleased] ##
+### Added ###
+### Changed ###
+### Deprecated ###
+### Removed ###
+### Fixed ###
+### Security ###
--- a/LICENSE.md
+++ b/LICENSE.md
--- a/README.md
+++ b/README.md
+# README
+## Welcome
+This is a small collection of code snippets that may serve as a bases for the 
+development of CaosDB Crawler modules.
+The setup is such that makes it very easy to start with it. This also implies
+that we are working with files that lie locally (where the script is executed)
+and are not accessible by the server.
+## Prerequisites
+You need to have a [caosdb-pylib](https://docs.indiscale.com/caosdb-advanced-user-tools/) 
+and [caosdb-advancedtools](https://docs.indiscale.com/caosdb-advanced-user-tools/) installed.
+You need a working connection to a CaosDB instance. We strongly recommend to use one that
+is explicitly intended for testing and not for productive use.
+## Setup
+You can check out the data model that will be used in `insert_model.py`. You 
+may check in your CaosDB instance that no such model exists (although it may).
+You can simply test the crawler by calling `python3 crawl.py data` in the directory 
+where this file resides.
+Afterwards adjust the data model, the data and the crawler implementation in 
+`example_cfood.py` as you like.
+## Contributing
+Thank you very much to all contributers—[past, present](https://gitlab.com/caosdb/caosdb/-/blob/dev/HUMANS.md), and prospective ones.
+### Code of Conduct
+By participating, you are expected to uphold our [Code of Conduct](https://gitlab.com/caosdb/caosdb/-/blob/dev/CODE_OF_CONDUCT.md).
+## License
+* Copyright (C) 2021 Indiscale GmbH <info@indiscale.com>
+All files in this repository are licensed under a [GNU Affero General Public
+License](LICENCE.md) (version 3 or later).
--- a/crawl.py
+++ b/crawl.py
+#!/usr/bin/env python3
+# encoding: utf-8
+#
+# ** header v3.0
+# This file is a part of the CaosDB Project.
+#
+# Copyright (C) 2020,2021 IndiScale GmbH <info@indiscale.com>
+# Copyright (C) 2020,2021 Henrik tom Wörden <h.tomwoerden@indiscale.com>
+# Copyright (C) 2018 Research Group Biomedical Physics
+# Max-Planck-Institute for Dynamics and Self-Organization Göttingen
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+# ** end header
+#
+"""
+This script invokes a CaosDB Crawler.
+"""
+import argparse
+import logging
+import os
+import sys
+from argparse import RawTextHelpFormatter
+import caosdb as db
+from caosadvancedtools.cfood import fileguide
+from caosadvancedtools.crawler import FileCrawler
+from caosadvancedtools.guard import UPDATE
+from example_cfood import ExampleCFood
+def get_parser():
+    parser = argparse.ArgumentParser(description=__doc__,
+                                     formatter_class=RawTextHelpFormatter)
+    parser.add_argument("path",
+                        help="the subtree of files below the given (local) "
+                        " path will be considered.")
+    return parser
+def local_access(path):
+    return path
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    fileguide.access = local_access
+    # logging config
+    logger = logging.getLogger("caosadvancedtools")
+    logger.addHandler(logging.StreamHandler(sys.stdout))
+    logger.setLevel(logging.DEBUG)
+    # create file objects manually since the files do not exist on the server
+    # and cannot be queried using 'FileCrawler.query_files`
+    to_be_crawled = []
+    for root, dirs, files in os.walk(args.path):
+        to_be_crawled.extend(
+            [db.File(path=os.path.join(root, f)) for f in files])
+    c = FileCrawler(files=to_be_crawled, use_cache=False,
+                    abort_on_exception=False,
+                    interactive=False,
+                    cfood_types=[
+                        ExampleCFood,
+                    ],
+                    )
+    c.crawl(security_level=UPDATE, path=args.path)
--- a/data/.create_dir
+++ b/data/.create_dir
--- a/data/2010_TestProject/2019-02-03/result.yml
+++ b/data/2010_TestProject/2019-02-03/result.yml
+cdoefficient: 7.8
--- a/data/2010_TestProject/2019-02-04/result.yml
+++ b/data/2010_TestProject/2019-02-04/result.yml
+coefficient: 10.3
--- a/example_cfood.py
+++ b/example_cfood.py
+#!/usr/bin/env python
+# encoding: utf-8
+#
+# ** header v3.0
+# This file is a part of the CaosDB Project.
+#
+# Copyright (C) 2018 Research Group Biomedical Physics,
+# Max-Planck-Institute for Dynamics and Self-Organization Göttingen
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+import caosdb as db
+import yaml
+from caosadvancedtools.cfood import (AbstractFileCFood, assure_property_is,
+                                     fileguide)
+from caosadvancedtools.datainconsistency import DataInconsistencyError
+try:
+    from yaml import CDumper as Dumper
+    from yaml import CLoader as Loader
+except ImportError:
+    from yaml import Dumper, Loader
+class ExampleCFood(AbstractFileCFood):
+    @classmethod
+    def get_re(cls):
+        # matches for example `data/2010_TestProject/2019-02-03/README.md`
+        # The following groups are created (values for the above example):
+        # - project_identifier: 2010_TestProject
+        # - project_year: 2010
+        # - project_name: TestProject
+        # - date: 2019-02-03
+        return (r".*/(?P<project_identifier>"
+                r"(?P<project_year>\d{4})_?(?P<project_name>((?!/).)*))/"
+                r"(?P<date>\d{4}-\d{2}-\d{2})/result.yml")
+    def create_identifiables(self):
+        self.experiment = db.Record()
+        self.experiment.add_parent(name="Experiment")
+        self.experiment.add_property(name="date",
+                                     value=self.match.group('date'))
+        self.identifiables.append(self.experiment)
+    def update_identifiables(self):
+        with open(fileguide.access(self.crawled_path)) as fi:
+            data = yaml.load(fi, Loader=Loader)
+        # check that the expected data exits and raise an error otherwise
+        if 'coefficient' not in data:
+            raise DataInconsistencyError(
+                "The measured coefficient of experiment must be given in "
+                "the results file! Please check:\n{}\n".format(
+                    self.crawled_path))
+        # add the data to the experiment Record
+        assure_property_is(self.experiment, "coefficient", data['coefficient'])
--- a/insert_model.py
+++ b/insert_model.py
+#!/usr/bin/env python3
+import caosdb as db
+# This creates a very simple data model (if it is not yet present)
+# This snippet illustrates the basic way to work with the data model in python.
+# If you want to create large models for an initial setup, you might want to
+# have a look at https://docs.indiscale.com/caosdb-advanced-user-tools/yaml_interface.html
+if len(db.execute_query("FIND date")) == 0:
+    date = db.Property(name="date", datatype=db.DATETIME)
+    date.insert()
+if len(db.execute_query("FIND coefficient")) == 0:
+    date = db.Property(name="coefficient", datatype=db.DOUBLE)
+    date.insert()
+if len(db.execute_query("FIND Project")) == 0:
+    pro = db.RecordType(name="Project")
+    pro.insert()
+if len(db.execute_query("FIND Experiment")) == 0:
+    exp = db.RecordType(name="Experiment")
+    exp.add_property(name="date", importance=db.RECOMMENDED)
+    exp.add_property(name="coefficient", importance=db.RECOMMENDED)
+    exp.add_property(name="Project", importance=db.RECOMMENDED)
+    exp.insert()