Skip to content
Snippets Groups Projects
Commit 661d63fc authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

snippets for exemplary crawler usage

parents
No related branches found
No related tags found
No related merge requests found
# Changelog
All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [Unreleased] ##
### Added ###
### Changed ###
### Deprecated ###
### Removed ###
### Fixed ###
### Security ###
This diff is collapsed.
# README
## Welcome
This is a small collection of code snippets that may serve as a bases for the
development of CaosDB Crawler modules.
The setup is such that makes it very easy to start with it. This also implies
that we are working with files that lie locally (where the script is executed)
and are not accessible by the server.
## Prerequisites
You need to have a [caosdb-pylib](https://docs.indiscale.com/caosdb-advanced-user-tools/)
and [caosdb-advancedtools](https://docs.indiscale.com/caosdb-advanced-user-tools/) installed.
You need a working connection to a CaosDB instance. We strongly recommend to use one that
is explicitly intended for testing and not for productive use.
## Setup
You can check out the data model that will be used in `insert_model.py`. You
may check in your CaosDB instance that no such model exists (although it may).
You can simply test the crawler by calling `python3 crawl.py data` in the directory
where this file resides.
Afterwards adjust the data model, the data and the crawler implementation in
`example_cfood.py` as you like.
## Contributing
Thank you very much to all contributers—[past, present](https://gitlab.com/caosdb/caosdb/-/blob/dev/HUMANS.md), and prospective ones.
### Code of Conduct
By participating, you are expected to uphold our [Code of Conduct](https://gitlab.com/caosdb/caosdb/-/blob/dev/CODE_OF_CONDUCT.md).
## License
* Copyright (C) 2021 Indiscale GmbH <info@indiscale.com>
All files in this repository are licensed under a [GNU Affero General Public
License](LICENCE.md) (version 3 or later).
crawl.py 0 → 100755
#!/usr/bin/env python3
# encoding: utf-8
#
# ** header v3.0
# This file is a part of the CaosDB Project.
#
# Copyright (C) 2020,2021 IndiScale GmbH <info@indiscale.com>
# Copyright (C) 2020,2021 Henrik tom Wörden <h.tomwoerden@indiscale.com>
# Copyright (C) 2018 Research Group Biomedical Physics
# Max-Planck-Institute for Dynamics and Self-Organization Göttingen
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
# ** end header
#
"""
This script invokes a CaosDB Crawler.
"""
import argparse
import logging
import os
import sys
from argparse import RawTextHelpFormatter
import caosdb as db
from caosadvancedtools.cfood import fileguide
from caosadvancedtools.crawler import FileCrawler
from caosadvancedtools.guard import UPDATE
from example_cfood import ExampleCFood
def get_parser():
parser = argparse.ArgumentParser(description=__doc__,
formatter_class=RawTextHelpFormatter)
parser.add_argument("path",
help="the subtree of files below the given (local) "
" path will be considered.")
return parser
def local_access(path):
return path
if __name__ == "__main__":
parser = get_parser()
args = parser.parse_args()
fileguide.access = local_access
# logging config
logger = logging.getLogger("caosadvancedtools")
logger.addHandler(logging.StreamHandler(sys.stdout))
logger.setLevel(logging.DEBUG)
# create file objects manually since the files do not exist on the server
# and cannot be queried using 'FileCrawler.query_files`
to_be_crawled = []
for root, dirs, files in os.walk(args.path):
to_be_crawled.extend(
[db.File(path=os.path.join(root, f)) for f in files])
c = FileCrawler(files=to_be_crawled, use_cache=False,
abort_on_exception=False,
interactive=False,
cfood_types=[
ExampleCFood,
],
)
c.crawl(security_level=UPDATE, path=args.path)
cdoefficient: 7.8
coefficient: 10.3
#!/usr/bin/env python
# encoding: utf-8
#
# ** header v3.0
# This file is a part of the CaosDB Project.
#
# Copyright (C) 2018 Research Group Biomedical Physics,
# Max-Planck-Institute for Dynamics and Self-Organization Göttingen
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import caosdb as db
import yaml
from caosadvancedtools.cfood import (AbstractFileCFood, assure_property_is,
fileguide)
from caosadvancedtools.datainconsistency import DataInconsistencyError
try:
from yaml import CDumper as Dumper
from yaml import CLoader as Loader
except ImportError:
from yaml import Dumper, Loader
class ExampleCFood(AbstractFileCFood):
@classmethod
def get_re(cls):
# matches for example `data/2010_TestProject/2019-02-03/README.md`
# The following groups are created (values for the above example):
# - project_identifier: 2010_TestProject
# - project_year: 2010
# - project_name: TestProject
# - date: 2019-02-03
return (r".*/(?P<project_identifier>"
r"(?P<project_year>\d{4})_?(?P<project_name>((?!/).)*))/"
r"(?P<date>\d{4}-\d{2}-\d{2})/result.yml")
def create_identifiables(self):
self.experiment = db.Record()
self.experiment.add_parent(name="Experiment")
self.experiment.add_property(name="date",
value=self.match.group('date'))
self.identifiables.append(self.experiment)
def update_identifiables(self):
with open(fileguide.access(self.crawled_path)) as fi:
data = yaml.load(fi, Loader=Loader)
# check that the expected data exits and raise an error otherwise
if 'coefficient' not in data:
raise DataInconsistencyError(
"The measured coefficient of experiment must be given in "
"the results file! Please check:\n{}\n".format(
self.crawled_path))
# add the data to the experiment Record
assure_property_is(self.experiment, "coefficient", data['coefficient'])
#!/usr/bin/env python3
import caosdb as db
# This creates a very simple data model (if it is not yet present)
# This snippet illustrates the basic way to work with the data model in python.
# If you want to create large models for an initial setup, you might want to
# have a look at https://docs.indiscale.com/caosdb-advanced-user-tools/yaml_interface.html
if len(db.execute_query("FIND date")) == 0:
date = db.Property(name="date", datatype=db.DATETIME)
date.insert()
if len(db.execute_query("FIND coefficient")) == 0:
date = db.Property(name="coefficient", datatype=db.DOUBLE)
date.insert()
if len(db.execute_query("FIND Project")) == 0:
pro = db.RecordType(name="Project")
pro.insert()
if len(db.execute_query("FIND Experiment")) == 0:
exp = db.RecordType(name="Experiment")
exp.add_property(name="date", importance=db.RECOMMENDED)
exp.add_property(name="coefficient", importance=db.RECOMMENDED)
exp.add_property(name="Project", importance=db.RECOMMENDED)
exp.insert()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment