diff --git a/.gitignore b/.gitignore index 67d5d78d3d3d86a0b9d601a3d9ccc9354f472e2b..daa9defe713083b48fb6501c9195a45dd2617300 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,7 @@ src/newcrawler.egg-info/ unittests/provenance.yml .coverage TAGS -src/.coverage \ No newline at end of file +src/.coverage +build/ +*~ +.pdbrc \ No newline at end of file diff --git a/src/doc/Makefile b/src/doc/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..dc1690a8f7f74815b25a51e519e4712c7c92b7ec --- /dev/null +++ b/src/doc/Makefile @@ -0,0 +1,49 @@ +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2020 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2020 Daniel Hornung <d.hornung@indiscale.com> +# Copyright (C) 2021 Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header + +# This Makefile is a wrapper for sphinx scripts. +# +# It is based upon the autocreated makefile for Sphinx documentation. + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= -a +SPHINXBUILD ?= sphinx-build +SPHINXAPIDOC ?= sphinx-apidoc +PY_BASEDIR = ../newcrawler +SOURCEDIR = . +BUILDDIR = ../../build/doc + + +.PHONY: doc-help Makefile + +# Put it first so that "make" without argument is like "make help". +doc-help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile apidoc + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +apidoc: + @$(SPHINXAPIDOC) -o _apidoc --separate $(PY_BASEDIR) diff --git a/src/doc/conf.py b/src/doc/conf.py new file mode 100644 index 0000000000000000000000000000000000000000..75731285a77f8a30fcb4bfc6be0483c4bba0052a --- /dev/null +++ b/src/doc/conf.py @@ -0,0 +1,217 @@ +# -*- coding: utf-8 -*- +# +# Configuration file for the Sphinx documentation builder. +# +# Based on the configuration for caosdb-pylib. +# +# # Copyright (C) 2021 Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, add these +# directories to sys.path here. This is particularly necessary if this package is installed at a +# different version, for example via `pip install`. +# +# If the directory is relative to the documentation root, use os.path.abspath to make it absolute, +# like shown here. +# +import os +import sys +sys.path.insert(0, os.path.abspath('..')) + +import sphinx_rtd_theme # noqa: E402 + + +# -- Project information ----------------------------------------------------- + +project = 'caosdb-newcrawler' +copyright = '2021, MPIDS' +author = 'Alexander Schlemmer' + +# The short X.Y version +version = '0.1' +# The full version, including alpha/beta/rc tags +# release = '0.5.2-rc2' +release = '0.1' + + +# -- General configuration --------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.autosectionlabel', + 'sphinx.ext.intersphinx', + 'sphinx.ext.napoleon', # For Google style docstrings + "sphinx_rtd_theme", +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +source_suffix = ['.rst'] + +# The master toctree document. +master_doc = 'index' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = None + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# + +html_theme = "sphinx_rtd_theme" + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Custom sidebar templates, must be a dictionary that maps document names +# to template names. +# +# The default sidebars (for documents that don't match any pattern) are +# defined by theme itself. Builtin themes are using these templates by +# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', +# 'searchbox.html']``. +# +# html_sidebars = {} + + +# -- Options for HTMLHelp output --------------------------------------------- + +# Output file base name for HTML help builder. +htmlhelp_basename = 'caosdb-newcrawlerdoc' + + +# -- Options for LaTeX output ------------------------------------------------ + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'caosdb-newcrawler.tex', 'caosdb-newcrawler Documentation', + 'MPIDS', 'manual'), +] + + +# -- Options for manual page output ------------------------------------------ + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'caosdb-newcrawler', 'caosdb-newcrawler documentation', + [author], 1) +] + + +# -- Options for Texinfo output ---------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'caosdb-newcrawler', 'caosdb-newcrawler documentation', + author, 'caosdb-newcrawler', 'One line description of project.', + 'Miscellaneous'), +] + + +# -- Options for Epub output ------------------------------------------------- + +# Bibliographic Dublin Core info. +epub_title = project + +# The unique identifier of the text. This can be a ISBN number +# or the project homepage. +# +# epub_identifier = '' + +# A unique identification for the text. +# +# epub_uid = '' + +# A list of files that should not be packed into the epub file. +epub_exclude_files = ['search.html'] + + +# -- Extension configuration ------------------------------------------------- + +# True to prefix each section label with the name of the document it is in, followed by a colon. For +# example, index:Introduction for a section called Introduction that appears in document +# index.rst. Useful for avoiding ambiguity when the same section heading appears in different +# documents. +# +# Note: This stops "normal" links from working, so it should be kept at False. +# autosectionlabel_prefix_document = True + +# -- Options for intersphinx ------------------------------------------------- + +# https://www.sphinx-doc.org/en/master/usage/extensions/intersphinx.html#confval-intersphinx_mapping +intersphinx_mapping = { + "python": ("https://docs.python.org/", None), + "caosdb-mysqlbackend": ("https://docs.indiscale.com/caosdb-mysqlbackend/", + None), + "caosdb-server": ("https://docs.indiscale.com/caosdb-server/", None), + "caosdb-pylib": ("https://docs.indiscale.com/caosdb-pylib/", None), + "caosdb-advanced-user-tools": ("https://docs.indiscale.com/caosdb-advanced-user-tools/", None), +} + + +# TODO Which options do we want? +autodoc_default_options = { + 'members': None, + 'undoc-members': None, +} diff --git a/src/doc/index.rst b/src/doc/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..5579b93fd406c0169c410b8a0d237586df504a36 --- /dev/null +++ b/src/doc/index.rst @@ -0,0 +1,75 @@ +Crawler 2.0 Documentation +========================= + +Introduction +------------ + +The crawler is the main date integration tool for CaosDB. +Its task is to automatically synchronize data found on file systems or in other +sources of data with the semantic data model of CaosDB. + +More specifically, data that is contained in a hierarchical structure is converted to a data +structure that is consistent with a predefined semantic data model. + +The hierarchical sturcture can be for example a file tree. However it can be +also something different like the contents of a json file or a file tree with +json files. + +Concepts +-------- + +Structure Elements +++++++++++++++++++ + +This hierarchical structure is assumed to be consituted of a tree of +StructureElements. The tree is created on the fly by so called Converters which +are defined in a yaml file. The tree of StructureElements is a model +of the existing data (For example could a tree of Python file objects +(StructureElements) represent a file tree that exists on some file server). + +Relevant sources in: +src/structure_elements.py + +Converters +++++++++++ + +Converters treat StructureElements and thereby create the StructureElement that +are the children of the treated StructureElement. Converters therefore create +the above named tree. The definition of a Converter also contains what +Converters shall be used to treat the generated child-StructureElements. The +definition is there a tree itself. (Question: Should there be global Converters +that are always checked when treating a StructureElement? Should Converters be +associated with generated child-StructureElements? Currently, all children are +created and checked against all Converters. It could be that one would like to +check file-StructureElements against one set of Converters and +directory-StructureElements against another) + +Each StructureElement in the tree has a set of data values, i.e a dictionary of +key value pairs. +Some of those values are set due to the kind of StructureElement. For example, +a file could have the file name as such a key value pair: 'filename': <sth>. +Converters may define additional functions that create further values. For +example, a regular expresion could be used to get a date from a file name. + +Relevant sources in: +src/converters.py + +Identifiables ++++++++++++++ + +Relevant sources in: +src/identifiable_adapters.py + +The Crawler ++++++++++++ + +The crawler can be considered the main program doing the synchronization in basically two steps: +1. Based on a yaml-specification scan the file system (or other sources) and create a set + of CaosDB Entities that are supposed to be inserted or updated in a CaosDB instance. +2. Compare the current state of the CaosDB instance with the set of CaosDB Entities created in + step 1, taking into account the :ref:`registered identifiables<Identifiables>`. Insert or + update entites accordingly. + +Relevant sources in: +src/crawl.py + diff --git a/src/newcrawler/crawl.py b/src/newcrawler/crawl.py index 141558f3c64f0aefa9289dd95bf880e761b486f7..b448dd9169ebd3291b0b0a8b5602d4329025d202 100644 --- a/src/newcrawler/crawl.py +++ b/src/newcrawler/crawl.py @@ -23,41 +23,6 @@ # ** end header # -""" -Data that is contained in a hierarchical structure is converted to a data -structure that is consistent with a predefined semantic data model. - -The hierarchical sturcture can be for example a file tree. However it can be -also something different like the contents of a json file or a file tree with -json files. - - -This hierarchical structure is assumed to be consituted of a tree of -StructureElements. The tree is created on the fly by so called Converters which -are defined in a yaml file. The tree of StructureElements is a model -of the existing data (For example could a tree of Python file objects -(StructureElements) represent a file tree that exists on some file server). - -Converters treat StructureElements and thereby create the StructureElement that -are the children of the treated StructureElement. Converters therefore create -the above named tree. The definition of a Converter also contains what -Converters shall be used to treat the generated child-StructureElements. The -definition is there a tree itself. (Question: Should there be global Converters -that are always checked when treating a StructureElement? Should Converters be -associated with generated child-StructureElements? Currently, all children are -created and checked against all Converters. It could be that one would like to -check file-StructureElements against one set of Converters and -directory-StructureElements against another) - -Each StructureElement in the tree has a set of data values, i.e a dictionary of -key value pairs. -Some of those values are set due to the kind of StructureElement. For example, -a file could have the file name as such a key value pair: 'filename': <sth>. -Converters may define additional functions that create further values. For -example, a regular expresion could be used to get a date from a file name. - - -""" import sys import os