Skip to content
Snippets Groups Projects
Commit 3f1cfe20 authored by Florian Spreckelsen's avatar Florian Spreckelsen
Browse files

Merge branch 'f-small-doc-fixes' into 'dev'

Documentation: many small changes

See merge request !129
parents 32f4bac0 f4906bfc
No related branches found
No related tags found
2 merge requests!162DOC WIP: Tutorial: Single structured file,!129Documentation: many small changes
Pipeline #49018 passed
......@@ -17,6 +17,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed ###
* `crawler_main` doesn't need the deprecated `debug=True` anymore to put out a
provenance file if the `provenance_file` parameter is provided.
### Security ###
### Documentation ###
......
......@@ -389,8 +389,8 @@ class Converter(object, metaclass=ABCMeta):
Extract information from the structure element and store them as values in the
general store.
Parameters:
------------
Parameters
----------
values: GeneralStore
The GeneralStore to store values in.
......@@ -409,8 +409,8 @@ class Converter(object, metaclass=ABCMeta):
Check if transformers are defined using the "transform" keyword.
Then apply the transformers to the variables defined in GeneralStore "values".
Parameters:
------------
Parameters
----------
values: GeneralStore
The GeneralStore to store values in.
......@@ -765,6 +765,12 @@ schema_resource:
class DictElementConverter(Converter):
"""
**Operates on:** :py:class:`caoscrawler.structure_elements.DictElement`
**Generates:** :py:class:`caoscrawler.structure_elements.StructureElement`
"""
def create_children(self, generalStore: GeneralStore, element: StructureElement):
# TODO: See comment on types and inheritance
if not isinstance(element, DictElement):
......@@ -1154,6 +1160,12 @@ class TableConverter(Converter):
class XLSXTableConverter(TableConverter):
"""
**Operates on:** :py:class:`caoscrawler.structure_elements.File`
**Generates:** :py:class:`caoscrawler.structure_elements.DictElement`
"""
def get_options(self):
return self._get_options([
("sheet_name", str),
......
......@@ -1504,7 +1504,7 @@ def crawler_main(crawled_directory_path: str,
dry_run: bool = False,
prefix: str = "",
securityMode: SecurityMode = SecurityMode.UPDATE,
unique_names=True,
unique_names: bool = True,
restricted_path: Optional[list[str]] = None,
remove_prefix: Optional[str] = None,
add_prefix: Optional[str] = None,
......@@ -1520,9 +1520,9 @@ def crawler_main(crawled_directory_path: str,
identifiables_definition_file : str
filename of an identifiable definition yaml file
debug : bool
DEPRECATED, whether or not to run in debug mode
DEPRECATED, use a provenance file instead.
provenance_file : str
provenance information will be stored in a file with given filename
Provenance information will be stored in a file with given filename
dry_run : bool
do not commit any chnages to the server
prefix : str
......@@ -1562,7 +1562,7 @@ def crawler_main(crawled_directory_path: str,
_fix_file_paths(crawled_data, add_prefix, remove_prefix)
_check_record_types(crawled_data)
if provenance_file is not None and debug:
if provenance_file is not None:
crawler.save_debug_data(debug_tree=debug_tree, filename=provenance_file)
if identifiables_definition_file is not None:
......
......@@ -25,7 +25,9 @@
#
"""
This is the scanner, the original "_crawl" function from crawl.py.
This is the scanner.
This was where formerly the ``_crawl(...)`` function from ``crawl.py`` was located.
This is just the functionality that extracts data from the file system.
"""
......@@ -257,31 +259,31 @@ def scanner(items: list[StructureElement],
restricted_path: Optional[list[str]] = None,
crawled_data: Optional[list[db.Record]] = None,
debug_tree: Optional[DebugTree] = None,
registered_transformer_functions: Optional[dict] = None):
registered_transformer_functions: Optional[dict] = None) -> list[db.Record]:
"""Crawl a list of StructureElements and apply any matching converters.
Formerly known as "_crawl".
Formerly known as ``_crawl(...)``.
Parameters
----------
items:
items: list[StructureElement]
structure_elements (e.g. files and folders on one level on the hierarchy)
converters:
converters: list[Converter]
locally defined converters for treating structure elements. A locally
defined converter could be one that is only valid for a specific subtree
of the originally cralwed StructureElement structure.
general_store, record_store:
general_store, record_store: GeneralStore, RecordStore, optional
This recursion of the crawl function should only operate on copies of
the global stores of the Crawler object.
restricted_path : list of strings, optional
restricted_path : list[str], optional
traverse the data tree only along the given path. For example, when a
directory contains files a, b and c and b is given as restricted_path, a
and c will be ignroed by the crawler. When the end of the given path is
directory contains files a, b and c, and b is given as ``restricted_path``, a
and c will be ignored by the crawler. When the end of the given path is
reached, traverse the full tree as normal. The first element of the list
provided by restricted_path should be the name of the StructureElement
provided by ``restricted_path`` should be the name of the StructureElement
at this level, i.e. denoting the respective element in the items
argument.
......@@ -292,7 +294,8 @@ def scanner(items: list[StructureElement],
Each function is a dictionary:
- The key is the name of the function to be looked up in the dictionary of registered transformer functions.
- The key is the name of the function to be looked up in the dictionary of registered
transformer functions.
- The value is the function which needs to be of the form:
def func(in_value: Any, in_parameters: dict) -> Any:
pass
......@@ -457,7 +460,8 @@ def scan_structure_elements(items: Union[list[StructureElement], StructureElemen
converter_registry: dict,
restricted_path: Optional[list[str]] = None,
debug_tree: Optional[DebugTree] = None,
registered_transformer_functions: Optional[dict] = None):
registered_transformer_functions: Optional[dict] = None) -> (
list[db.Record]):
"""
Start point of the crawler recursion.
......@@ -471,14 +475,14 @@ def scan_structure_elements(items: Union[list[StructureElement], StructureElemen
crawler_definition : dict
A dictionary representing the crawler definition, possibly from a yaml
file.
restricted_path: optional, list of strings
restricted_path: list[str], optional
Traverse the data tree only along the given path. When the end of the
given path is reached, traverse the full tree as normal. See docstring
of 'scanner' for more details.
Returns
-------
crawled_data : list
crawled_data : list[db.Record]
the final list with the target state of Records.
"""
......
......@@ -28,9 +28,16 @@ import warnings
class StructureElement(object):
""" base class for elements in the hierarchical data structure """
"""Base class for elements in the hierarchical data structure.
def __init__(self, name):
Parameters
----------
name: str
The name of the StructureElement. May be used for pattern matching by CFood rules.
"""
def __init__(self, name: str):
# Used to store usage information for debugging:
self.metadata: tDict[str, set[str]] = {
"usage": set()
......@@ -46,6 +53,18 @@ class StructureElement(object):
class FileSystemStructureElement(StructureElement):
"""StructureElement representing an element of a file system, like a directory or a simple file.
Parameters
----------
name: str
The name of the StructureElement. May be used for pattern matching by CFood rules.
path: str
The path to the file or directory.
"""
def __init__(self, name: str, path: str):
super().__init__(name)
self.path = path
......@@ -65,6 +84,7 @@ class Directory(FileSystemStructureElement):
class File(FileSystemStructureElement):
"""StrutureElement representing a file."""
pass
......
Concepts
))))))))
========
The CaosDB Crawler can handle any kind of hierarchical data structure. The typical use case is
The CaosDB Crawler can handle any kind of hierarchical data structure. The typical use case is a
directory tree that is traversed. We use the following terms/concepts to describe how the CaosDB
Crawler works.
Structure Elements
++++++++++++++++++
This hierarchical structure is assumed to be consituted of a tree of
StructureElements. The tree is created on the fly by so called Converters which
are defined in a yaml file. The tree of StructureElements is a model
of the existing data (For example could a tree of Python file objects
(StructureElements) represent a file tree that exists on some file server).
The crawled hierarchical structure is represented by a tree of *StructureElements*. This tree is
generated on the fly by so called Converters which are defined in a yaml file (usually called
``cfood.yml``). This generated tree of StructureElements is a model of the existing data. For
example a tree of Python *file objects* (StructureElements) could correspond to a file system tree.
Relevant sources in:
......@@ -23,29 +22,28 @@ Relevant sources in:
Converters
++++++++++
Converters treat StructureElements and thereby create the StructureElement that
are the children of the treated StructureElement. Converters therefore create
the above named tree. The definition of a Converter also contains what
Converters shall be used to treat the generated child-StructureElements. The
definition is therefore a tree itself.
See :std:doc:`converters<converters>` for details.
Converters treat a StructureElement and during this process create a number of new
StructureElements: the children of the initially treated StructureElement. Thus by treatment of
existing StructureElements, Converters create a tree of StructureElements.
.. image:: img/converter.png
:height: 170
See :std:doc:`converters<converters>` for details.
Relevant sources in:
- ``src/converters.py``
- ``src/converters.py``
Identifiables
+++++++++++++
An Identifiable of a Record is like the fingerprint of a Record.
An *Identifiable* of a Record is like the fingerprint of a Record.
The identifiable contains the information that is used by the CaosDB Crawler to identify Records.
For example, in order to check whether a Record exits in the CaosDB Server, the CaosDB Crawler creates a query
using the information contained in the Identifiable.
The Identifiable contains the information that is used by the CaosDB Crawler to identify Records.
For example, the CaosDB Crawler may create a query using the information contained in the
Identifiable in order to check whether a Record exists in the CaosDB Server.
Suppose a certain experiment is at most done once per day, then the identifiable could
consist of the RecordType "SomeExperiment" (as a parent) and the Property "date" with the respective value.
......@@ -100,7 +98,9 @@ The Crawler
+++++++++++
The crawler can be considered the main program doing the synchronization in basically two steps:
#. Based on a yaml-specification scan the file system (or other sources) and create a set of CaosDB Entities that are supposed to be inserted or updated in a CaosDB instance.
#. Compare the current state of the CaosDB instance with the set of CaosDB Entities created in step 1, taking into account the :ref:`registered identifiables<Identifiables>`. Insert or update entites accordingly.
Relevant sources in:
......
Converters
))))))))))
Converters treat StructureElements and thereby create the StructureElement that
are the children of the treated StructureElement. Converters therefore create
the tree of structure elements. The definition of a Converter also contains what
Converters shall be used to treat the generated child-StructureElements. The
definition is therefore a tree itself.
Each StructureElement in the tree has a set of data values, i.e a dictionary of
key value pairs.
Some of those values are set due to the kind of StructureElement. For example,
a file could have the file name as such a key value pair: 'filename': <sth>.
Converters treat a StructureElement and during this process create a number of new
StructureElements: the children of the initially treated StructureElement. Thus by treatment of
existing StructureElements, Converters create a tree of StructureElements.
.. image:: img/converter.png
:height: 170
The ``cfood.yml`` definition also describes which
Converters shall be used to treat the generated child StructureElements. The
definition therefore itself also defines a tree.
Each StructureElement in the tree has a set of properties, organized as
key-value pairs.
Some of those properties are specified by the type of StructureElement. For example,
a file could have the file name as property: ``'filename': myfile.dat``.
Converters may define additional functions that create further values. For
example, a regular expresion could be used to get a date from a file name.
example, a regular expression could be used to get a date from a file name.
A converter is defined via a yml file or part of it. The definition states
......@@ -20,7 +25,7 @@ what kind of StructureElement it treats (typically one).
Also, it defines how children of the current StructureElement are
created and what Converters shall be used to treat those.
The yaml definition looks like the following:
The yaml definition may look like this:
TODO: outdated, see cfood-schema.yml
......@@ -53,8 +58,9 @@ to generate records (see :py:meth:`~caoscrawler.converters.Converter.create_reco
**records** is a dict of definitions that define the semantic structure
(see details below).
Subtree contains a list of Converter defnitions that look like the one
described here.
**subtree** makes the yaml recursive: It contains a list of new Converter
definitions, which work on the StructureElements that are returned by the
current Converter.
Transform Functions
+++++++++++++++++++
......@@ -108,6 +114,9 @@ them to the cfood definition (see :doc:`CFood Documentation<cfood>`).
Standard Converters
+++++++++++++++++++
These are the standard converters that exist in a default installation. For writing and applying
*custom converters*, see :ref:`below <Custom Converters>`.
Directory Converter
===================
The Directory Converter creates StructureElements for each File and Directory
......@@ -126,11 +135,14 @@ children elements according to the structure of the header.
DictElement Converter
=====================
DictElement → StructureElement
Creates a child StructureElement for each key in the dictionary.
Typical Subtree converters
--------------------------
The following StructureElement are typically created:
The following StructureElement types are typically created by the DictElement converter:
- BooleanElement
- FloatElement
......@@ -155,12 +167,12 @@ behavior can be adjusted with the fields `accept_text`, `accept_int`,
The following denotes what kind of StructureElements are accepted by default
(they are defined in `src/caoscrawler/converters.py`):
- DictBooleanElementConverter: bool, int
- DictFloatElementConverter: int, float
- DictTextElementConverter: text, bool, int, float
- DictIntegerElementConverter: int
- DictListElementConverter: list
- DictDictElementConverter: dict
- BooleanElementConverter: bool, int
- FloatElementConverter: int, float
- TextElementConverter: text, bool, int, float
- IntegerElementConverter: int
- ListElementConverter: list
- DictElementConverter: dict
YAMLFileConverter
=================
......@@ -180,11 +192,13 @@ JSONFileConverter
TableConverter
==============
Table → DictElement
A generic converter (abstract) for files containing tables.
Currently, there are two specialized implementations for xlsx-files and csv-files.
Currently, there are two specialized implementations for XLSX files and CSV files.
All table converters generate a subtree that can be converted with DictDictElementConverters:
For each row in the table a DictDictElement (structure element) is generated. The key of the
All table converters generate a subtree of dicts, which in turn can be converted with DictElementConverters:
For each row in the table the TableConverter generates a DictElement (structure element). The key of the
element is the row number. The value of the element is a dict containing the mapping of
column names to values of the respective cell.
......@@ -193,21 +207,21 @@ Example:
.. code-block:: yaml
subtree:
TABLE:
TABLE: # Any name for the table as a whole
type: CSVTableConverter
match: ^test_table.csv$
records:
(...) # Records edited for the whole table file
subtree:
ROW:
type: DictDictElement
ROW: # Any name for a data row in the table
type: DictElement
match_name: .*
match_value: .*
records:
(...) # Records edited for each row
subtree:
COLUMN:
type: DictFloatElement
COLUMN: # Any name for a specific type of column in the table
type: FloatElement
match_name: measurement # Name of the column in the table file
match_value: (?P<column_value).*)
records:
......@@ -217,9 +231,13 @@ Example:
XLSXTableConverter
==================
XLSX File → DictElement
CSVTableConverter
=================
CSV File → DictElement
Further converters
++++++++++++++++++
......@@ -322,11 +340,15 @@ file in a text property, the name of which can be configured with the
Custom Converters
+++++++++++++++++
It was previously mentioned that it is possible to create custom converters.
As mentioned before it is possible to create custom converters.
These custom converters can be used to integrate arbitrary data extraction and ETL capabilities
into the caosdb-crawler and make these extensions available to any yaml specification.
into the LinkAhead crawler and make these extensions available to any yaml specification.
Tell the crawler about a custom converter
=========================================
The basic syntax for adding a custom converter to a yaml cfood definition file is:
To use a custom crawler, it must be defined in the ``Converters`` section of the CFood yaml file.
The basic syntax for adding a custom converter to a definition file is:
.. code-block:: yaml
......@@ -335,7 +357,7 @@ The basic syntax for adding a custom converter to a yaml cfood definition file i
package: <python>.<module>.<name>
converter: <PythonClassName>
The Converters-section can be either put into the first or second document of the cfood yaml file.
The Converters section can be either put into the first or the second document of the cfood yaml file.
It can be also part of a single-document yaml cfood file. Please refer to :doc:`the cfood documentation<cfood>` for more details.
Details:
......@@ -344,9 +366,16 @@ Details:
- **<python>.<module>.<name>**: The name of the module where the converter class resides.
- **<PythonClassName>**: Within this specified module there must be a class inheriting from base class :py:class:`caoscrawler.converters.Converter`.
Implementing a custom converter
===============================
Converters inherit from the :py:class:`~caoscrawler.converters.Converter` class.
The following methods are abstract and need to be overwritten by your custom converter to make it work:
- :py:meth:`~caoscrawler.converters.Converter.create_children`
:py:meth:`~caoscrawler.converters.Converter.create_children`:
Return a list of child StructureElement objects.
- :py:meth:`~caoscrawler.converters.Converter.match`
- :py:meth:`~caoscrawler.converters.Converter.typecheck`
......
Further reading
===============
- A simple `documented example <https://gitlab.com/caosdb/documented-crawler-example>`_ which
demonstrates the crawler usage.
- Some useful examples can be found in the `integration tests
<https://gitlab.com/caosdb/caosdb-crawler/-/tree/main/integrationtests>`_ (and to a certain extent
in the unit tests).
......@@ -10,6 +10,7 @@ Getting Started
prerequisites
helloworld
optionalfeatures
furtherreading
This section will help you get going! From the first installation steps to the first simple crawl.
......
src/doc/img/converter.png

45.9 KiB

This diff is collapsed.
Macros
------
Macros highly facilitate the writing of complex :doc:`CFoods<cfood>`. Consider the following prevalent example:
Macros highly facilitate the writing of complex :doc:`CFoods<cfood>`. Consider the following common
example:
.. _example_files:
.. code-block:: yaml
......
......@@ -9,4 +9,4 @@ This chapter contains a collection of tutorials.
Parameter File<parameterfile>
Scientific Data Folder<scifolder>
WIP: Single Structured File <single_file>
WIP Tutorial: Single structured file
====================================
.. warning::
This tutorial is still work in progress. It may be better than nothing, but it is still
incomplete and probably contains serious errors.
Use at your own risk.
In this tutorial, we will create a crawler that reads a single structured file, such as an XLSX
file.
Declarations
------------
``identifiables.yml``
.. code-block:: yaml
Präventionsmaßnahme:
- Organisation
- titel
- Laufzeit
``cfood.yml``
.. code-block:: yaml
---
metadata:
crawler-version: 0.6.1
---
Präventionsmaßnahme der Organisation: # Eine Excel-Datei mit Präventionsmaßnahmen
type: XLSXTableConverter
match: ".*xlsx$" # Any xlsx file.
subtree:
Maßnahme: # Eine Zeile in der Datei
type: DictElement
match_name: .*
match_value: .*
records:
Präventionsmaßnahme: # Records edited for each row
name: ""
subtree:
MaßnahmenArt: # Spalte mit Art der Maßnahme
type: IntegerElement
match_name: Art der Maßnahme # Name of the column in the table file
match_value: (?P<column_value).*)
MaßnahmenTitel:
type: TextElement
match_name: Titel der Maßnahme # Name of the column in the table file
match_value: (?P<column_value).*)
records: # Records edited for each cell
Präventionsmaßnahme:
titel: $column_value
Python code
-----------
.. code-block:: python
#!/usr/bin/env python3
# Crawler für Präventionsmaßnahme
#
# Copyright (C) 2023 IndiScale GmbH <info@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""Crawler für Präventionsmaßnahmen"""
import argparse
from caoscrawler.scanner import load_definition, create_converter_registry, scan_structure_elements
from caoscrawler.structure_elements import File
def crawl_file(filename: str, dry_run: bool = False):
"""Read an XLSX file into a LinkAhead container.
Parameters
----------
filename : str
The name of the XLSX file.
dry_run : bool
If True, do not modify the database.
"""
definition = load_definition("cfood.yml")
converter_registry = create_converter_registry(definition)
records = scan_structure_elements(items=File(name="somename.xlsx", path=filename),
crawler_definition=definition,
converter_registry=converter_registry)
from IPython import embed
embed()
def _parse_arguments():
"""Parse the arguments."""
parser = argparse.ArgumentParser(description='Crawler für Präventionsmaßnahme')
parser.add_argument('-n', '--dry-run', help="Do not modify the database.", action="store_true")
parser.add_argument('xlsx_file', metavar="XSLX file", help="The xlsx file to be crawled.")
return parser.parse_args()
def main():
"""Main function."""
args = _parse_arguments()
crawl_file(args.xlsx_file, dry_run=args.dry_run)
if __name__ == '__main__':
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment