From c6f10a9ab3d377f03e34a3d3c63246d3137fb814 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <alexander@mail-schlemmer.de> Date: Tue, 27 Sep 2022 14:17:45 +0200 Subject: [PATCH] DOC: documentation for custom crawlers --- src/doc/converters.rst | 163 ++++++++++++++++++++++++++++++++++++++++- src/doc/macros.rst | 2 +- 2 files changed, 162 insertions(+), 3 deletions(-) diff --git a/src/doc/converters.rst b/src/doc/converters.rst index 28b81947..99c0bbff 100644 --- a/src/doc/converters.rst +++ b/src/doc/converters.rst @@ -43,7 +43,7 @@ JSONFileConverter TextElementConverter TableConverter -================= +============== A generic converter (abstract) for files containing tables. Currently, there are two specialized implementations for xlsx-files and csv-files. @@ -80,7 +80,7 @@ Example: XLSXTableConverter -================= +================== CSVTableConverter ================= @@ -88,3 +88,162 @@ CSVTableConverter Custom Converters +++++++++++++++++ +It was previously mentioned that it is possible to create custom converters. +These custom converters can be used to integrate arbitrary data extraction and ETL capabilities +into the caosdb-crawler and make these extensions available to any yaml specification. + +The basic syntax for adding a custom converter to a yaml cfood definition file is: + +.. code-block:: yaml + + Converters: + <NameOfTheConverterInYamlFile>: + package: <python>.<module>.<name> + converter: <PythonClassName> + +The Converters-section can be either put into the first or second document of the cfood yaml file. +It can be also part of a single-document yaml cfood file. Please refer to :doc:`the cfood documentation<cfood>` for more details. + +Details: + +- **<NameOfTheConverterInYamlFile>**: This is the name of the converter as it is going to be used in the present yaml file. +- **<python>.<module>.<name>**: The name of the module where the converter class resides. +- **<PythonClassName>**: Within this specified module there must be a class inheriting from base class :py:class:`caoscrawler.converters.Converter`. + +The following methods are abstract and need to be overwritten by your custom converter to make it work: + +- :py:meth:`~caoscrawler.converters.Converter.create_children` +- :py:meth:`~caoscrawler.converters.Converter.match` +- :py:meth:`~caoscrawler.converters.Converter.typecheck` + + +Example +======= + +In the following, we will explain the process of adding a custom converter to a yaml file using +a SourceResolver that is able to attach a source element to another entity. + +**Note**: This example might become a standard crawler soon, as part of the scifolder specification. See https://doi.org/10.3390/data5020043 for details. In this documentation example we will, therefore, add it to a package called "scifolder". + +First we will create our package and module structure, which might be: +.. code-block:: + + scifolder_package/ + README.md + setup.cfg + setup.py + Makefile + tox.ini + src/ + scifolder/ + __init__.py + converters/ + __init__.py + sources.py # <- the actual file containing + # the converter class + doc/ + unittests/ + +Now we need to create a class called "SourceResolver" in the file "sources.py". In this - more advanced - example, we will not inherit our converter directly from :py:class:`~caoscrawler.converters.Converter`, but use :py:class:`~caoscrawler.converters.TextElementConverter`. The latter already implements :py:meth:`~caoscrawler.converters.Converter.match` and :py:meth:`~caoscrawler.converters.Converter.typecheck`, so only an implementation for :py:meth:`~caoscrawler.converters.Converter.create_children` has to be provided by us. +Furthermore we will customize the method :py:meth:`~caoscrawler.converters.Converter.create_records` that allows us to specify a more complex record generation procedure than provided in the standard implementation. One specific limitation of the standard implementation is, that only a fixed +number of records can be generated by the yaml definition. So for any applications - like here - that require an arbitrary number of records to be created, a customized implementation of :py:meth:`~caoscrawler.converters.Converter.create_records` is recommended. +In this context it is recommended to make use of the function :func:`caoscrawler.converters.create_records` that implements creation of record objects from python dictionaries of the same structure +that would be given using a yaml definition. + +.. code-block:: python + + import re + from caoscrawler.stores import GeneralStore, RecordStore + from caoscrawler.converters import TextElementConverter, create_records + from caoscrawler.structure_elements import StructureElement, TextElement + + + class SourceResolver(TextElementConverter): + """ + This resolver uses a source list element (e.g. from the markdown readme file) + to link sources correctly. + """ + + def __init__(self, definition: dict, name: str, + converter_registry: dict): + """ + Initialize a new directory converter. + """ + super().__init__(definition, name, converter_registry) + + def create_children(self, generalStore: GeneralStore, + element: StructureElement): + + # The source resolver does not create children: + + return [] + + def create_records(self, values: GeneralStore, + records: RecordStore, + element: StructureElement, + file_path_prefix): + if not isinstance(element, TextElement): + raise RuntimeError() + + # This function must return a list containing tuples, each one for a modified + # property: (name_of_entity, name_of_property) + keys_modified = [] + + # This is the name of the entity where the source is going to be attached: + attach_to_scientific_activity = self.definition["scientific_activity"] + rec = records[attach_to_scientific_activity] + + # The "source" is a path to a source project, so it should have the form: + # /<Category>/<project>/<scientific_activity>/ + # obtain these information from the structure element: + val = element.value + regexp = (r'/(?P<category>(SimulationData)|(ExperimentalData)|(DataAnalysis))' + '/(?P<project_date>.*?)_(?P<project_identifier>.*)' + '/(?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2})(_(?P<identifier>.*))?/') + + res = re.match(regexp, val) + if res is None: + raise RuntimeError("Source cannot be parsed correctly.") + + # Mapping of categories on the file system to corresponding record types in CaosDB: + cat_map = { + "SimulationData": "Simulation", + "ExperimentalData": "Experiment", + "DataAnalysis": "DataAnalysis"} + linkrt = cat_map[res.group("category")] + + keys_modified.extend(create_records(values, records, { + "Project": { + "date": res.group("project_date"), + "identifier": res.group("project_identifier"), + }, + linkrt: { + "date": res.group("date"), + "identifier": res.group("identifier"), + "project": "$Project" + }, + attach_to_scientific_activity: { + "sources": "+$" + linkrt + }}, file_path_prefix)) + + # Process the records section of the yaml definition: + keys_modified.extend( + super().create_records(values, records, element, file_path_prefix)) + + # The create_records function must return the modified keys to make it compatible + # to the crawler functions: + return keys_modified + + +If the recommended (python) package structure is used, the package containing the converter +definition can just be installed using `pip install .` or `pip install -e .` from the +`scifolder_package` directory. + +The following yaml block will register the converter in a yaml file: + +.. code-block:: yaml + + Converters: + SourceResolver: + package: scifolder.converters.sources + converter: SourceResolver diff --git a/src/doc/macros.rst b/src/doc/macros.rst index d8e81946..3d995c1f 100644 --- a/src/doc/macros.rst +++ b/src/doc/macros.rst @@ -1,7 +1,7 @@ Macros ------ -Macros highly facilitate the writing of complex :doc:`CFoods<cfoods>`. Consider the following prevalent example: +Macros highly facilitate the writing of complex :doc:`CFoods<cfood>`. Consider the following prevalent example: .. _example_files: .. code-block:: yaml -- GitLab