diff --git a/.gitignore b/.gitignore index 9af5ee22fdd68c1c25e98614ab516bf4d384d577..5599d7d263c8927025e128c37eabb185025bf96b 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,6 @@ provenance.yml *.tar.gz *.sql /integrationtests/test-profile/custom/other/cert/ +src/doc/_apidoc/ +start_caosdb_docker.sh +src/doc/_apidoc diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index bfe693cfde8446eb8e585d2575bb979e0f65965e..39a6bf2b987cb68ffffd359923c255c8af54839f 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -535,6 +535,7 @@ class DictConverter(Converter): return {} +# TODO: difference to SimpleFileConverter? Do we need both? class FileConverter(Converter): def typecheck(self, element: StructureElement): return isinstance(element, File) @@ -566,6 +567,8 @@ class JSONFileConverter(DictConverter): def create_children(self, generalStore: GeneralStore, element: StructureElement): if not self.typecheck(element): raise RuntimeError("A JSON file is needed to create children") + # TODO: either add explicit time check for File structure element here, + # or add a comment to suppress mypy type warning. with open(element.path, 'r') as json_file: json_data = json.load(json_file) if not isinstance(json_data, dict): diff --git a/src/doc/README_SETUP.md b/src/doc/README_SETUP.md new file mode 100644 index 0000000000000000000000000000000000000000..b6995c9a2d950ecd1e832d5b49dac9ed88a7e455 --- /dev/null +++ b/src/doc/README_SETUP.md @@ -0,0 +1,82 @@ +# Getting started with the CaosDB Crawler # + +## Installation ## + +### Requirements ### + + +### How to install ### + +#### Linux #### + +Make sure that Python (at least version 3.8) and pip is installed, using your system tools and +documentation. + +Then open a terminal and continue in the [Generic installation](#generic-installation) section. + +#### Windows #### + +If a Python distribution is not yet installed, we recommend Anaconda Python, which you can download +for free from [https://www.anaconda.com](https://www.anaconda.com). The "Anaconda Individual Edition" provides most of all +packages you will ever need out of the box. If you prefer, you may also install the leaner +"Miniconda" installer, which allows you to install packages as you need them. + +After installation, open an Anaconda prompt from the Windows menu and continue in the [Generic +installation](#generic-installation) section. + +#### MacOS #### + +If there is no Python 3 installed yet, there are two main ways to +obtain it: Either get the binary package from +[python.org](https://www.python.org/downloads/) or, for advanced +users, install via [Homebrew](https://brew.sh/). After installation +from python.org, it is recommended to also update the TLS certificates +for Python (this requires administrator rights for your user): + +```sh +# Replace this with your Python version number: +cd /Applications/Python\ 3.9/ + +# This needs administrator rights: +sudo ./Install\ Certificates.command +``` + +After these steps, you may continue with the [Generic +installation](#generic-installation). + +#### Generic installation #### + +--- + +Obtain the sources from GitLab and install from there (`git` must be installed for +this option): + +```sh +git clone https://gitlab.com/caosdb/caosdb-crawler +cd caosdb-crawler +pip3 install --user . +``` + +**Note**: In the near future, this package will also be made available on PyPi. + +## Configuration ## + + + +## Try it out ## + + + +## Run Unit Tests + +## Documentation ## + +Build documentation in `src/doc` with `make html`. + +### Requirements ### + +- `sphinx` +- `sphinx-autoapi` +- `recommonmark` + +### Troubleshooting ### diff --git a/src/doc/_apidoc/modules.rst b/src/doc/_apidoc/modules.rst deleted file mode 100644 index 17f187982981ffbf7bcc857056d10644c2bd422b..0000000000000000000000000000000000000000 --- a/src/doc/_apidoc/modules.rst +++ /dev/null @@ -1,7 +0,0 @@ -newcrawler -========== - -.. toctree:: - :maxdepth: 4 - - newcrawler diff --git a/src/doc/_apidoc/newcrawler.converters.rst b/src/doc/_apidoc/newcrawler.converters.rst deleted file mode 100644 index 893391c229b94baeed9a44c57877ed33f37b2f5e..0000000000000000000000000000000000000000 --- a/src/doc/_apidoc/newcrawler.converters.rst +++ /dev/null @@ -1,7 +0,0 @@ -newcrawler.converters module -============================ - -.. automodule:: newcrawler.converters - :members: - :undoc-members: - :show-inheritance: diff --git a/src/doc/_apidoc/newcrawler.crawl.rst b/src/doc/_apidoc/newcrawler.crawl.rst deleted file mode 100644 index b00a6ab6498a0482cea3e9faa54d66d66991dc2d..0000000000000000000000000000000000000000 --- a/src/doc/_apidoc/newcrawler.crawl.rst +++ /dev/null @@ -1,7 +0,0 @@ -newcrawler.crawl module -======================= - -.. automodule:: newcrawler.crawl - :members: - :undoc-members: - :show-inheritance: diff --git a/src/doc/_apidoc/newcrawler.identifiable_adapters.rst b/src/doc/_apidoc/newcrawler.identifiable_adapters.rst deleted file mode 100644 index d8926f41b72d2c54931f045d75f9fe59b21e6076..0000000000000000000000000000000000000000 --- a/src/doc/_apidoc/newcrawler.identifiable_adapters.rst +++ /dev/null @@ -1,7 +0,0 @@ -newcrawler.identifiable\_adapters module -======================================== - -.. automodule:: newcrawler.identifiable_adapters - :members: - :undoc-members: - :show-inheritance: diff --git a/src/doc/_apidoc/newcrawler.identified_cache.rst b/src/doc/_apidoc/newcrawler.identified_cache.rst deleted file mode 100644 index 6f697362ad44d1fec01f328550dc8667cc889019..0000000000000000000000000000000000000000 --- a/src/doc/_apidoc/newcrawler.identified_cache.rst +++ /dev/null @@ -1,7 +0,0 @@ -newcrawler.identified\_cache module -=================================== - -.. automodule:: newcrawler.identified_cache - :members: - :undoc-members: - :show-inheritance: diff --git a/src/doc/_apidoc/newcrawler.rst b/src/doc/_apidoc/newcrawler.rst deleted file mode 100644 index 202444a5efbde248e52d712575ade49f6dd50601..0000000000000000000000000000000000000000 --- a/src/doc/_apidoc/newcrawler.rst +++ /dev/null @@ -1,24 +0,0 @@ -newcrawler package -================== - -Submodules ----------- - -.. toctree:: - :maxdepth: 4 - - newcrawler.converters - newcrawler.crawl - newcrawler.identifiable_adapters - newcrawler.identified_cache - newcrawler.stores - newcrawler.structure_elements - newcrawler.utils - -Module contents ---------------- - -.. automodule:: newcrawler - :members: - :undoc-members: - :show-inheritance: diff --git a/src/doc/_apidoc/newcrawler.stores.rst b/src/doc/_apidoc/newcrawler.stores.rst deleted file mode 100644 index 7d446c1cd45a6bf1c4b6cf1b1d33e9a2a5ad9751..0000000000000000000000000000000000000000 --- a/src/doc/_apidoc/newcrawler.stores.rst +++ /dev/null @@ -1,7 +0,0 @@ -newcrawler.stores module -======================== - -.. automodule:: newcrawler.stores - :members: - :undoc-members: - :show-inheritance: diff --git a/src/doc/_apidoc/newcrawler.structure_elements.rst b/src/doc/_apidoc/newcrawler.structure_elements.rst deleted file mode 100644 index 4613e1d58b0ef9c7cc38096aa25270f469836ce5..0000000000000000000000000000000000000000 --- a/src/doc/_apidoc/newcrawler.structure_elements.rst +++ /dev/null @@ -1,7 +0,0 @@ -newcrawler.structure\_elements module -===================================== - -.. automodule:: newcrawler.structure_elements - :members: - :undoc-members: - :show-inheritance: diff --git a/src/doc/_apidoc/newcrawler.utils.rst b/src/doc/_apidoc/newcrawler.utils.rst deleted file mode 100644 index 4df55a234fd85072068e41d1ce7bb3b17fd1a698..0000000000000000000000000000000000000000 --- a/src/doc/_apidoc/newcrawler.utils.rst +++ /dev/null @@ -1,7 +0,0 @@ -newcrawler.utils module -======================= - -.. automodule:: newcrawler.utils - :members: - :undoc-members: - :show-inheritance: diff --git a/src/doc/cfood.rst b/src/doc/cfood.rst new file mode 100644 index 0000000000000000000000000000000000000000..677cadc55709c6c25d16ff547b311102ee78699a --- /dev/null +++ b/src/doc/cfood.rst @@ -0,0 +1,149 @@ +CFood-Definition +================ + +The crawler specification is called CFood-definition. It is stored inside a yaml file, or - more precisely - inside of one single or two yaml documents inside a yaml file. + +The specification consists of three separate parts: +#. Metadata and macro definitions +#. Custom converter registrations +#. The converter tree specification + +In the simplest case, there is just one yaml file with just a single document including at least +the converter tree specification (see :ref:`example 1<example_1>`). Additionally the custom converter part may be also included in +this single document (for historical reasons, see :ref:`example 2<example_2>`), but it is recommended to include them in the separate +document together with the metadata and :doc:`macro<macros>` definitions (see :ref:`below<example_4>`). + +If metadata and macro definitions are provided, there **must** be a second document preceeding the +converter tree specification, including these definitions. + +Examples +++++++++ + +A single document with a converter tree specification: + +.. _example_1: +.. code-block:: yaml + + extroot: + type: Directory + match: ^extroot$ + subtree: + DataAnalysis: + type: Directory + match: DataAnalysis + # (...) + + +A single document with a converter tree specification, but also including a custom converters section: + +.. _example_2: +.. code-block:: yaml + + Converters: + CustomConverter_1: + package: mypackage.converters + converter: CustomConverter1 + CustomConverter_2: + package: mypackage.converters + converter: CustomConverter2 + + extroot: + type: Directory + match: ^extroot$ + subtree: + DataAnalysis: + type: Directory + match: DataAnalysis + # (...) + + + +A yaml multi-document, defining metadata and some macros in the first document and declaring +two custom converters in the second document (**not recommended**, see the recommended version :ref:`below<example_4>`). Please note, that two separate yaml documents can be defined using the ``---`` syntax: + + +.. _example_3: +.. code-block:: yaml + + --- + metadata: + name: Datascience CFood + description: CFood for data from the local data science work group + macros: + - !defmacro + name: SimulationDatasetFile + params: + match: null + recordtype: null + nodename: null + definition: + # (...) + --- + Converters: + CustomConverter_1: + package: mypackage.converters + converter: CustomConverter1 + CustomConverter_2: + package: mypackage.converters + converter: CustomConverter2 + + extroot: + type: Directory + match: ^extroot$ + subtree: + DataAnalysis: + type: Directory + match: DataAnalysis + # (...) + + + +The **recommended way** of defining metadata, custom converters, macros and the main cfood specification is shown in the following code example: + + +.. _example_4: +.. code-block:: yaml + + --- + metadata: + name: Datascience CFood + description: CFood for data from the local data science work group + macros: + - !defmacro + name: SimulationDatasetFile + params: + match: null + recordtype: null + nodename: null + definition: + # (...) + Converters: + CustomConverter_1: + package: mypackage.converters + converter: CustomConverter1 + CustomConverter_2: + package: mypackage.converters + converter: CustomConverter2 + --- + extroot: + type: Directory + match: ^extroot$ + subtree: + DataAnalysis: + type: Directory + match: DataAnalysis + # (...) + + +List Mode +--------- + +Specifying values of properties can make use of two special characters, in order to automatically +create lists or multi properties instead of single values: + +.. code-block:: yaml + + Experiment1: + Measurement: +Measurement <- Element in List (list is cleared before run) + *Measurement <- Multi Property (properties are removed before run) + Measurement <- Overwrite diff --git a/src/doc/concepts.rst b/src/doc/concepts.rst new file mode 100644 index 0000000000000000000000000000000000000000..c0f21cbaa322caddabed8e045f7b6fc4253d2959 --- /dev/null +++ b/src/doc/concepts.rst @@ -0,0 +1,119 @@ +Concepts +)))))))) + +Structure Elements +++++++++++++++++++ + +This hierarchical structure is assumed to be consituted of a tree of +StructureElements. The tree is created on the fly by so called Converters which +are defined in a yaml file. The tree of StructureElements is a model +of the existing data (For example could a tree of Python file objects +(StructureElements) represent a file tree that exists on some file server). + +Relevant sources in: +src/structure_elements.py + +Converters +++++++++++ + +Converters treat StructureElements and thereby create the StructureElement that +are the children of the treated StructureElement. Converters therefore create +the above named tree. The definition of a Converter also contains what +Converters shall be used to treat the generated child-StructureElements. The +definition is therefore a tree itself. + +See `:doc:converters<converters>` for details. + + + +Relevant sources in: +src/converters.py + + + +Identifiables ++++++++++++++ + +Relevant sources in: +src/identifiable_adapters.py + +The Crawler ++++++++++++ + +The crawler can be considered the main program doing the synchronization in basically two steps: +#. Based on a yaml-specification scan the file system (or other sources) and create a set of CaosDB Entities that are supposed to be inserted or updated in a CaosDB instance. +#. Compare the current state of the CaosDB instance with the set of CaosDB Entities created in step 1, taking into account the :ref:`registered identifiables<Identifiables>`. Insert or update entites accordingly. + +Relevant sources in: +src/crawl.py + + + +Special Cases +============= + +Variable Precedence ++++++++++++++++++++ + +Let's assume the following situation + +.. code-block:: yaml + + description: + type: DictTextElement + match_value: (?P<description>.*) + match_name: description + + +Making use of the $description variable could refer to two different variables created here: +1. The structure element path. +2. The value of the matched expression. + +The matched expression does take precedence over the structure element path and shadows it. + +Make sure, that if you want to be able to use the structure element path, to give unique names +to the variables like: + +.. code-block:: yaml + + description_text_block: + type: DictTextElement + match_value: (?P<description>.*) + match_name: description + + +Scopes +======== + +Example: + +.. code-block:: yaml + + DicomFile: + type: SimpleDicomFile + match: (?P<filename>.*)\.dicom + records: + DicomRecord: + name: $filename + subtree: # header of dicom file + PatientID: + type: DicomHeaderElement + match_name: PatientName + match_value: (?P<patient>.*) + records: + Patient: + name: $patient + dicom_name: $filename # $filename is in same scope! + ExperimentFile: + type: MarkdownFile + match: ^readme.md$ + records: + Experiment: + dicom_name: $filename # does NOT work, because $filename is out of scope! + + +# can variables be used within regexp? + + +File Objects +============ diff --git a/src/doc/conf.py b/src/doc/conf.py index fb37cdd96c440300741aeb49e90caffe4370f5d7..30ce670eb8685e9701eeeb59bf22451a21fb16b9 100644 --- a/src/doc/conf.py +++ b/src/doc/conf.py @@ -53,6 +53,7 @@ extensions = [ 'sphinx.ext.autosectionlabel', 'sphinx.ext.intersphinx', 'sphinx.ext.napoleon', # For Google style docstrings + "recommonmark", # For markdown files. "sphinx_rtd_theme", ] @@ -61,7 +62,7 @@ templates_path = ['_templates'] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: -source_suffix = ['.rst'] +source_suffix = ['.rst', '.md'] # The master toctree document. master_doc = 'index' @@ -71,7 +72,7 @@ master_doc = 'index' # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = None +language = "en" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -99,7 +100,7 @@ html_theme = "sphinx_rtd_theme" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = [] # ['_static'] # Custom sidebar templates, must be a dictionary that maps document names # to template names. diff --git a/src/doc/converters.rst b/src/doc/converters.rst new file mode 100644 index 0000000000000000000000000000000000000000..a30a7d92850f90be14f82a4e563fb56df5fcde88 --- /dev/null +++ b/src/doc/converters.rst @@ -0,0 +1,308 @@ +Converters +)))))))))) + +Converters treat StructureElements and thereby create the StructureElement that +are the children of the treated StructureElement. Converters therefore create +the tree of structure elements. The definition of a Converter also contains what +Converters shall be used to treat the generated child-StructureElements. The +definition is therefore a tree itself. + +Each StructureElement in the tree has a set of data values, i.e a dictionary of +key value pairs. +Some of those values are set due to the kind of StructureElement. For example, +a file could have the file name as such a key value pair: 'filename': <sth>. +Converters may define additional functions that create further values. For +example, a regular expresion could be used to get a date from a file name. + + + + +A converter is defined via a yml file or part of it. The definition states +what kind of StructureElement it treats (typically one). +Also, it defines how children of the current StructureElement are +created and what Converters shall be used to treat those. + +The yaml definition looks like the following: + +TODO: outdated, see cfood-schema.yml + +.. code-block:: yaml + + <NodeName>: + type: <ConverterName> + match: ".*" + records: + Experiment1: + parents: + - Experiment + - Blablabla + date: $DATUM + (...) + Experiment2: + parents: + - Experiment + subtree: + (...) + +The **<NodeName>** is a description of what it represents (e.g. +'experiment-folder') and is used as identifier. + +**<type>** selects the converter that is going to be matched against the current structure +element. If the structure element matches (this is a combination of a typecheck and a detailed +match, see :py:class:`~caoscrawler.converters.Converter` for details) the converter is used +to generate records (see :py:meth:`~caoscrawler.converters.Converter.create_records`) and to possibly process a subtree, as defined by the function :func:`caoscrawler.converters.create_children`. + +**records** is a dict of definitions that define the semantic structure +(see details below). + +Subtree contains a list of Converter defnitions that look like the one +described here. + + +Standard Converters ++++++++++++++++++++ + +Directory Converter +=================== + +Simple File Converter +===================== + +Markdown File Converter +======================= + +Dict Converter +============== + +Typical Subtree converters +-------------------------- + +DictBooleanElementConverter +DictFloatElementConverter +DictTextElementConverter +DictIntegerElementConverter +DictListElementConverter +DictDictElementConverter + +YAMLFileConverter +================= + +A specialized Dict Converter for yaml files: Yaml files are opened and the contents are +converted into dictionaries that can be further converted using the typical subtree converters +of dict converter. + +**WARNING**: Currently unfinished implementation. + +JSONFileConverter +================= + + + +TextElementConverter +==================== + +TableConverter +============== + +A generic converter (abstract) for files containing tables. +Currently, there are two specialized implementations for xlsx-files and csv-files. + +All table converters generate a subtree that can be converted with DictDictElementConverters: +For each row in the table a DictDictElement (structure element) is generated. The key of the +element is the row number. The value of the element is a dict containing the mapping of +column names to values of the respective cell. + +Example: + +.. code-block:: yaml + + subtree: + TABLE: + type: CSVTableConverter + match: ^test_table.csv$ + records: + (...) # Records edited for the whole table file + subtree: + ROW: + type: DictDictElement + match_name: .* + match_value: .* + records: + (...) # Records edited for each row + subtree: + COLUMN: + type: DictFloatElement + match_name: measurement # Name of the column in the table file + match_value: (?P<column_value).*) + records: + (...) # Records edited for each cell + + +XLSXTableConverter +================== + +CSVTableConverter +================= + +Custom Converters ++++++++++++++++++ + +It was previously mentioned that it is possible to create custom converters. +These custom converters can be used to integrate arbitrary data extraction and ETL capabilities +into the caosdb-crawler and make these extensions available to any yaml specification. + +The basic syntax for adding a custom converter to a yaml cfood definition file is: + +.. code-block:: yaml + + Converters: + <NameOfTheConverterInYamlFile>: + package: <python>.<module>.<name> + converter: <PythonClassName> + +The Converters-section can be either put into the first or second document of the cfood yaml file. +It can be also part of a single-document yaml cfood file. Please refer to :doc:`the cfood documentation<cfood>` for more details. + +Details: + +- **<NameOfTheConverterInYamlFile>**: This is the name of the converter as it is going to be used in the present yaml file. +- **<python>.<module>.<name>**: The name of the module where the converter class resides. +- **<PythonClassName>**: Within this specified module there must be a class inheriting from base class :py:class:`caoscrawler.converters.Converter`. + +The following methods are abstract and need to be overwritten by your custom converter to make it work: + +- :py:meth:`~caoscrawler.converters.Converter.create_children` +- :py:meth:`~caoscrawler.converters.Converter.match` +- :py:meth:`~caoscrawler.converters.Converter.typecheck` + + +Example +======= + +In the following, we will explain the process of adding a custom converter to a yaml file using +a SourceResolver that is able to attach a source element to another entity. + +**Note**: This example might become a standard crawler soon, as part of the scifolder specification. See https://doi.org/10.3390/data5020043 for details. In this documentation example we will, therefore, add it to a package called "scifolder". + +First we will create our package and module structure, which might be: +.. code-block:: + + scifolder_package/ + README.md + setup.cfg + setup.py + Makefile + tox.ini + src/ + scifolder/ + __init__.py + converters/ + __init__.py + sources.py # <- the actual file containing + # the converter class + doc/ + unittests/ + +Now we need to create a class called "SourceResolver" in the file "sources.py". In this - more advanced - example, we will not inherit our converter directly from :py:class:`~caoscrawler.converters.Converter`, but use :py:class:`~caoscrawler.converters.TextElementConverter`. The latter already implements :py:meth:`~caoscrawler.converters.Converter.match` and :py:meth:`~caoscrawler.converters.Converter.typecheck`, so only an implementation for :py:meth:`~caoscrawler.converters.Converter.create_children` has to be provided by us. +Furthermore we will customize the method :py:meth:`~caoscrawler.converters.Converter.create_records` that allows us to specify a more complex record generation procedure than provided in the standard implementation. One specific limitation of the standard implementation is, that only a fixed +number of records can be generated by the yaml definition. So for any applications - like here - that require an arbitrary number of records to be created, a customized implementation of :py:meth:`~caoscrawler.converters.Converter.create_records` is recommended. +In this context it is recommended to make use of the function :func:`caoscrawler.converters.create_records` that implements creation of record objects from python dictionaries of the same structure +that would be given using a yaml definition. + +.. code-block:: python + + import re + from caoscrawler.stores import GeneralStore, RecordStore + from caoscrawler.converters import TextElementConverter, create_records + from caoscrawler.structure_elements import StructureElement, TextElement + + + class SourceResolver(TextElementConverter): + """ + This resolver uses a source list element (e.g. from the markdown readme file) + to link sources correctly. + """ + + def __init__(self, definition: dict, name: str, + converter_registry: dict): + """ + Initialize a new directory converter. + """ + super().__init__(definition, name, converter_registry) + + def create_children(self, generalStore: GeneralStore, + element: StructureElement): + + # The source resolver does not create children: + + return [] + + def create_records(self, values: GeneralStore, + records: RecordStore, + element: StructureElement, + file_path_prefix): + if not isinstance(element, TextElement): + raise RuntimeError() + + # This function must return a list containing tuples, each one for a modified + # property: (name_of_entity, name_of_property) + keys_modified = [] + + # This is the name of the entity where the source is going to be attached: + attach_to_scientific_activity = self.definition["scientific_activity"] + rec = records[attach_to_scientific_activity] + + # The "source" is a path to a source project, so it should have the form: + # /<Category>/<project>/<scientific_activity>/ + # obtain these information from the structure element: + val = element.value + regexp = (r'/(?P<category>(SimulationData)|(ExperimentalData)|(DataAnalysis))' + '/(?P<project_date>.*?)_(?P<project_identifier>.*)' + '/(?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2})(_(?P<identifier>.*))?/') + + res = re.match(regexp, val) + if res is None: + raise RuntimeError("Source cannot be parsed correctly.") + + # Mapping of categories on the file system to corresponding record types in CaosDB: + cat_map = { + "SimulationData": "Simulation", + "ExperimentalData": "Experiment", + "DataAnalysis": "DataAnalysis"} + linkrt = cat_map[res.group("category")] + + keys_modified.extend(create_records(values, records, { + "Project": { + "date": res.group("project_date"), + "identifier": res.group("project_identifier"), + }, + linkrt: { + "date": res.group("date"), + "identifier": res.group("identifier"), + "project": "$Project" + }, + attach_to_scientific_activity: { + "sources": "+$" + linkrt + }}, file_path_prefix)) + + # Process the records section of the yaml definition: + keys_modified.extend( + super().create_records(values, records, element, file_path_prefix)) + + # The create_records function must return the modified keys to make it compatible + # to the crawler functions: + return keys_modified + + +If the recommended (python) package structure is used, the package containing the converter +definition can just be installed using `pip install .` or `pip install -e .` from the +`scifolder_package` directory. + +The following yaml block will register the converter in a yaml file: + +.. code-block:: yaml + + Converters: + SourceResolver: + package: scifolder.converters.sources + converter: SourceResolver diff --git a/src/doc/index.rst b/src/doc/index.rst index f11d73b58a3216b1d735d6565650148c150ebb68..724bcc543dd1cf0b9af451c487b1b3aab7fa95ca 100644 --- a/src/doc/index.rst +++ b/src/doc/index.rst @@ -1,8 +1,23 @@ Crawler 2.0 Documentation ========================= -Introduction ------------- + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + :hidden: + + Getting started<README_SETUP> + Concepts<concepts> + Converters<converters> + CFoods (Crawler Definitions)<cfood> + Macros<macros> + Tutorials<tutorials/index> + API documentation<_apidoc/modules> + + + +This is the documentation for the crawler (previously known as crawler 2.0) for CaosDB, ``caosdb-crawler``. The crawler is the main date integration tool for CaosDB. Its task is to automatically synchronize data found on file systems or in other @@ -15,172 +30,15 @@ The hierarchical sturcture can be for example a file tree. However it can be also something different like the contents of a json file or a file tree with json files. -Concepts --------- - -Structure Elements -++++++++++++++++++ - -This hierarchical structure is assumed to be consituted of a tree of -StructureElements. The tree is created on the fly by so called Converters which -are defined in a yaml file. The tree of StructureElements is a model -of the existing data (For example could a tree of Python file objects -(StructureElements) represent a file tree that exists on some file server). - -Relevant sources in: -src/structure_elements.py - -Converters -++++++++++ - -Converters treat StructureElements and thereby create the StructureElement that -are the children of the treated StructureElement. Converters therefore create -the above named tree. The definition of a Converter also contains what -Converters shall be used to treat the generated child-StructureElements. The -definition is there a tree itself. (Question: Should there be global Converters -that are always checked when treating a StructureElement? Should Converters be -associated with generated child-StructureElements? Currently, all children are -created and checked against all Converters. It could be that one would like to -check file-StructureElements against one set of Converters and -directory-StructureElements against another) - -Each StructureElement in the tree has a set of data values, i.e a dictionary of -key value pairs. -Some of those values are set due to the kind of StructureElement. For example, -a file could have the file name as such a key value pair: 'filename': <sth>. -Converters may define additional functions that create further values. For -example, a regular expresion could be used to get a date from a file name. - - - - -A converter is defined via a yml file or part of it. The definition states -what kind of StructureElement it treats (typically one). -Also, it defines how children of the current StructureElement are -created and what Converters shall be used to treat those. - -The yaml definition looks like the following: - -TODO: outdated, see cfood-schema.yml - -converter-name: - type: <StructureElement Type> - match: ".*" - records: - Experiment1: - parents: - - Experiment - - Blablabla - date: $DATUM - <...> - Experiment2: - parents: - - Experiment - valuegenerators: - datepattern: - <...> - childrengenerators: - create_children_from_directory: - sort-by-date: true - subtree: - - -records: - Measurement: <- wird automatisch ein value im valueStore - run_number: 25 - Experiment1: - Measurement: +Measurement <- Element in List (list is cleared before run) - *Measurement <- Multi Property (properties are removed before run) - Measurement <- Overwrite - -UPDATE-Stage prüft ob es z.B. Gleichheit zwischen Listen gibt (die dadurch definiert sein -kann, dass alle Elemente vorhanden, aber nicht zwingend in der richtigen Reihenfolge sind) -evtl. brauchen wir das nicht, weil crawler eh schon deterministisch ist. - -The converter-name is a description of what it represents (e.g. -'experiment-folder') and is used as identifier. - -The type restricts what kind of StructureElements are treated. -The match is by default a regular expression, that is matche against the -name of StructureElements. Discussion: StructureElements might not have a -name (e.g. a dict) or should a name be created artificially if necessary -(e.g. "root-dict")? It might make sense to allow keywords like "always" and -other kinds of checks. For example a dictionary could be checked against a -json-schema definition. - -recordtypes is a list of definitions that define the semantic structure -(see details below). - -valuegenerators allow to provide additional functionality that creates -data values in addition to the ones given by default via the -StructureElement. This can be for example a match group of a regular -expression applied to the filename. -It should be possible to access the values of parent nodes. For example, -the name of a parent node could be accessed with $converter-name.name. -Discussion: This can introduce conflicts, if the key <converver-name> -already exists. An alternative would be to identify those lookups. E.g. -$$converter-name.name (2x$). - -childrengenerators denotes how StructureElements shall be created that are -children of the current one. - -subtree contains a list of Converter defnitions that look like the one -described here. - -those keywords should be allowed but not required. I.e. if no -valuegenerators shall be defined, the keyword may be omitted. - - -Relevant sources in: -src/converters.py - -Identifiables -+++++++++++++ - -Relevant sources in: -src/identifiable_adapters.py - -The Crawler -+++++++++++ - -The crawler can be considered the main program doing the synchronization in basically two steps: -1. Based on a yaml-specification scan the file system (or other sources) and create a set - of CaosDB Entities that are supposed to be inserted or updated in a CaosDB instance. -2. Compare the current state of the CaosDB instance with the set of CaosDB Entities created in - step 1, taking into account the :ref:`registered identifiables<Identifiables>`. Insert or - update entites accordingly. - -Relevant sources in: -src/crawl.py - - - -Special Cases -============= - -Variable Precedence -++++++++++++ - -Let's assume the following situation - -.. code-block:: yaml - description: - type: DictTextElement - match_value: (?P<description>.*) - match_name: description +This documentation helps you to :doc:`get started<README_SETUP>`, explains the most important +:doc:`concepts<concepts>` and offers a range of :doc:`tutorials<tutorials/index>`. -Making use of the $description variable could refer to two different variables created here: -1. The structure element path. -2. The value of the matched expression. +Indices and tables +================== -The matched expression does take precedence over the structure element path and shadows it. +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` -Make sure, that if you want to be able to use the structure element path, to give unique names -to the variables like: -.. code-block:: yaml - description_text_block: - type: DictTextElement - match_value: (?P<description>.*) - match_name: description diff --git a/src/doc/macros.rst b/src/doc/macros.rst new file mode 100644 index 0000000000000000000000000000000000000000..3d995c1fbc67b155a6df606ac2f84a0cec26d1a5 --- /dev/null +++ b/src/doc/macros.rst @@ -0,0 +1,88 @@ +Macros +------ + +Macros highly facilitate the writing of complex :doc:`CFoods<cfood>`. Consider the following prevalent example: + +.. _example_files: +.. code-block:: yaml + + ExperimentalData: + type: Directory + match: ExperimentalData + subtree: + README: + type: SimpleFile + match: ^README.md$ + records: + ReadmeFile: + parents: + - MarkdownFile + role: File + path: $README + file: $README + +This example just inserts a file called ``README.md`` contained in Folder ``ExpreimentalData/`` into CaosDB, assigns the parent (RecordType) ``MarkdownFile`` and allows for later referencing this entity within the cfood. As file objects are created in the cfood specification using the ``records`` section with the special role ``File``, defining and using many files can become very cumbersome and make the cfood file difficult to read. + +The same version using cfood macros could be defined as follows: + +.. _example_files_2: +.. code-block:: yaml + + --- + metadata: + macros: + - !defmacro + name: MarkdownFile + params: + name: null + filename: null + definition: + ${name}_filename + type: SimpleFile + match: $filename + records: + $name: + parents: + - MarkdownFile + role: File + path: ${name}_filename + file: ${name}_filename + --- + ExperimentalData: + type: Directory + match: ExperimentalData + subtree: !macro + MarkdownFile: + - name: README + filename: ^README.md$ + + + + + +Complex Example +=============== + +.. _example_1: +.. code-block:: yaml + + macros: + - !defmacro + name: SimulationDatasetFile + params: + match: null + recordtype: null + nodename: null + definition: + $nodename: + match: $match + type: SimpleFile + records: + File: + parents: + - $recordtype + role: File + path: $$$nodename + file: $$$nodename + Simulation: + $recordtype: +$File diff --git a/src/doc/tutorials/index.rst b/src/doc/tutorials/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..1652515968c3b0025a2916604632d57c042f119b --- /dev/null +++ b/src/doc/tutorials/index.rst @@ -0,0 +1,2 @@ +Tutorials ++++++++++