diff --git a/CHANGELOG.md b/CHANGELOG.md index 6b75284d0f3651610566dfa3dfd54a994524137c..2ba6c84749478314882a4131754bf9cc7fc5b184 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### +* New converters for XML documents/trees/tags: XMLFile, XMLTag, XMLTextNode + ### Changed ### ### Deprecated ### diff --git a/README_SETUP.md b/README_SETUP.md deleted file mode 120000 index d478016ecde09dab8820d398b15df325f4159380..0000000000000000000000000000000000000000 --- a/README_SETUP.md +++ /dev/null @@ -1 +0,0 @@ -src/doc/README_SETUP.md \ No newline at end of file diff --git a/README_SETUP.md b/README_SETUP.md new file mode 100644 index 0000000000000000000000000000000000000000..32f0bb89a6051bc2ec4be0bae6cf06cd1a540f8b --- /dev/null +++ b/README_SETUP.md @@ -0,0 +1,34 @@ +# Getting started with the CaosDB Crawler # + +## Installation +see INSTALL.md + +## Run Unit Tests + +1. Install additional dependencies: + - h5py +2. Run `pytest unittests`. + +## Documentation ## +We use sphinx to create the documentation. Docstrings in the code should comply +with the Googly style (see link below). + +Build documentation in `src/doc` with `make doc`. Note that for the +automatic generation of the complete API documentation, it is +necessary to first install this library with all its optional +dependencies, i.e., `pip install .[h5-crawler,spss]`. + +### Requirements ### + +- `sphinx` +- `sphinx-autoapi` +- `recommonmark` +- `sphinx-rtd-theme` + +### How to contribute ### + +- [Google Style Python Docstrings](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) +- [Google Style Python Docstrings 2nd reference](https://github.com/google/styleguide/blob/gh-pages/pyguide.md#38-comments-and-docstrings) +- [References to other documentation](https://www.sphinx-doc.org/en/master/usage/extensions/intersphinx.html#role-external) + + diff --git a/src/caoscrawler/__init__.py b/src/caoscrawler/__init__.py index 41b96323b1106d8ce28caadc4a2da012f3dc22ea..27bdbfd371e10826d007480b4189bd2cd148344c 100644 --- a/src/caoscrawler/__init__.py +++ b/src/caoscrawler/__init__.py @@ -1,4 +1,4 @@ -from . import converters, utils +from . import converters, utils, xml_converter try: from .conv_impl.spss import SPSSConverter except ImportError as err: diff --git a/src/caoscrawler/cfood-schema.yml b/src/caoscrawler/cfood-schema.yml index 6609d8eb05135b17f5f6d9526df255b810de112a..acc3911f21d320146d0c35abc9d781541ee151ac 100644 --- a/src/caoscrawler/cfood-schema.yml +++ b/src/caoscrawler/cfood-schema.yml @@ -73,6 +73,9 @@ cfood: - H5Dataset - H5Group - H5Ndarray + - XMLFile + - XMLTag + - XMLTextNode - PropertiesFromDictElement description: Type of this converter node. match: diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index 0eee8965512ed39add9a9688c531c540f80d7df2..9805d1103e380f688b40a9bfd4c3d03129dbd591 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -316,7 +316,10 @@ def create_records(values: GeneralStore, records: RecordStore, def_records: dict class Converter(object, metaclass=ABCMeta): - """Converters treat StructureElements contained in the hierarchical sturcture.""" + """Converters treat StructureElements contained in the hierarchical sturcture. + + This is the abstract super class for all Converters. + """ def __init__(self, definition: dict, name: str, converter_registry: dict): """ @@ -582,6 +585,12 @@ class Converter(object, metaclass=ABCMeta): class DirectoryConverter(Converter): + """ + Converter that matches and handles structure elements of type directory. + + This is one typical starting point of a crawling procedure. + """ + def create_children(self, generalStore: GeneralStore, element: StructureElement): # TODO: See comment on types and inheritance if not isinstance(element, Directory): diff --git a/src/caoscrawler/default_converters.yml b/src/caoscrawler/default_converters.yml index 82e2f635f621b2e21e43b728fd9ed6865454f828..cb4a7d8c63489158c15dcf86b83fd940cd608460 100644 --- a/src/caoscrawler/default_converters.yml +++ b/src/caoscrawler/default_converters.yml @@ -94,3 +94,20 @@ SPSSFile: XLSXTableConverter: converter: XLSXTableConverter package: caoscrawler.converters + + +# ------------------------- +# XML +# ------------------------- + +XMLFile: + converter: XMLFileConverter + package: caoscrawler.xml_converter + +XMLTag: + converter: XMLTagConverter + package: caoscrawler.xml_converter + +XMLTextNode: + converter: XMLTextNodeConverter + package: caoscrawler.xml_converter diff --git a/src/caoscrawler/structure_elements.py b/src/caoscrawler/structure_elements.py index 0efba91c185446e0bfbecbbb53f68aaa8a8e15d1..67cd1056b382c92485deada2058526a03b6d8535 100644 --- a/src/caoscrawler/structure_elements.py +++ b/src/caoscrawler/structure_elements.py @@ -24,6 +24,7 @@ # import warnings +import lxml.etree class StructureElement(object): @@ -167,3 +168,53 @@ class DictDictElement(DictElement): def __init__(self, *args, **kwargs): warnings.warn(DeprecationWarning("This class is depricated. Please use DictElement.")) super().__init__(*args, **kwargs) + + +class XMLTagElement(StructureElement): + """ + Stores elements of an XML tree. + """ + + def __init__(self, element: lxml.etree.Element): + super().__init__(element.getroottree().getelementpath(element)) + self.tag = element + + +class XMLTextNode(StructureElement): + """ + Stores text nodes of XML trees. + """ + + def __init__(self, element: lxml.etree.Element): + """ + Initializes this XML text node. + + Please note that, although syntactically similar, it is semantically + different from TextElement: + - TextElements have a meaningful name, e.g. a key in a key-value pair. This name can + be matched using the match_name entry. + - XMLTextNodes just have a text and the name is just for identifying the structure element. + They can only be matched using the match entry in the XMLTextNodeConverter. + """ + super().__init__(element.getroottree().getelementpath(element) + "/text()") + self.tag = element + self.value = element.text + + +class XMLAttributeNode(StructureElement): + """ + Stores text nodes of XML trees. + """ + + def __init__(self, element: lxml.etree.Element, + key: str): + """ + Initializes this XML attribute node. + + element: The xml tree element containing the attribute. + key: The key which identifies the attribute in the list of attributes. + """ + super().__init__(element.getroottree().getelementpath(element) + "@" + key) + self.value = element.attrib[key] + self.key = key + self.tag = element diff --git a/src/caoscrawler/xml_converter.py b/src/caoscrawler/xml_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..6d350c26d467372e65c4acc0fd397d6679279b24 --- /dev/null +++ b/src/caoscrawler/xml_converter.py @@ -0,0 +1,226 @@ +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Converters take structure elements and create Records and new structure elements from them.""" + +from __future__ import annotations + +import datetime +import json +import logging +import os +import re +import warnings +from inspect import signature +from string import Template +from typing import Any, Callable, Optional, Union + +import linkahead as db +from jsonschema import ValidationError, validate + +from .stores import GeneralStore, RecordStore +from .structure_elements import (BooleanElement, DictElement, Directory, File, + FloatElement, IntegerElement, JSONFile, + ListElement, NoneElement, StructureElement, + TextElement, XMLTagElement, XMLTextNode, XMLAttributeNode) +from .utils import has_parent + +import lxml.etree +from .converters import SimpleFileConverter, ConverterValidationError, Converter + + +class XMLFileConverter(SimpleFileConverter): + + """Convert XML files. See + https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145 + for the current suggestion for the specification. + + """ + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + # TODO: See comment on types and inheritance + if not isinstance(element, File): + raise ValueError("create_children was called with wrong type of StructureElement") + with open(element.path, 'r') as xml_file: + xml = lxml.etree.parse(xml_file) + if "validate" in self.definition and self.definition["validate"]: + try: + raise NotImplementedError("XML validation not implemented yet.") + except ConverterValidationError as err: + raise ConverterValidationError( + "Error during the validation of the XML file:\n" + f"{element.path}\n" + err.message) + + return [XMLTagElement(xml.getroot())] + + +class XMLTagConverter(Converter): + def create_children(self, generalStore: GeneralStore, element: StructureElement): + """Children that are generated by this function are the + result of the xpath query given in the yaml property + ``xpath``. Its default (when not given) is ``child::*``, so the + direct children of the current xml node. The xpath expression + must be designed in a way that it returns xml tags (and no + attributes or texts). That means, that the axis ``attribute::`` + and the function ``text()`` must not be used. + + The following yaml properties can be used to generate other + types of nodes (text nodes and attribute nodes) as subtree + structure elements: + + :: + + # _*_ marks the default: + attribs_as_children: true # true / _false_ + text_as_children: true # true / _false_ + tags_as_children: true # _true_ / false + + The default is to generate the tags matched by the xpath expression only. + + - When text_as_children is set to true, text nodes will be generated that contain the text + contained in the matched tags. + - When attribs_as_children is set to true, attribute nodes will be generated from the attributes + of the matched tags. + + Notes + ----- + The default is to take the namespace map from the current node and use it in xpath queries. + Because default namespaces cannot be handled by xpath, it is possible to remap the default namespace + using the key ``default_namespace``. + The key ``nsmap`` can be used to define additional nsmap entries. + + """ + if not isinstance(element, XMLTagElement): + raise TypeError("Element must be an instance of XMLTagElement.") + + # Get the namespace map from the element: + nsmap = element.tag.nsmap + # The default name of the default namespace is "default". + # You can overwrite it using the attribute "default_namespace" in the converter definition: + default_namespace = self.definition.get("default_namespace", "default") + if None in nsmap: + nsmap[default_namespace] = nsmap[None] + del nsmap[None] + + # Set additional nsmap entries from the converter definition: + if "nsmap" in self.definition: + for key, value in self.definition["nsmap"].items(): + nsmap[key] = value + + xpath = self.definition.get("xpath", "child::*") + children = element.tag.xpath(xpath, namespaces=nsmap) + el_lst = [] + for el in children: + if isinstance(el, str): + raise RuntimeError( + "Only standard xml nodes are supported as results of xpath queries.") + elif isinstance(el, lxml.etree._Element): + if self.definition.get("tags_as_children", True): + el_lst.append(XMLTagElement(el)) + if self.definition.get("attribs_as_children", False): + for attrib in el.attrib: + el_lst.append(XMLAttributeNode(el, attrib)) + if self.definition.get("text_as_children", False): + el_lst.append(XMLTextNode(el)) + else: + raise RuntimeError("Unsupported child type.") + return el_lst + + def typecheck(self, element: StructureElement): + """ + Check whether the current structure element can be converted using + this converter. + """ + return isinstance(element, XMLTagElement) + + def match(self, element: StructureElement) -> Optional[dict]: + # See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145 + # for a suggestion for the design of the matching algorithm. + if not isinstance(element, XMLTagElement): + raise TypeError("Element must be an instance of XMLTagElement.") + + # Store the result of all individual regexp variable results: + vardict = {} + + if "match_tag" in self.definition: + m_tag = re.match(self.definition["match_tag"], element.tag.tag) + if m_tag is None: + return None + vardict.update(m_tag.groupdict()) + + if "match_text" in self.definition: + tagtext = element.tag.text + if element.tag.text is None: + tagtext = "" + m_text = re.match(self.definition["match_text"], tagtext, re.DOTALL) + if m_text is None: + return None + vardict.update(m_text.groupdict()) + + if "match_attrib" in self.definition: + for attrib_def_key, attrib_def_value in self.definition["match_attrib"].items(): + match_counter = 0 + matched_m_attrib = None + matched_m_attrib_value = None + for attr_key, attr_value in element.tag.attrib.items(): + m_attrib = re.match(attrib_def_key, attr_key) + if m_attrib is not None: + match_counter += 1 + matched_m_attrib = m_attrib + m_attrib_value = re.match(attrib_def_value, attr_value) + if m_attrib_value is None: + return None + matched_m_attrib_value = m_attrib_value + # TODO: How to deal with multiple matches? + # There are multiple options: + # - Allow multiple attribute-key matches: Leads to possible overwrites of variables + # - Require unique attribute-key and attribute-value matches: Very complex + # - Only allow one single attribute-key to match and run attribute-value match separately. + # Currently the latter option is implemented. + if match_counter == 0: + return None + elif match_counter > 1: + raise RuntimeError("Multiple attributes match the same match_attrib entry.") + vardict.update(matched_m_attrib.groupdict()) + vardict.update(matched_m_attrib_value.groupdict()) + + return vardict + + +class XMLTextNodeConverter(Converter): + def create_children(self, generalStore: GeneralStore, element: StructureElement): + raise NotImplementedError() + + def typecheck(self, element: StructureElement): + """ + Check whether the current structure element can be converted using + this converter. + """ + return isinstance(element, XMLTextNode) + + def match(self, element: StructureElement) -> Optional[dict]: + # See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145 + # for a suggestion for the design of the matching algorithm. + if not isinstance(element, XMLTextNode): + raise TypeError("Element must be an instance of XMLTextNode.") + + raise NotImplementedError() + + return None diff --git a/src/doc/README_SETUP.md b/src/doc/README_SETUP.md deleted file mode 100644 index 32f0bb89a6051bc2ec4be0bae6cf06cd1a540f8b..0000000000000000000000000000000000000000 --- a/src/doc/README_SETUP.md +++ /dev/null @@ -1,34 +0,0 @@ -# Getting started with the CaosDB Crawler # - -## Installation -see INSTALL.md - -## Run Unit Tests - -1. Install additional dependencies: - - h5py -2. Run `pytest unittests`. - -## Documentation ## -We use sphinx to create the documentation. Docstrings in the code should comply -with the Googly style (see link below). - -Build documentation in `src/doc` with `make doc`. Note that for the -automatic generation of the complete API documentation, it is -necessary to first install this library with all its optional -dependencies, i.e., `pip install .[h5-crawler,spss]`. - -### Requirements ### - -- `sphinx` -- `sphinx-autoapi` -- `recommonmark` -- `sphinx-rtd-theme` - -### How to contribute ### - -- [Google Style Python Docstrings](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) -- [Google Style Python Docstrings 2nd reference](https://github.com/google/styleguide/blob/gh-pages/pyguide.md#38-comments-and-docstrings) -- [References to other documentation](https://www.sphinx-doc.org/en/master/usage/extensions/intersphinx.html#role-external) - - diff --git a/src/doc/cfood.rst b/src/doc/cfood.rst index 07431af0a9fb26e569be5d47f79d6a4f120df269..51c392780b44b73964921506ad3764b95e14d5ed 100644 --- a/src/doc/cfood.rst +++ b/src/doc/cfood.rst @@ -183,7 +183,7 @@ in a vairable with the same name (as it is the case for other Records). Transform Functions ------------------- You can use transform functions to alter variable values that the crawler consumes (e.g. a string -that was matched with a reg exp). See :doc:`Converter Documentation<converters>`. +that was matched with a reg exp). See :doc:`Converter Documentation<converters/index>`. You can define your own transform functions by adding the the same way you add custom converters: diff --git a/src/doc/concepts.rst b/src/doc/concepts.rst index 770731857112b93205f0e80d623fa9183c4aa885..b3aa02a151a4d03c1531094ea01a5246cb02ba73 100644 --- a/src/doc/concepts.rst +++ b/src/doc/concepts.rst @@ -20,7 +20,7 @@ example a tree of Python *file objects* (StructureElements) could correspond to Relevant sources in: -- ``src/structure_elements.py`` +- :py:mod:`caoscrawler.structure_elements` .. _ConceptConverters: @@ -34,11 +34,11 @@ existing StructureElements, Converters create a tree of StructureElements. .. image:: img/converter.png :height: 170 -See the chapter :std:doc:`Converters<converters>` for details. +See the chapter :std:doc:`Converters<converters/index>` for details. Relevant sources in: -- ``src/converters.py`` +- :py:mod:`caoscrawler.converters` Identifiables @@ -70,8 +70,8 @@ In the current implementation an identifiable can only use one RecordType even t Relevant sources in -- ``src/identifiable_adapters.py`` -- ``src/identifiable.py`` +- :py:mod:`caoscrawler.identifiable_adapters` +- :py:mod:`caoscrawler.identifiable` Registered Identifiables ++++++++++++++++++++++++ @@ -110,7 +110,7 @@ The crawler can be considered the main program doing the synchronization in basi Relevant sources in: -- ``src/crawl.py`` +- :py:mod:`caoscrawler.crawl` diff --git a/src/doc/converters.rst b/src/doc/converters.rst deleted file mode 100644 index f59e6d3dff0a1f75dc4e0e5bcbbee0b4ceb7e81d..0000000000000000000000000000000000000000 --- a/src/doc/converters.rst +++ /dev/null @@ -1,822 +0,0 @@ -Converters -)))))))))) - -Converters treat a StructureElement and during this process create a number of new -StructureElements: the children of the initially treated StructureElement. Thus by treatment of -existing StructureElements, Converters create a tree of StructureElements. - -.. image:: img/converter.png - :height: 170 - -Each StructureElement in the tree has a set of properties, organized as -key-value pairs. -Some of those properties are specified by the type of StructureElement. For example, -a file could have the file name as property: ``'filename': myfile.dat``. -Converters may define additional functions that create further values. For -example, a regular expression could be used to get a date from a file name. - -CFood definition -++++++++++++++++ - -Converter application to data is specified via a tree-like yml file (called ``cfood.yml``, by -convention). The yml file specifies which Converters shall be used on which StructureElements, and -how to treat the generated *child* StructureElements. - -The yaml definition may look like this: - -.. todo:: - - This is outdated, see ``cfood-schema.yml`` for the current specification of a ``cfood.yml``. - -.. code-block:: yaml - - <NodeName>: - type: <ConverterName> - match: ".*" - records: - Experiment1: - parents: - - Experiment - - Blablabla - date: $DATUM - (...) - Experiment2: - parents: - - Experiment - subtree: - (...) - -The **<NodeName>** is a description of what the current block represents (e.g. -``experiment-folder``) and is used as an identifier. - -**<type>** selects the converter that is going to be matched against the current structure -element. If the structure element matches (this is a combination of a typecheck and a detailed -match, see the :py:class:`~caoscrawler.converters.Converter` source documentation for details), the -converter will: - -- generate records (with :py:meth:`~caoscrawler.converters.Converter.create_records`) -- possibly process a subtree (with :py:meth:`caoscrawler.converters.Converter.create_children`) - -**match** *TODO* - -**records** is a dict of definitions that define the semantic structure -(see details below). - -**subtree** makes the yaml recursive: It contains a list of new Converter -definitions, which work on the StructureElements that are returned by the -current Converter. - -Transform Functions -+++++++++++++++++++ -Often the situation arises, that you cannot use a value as it is found. Maybe a value should be -increased by an offset or a string should be split into a list of pieces. In order to allow such -simple conversions, transform functions can be named in the converter definition that are then -applied to the respective variables when the converter is executed. - -.. code-block:: yaml - - <NodeName>: - type: <ConverterName> - match: ".*" - transform: - <TransformNodeName>: - in: $<in_var_name> - out: $<out_var_name> - functions: - - <func_name>: # name of the function to be applied - <func_arg1>: <func_arg1_value> # key value pairs that are passed as parameters - <func_arg2>: <func_arg2_value> - # ... - -An example that splits the variable ``a`` and puts the generated list in ``b`` is the following: - -.. code-block:: yaml - - Experiment: - type: Dict - match: ".*" - transform: - param_split: - in: $a - out: $b - functions: - - split: # split is a function that is defined by default - marker: "|" # its only parameter is the marker that is used to split the string - records: - Report: - tags: $b - -This splits the string in '$a' and stores the resulting list in '$b'. This is here used to add a -list valued property to the Report Record. - - -There are a number of transform functions that are defined by default (see -``src/caoscrawler/default_transformers.yml``). You can define custom transform functions by adding -them to the cfood definition (see :doc:`CFood Documentation<cfood>`). - - -Standard Converters -+++++++++++++++++++ - -These are the standard converters that exist in a default installation. For writing and applying -*custom converters*, see :ref:`below <Custom Converters>`. - -Directory Converter -=================== -The Directory Converter creates StructureElements for each File and Directory -inside the current Directory. You can match a regular expression against the -directory name using the 'match' key. - -Simple File Converter -===================== -The Simple File Converter does not create any children and is usually used if -a file shall be used as it is and be inserted and referenced by other entities. - -Markdown File Converter -======================= -Reads a YAML header from Markdown files (if such a header exists) and creates -children elements according to the structure of the header. - -DictElement Converter -===================== - -DictElement → StructureElement - -Creates a child StructureElement for each key in the dictionary. - -Typical Subtree converters --------------------------- -The following StructureElement types are typically created by the DictElement converter: - -- BooleanElement -- FloatElement -- TextElement -- IntegerElement -- ListElement -- DictElement - -Note that you may use ``TextElement`` for anything that exists in a text format that can be -interpreted by the server, such as date and datetime strings in ISO-8601 format. - -Scalar Value Converters -======================= -`BooleanElementConverter`, `FloatElementConverter`, `TextElementConverter`, and -`IntegerElementConverter` behave very similarly. - -These converters expect `match_name` and `match_value` in their definition -which allow to match the key and the value, respectively. - -Note that there are defaults for accepting other types. For example, -FloatElementConverter also accepts IntegerElements. The default -behavior can be adjusted with the fields `accept_text`, `accept_int`, -`accept_float`, and `accept_bool`. - -The following denotes what kind of StructureElements are accepted by default -(they are defined in `src/caoscrawler/converters.py`): - -- BooleanElementConverter: bool, int -- FloatElementConverter: int, float -- TextElementConverter: text, bool, int, float -- IntegerElementConverter: int -- ListElementConverter: list -- DictElementConverter: dict - -YAMLFileConverter -================= - -A specialized Dict Converter for yaml files: Yaml files are opened and the contents are -converted into dictionaries that can be further converted using the typical subtree converters -of dict converter. - -**WARNING**: Currently unfinished implementation. - -JSONFileConverter -================= - - - - -TableConverter -============== - -Table → DictElement - -A generic converter (abstract) for files containing tables. -Currently, there are two specialized implementations for XLSX files and CSV files. - -All table converters generate a subtree of dicts, which in turn can be converted with DictElementConverters: -For each row in the table the TableConverter generates a DictElement (structure element). The key of the -element is the row number. The value of the element is a dict containing the mapping of -column names to values of the respective cell. - -Example: - -.. code-block:: yaml - - subtree: - TABLE: # Any name for the table as a whole - type: CSVTableConverter - match: ^test_table.csv$ - records: - (...) # Records edited for the whole table file - subtree: - ROW: # Any name for a data row in the table - type: DictElement - match_name: .* - match_value: .* - records: - (...) # Records edited for each row - subtree: - COLUMN: # Any name for a specific type of column in the table - type: FloatElement - match_name: measurement # Name of the column in the table file - match_value: (?P<column_value).*) - records: - (...) # Records edited for each cell - - -XLSXTableConverter -================== - -XLSX File → DictElement - -CSVTableConverter -================= - -CSV File → DictElement - -PropertiesFromDictConverter -=========================== - -The :py:class:`~caoscrawler.converters.PropertiesFromDictConverter` is -a specialization of the -:py:class:`~caoscrawler.converters.DictElementConverter` and offers -all its functionality. It is meant to operate on dictionaries (e.g., -from reading in a json or a table file), the keys of which correspond -closely to properties in a LinkAhead datamodel. This is especially -handy in cases where properties may be added to the data model and -data sources that are not yet known when writing the cfood definition. - -The converter definition of the -:py:class:`~caoscrawler.converters.PropertiesFromDictConverter` has an -additional required entry ``record_from_dict`` which specifies the -Record to which the properties extracted from the dict are attached -to. This Record is identified by its ``variable_name`` by which it can -be referred to further down the subtree. You can also use the name of -a Record that was specified earlier in the CFood definition in order -to extend it by the properties extracted from a dict. Let's have a -look at a simple example. A CFood definition - -.. code-block:: yaml - - PropertiesFromDictElement: - type: PropertiesFromDictElement - match: ".*" - record_from_dict: - variable_name: MyRec - parents: - - MyType1 - - MyType2 - -applied to a dictionary - -.. code-block:: json - - { - "name": "New name", - "a": 5, - "b": ["a", "b", "c"], - "author": { - "full_name": "Silvia Scientist" - } - } - -will create a Record ``New name`` with parents ``MyType1`` and -``MyType2``. It has a scalar property ``a`` with value 5, a list -property ``b`` with values "a", "b" and "c", and an ``author`` -property which references an ``author`` with a ``full_name`` property -with value "Silvia Scientist": - -.. image:: img/properties-from-dict-records-author.png - :height: 210 - -Note how the different dictionary keys are handled differently -depending on their types: scalar and list values are understood -automatically, and a dictionary-valued entry like ``author`` is -translated into a reference to an ``author`` Record automatically. - -You can further specify how references are treated with an optional -``references key`` in ``record_from_dict``. Let's assume that in the -above example, we have an ``author`` **Property** with datatype -``Person`` in our data model. We could add this information by -extending the above example definition by - - -.. code-block:: yaml - - PropertiesFromDictElement: - type: PropertiesFromDictElement - match: ".*" - record_from_dict: - variable_name: MyRec - parents: - - MyType1 - - MyType2 - references: - author: - parents: - - Person - -so that now, a ``Person`` record with a ``full_name`` property with -value "Silvia Scientist" is created as the value of the ``author`` -property: - -.. image:: img/properties-from-dict-records-person.png - :height: 200 - -For the time being, only the parents of the referenced record can be -set via this option. More complicated treatments can be implemented -via the ``referenced_record_callback`` (see below). - -Properties can be blacklisted with the ``properties_blacklist`` -keyword, i.e., all keys listed under ``properties_blacklist`` will be -excluded from automated treatment. Since the -:py:class:`~caoscrawler.converters.PropertiesFromDictConverter` has -all the functionality of the -:py:class:`~caoscrawler.converters.DictElementConverter`, individual -properties can still be used in a subtree. Together with -``properties_blacklist`` this can be used to add custom treatment to -specific properties by blacklisting them in ``record_from_dict`` and -then treating them in the subtree the same as you would do it in the -standard -:py:class:`~caoscrawler.converters.DictElementConverter`. Note that -the blacklisted keys are excluded on **all** levels of the dictionary, -i.e., also when they occur in a referenced entity. - -For further customization, the -:py:class:`~caoscrawler.converters.PropertiesFromDictConverter` can be -used as a basis for :ref:`custom converters<Custom Converters>` which -can make use of its ``referenced_record_callback`` argument. The -``referenced_record_callback`` can be a callable object which takes -exactly a Record as an argument and needs to return that Record after -doing whatever custom treatment is needed. Additionally, it is given -the ``RecordStore`` and the ``ValueStore`` in order to be able to -access the records and values that have already been defined from -within ``referenced_record_callback``. Such a function might look the -following: - -.. code-block:: python - - def my_callback(rec: db.Record, records: RecordStore, values: GeneralStore): - # do something with rec, possibly using other records or values from the stores... - rec.description = "This was updated in a callback" - return rec - -It is applied to all Records that are created from the dictionary and -it can be used to, e.g., transform values of some properties, or add -special treatment to all Records of a specific -type. ``referenced_record_callback`` is applied **after** the -properties from the dictionary have been applied as explained above. - - -Further converters -++++++++++++++++++ - -More converters, together with cfood definitions and examples can be found in -the `LinkAhead Crawler Extensions Subgroup -<https://gitlab.com/linkahead/crawler-extensions>`_ on gitlab. In the following, -we list converters that are shipped with the crawler library itself but are not -part of the set of standard converters and may require this library to be -installed with additional optional dependencies. - -HDF5 Converters -=============== - -For treating `HDF5 Files -<https://docs.hdfgroup.org/hdf5/develop/_s_p_e_c.html>`_, there are in total -four individual converters corresponding to the internal structure of HDF5 -files: the :ref:`H5FileConverter` which opens the file itself and creates -further structure elements from HDF5 groups, datasets, and included -multi-dimensional arrays that are in turn treated by the -:ref:`H5GroupConverter`, the :ref:`H5DatasetConverter`, and the -:ref:`H5NdarrayConverter`, respectively. You need to install the LinkAhead -crawler with its optional ``h5-crawler`` dependency for using these converters. - -The basic idea when crawling HDF5 files is to treat them very similar to -:ref:`dictionaries <DictElement Converter>` in which the attributes on root, -group, or dataset level are essentially treated like ``BooleanElement``, -``TextElement``, ``FloatElement``, and ``IntegerElement`` in a dictionary: They -are appended as children and can be accessed via the ``subtree``. The file -itself and the groups within may contain further groups and datasets, which can -have their own attributes, subgroups, and datasets, very much like -``DictElements`` within a dictionary. The main difference to any other -dictionary type is the presence of multi-dimensional arrays within HDF5 -datasets. Since LinkAhead doesn't have any datatype corresponding to these, and -since it isn't desirable to store these arrays directly within LinkAhead for -reasons of performance and of searchability, we wrap them within a specific -Record as explained :ref:`below <H5NdarrayConverter>`, together with more -metadata and their internal path within the HDF5 file. Users can thus query for -datasets and their arrays according to their metadata within LinkAhead and then -use the internal path information to access the dataset within the file -directly. The type of this record and the property for storing the internal path -need to be reflected in the datamodel. Using the default names, you would need a -datamodel like - -.. code-block:: yaml - - H5Ndarray: - obligatory_properties: - internal_hdf5-path: - datatype: TEXT - -although the names of both property and record type can be configured within the -cfood definition. - -A simple example of a cfood definition for HDF5 files can be found in the `unit -tests -<https://gitlab.com/linkahead/linkahead-crawler/-/blob/main/unittests/h5_cfood.yml?ref_type=heads>`_ -and shows how the individual converters are used in order to crawl a `simple -example file -<https://gitlab.com/linkahead/linkahead-crawler/-/blob/main/unittests/hdf5_dummy_file.hdf5?ref_type=heads>`_ -containing groups, subgroups, and datasets, together with their respective -attributes. - -H5FileConverter ---------------- - -This is an extension of the -:py:class:`~caoscrawler.converters.SimpleFileConverter` class. It opens the HDF5 -file and creates children for any contained group or dataset. Additionally, the -root-level attributes of the HDF5 file are accessible as children. - -H5GroupConverter ----------------- - -This is an extension of the -:py:class:`~caoscrawler.converters.DictElementConverter` class. Children are -created for all subgroups and datasets in this HDF5 group. Additionally, the -group-level attributes are accessible as children. - -H5DatasetConverter ------------------- - -This is an extension of the -:py:class:`~caoscrawler.converters.DictElementConverter` class. Most -importantly, it stores the array data in HDF5 dataset into -:py:class:`~caoscrawler.hdf5_converter.H5NdarrayElement` which is added to its -children, as well as the dataset attributes. - -H5NdarrayConverter ------------------- - -This converter creates a wrapper record for the contained dataset. The name of -this record needs to be specified in the cfood definition of this converter via -the ``recordname`` option. The RecordType of this record can be configured with -the ``array_recordtype_name`` option and defaults to ``H5Ndarray``. Via the -given ``recordname``, this record can be used within the cfood. Most -importantly, this record stores the internal path of this array within the HDF5 -file in a text property, the name of which can be configured with the -``internal_path_property_name`` option which defaults to ``internal_hdf5_path``. - -Custom Converters -+++++++++++++++++ - -As mentioned before it is possible to create custom converters. -These custom converters can be used to integrate arbitrary data extraction and ETL capabilities -into the LinkAhead crawler and make these extensions available to any yaml specification. - -Tell the crawler about a custom converter -========================================= - -To use a custom crawler, it must be defined in the ``Converters`` section of the CFood yaml file. -The basic syntax for adding a custom converter to a definition file is: - -.. code-block:: yaml - - Converters: - <NameOfTheConverterInYamlFile>: - package: <python>.<module>.<name> - converter: <PythonClassName> - -The Converters section can be either put into the first or the second document of the cfood yaml file. -It can be also part of a single-document yaml cfood file. Please refer to :doc:`the cfood documentation<cfood>` for more details. - -Details: - -- **<NameOfTheConverterInYamlFile>**: This is the name of the converter as it is going to be used in the present yaml file. -- **<python>.<module>.<name>**: The name of the module where the converter class resides. -- **<PythonClassName>**: Within this specified module there must be a class inheriting from base class :py:class:`caoscrawler.converters.Converter`. - -Implementing a custom converter -=============================== - -Converters inherit from the :py:class:`~caoscrawler.converters.Converter` class. - -The following methods are abstract and need to be overwritten by your custom converter to make it work: - -:py:meth:`~caoscrawler.converters.Converter.create_children`: - Return a list of child StructureElement objects. - -- :py:meth:`~caoscrawler.converters.Converter.match` -- :py:meth:`~caoscrawler.converters.Converter.typecheck` - - -Example -======= - -In the following, we will explain the process of adding a custom converter to a yaml file using -a SourceResolver that is able to attach a source element to another entity. - -**Note**: This example might become a standard crawler soon, as part of the scifolder specification. See https://doi.org/10.3390/data5020043 for details. In this documentation example we will, therefore, add it to a package called "scifolder". - -First we will create our package and module structure, which might be: - -.. code-block:: - - scifolder_package/ - README.md - setup.cfg - setup.py - Makefile - tox.ini - src/ - scifolder/ - __init__.py - converters/ - __init__.py - sources.py # <- the actual file containing - # the converter class - doc/ - unittests/ - -Now we need to create a class called "SourceResolver" in the file "sources.py". In this - more advanced - example, we will not inherit our converter directly from :py:class:`~caoscrawler.converters.Converter`, but use :py:class:`~caoscrawler.converters.TextElementConverter`. The latter already implements :py:meth:`~caoscrawler.converters.Converter.match` and :py:meth:`~caoscrawler.converters.Converter.typecheck`, so only an implementation for :py:meth:`~caoscrawler.converters.Converter.create_children` has to be provided by us. -Furthermore we will customize the method :py:meth:`~caoscrawler.converters.Converter.create_records` that allows us to specify a more complex record generation procedure than provided in the standard implementation. One specific limitation of the standard implementation is, that only a fixed -number of records can be generated by the yaml definition. So for any applications - like here - that require an arbitrary number of records to be created, a customized implementation of :py:meth:`~caoscrawler.converters.Converter.create_records` is recommended. -In this context it is recommended to make use of the function :func:`caoscrawler.converters.create_records` that implements creation of record objects from python dictionaries of the same structure -that would be given using a yaml definition (see next section below). - -.. code-block:: python - - import re - from caoscrawler.stores import GeneralStore, RecordStore - from caoscrawler.converters import TextElementConverter, create_records - from caoscrawler.structure_elements import StructureElement, TextElement - - - class SourceResolver(TextElementConverter): - """ - This resolver uses a source list element (e.g. from the markdown readme file) - to link sources correctly. - """ - - def __init__(self, definition: dict, name: str, - converter_registry: dict): - """ - Initialize a new directory converter. - """ - super().__init__(definition, name, converter_registry) - - def create_children(self, generalStore: GeneralStore, - element: StructureElement): - - # The source resolver does not create children: - - return [] - - def create_records(self, values: GeneralStore, - records: RecordStore, - element: StructureElement, - file_path_prefix): - if not isinstance(element, TextElement): - raise RuntimeError() - - # This function must return a list containing tuples, each one for a modified - # property: (name_of_entity, name_of_property) - keys_modified = [] - - # This is the name of the entity where the source is going to be attached: - attach_to_scientific_activity = self.definition["scientific_activity"] - rec = records[attach_to_scientific_activity] - - # The "source" is a path to a source project, so it should have the form: - # /<Category>/<project>/<scientific_activity>/ - # obtain these information from the structure element: - val = element.value - regexp = (r'/(?P<category>(SimulationData)|(ExperimentalData)|(DataAnalysis))' - '/(?P<project_date>.*?)_(?P<project_identifier>.*)' - '/(?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2})(_(?P<identifier>.*))?/') - - res = re.match(regexp, val) - if res is None: - raise RuntimeError("Source cannot be parsed correctly.") - - # Mapping of categories on the file system to corresponding record types in CaosDB: - cat_map = { - "SimulationData": "Simulation", - "ExperimentalData": "Experiment", - "DataAnalysis": "DataAnalysis"} - linkrt = cat_map[res.group("category")] - - keys_modified.extend(create_records(values, records, { - "Project": { - "date": res.group("project_date"), - "identifier": res.group("project_identifier"), - }, - linkrt: { - "date": res.group("date"), - "identifier": res.group("identifier"), - "project": "$Project" - }, - attach_to_scientific_activity: { - "sources": "+$" + linkrt - }}, file_path_prefix)) - - # Process the records section of the yaml definition: - keys_modified.extend( - super().create_records(values, records, element, file_path_prefix)) - - # The create_records function must return the modified keys to make it compatible - # to the crawler functions: - return keys_modified - - -If the recommended (python) package structure is used, the package containing the converter -definition can just be installed using `pip install .` or `pip install -e .` from the -`scifolder_package` directory. - -The following yaml block will register the converter in a yaml file: - -.. code-block:: yaml - - Converters: - SourceResolver: - package: scifolder.converters.sources - converter: SourceResolver - - -Using the `create_records` API function -======================================= - -The function :func:`caoscrawler.converters.create_records` was already mentioned above and it is -the recommended way to create new records from custom converters. Let's have a look at the -function signature: - -.. code-block:: python - - def create_records(values: GeneralStore, # <- pass the current variables store here - records: RecordStore, # <- pass the current store of CaosDB records here - def_records: dict): # <- This is the actual definition of new records! - - -`def_records` is the actual definition of new records according to the yaml cfood specification -(work in progress, in the docs). Essentially you can do everything here, that you could do -in the yaml document as well, but using python source code. - -Let's have a look at a few examples: - -.. code-block:: yaml - - DirConverter: - type: Directory - match: (?P<dir_name>.*) - records: - Experiment: - identifier: $dir_name - -This block will just create a new record with parent `Experiment` and one property -`identifier` with a value derived from the matching regular expression. - -Let's formulate that using `create_records`: - -.. code-block:: python - - dir_name = "directory name" - - record_def = { - "Experiment": { - "identifier": dir_name - } - } - - keys_modified = create_records(values, records, - record_def) - -The `dir_name` is set explicitely here, everything else is identical to the yaml statements. - - -The role of `keys_modified` -=========================== - -You probably have noticed already, that :func:`caoscrawler.converters.create_records` returns -`keys_modified` which is a list of tuples. Each element of `keys_modified` has two elements: - -- Element 0 is the name of the record that is modified (as used in the record store `records`). -- Element 1 is the name of the property that is modified. - -It is important, that the correct list of modified keys is returned by -:py:meth:`~caoscrawler.converters.Converter.create_records` to make the crawler process work. - -So, a sketch of a typical implementation within a custom converter could look like this: - - -.. code-block:: python - - def create_records(self, values: GeneralStore, - records: RecordStore, - element: StructureElement, - file_path_prefix: str): - - # Modify some records: - record_def = { - # ... - } - - keys_modified = create_records(values, records, - record_def) - - # You can of course do it multiple times: - keys_modified.extend(create_records(values, records, - record_def)) - - # You can also process the records section of the yaml definition: - keys_modified.extend( - super().create_records(values, records, element, file_path_prefix)) - # This essentially allows users of your converter to customize the creation of records - # by providing a custom "records" section additionally to the modifications provided - # in this implementation of the Converter. - - # Important: Return the list of modified keys! - return keys_modified - - -More complex example -==================== - -Let's have a look at a more complex examples, defining multiple records: - -.. code-block:: yaml - - DirConverter: - type: Directory - match: (?P<dir_name>.*) - records: - Project: - identifier: project_name - Experiment: - identifier: $dir_name - Project: $Project - ProjectGroup: - projects: +$Project - - -This block will create two new Records: - -- A project with a constant identifier -- An experiment with an identifier, derived from a regular expression and a reference to the new project. - -Furthermore a Record `ProjectGroup` will be edited (its initial definition is not given in the -yaml block): The project that was just created will be added as a list element to the property -`projects`. - -Let's formulate that using `create_records` (again, `dir_name` is constant here): - -.. code-block:: python - - dir_name = "directory name" - - record_def = { - "Project": { - "identifier": "project_name", - } - "Experiment": { - "identifier": dir_name, - "Project": "$Project", - } - "ProjectGroup": { - "projects": "+$Project", - } - - } - - keys_modified = create_records(values, records, - record_def) - -Debugging -========= - -You can add the key `debug_match` to the definition of a Converter in order to create debugging -output for the match step. The following snippet illustrates this: - -.. code-block:: yaml - - DirConverter: - type: Directory - match: (?P<dir_name>.*) - debug_match: True - records: - Project: - identifier: project_name - - -Whenever this Converter tries to match a StructureElement, it logs what was tried to macht against -what and what the result was. diff --git a/src/doc/converters/cfood_definition.rst b/src/doc/converters/cfood_definition.rst new file mode 100644 index 0000000000000000000000000000000000000000..13c04fd38df8b00c435192a1c3cf02147f870b4c --- /dev/null +++ b/src/doc/converters/cfood_definition.rst @@ -0,0 +1,50 @@ +CFood definition +++++++++++++++++ + +Converter application to data is specified via a tree-like yml file (called ``cfood.yml``, by +convention). The yml file specifies which Converters shall be used on which StructureElements, and +how to treat the generated *child* StructureElements. + +The yaml definition may look like this: + +.. todo:: + + This is outdated, see ``cfood-schema.yml`` for the current specification of a ``cfood.yml``. + +.. code-block:: yaml + + <NodeName>: + type: <ConverterName> + match: ".*" + records: + Experiment1: + parents: + - Experiment + - Blablabla + date: $DATUM + (...) + Experiment2: + parents: + - Experiment + subtree: + (...) + +The **<NodeName>** is a description of what the current block represents (e.g. +``experiment-folder``) and is used as an identifier. + +**<type>** selects the converter that is going to be matched against the current structure +element. If the structure element matches (this is a combination of a typecheck and a detailed +match, see the :py:class:`~caoscrawler.converters.Converter` source documentation for details), the +converter will: + +- generate records (with :py:meth:`~caoscrawler.converters.Converter.create_records`) +- possibly process a subtree (with :py:meth:`caoscrawler.converters.Converter.create_children`) + +**match** *TODO* + +**records** is a dict of definitions that define the semantic structure +(see details below). + +**subtree** makes the yaml recursive: It contains a list of new Converter +definitions, which work on the StructureElements that are returned by the +current Converter. diff --git a/src/doc/converters/custom_converters.rst b/src/doc/converters/custom_converters.rst new file mode 100644 index 0000000000000000000000000000000000000000..573d9714488eaacd2c794b1fa497306a8d110a5f --- /dev/null +++ b/src/doc/converters/custom_converters.rst @@ -0,0 +1,344 @@ +Custom Converters ++++++++++++++++++ + +As mentioned before it is possible to create custom converters. +These custom converters can be used to integrate arbitrary data extraction and ETL capabilities +into the LinkAhead crawler and make these extensions available to any yaml specification. + +Tell the crawler about a custom converter +========================================= + +To use a custom crawler, it must be defined in the ``Converters`` section of the CFood yaml file. +The basic syntax for adding a custom converter to a definition file is: + +.. code-block:: yaml + + Converters: + <NameOfTheConverterInYamlFile>: + package: <python>.<module>.<name> + converter: <PythonClassName> + +The Converters section can be either put into the first or the second +document of the cfood yaml file. It can be also part of a +single-document yaml cfood file. Please refer to :doc:`the cfood +documentation<../cfood>` for more details. + +Details: + +- **<NameOfTheConverterInYamlFile>**: This is the name of the converter as it is going to be used in the present yaml file. +- **<python>.<module>.<name>**: The name of the module where the converter class resides. +- **<PythonClassName>**: Within this specified module there must be a class inheriting from base class :py:class:`caoscrawler.converters.Converter`. + +Implementing a custom converter +=============================== + +Converters inherit from the :py:class:`~caoscrawler.converters.Converter` class. + +The following methods are abstract and need to be overwritten by your custom converter to make it work: + +:py:meth:`~caoscrawler.converters.Converter.create_children`: + Return a list of child StructureElement objects. + +- :py:meth:`~caoscrawler.converters.Converter.match` +- :py:meth:`~caoscrawler.converters.Converter.typecheck` + + +Example +======= + +In the following, we will explain the process of adding a custom converter to a yaml file using +a SourceResolver that is able to attach a source element to another entity. + +**Note**: This example might become a standard crawler soon, as part of the scifolder specification. See https://doi.org/10.3390/data5020043 for details. In this documentation example we will, therefore, add it to a package called "scifolder". + +First we will create our package and module structure, which might be: + +.. code-block:: + + scifolder_package/ + README.md + setup.cfg + setup.py + Makefile + tox.ini + src/ + scifolder/ + __init__.py + converters/ + __init__.py + sources.py # <- the actual file containing + # the converter class + doc/ + unittests/ + +Now we need to create a class called "SourceResolver" in the file "sources.py". In this - more advanced - example, we will not inherit our converter directly from :py:class:`~caoscrawler.converters.Converter`, but use :py:class:`~caoscrawler.converters.TextElementConverter`. The latter already implements :py:meth:`~caoscrawler.converters.Converter.match` and :py:meth:`~caoscrawler.converters.Converter.typecheck`, so only an implementation for :py:meth:`~caoscrawler.converters.Converter.create_children` has to be provided by us. +Furthermore we will customize the method :py:meth:`~caoscrawler.converters.Converter.create_records` that allows us to specify a more complex record generation procedure than provided in the standard implementation. One specific limitation of the standard implementation is, that only a fixed +number of records can be generated by the yaml definition. So for any applications - like here - that require an arbitrary number of records to be created, a customized implementation of :py:meth:`~caoscrawler.converters.Converter.create_records` is recommended. +In this context it is recommended to make use of the function :func:`caoscrawler.converters.create_records` that implements creation of record objects from python dictionaries of the same structure +that would be given using a yaml definition (see next section below). + +.. code-block:: python + + import re + from caoscrawler.stores import GeneralStore, RecordStore + from caoscrawler.converters import TextElementConverter, create_records + from caoscrawler.structure_elements import StructureElement, TextElement + + + class SourceResolver(TextElementConverter): + """ + This resolver uses a source list element (e.g. from the markdown readme file) + to link sources correctly. + """ + + def __init__(self, definition: dict, name: str, + converter_registry: dict): + """ + Initialize a new directory converter. + """ + super().__init__(definition, name, converter_registry) + + def create_children(self, generalStore: GeneralStore, + element: StructureElement): + + # The source resolver does not create children: + + return [] + + def create_records(self, values: GeneralStore, + records: RecordStore, + element: StructureElement, + file_path_prefix): + if not isinstance(element, TextElement): + raise RuntimeError() + + # This function must return a list containing tuples, each one for a modified + # property: (name_of_entity, name_of_property) + keys_modified = [] + + # This is the name of the entity where the source is going to be attached: + attach_to_scientific_activity = self.definition["scientific_activity"] + rec = records[attach_to_scientific_activity] + + # The "source" is a path to a source project, so it should have the form: + # /<Category>/<project>/<scientific_activity>/ + # obtain these information from the structure element: + val = element.value + regexp = (r'/(?P<category>(SimulationData)|(ExperimentalData)|(DataAnalysis))' + '/(?P<project_date>.*?)_(?P<project_identifier>.*)' + '/(?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2})(_(?P<identifier>.*))?/') + + res = re.match(regexp, val) + if res is None: + raise RuntimeError("Source cannot be parsed correctly.") + + # Mapping of categories on the file system to corresponding record types in CaosDB: + cat_map = { + "SimulationData": "Simulation", + "ExperimentalData": "Experiment", + "DataAnalysis": "DataAnalysis"} + linkrt = cat_map[res.group("category")] + + keys_modified.extend(create_records(values, records, { + "Project": { + "date": res.group("project_date"), + "identifier": res.group("project_identifier"), + }, + linkrt: { + "date": res.group("date"), + "identifier": res.group("identifier"), + "project": "$Project" + }, + attach_to_scientific_activity: { + "sources": "+$" + linkrt + }}, file_path_prefix)) + + # Process the records section of the yaml definition: + keys_modified.extend( + super().create_records(values, records, element, file_path_prefix)) + + # The create_records function must return the modified keys to make it compatible + # to the crawler functions: + return keys_modified + + +If the recommended (python) package structure is used, the package containing the converter +definition can just be installed using `pip install .` or `pip install -e .` from the +`scifolder_package` directory. + +The following yaml block will register the converter in a yaml file: + +.. code-block:: yaml + + Converters: + SourceResolver: + package: scifolder.converters.sources + converter: SourceResolver + + +Using the `create_records` API function +======================================= + +The function :func:`caoscrawler.converters.create_records` was already mentioned above and it is +the recommended way to create new records from custom converters. Let's have a look at the +function signature: + +.. code-block:: python + + def create_records(values: GeneralStore, # <- pass the current variables store here + records: RecordStore, # <- pass the current store of CaosDB records here + def_records: dict): # <- This is the actual definition of new records! + + +`def_records` is the actual definition of new records according to the yaml cfood specification +(work in progress, in the docs). Essentially you can do everything here, that you could do +in the yaml document as well, but using python source code. + +Let's have a look at a few examples: + +.. code-block:: yaml + + DirConverter: + type: Directory + match: (?P<dir_name>.*) + records: + Experiment: + identifier: $dir_name + +This block will just create a new record with parent `Experiment` and one property +`identifier` with a value derived from the matching regular expression. + +Let's formulate that using `create_records`: + +.. code-block:: python + + dir_name = "directory name" + + record_def = { + "Experiment": { + "identifier": dir_name + } + } + + keys_modified = create_records(values, records, + record_def) + +The `dir_name` is set explicitely here, everything else is identical to the yaml statements. + + +The role of `keys_modified` +=========================== + +You probably have noticed already, that :func:`caoscrawler.converters.create_records` returns +`keys_modified` which is a list of tuples. Each element of `keys_modified` has two elements: + +- Element 0 is the name of the record that is modified (as used in the record store `records`). +- Element 1 is the name of the property that is modified. + +It is important, that the correct list of modified keys is returned by +:py:meth:`~caoscrawler.converters.Converter.create_records` to make the crawler process work. + +So, a sketch of a typical implementation within a custom converter could look like this: + + +.. code-block:: python + + def create_records(self, values: GeneralStore, + records: RecordStore, + element: StructureElement, + file_path_prefix: str): + + # Modify some records: + record_def = { + # ... + } + + keys_modified = create_records(values, records, + record_def) + + # You can of course do it multiple times: + keys_modified.extend(create_records(values, records, + record_def)) + + # You can also process the records section of the yaml definition: + keys_modified.extend( + super().create_records(values, records, element, file_path_prefix)) + # This essentially allows users of your converter to customize the creation of records + # by providing a custom "records" section additionally to the modifications provided + # in this implementation of the Converter. + + # Important: Return the list of modified keys! + return keys_modified + + +More complex example +==================== + +Let's have a look at a more complex examples, defining multiple records: + +.. code-block:: yaml + + DirConverter: + type: Directory + match: (?P<dir_name>.*) + records: + Project: + identifier: project_name + Experiment: + identifier: $dir_name + Project: $Project + ProjectGroup: + projects: +$Project + + +This block will create two new Records: + +- A project with a constant identifier +- An experiment with an identifier, derived from a regular expression and a reference to the new project. + +Furthermore a Record `ProjectGroup` will be edited (its initial definition is not given in the +yaml block): The project that was just created will be added as a list element to the property +`projects`. + +Let's formulate that using `create_records` (again, `dir_name` is constant here): + +.. code-block:: python + + dir_name = "directory name" + + record_def = { + "Project": { + "identifier": "project_name", + } + "Experiment": { + "identifier": dir_name, + "Project": "$Project", + } + "ProjectGroup": { + "projects": "+$Project", + } + + } + + keys_modified = create_records(values, records, + record_def) + +Debugging +========= + +You can add the key `debug_match` to the definition of a Converter in order to create debugging +output for the match step. The following snippet illustrates this: + +.. code-block:: yaml + + DirConverter: + type: Directory + match: (?P<dir_name>.*) + debug_match: True + records: + Project: + identifier: project_name + + +Whenever this Converter tries to match a StructureElement, it logs what was tried to macht against +what and what the result was. diff --git a/src/doc/converters/further_converters.rst b/src/doc/converters/further_converters.rst new file mode 100644 index 0000000000000000000000000000000000000000..539c5159eb1de01765a78e3c04e10fb3f0be9be5 --- /dev/null +++ b/src/doc/converters/further_converters.rst @@ -0,0 +1,98 @@ +Further converters +++++++++++++++++++ + +More converters, together with cfood definitions and examples can be found in +the `LinkAhead Crawler Extensions Subgroup +<https://gitlab.com/linkahead/crawler-extensions>`_ on gitlab. In the following, +we list converters that are shipped with the crawler library itself but are not +part of the set of standard converters and may require this library to be +installed with additional optional dependencies. + +HDF5 Converters +=============== + +For treating `HDF5 Files +<https://docs.hdfgroup.org/hdf5/develop/_s_p_e_c.html>`_, there are in total +four individual converters corresponding to the internal structure of HDF5 +files: the :ref:`H5FileConverter` which opens the file itself and creates +further structure elements from HDF5 groups, datasets, and included +multi-dimensional arrays that are in turn treated by the +:ref:`H5GroupConverter`, the :ref:`H5DatasetConverter`, and the +:ref:`H5NdarrayConverter`, respectively. You need to install the LinkAhead +crawler with its optional ``h5-crawler`` dependency for using these converters. + +The basic idea when crawling HDF5 files is to treat them very similar to +:ref:`dictionaries <DictElement Converter>` in which the attributes on root, +group, or dataset level are essentially treated like ``BooleanElement``, +``TextElement``, ``FloatElement``, and ``IntegerElement`` in a dictionary: They +are appended as children and can be accessed via the ``subtree``. The file +itself and the groups within may contain further groups and datasets, which can +have their own attributes, subgroups, and datasets, very much like +``DictElements`` within a dictionary. The main difference to any other +dictionary type is the presence of multi-dimensional arrays within HDF5 +datasets. Since LinkAhead doesn't have any datatype corresponding to these, and +since it isn't desirable to store these arrays directly within LinkAhead for +reasons of performance and of searchability, we wrap them within a specific +Record as explained :ref:`below <H5NdarrayConverter>`, together with more +metadata and their internal path within the HDF5 file. Users can thus query for +datasets and their arrays according to their metadata within LinkAhead and then +use the internal path information to access the dataset within the file +directly. The type of this record and the property for storing the internal path +need to be reflected in the datamodel. Using the default names, you would need a +datamodel like + +.. code-block:: yaml + + H5Ndarray: + obligatory_properties: + internal_hdf5-path: + datatype: TEXT + +although the names of both property and record type can be configured within the +cfood definition. + +A simple example of a cfood definition for HDF5 files can be found in the `unit +tests +<https://gitlab.com/linkahead/linkahead-crawler/-/blob/main/unittests/h5_cfood.yml?ref_type=heads>`_ +and shows how the individual converters are used in order to crawl a `simple +example file +<https://gitlab.com/linkahead/linkahead-crawler/-/blob/main/unittests/hdf5_dummy_file.hdf5?ref_type=heads>`_ +containing groups, subgroups, and datasets, together with their respective +attributes. + +H5FileConverter +--------------- + +This is an extension of the +:py:class:`~caoscrawler.converters.SimpleFileConverter` class. It opens the HDF5 +file and creates children for any contained group or dataset. Additionally, the +root-level attributes of the HDF5 file are accessible as children. + +H5GroupConverter +---------------- + +This is an extension of the +:py:class:`~caoscrawler.converters.DictElementConverter` class. Children are +created for all subgroups and datasets in this HDF5 group. Additionally, the +group-level attributes are accessible as children. + +H5DatasetConverter +------------------ + +This is an extension of the +:py:class:`~caoscrawler.converters.DictElementConverter` class. Most +importantly, it stores the array data in HDF5 dataset into +:py:class:`~caoscrawler.hdf5_converter.H5NdarrayElement` which is added to its +children, as well as the dataset attributes. + +H5NdarrayConverter +------------------ + +This converter creates a wrapper record for the contained dataset. The name of +this record needs to be specified in the cfood definition of this converter via +the ``recordname`` option. The RecordType of this record can be configured with +the ``array_recordtype_name`` option and defaults to ``H5Ndarray``. Via the +given ``recordname``, this record can be used within the cfood. Most +importantly, this record stores the internal path of this array within the HDF5 +file in a text property, the name of which can be configured with the +``internal_path_property_name`` option which defaults to ``internal_hdf5_path``. diff --git a/src/doc/converters/index.rst b/src/doc/converters/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..38fc11335a2640f645e9b4e093690d1ffa7cd07f --- /dev/null +++ b/src/doc/converters/index.rst @@ -0,0 +1,29 @@ +Converters +)))))))))) + +Converters treat a StructureElement and during this process create a number of new +StructureElements: the children of the initially treated StructureElement. Thus by treatment of +existing StructureElements, Converters create a tree of StructureElements. + +.. image:: ../img/converter.png + :height: 170 + :alt: Converters are Python classes that tell the crawler how to + interprete StructureElements. + +Each StructureElement in the tree has a set of properties, organized as +key-value pairs. +Some of those properties are specified by the type of StructureElement. For example, +a file could have the file name as property: ``'filename': myfile.dat``. +Converters may define additional functions that create further values. For +example, a regular expression could be used to get a date from a file name. + +.. toctree:: + :maxdepth: 1 + :caption: Contents: + + CFood definition<cfood_definition> + Standard converters<standard_converters> + Further converters<further_converters> + Custom converters<custom_converters> + Transform functions<transform_functions> + diff --git a/src/doc/converters/standard_converters.rst b/src/doc/converters/standard_converters.rst new file mode 100644 index 0000000000000000000000000000000000000000..3dc3c882e76e10706d030ba0695d498631bf7b28 --- /dev/null +++ b/src/doc/converters/standard_converters.rst @@ -0,0 +1,333 @@ +Standard Converters ++++++++++++++++++++ + +These are the standard converters that exist in a default installation. For writing and applying +*custom converters*, see :ref:`below <Custom Converters>`. + +Directory Converter +=================== +The Directory Converter creates StructureElements for each File and Directory +inside the current Directory. You can match a regular expression against the +directory name using the 'match' key. + +Simple File Converter +===================== +The Simple File Converter does not create any children and is usually used if +a file shall be used as it is and be inserted and referenced by other entities. + +Markdown File Converter +======================= +Reads a YAML header from Markdown files (if such a header exists) and creates +children elements according to the structure of the header. + +DictElement Converter +===================== + +DictElement → StructureElement + +Creates a child StructureElement for each key in the dictionary. + +Typical Subtree converters +-------------------------- +The following StructureElement types are typically created by the DictElement converter: + +- BooleanElement +- FloatElement +- TextElement +- IntegerElement +- ListElement +- DictElement + +Note that you may use ``TextElement`` for anything that exists in a text format that can be +interpreted by the server, such as date and datetime strings in ISO-8601 format. + +Scalar Value Converters +======================= +`BooleanElementConverter`, `FloatElementConverter`, `TextElementConverter`, and +`IntegerElementConverter` behave very similarly. + +These converters expect `match_name` and `match_value` in their definition +which allow to match the key and the value, respectively. + +Note that there are defaults for accepting other types. For example, +FloatElementConverter also accepts IntegerElements. The default +behavior can be adjusted with the fields `accept_text`, `accept_int`, +`accept_float`, and `accept_bool`. + +The following denotes what kind of StructureElements are accepted by default +(they are defined in `src/caoscrawler/converters.py`): + +- BooleanElementConverter: bool, int +- FloatElementConverter: int, float +- TextElementConverter: text, bool, int, float +- IntegerElementConverter: int +- ListElementConverter: list +- DictElementConverter: dict + +YAMLFileConverter +================= + +A specialized Dict Converter for yaml files: Yaml files are opened and the contents are +converted into dictionaries that can be further converted using the typical subtree converters +of dict converter. + +**WARNING**: Currently unfinished implementation. + +JSONFileConverter +================= + + + + +TableConverter +============== + +Table → DictElement + +A generic converter (abstract) for files containing tables. +Currently, there are two specialized implementations for XLSX files and CSV files. + +All table converters generate a subtree of dicts, which in turn can be converted with DictElementConverters: +For each row in the table the TableConverter generates a DictElement (structure element). The key of the +element is the row number. The value of the element is a dict containing the mapping of +column names to values of the respective cell. + +Example: + +.. code-block:: yaml + + subtree: + TABLE: # Any name for the table as a whole + type: CSVTableConverter + match: ^test_table.csv$ + records: + (...) # Records edited for the whole table file + subtree: + ROW: # Any name for a data row in the table + type: DictElement + match_name: .* + match_value: .* + records: + (...) # Records edited for each row + subtree: + COLUMN: # Any name for a specific type of column in the table + type: FloatElement + match_name: measurement # Name of the column in the table file + match_value: (?P<column_value).*) + records: + (...) # Records edited for each cell + + +XLSXTableConverter +================== + +XLSX File → DictElement + +CSVTableConverter +================= + +CSV File → DictElement + +PropertiesFromDictConverter +=========================== + +The :py:class:`~caoscrawler.converters.PropertiesFromDictConverter` is +a specialization of the +:py:class:`~caoscrawler.converters.DictElementConverter` and offers +all its functionality. It is meant to operate on dictionaries (e.g., +from reading in a json or a table file), the keys of which correspond +closely to properties in a LinkAhead datamodel. This is especially +handy in cases where properties may be added to the data model and +data sources that are not yet known when writing the cfood definition. + +The converter definition of the +:py:class:`~caoscrawler.converters.PropertiesFromDictConverter` has an +additional required entry ``record_from_dict`` which specifies the +Record to which the properties extracted from the dict are attached +to. This Record is identified by its ``variable_name`` by which it can +be referred to further down the subtree. You can also use the name of +a Record that was specified earlier in the CFood definition in order +to extend it by the properties extracted from a dict. Let's have a +look at a simple example. A CFood definition + +.. code-block:: yaml + + PropertiesFromDictElement: + type: PropertiesFromDictElement + match: ".*" + record_from_dict: + variable_name: MyRec + parents: + - MyType1 + - MyType2 + +applied to a dictionary + +.. code-block:: json + + { + "name": "New name", + "a": 5, + "b": ["a", "b", "c"], + "author": { + "full_name": "Silvia Scientist" + } + } + +will create a Record ``New name`` with parents ``MyType1`` and +``MyType2``. It has a scalar property ``a`` with value 5, a list +property ``b`` with values "a", "b" and "c", and an ``author`` +property which references an ``author`` with a ``full_name`` property +with value "Silvia Scientist": + +.. image:: ../img/properties-from-dict-records-author.png + :height: 210 + :alt: A Record "New Name" and an author Record with full_name + "Silvia Scientist" are generated and filled automatically. + +Note how the different dictionary keys are handled differently +depending on their types: scalar and list values are understood +automatically, and a dictionary-valued entry like ``author`` is +translated into a reference to an ``author`` Record automatically. + +You can further specify how references are treated with an optional +``references key`` in ``record_from_dict``. Let's assume that in the +above example, we have an ``author`` **Property** with datatype +``Person`` in our data model. We could add this information by +extending the above example definition by + + +.. code-block:: yaml + + PropertiesFromDictElement: + type: PropertiesFromDictElement + match: ".*" + record_from_dict: + variable_name: MyRec + parents: + - MyType1 + - MyType2 + references: + author: + parents: + - Person + +so that now, a ``Person`` record with a ``full_name`` property with +value "Silvia Scientist" is created as the value of the ``author`` +property: + +.. image:: ../img/properties-from-dict-records-person.png + :height: 200 + :alt: A new Person Record is created which is referenced as an + author. + +For the time being, only the parents of the referenced record can be +set via this option. More complicated treatments can be implemented +via the ``referenced_record_callback`` (see below). + +Properties can be blacklisted with the ``properties_blacklist`` +keyword, i.e., all keys listed under ``properties_blacklist`` will be +excluded from automated treatment. Since the +:py:class:`~caoscrawler.converters.PropertiesFromDictConverter` has +all the functionality of the +:py:class:`~caoscrawler.converters.DictElementConverter`, individual +properties can still be used in a subtree. Together with +``properties_blacklist`` this can be used to add custom treatment to +specific properties by blacklisting them in ``record_from_dict`` and +then treating them in the subtree the same as you would do it in the +standard +:py:class:`~caoscrawler.converters.DictElementConverter`. Note that +the blacklisted keys are excluded on **all** levels of the dictionary, +i.e., also when they occur in a referenced entity. + +For further customization, the +:py:class:`~caoscrawler.converters.PropertiesFromDictConverter` can be +used as a basis for :ref:`custom converters<Custom Converters>` which +can make use of its ``referenced_record_callback`` argument. The +``referenced_record_callback`` can be a callable object which takes +exactly a Record as an argument and needs to return that Record after +doing whatever custom treatment is needed. Additionally, it is given +the ``RecordStore`` and the ``ValueStore`` in order to be able to +access the records and values that have already been defined from +within ``referenced_record_callback``. Such a function might look the +following: + +.. code-block:: python + + def my_callback(rec: db.Record, records: RecordStore, values: GeneralStore): + # do something with rec, possibly using other records or values from the stores... + rec.description = "This was updated in a callback" + return rec + +It is applied to all Records that are created from the dictionary and +it can be used to, e.g., transform values of some properties, or add +special treatment to all Records of a specific +type. ``referenced_record_callback`` is applied **after** the +properties from the dictionary have been applied as explained above. + +XML Converters +============== + +There are the following converters for XML content: + + +XMLFileConverter +---------------- + +This is a converter that loads an XML file and creates an XMLElement containing the +root element of the XML tree. It can be matched in the subtree using the XMLTagConverter. + +XMLTagConverter +--------------- + +The XMLTagConverter is a generic converter for XMLElements with the following main features: + +- It allows to match a combination of tag name, attribute names and text contents using the keys: + + - ``match_tag``: regexp, default empty string + - ``match_attrib``: dictionary of key-regexps and value-regexp + pairs. Each key matches an attribute name and the corresponding + value matches its attribute value. + - ``match_text``: regexp, default empty string +- It allows to traverse the tree using XPath (using Python lxml's xpath functions): + + - The key ``xpath`` is used to set the xpath expression and has a + default of ``child::*``. Its default would generate just the list of + sub nodes of the current node. The result of the xpath expression + is used to generate structure elements as children. It furthermore + uses the keys ``tags_as_children``, ``attribs_as_children`` and + ``text_as_children`` to decide which information from the found + nodes will be used as children: + - ``tags_as_children``: (default ``true``) For each xml tag element + found by the xpath expression, generate one XMLTag structure + element. Its name is the full path to the tag using the function + ``getelementpath`` from ``lxml``. + - ``attribs_as_children``: (default ``false``) For each xml tag element + found by the xpath expression, generate one XMLAttributeNode + structure element for each of its attributes. The name of the + respective attribute node has the form: ``<full path of the tag> @ + <name of the attribute>`` **Please note:** Currently, there is no + converter implemented that can match XMLAttributeNodes. + - ``text_as_children``: (default ``false``) For each xml tag element + found by the xpath expression, generate one XMLTextNode structure + element containing the text content of the tag element. Note that + in case of multiple text elements, only the first one is + added. The name of the respective attribute node has the form: + ``<full path of the tag> /text()`` to the tag using the function + ``getelementpath`` from ``lxml``. **Please note:** Currently, there is + no converter implemented that can match XMLAttributeNodes. + +Namespaces +********** + +The default is to take the namespace map from the current node and use +it in xpath queries. Because default namespaces cannot be handled by +xpath, it is possible to remap the default namespace using the key +``default_namespace``. The key ``nsmap`` can be used to define +additional nsmap entries. + +XMLTextNodeConverter +-------------------- + +In the future, this converter can be used to match XMLTextNodes that +are generated by the XMLTagConverter. diff --git a/src/doc/converters/transform_functions.rst b/src/doc/converters/transform_functions.rst new file mode 100644 index 0000000000000000000000000000000000000000..22df35c8521ea0d70b2ebf7b7c8bc7c52e176bd3 --- /dev/null +++ b/src/doc/converters/transform_functions.rst @@ -0,0 +1,47 @@ +Transform Functions ++++++++++++++++++++ +Often the situation arises, that you cannot use a value as it is found. Maybe a value should be +increased by an offset or a string should be split into a list of pieces. In order to allow such +simple conversions, transform functions can be named in the converter definition that are then +applied to the respective variables when the converter is executed. + +.. code-block:: yaml + + <NodeName>: + type: <ConverterName> + match: ".*" + transform: + <TransformNodeName>: + in: $<in_var_name> + out: $<out_var_name> + functions: + - <func_name>: # name of the function to be applied + <func_arg1>: <func_arg1_value> # key value pairs that are passed as parameters + <func_arg2>: <func_arg2_value> + # ... + +An example that splits the variable ``a`` and puts the generated list in ``b`` is the following: + +.. code-block:: yaml + + Experiment: + type: Dict + match: ".*" + transform: + param_split: + in: $a + out: $b + functions: + - split: # split is a function that is defined by default + marker: "|" # its only parameter is the marker that is used to split the string + records: + Report: + tags: $b + +This splits the string in '$a' and stores the resulting list in '$b'. This is here used to add a +list valued property to the Report Record. + + +There are a number of transform functions that are defined by default (see +``src/caoscrawler/default_transformers.yml``). You can define custom transform functions by adding +them to the cfood definition (see :doc:`CFood Documentation<../cfood>`). diff --git a/src/doc/index.rst b/src/doc/index.rst index 8a02ec62e50308a28899e71b4664f626dfa0c27b..a72389b1f4b94430b2c5ff2bfee9757193327ed7 100644 --- a/src/doc/index.rst +++ b/src/doc/index.rst @@ -10,7 +10,7 @@ CaosDB-Crawler Documentation Getting started<getting_started/index> Tutorials<tutorials/index> Concepts<concepts> - Converters<converters> + Converters<converters/index> CFoods (Crawler Definitions)<cfood> Macros<macros> How to upgrade<how-to-upgrade> diff --git a/unittests/test_converters.py b/unittests/test_converters.py index e12302514d16f077882e41d6ff5995953f2228f8..3d4d8dd7a1faf02c49febc1a112fab7c3cef4830 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -643,11 +643,15 @@ def test_load_converters(): # converter classes can be loaded from their respective packages. # Please adapt, if defaults change! - assert len(converter_registry) == 25 + assert len(converter_registry) == 28 # All of them are contained in caoscrawler.converters + # except for the xml converters: for conv_key, conv in converter_registry.items(): - assert conv["package"] == "caoscrawler.converters" + if conv_key in ("XMLTag", "XMLFile", "XMLTextNode"): + assert conv["package"] == "caoscrawler.xml_converter" + else: + assert conv["package"] == "caoscrawler.converters" # ... and their names all end in "Converter" assert conv["converter"].endswith("Converter") diff --git a/unittests/test_xml_converter.py b/unittests/test_xml_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..93e4a422d94a9315eadca24b8c799682d7d99964 --- /dev/null +++ b/unittests/test_xml_converter.py @@ -0,0 +1,320 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer <a.schlemmer@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" +test the converters module +""" +import datetime +import importlib +import json +import logging +import os +import sys +from itertools import product +from pathlib import Path + +import pytest +import yaml +from caoscrawler.converters import (Converter, ConverterValidationError, + DateElementConverter, DictElementConverter, + DictIntegerElementConverter, + DirectoryConverter, FloatElementConverter, + IntegerElementConverter, JSONFileConverter, + ListElementConverter, + MarkdownFileConverter, YAMLFileConverter, + _AbstractScalarValueElementConverter, + handle_value, replace_variables) +from caoscrawler.crawl import Crawler +from caoscrawler.scanner import (_load_definition_from_yaml_dict, + create_converter_registry, + create_transformer_registry, load_definition) +from caoscrawler.stores import GeneralStore +from caoscrawler.structure_elements import (BooleanElement, DictElement, + Directory, File, FloatElement, + IntegerElement, ListElement, + TextElement, XMLTagElement) +from caoscrawler.xml_converter import XMLTagConverter + +from lxml.etree import fromstring + +UNITTESTDIR = Path(__file__).parent + + +@pytest.fixture +def converter_registry(): + converter_registry: dict[str, dict[str, str]] = { + "Directory": { + "converter": "DirectoryConverter", + "package": "caoscrawler.converters"}, + "TextElement": { + "converter": "TextElementConverter", + "package": "caoscrawler.converters"}, + "XMLTag": { + "converter": "XMLTagConverter", + "package": "caoscrawler.xml_converter"}, + + "XMLTextNode": { + "converter": "XMLTextNodeConverter", + "package": "caoscrawler.xml_converter"}, + } + + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry + + +@pytest.fixture +def basic_xmltag_converter(converter_registry): + return XMLTagConverter(yaml.safe_load(""" +type: XMLTag +match_tag: a +match_attrib: # default is the empty dictionary + "(?P<ref>(href|url))": "test(?P<number>[0-9])" # either the "href" or the "url" attribute must be set + alt: (.+) # this attribute must be present and contain at least one character +match_text: \\s*(?P<node_text>.+)\\s* + +subtree: + img: + type: XMLTag + match_name: img + match_attrib: + src: test2 +"""), "TestXMLTagConverter", converter_registry) + + +@pytest.fixture +def basic_xpath_xmltag_converter(converter_registry): + return XMLTagConverter(yaml.safe_load(""" +type: XMLTag +match_tag: a +match_attrib: # default is the empty dictionary + "(?P<ref>(href|url))": "test(?P<number>[0-9])" # either the "href" or the "url" attribute must be set + alt: (.+) # this attribute must be present and contain at least one character +match_text: \\s*(?P<node_text>.+)\\s* +xpath: child::*/* + +subtree: + img: + type: XMLTag + match_name: img + match_attrib: + src: test2 + testnode: + type: XMLTag + match_name: testnode +"""), "TestXMLTagConverter", converter_registry) + + +def test_simple_xml(basic_xmltag_converter): + """ + Test for basic xml conversion functionality. + """ + xml_text = """ + <a href="test1" alt="no link"> + test <img src="test2"/> + </a> + """ + + xml = fromstring(xml_text) + tag = XMLTagElement(xml) + assert tag.name == "." + + m = basic_xmltag_converter.match(tag) + + assert m is not None + assert m["ref"] == "href" + assert m["number"] == "1" + assert m["node_text"] == "test " + + +def test_not_matching(basic_xmltag_converter): + m = basic_xmltag_converter.match(XMLTagElement(fromstring(""" + <a href="test1"> + test <img src="test2"/> + </a> + """))) + + assert m is None # alt-attribute was missing + + m = basic_xmltag_converter.match(XMLTagElement(fromstring(""" + <a href="test" alt="no link"> + test <img src="test2"/> + </a> + """))) + + assert m is None # href attribute did not match + + m = basic_xmltag_converter.match(XMLTagElement(fromstring(""" + <a href="test1" url="http" alt="no link"> + test <img src="test2"/> + </a> + """))) + + assert m is None # href and url must not be present simultaneously + + m = basic_xmltag_converter.match(XMLTagElement(fromstring(""" + <a href="test1" alt="no link"><img src="test2"/></a> + """))) + + assert m is None # text node is empty + + m = basic_xmltag_converter.match(XMLTagElement(fromstring(""" + <a href="test1" alt="no link"/> + """))) + + assert m is None # text node is empty + + # TODO: adapt converter -> empty (==None) text node is equivalent to empty string text node + # TODO: adapt tests + # TODO: how to match " ajskdlfjaldsf ajsdklfjadkl " without the whitespaces in regexp correctly? + + +def test_nested_simple_xml(basic_xmltag_converter, basic_xpath_xmltag_converter): + """ + Test for xml conversion including children. + """ + xml_text = """ + <a href="test1" alt="no link"> + test <img src="test2"/> + </a> + """ + + tag = XMLTagElement(fromstring(xml_text)) + m = basic_xmltag_converter.match(tag) + assert m is not None + + general_store = GeneralStore() + children = basic_xmltag_converter.create_children(general_store, tag) + + assert len(children) == 1 + assert isinstance(children[0], XMLTagElement) + assert children[0].name == "img" + + xml_text = """ + <a href="test1" alt="no link"> + test <img src="test2"> + <testnode/> </img> + </a> + """ + + tag = XMLTagElement(fromstring(xml_text)) + m = basic_xpath_xmltag_converter.match(tag) + assert m is not None + + general_store = GeneralStore() + children = basic_xpath_xmltag_converter.create_children(general_store, tag) + + assert len(children) == 1 + assert isinstance(children[0], XMLTagElement) + assert children[0].name == "img/testnode" + + +def test_namespace_xml(converter_registry): + """ + Test for xml conversion including children. + Nodes have namespaces. + """ + + xml_text = """ + <root xmlns="default-namespace" xmlns:test="alternative-namespace"> + <node1 active="true"> + Bla + </node1> + <node1 active="true" size="45"> + text + <node2 xmlns="sub-namespace"> + <node3> + ok + </node3> + </node2> + <test:node2> + sep + </test:node2> + </node1> + </root> +""" + + # Test unsupported xpath (containing text()): + converter = XMLTagConverter(yaml.safe_load(""" +type: XMLTag +match_tag: "{default-namespace}root" +xpath: "default:node1/text()" +default_namespace: default +"""), "TestXMLTagConverter", converter_registry) + + tag = XMLTagElement(fromstring(xml_text)) + m = converter.match(tag) + assert m is not None + + with pytest.raises(RuntimeError, match="Only standard xml nodes.*"): + converter.create_children(GeneralStore(), tag) + + # Test complex xml using namespaces and text nodes: + converter = XMLTagConverter(yaml.safe_load(""" +type: XMLTag +match_tag: "{default-namespace}root" +xpath: "default:node1" +default_namespace: default +attribs_as_children: false +text_as_children: true +tags_as_children: false +"""), "TestXMLTagConverter", converter_registry) + children = converter.create_children(GeneralStore(), tag) + assert len(children) == 2 + assert children[0].name == "{default-namespace}node1[1]/text()" + assert children[0].value.strip() == "Bla" + assert children[1].name == "{default-namespace}node1[2]/text()" + assert children[1].value.strip() == "text" + + # Check child generation of attributes: + converter = XMLTagConverter(yaml.safe_load(""" +type: XMLTag +match_tag: "{default-namespace}root" +xpath: "default:node1" +default_namespace: default +attribs_as_children: true +text_as_children: false +tags_as_children: false +"""), "TestXMLTagConverter", converter_registry) + children = converter.create_children(GeneralStore(), tag) + + assert len(children) == 3 + assert children[0].name == "{default-namespace}node1[1]@active" + assert children[0].value.strip() == "true" + assert children[1].name == "{default-namespace}node1[2]@active" + assert children[1].value.strip() == "true" + assert children[2].name == "{default-namespace}node1[2]@size" + assert children[2].value.strip() == "45" + + # Test setting nsmap entries: + converter = XMLTagConverter(yaml.safe_load(""" +type: XMLTag +match_tag: "{default-namespace}root" +xpath: "//s:node2" +default_namespace: default +nsmap: + s: sub-namespace +"""), "TestXMLTagConverter", converter_registry) + children = converter.create_children(GeneralStore(), tag) + assert len(children) == 1 + assert children[0].name == "{default-namespace}node1[2]/{sub-namespace}node2"