diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index 7150db751f429e2397df878487d165cc9e451866..eb41325bd798126c4d06d92f6ea398eaa1ebbb72 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -23,13 +23,6 @@ # ** end header # -""" -TODO place -- The key is the name of the function to be looked up in the dictionary -of registered transformer functions. -- The value is a dictionary with key, value-assignments which will be -passed to the transformer function. -""" from __future__ import annotations @@ -325,6 +318,15 @@ class Converter(object, metaclass=ABCMeta): """Converters treat StructureElements contained in the hierarchical sturcture.""" def __init__(self, definition: dict, name: str, converter_registry: dict): + """ + + Parameters + ---------- + definition: dict, stuffPlease refer to XX to learn about the structure that the definition dict must have + converter_registry: dict, A dictionary that contains converter names as keys and dicts as + values. Those value dicts have the keys 'converter' and 'package'. + """ + self.definition = definition self.name = name @@ -334,6 +336,23 @@ class Converter(object, metaclass=ABCMeta): } self.converters = [] + if "transform" in self.definition: + if not isinstance(self.definition["transform"], dict): + raise RuntimeError("The value corresponding to the 'transform' key in the " + "converter definition must be a dict") + for transformer_key, transformer in self.definition["transform"].items(): + if "in" not in transformer: + raise RuntimeError("In-variable not defined!") + if "out" not in transformer: + raise RuntimeError("Out-variable not defined!") + if "functions" not in transformer: + raise RuntimeError("No functions given for transformer!") + if not isinstance(transformer["functions"], list): + raise RuntimeError("The value corresponding to the 'functions' key in the " + "transform section must be a list") + + if not isinstance(transformer["in"], str): + raise RuntimeError("You should provide the variable name as string") if "subtree" in definition: for converter_name in definition['subtree']: @@ -404,51 +423,36 @@ class Converter(object, metaclass=ABCMeta): pass """ - if "transform" in self.definition: - if not isinstance(self.definition["transform"], dict): - raise RuntimeError("The value corresponding to the 'transform' key in the " - "converter definition must be a dict") - for transformer_key, transformer in self.definition["transform"].items(): - if "in" not in transformer: - raise RuntimeError("In-variable not defined!") - if "out" not in transformer: - raise RuntimeError("Out-variable not defined!") - if "functions" not in transformer: - raise RuntimeError("No functions given for transformer!") - if not isinstance(transformer["functions"], list): - raise RuntimeError("The value corresponding to the 'functions' key in the " - "transform section must be a list") - - if not isinstance(transformer["in"], str): - raise RuntimeError("You should provide the variable name as string") - in_value = replace_variables(transformer["in"], values) - - for tr_func_el in transformer["functions"]: - if not isinstance(tr_func_el, dict): - raise RuntimeError("Elements of the list of the functions key " - "must be dictonaries!") - if len(tr_func_el) != 1: - raise RuntimeError("List element dictionaries must have exactly" - " one element with they key being the name" - " of the function!") - tr_func_key = list(tr_func_el.keys())[0] - tr_func_params = tr_func_el[tr_func_key] - if tr_func_key not in transformer_functions: - raise RuntimeError("Unknown transformer function: {}".format(tr_func_key)) - - # Retrieve the function from the dictionary: - tr_func = transformer_functions[tr_func_key] - # Call the function: - out_value = tr_func(in_value, tr_func_params) - # The next in_value is the current out_value: - in_value = out_value - # If everything succeeded, store the final value in the general store: - match = SINGLE_VAR_RE.match(transformer["out"]) - if match is None: - raise RuntimeError("'out' of the transformer definition must specify a single" - f" variable name. It was {transformer['out']}") - print(f"set {match.group('varname')} to {out_value}") - values[match.group('varname')] = out_value + if not "transform" in self.definition: + return + for transformer_key, transformer in self.definition["transform"].items(): + in_value = replace_variables(transformer["in"], values) + + for tr_func_el in transformer["functions"]: + if not isinstance(tr_func_el, dict): + raise RuntimeError("Elements of the list of the functions key " + "must be dictonaries!") + if len(tr_func_el) != 1: + raise RuntimeError("List element dictionaries must have exactly" + " one element with they key being the name" + " of the function!") + tr_func_key = list(tr_func_el.keys())[0] + tr_func_params = tr_func_el[tr_func_key] + if tr_func_key not in transformer_functions: + raise RuntimeError("Unknown transformer function: {}".format(tr_func_key)) + + # Retrieve the function from the dictionary: + tr_func = transformer_functions[tr_func_key] + # Call the function: + out_value = tr_func(in_value, tr_func_params) + # The next in_value is the current out_value: + in_value = out_value + # If everything succeeded, store the final value in the general store: + match = SINGLE_VAR_RE.match(transformer["out"]) + if match is None: + raise RuntimeError("'out' of the transformer definition must specify a single" + f" variable name. It was {transformer['out']}") + values[match.group('varname')] = out_value @abstractmethod def create_children(self, values: GeneralStore, diff --git a/src/doc/cfood.rst b/src/doc/cfood.rst index 6564ee677f0b363a52c44dd5ceabe5378c255105..c12e251d49e164a737b20e92e56e7b3e10149d4f 100644 --- a/src/doc/cfood.rst +++ b/src/doc/cfood.rst @@ -4,6 +4,7 @@ CFood-Definition The crawler specification is called CFood-definition. It is stored inside a yaml file, or - more precisely - inside of one single or two yaml documents inside a yaml file. The specification consists of three separate parts: + #. Metadata and macro definitions #. Custom converter registrations #. The converter tree specification @@ -178,6 +179,23 @@ in a vairable with the same name (as it is the case for other Records). SomeRecord: ParameterFile: $fileEntity # creates a reference to the file + +Transform Functions +------------------- +You can use transform functions to alter variable values that the crawler consumes (e.g. a string +that was matched with a reg exp). See :doc:`Converter Documentation<converters.rst>`. + +You can define your own transform functions by adding the the same way you add custom converters: + +.. code-block:: yaml + + Transformers: + transform_foo: + package: some.package + function: some_foo + + + Automatically generated keys ++++++++++++++++++++++++++++ diff --git a/src/doc/converters.rst b/src/doc/converters.rst index 98849609f0cab2afba037a82fe4ae6802caa5956..be991dfe1ba10034670693a0efd192fc8bf9ecee 100644 --- a/src/doc/converters.rst +++ b/src/doc/converters.rst @@ -56,6 +56,48 @@ to generate records (see :py:meth:`~caoscrawler.converters.Converter.create_reco Subtree contains a list of Converter defnitions that look like the one described here. +Transform Functions ++++++++++++++++++++ +Often the situation arises, that you cannot use a value as it is found. Maybe a value should be +increased by an offset or a string should be split into a list of pieces. In order to allow such +simple conversions, transform functions can be named in the converter definition that are then +applied to the respective variables when the converter is executed. + +.. code-block:: yaml + + <NodeName>: + type: <ConverterName> + match: ".*" + transform: + <TransformNodeName>: + in: $<in_var_name> + out: $<out_var_name> + functions: + - <func_name>: # name of the function to be applied + <func_arg1>: <func_arg1_value> # key value pairs that are passed as parameters + <func_arg2>: <func_arg2_value> + # ... + +An example that splits the variable ``a`` and puts the generated list in ``b`` is the following: + +.. code-block:: yaml + + Experiment: + type: Dict + match: ".*" + transform: + param_split: + in: $a + out: $b + functions: + - split: # split is a function that is defined by default + marker: "|" # its only parameter is the marker that is used to split the string + + +There are a number of transform functions that are defined by default (see +``src/caoscrawler/default_transformers.yml``). You can define custom transform functions by adding +them to the cfood definition (see :doc:`CFood Documentation<cfood.rst>`). + Standard Converters +++++++++++++++++++