diff --git a/CHANGELOG.md b/CHANGELOG.md index 995ad6eedf391f2219cbd25fbd7fa12e1f32126a..81e32c0560a49c624d88931c57b9726409a3c90a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * New transformer functions: * `date_parse` * `datetime_parse` +* New ``PropertiesFromDictConverter`` which allows to automatically + create property values from dictionary keys. ### Changed ### diff --git a/src/caoscrawler/cfood-schema.yml b/src/caoscrawler/cfood-schema.yml index 340e5b9dec0e8f05b1c39ec2511196249ec87d31..6609d8eb05135b17f5f6d9526df255b810de112a 100644 --- a/src/caoscrawler/cfood-schema.yml +++ b/src/caoscrawler/cfood-schema.yml @@ -1,9 +1,44 @@ cfood: type: object + properties: + Converters: + description: Defintiion of custom converters + type: object + additionalProperties: + type: object + properties: + converter: + type: string + package: + type: string + required: + - converter + - package + macros: + description: Macro definitions + type: array + Transformers: + description: Variable transformer definition + type: object + additionalProperties: + type: object + properties: + function: + type: string + package: + type: string + required: + - package + - function additionalProperties: $ref: "#/$defs/converter" $defs: + parents: + description: Parents for this record are given here as a list of names. + type: array + items: + type: string converter: properties: type: @@ -38,6 +73,7 @@ cfood: - H5Dataset - H5Group - H5Ndarray + - PropertiesFromDictElement description: Type of this converter node. match: description: typically a regexp which is matched to a structure element name @@ -48,15 +84,46 @@ cfood: match_value: description: a regexp that is matched to the value of a key-value pair type: string - records: - description: This field is used to define new records or to modify records which have been defined on a higher level. + record_from_dict: + description: Only relevant for PropertiesFromDictElement. Specify the root record which is generated from the contained dictionary. type: object + required: + - variable_name properties: - parents: - description: Parents for this record are given here as a list of names. + variable_name: + description: | + Name of the record by which it can be accessed in the + cfood definiton. Can also be the name of an existing + record in which case that record will be updated by + the PropertiesFromDictConverter. + type: string + properties_blacklist: + description: List of keys to be ignored in the automatic treatment. They will be ignored on all levels of the dictionary. type: array items: type: string + references: + description: List of keys that will be transformed into named reference properties. + type: object + additionalProperties: + type: object + properties: + parents: + $ref: + "#/$defs/parents" + name: + description: Name of this record. If none is given, variable_name is used. + type: string + parents: + $ref: + "#/$defs/parents" + records: + description: This field is used to define new records or to modify records which have been defined on a higher level. + type: object + properties: + parents: + $ref: + "#/$defs/parents" additionalProperties: oneOf: - type: object @@ -78,3 +145,15 @@ cfood: additionalProperties: $ref: "#/$defs/converter" + if: + properties: + type: + const: + "PropertiesFromDictElement" + then: + required: + - type + - record_from_dict + else: + required: + - type diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index 40d3b72bfe7564cfb815e11a69a952f9142c3e55..0eee8965512ed39add9a9688c531c540f80d7df2 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -818,6 +818,180 @@ class DictElementConverter(Converter): return match_name_and_value(self.definition, element.name, element.value) +class PropertiesFromDictConverter(DictElementConverter): + """Extend the :py:class:`DictElementConverter` by a heuristic to set + property values from the dictionary keys. + + """ + + def __init__(self, definition: dict, name: str, converter_registry: dict, + referenced_record_callback: Optional[callable] = None): + + super().__init__(definition, name, converter_registry) + self.referenced_record_callback = referenced_record_callback + + def _recursively_create_records(self, subdict: dict, root_record: db.Record, + root_rec_name: str, + values: GeneralStore, records: RecordStore, + referenced_record_callback: callable, + keys_modified: list = [] + ): + """Create a record form the given `subdict` and recursively create referenced records.""" + + blacklisted_keys = self.definition["record_from_dict"][ + "properties_blacklist"] if "properties_blacklist" in self.definition["record_from_dict"] else [] + special_references = self.definition["record_from_dict"]["references"] if "references" in self.definition["record_from_dict"] else [ + ] + + for key, value in subdict.items(): + + if key in blacklisted_keys: + # We ignore this in the automated property generation + continue + if isinstance(value, list): + if not any([isinstance(val, dict) for val in value]): + # no dict in list, i.e., no references, so this is simple + root_record.add_property(name=key, value=value) + else: + if not all([isinstance(val, dict) for val in value]): + # if this is not an error (most probably it is), this + # needs to be handled manually for now. + raise ValueError( + f"{key} in {subdict} contains a mixed list of references and scalars.") + ref_recs = [] + for ii, ref_dict in enumerate(value): + ref_var_name = f"{root_rec_name}.{key}.{ii+1}" + ref_rec, keys_modified = self._create_ref_rec( + ref_var_name, + key, + ref_dict, + special_references, + records, + values, + keys_modified, + referenced_record_callback + ) + ref_recs.append(ref_rec) + root_record.add_property(name=key, value=ref_recs) + + elif isinstance(value, dict): + # Treat scalar reference + ref_var_name = f"{root_rec_name}.{key}" + ref_rec, keys_modified = self._create_ref_rec( + ref_var_name, + key, + value, + special_references, + records, + values, + keys_modified, + referenced_record_callback + ) + root_record.add_property(key, ref_rec) + else: + # All that remains are scalar properties which may or + # may not be special attributes like name. + if key.lower() in SPECIAL_PROPERTIES: + setattr(root_record, key.lower(), value) + else: + root_record.add_property(name=key, value=value) + keys_modified.append((root_rec_name, key)) + + if referenced_record_callback: + root_record = referenced_record_callback(root_record, records, values) + + return keys_modified + + def _create_ref_rec( + self, + name: str, + key: str, + subdict: dict, + special_references: dict, + records: RecordStore, + values: GeneralStore, + keys_modified: list, + referenced_record_callback: callable + ): + """Create the referenced Record and forward the stores etc. to + ``_recursively_create_records``. + + Parameters: + ----------- + name : str + name of the referenced record to be created in RecordStore and Value Store. + key : str + name of the key this record's definition had in the original dict. + subdict : dict + subdict containing this record's definition from the original dict. + special_references : dict + special treatment of referenced records from the converter definition. + records : RecordStore + RecordStore for entering new Records + values : GeneralStore + ValueStore for entering new Records + keys_modified : list + List for keeping track of changes + referenced_record_callback : callable + Advanced treatment of referenced records as given in the + converter initialization. + """ + ref_rec = db.Record() + if key in special_references: + for par in special_references[key]["parents"]: + ref_rec.add_parent(par) + else: + ref_rec.add_parent(key) + records[name] = ref_rec + values[name] = ref_rec + keys_modified = self._recursively_create_records( + subdict=subdict, + root_record=ref_rec, + root_rec_name=name, + values=values, + records=records, + referenced_record_callback=referenced_record_callback, + keys_modified=keys_modified + ) + return ref_rec, keys_modified + + def create_records(self, values: GeneralStore, records: RecordStore, + element: StructureElement): + + keys_modified = [] + + rfd = self.definition["record_from_dict"] + if rfd["variable_name"] not in records: + rec = db.Record() + if "name" in rfd: + rec.name = rfd["name"] + if "parents" in rfd: + for par in rfd["parents"]: + rec.add_parent(par) + else: + rec.add_parent(rfd["variable_name"]) + records[rfd["variable_name"]] = rec + values[rfd["variable_name"]] = rec + + else: + rec = records[rfd["variable_name"]] + + keys_modified = self._recursively_create_records( + subdict=element.value, + root_record=rec, + root_rec_name=rfd["variable_name"], + values=values, + records=records, + referenced_record_callback=self.referenced_record_callback, + keys_modified=keys_modified, + ) + + keys_modified.extend(super().create_records( + values=values, records=records, element=element)) + + return keys_modified + + class DictConverter(DictElementConverter): def __init__(self, *args, **kwargs): warnings.warn(DeprecationWarning( diff --git a/src/caoscrawler/default_converters.yml b/src/caoscrawler/default_converters.yml index 9a5fc248c45a77b848611c322ed7d2a5fdbd3721..82e2f635f621b2e21e43b728fd9ed6865454f828 100644 --- a/src/caoscrawler/default_converters.yml +++ b/src/caoscrawler/default_converters.yml @@ -14,6 +14,9 @@ Datetime: Dict: converter: DictElementConverter package: caoscrawler.converters +PropertiesFromDictElement: + converter: PropertiesFromDictConverter + package: caoscrawler.converters FloatElement: converter: FloatElementConverter package: caoscrawler.converters diff --git a/src/doc/README_SETUP.md b/src/doc/README_SETUP.md index a75193783d861707adf3b3d45311c392e22626f4..32f0bb89a6051bc2ec4be0bae6cf06cd1a540f8b 100644 --- a/src/doc/README_SETUP.md +++ b/src/doc/README_SETUP.md @@ -13,7 +13,10 @@ see INSTALL.md We use sphinx to create the documentation. Docstrings in the code should comply with the Googly style (see link below). -Build documentation in `src/doc` with `make html`. +Build documentation in `src/doc` with `make doc`. Note that for the +automatic generation of the complete API documentation, it is +necessary to first install this library with all its optional +dependencies, i.e., `pip install .[h5-crawler,spss]`. ### Requirements ### diff --git a/src/doc/converters.rst b/src/doc/converters.rst index d7e11c235fafa1e42f53342a24255ceb0d275ed4..f59e6d3dff0a1f75dc4e0e5bcbbee0b4ceb7e81d 100644 --- a/src/doc/converters.rst +++ b/src/doc/converters.rst @@ -31,20 +31,20 @@ The yaml definition may look like this: .. code-block:: yaml <NodeName>: - type: <ConverterName> - match: ".*" - records: - Experiment1: - parents: - - Experiment - - Blablabla - date: $DATUM - (...) - Experiment2: - parents: - - Experiment - subtree: - (...) + type: <ConverterName> + match: ".*" + records: + Experiment1: + parents: + - Experiment + - Blablabla + date: $DATUM + (...) + Experiment2: + parents: + - Experiment + subtree: + (...) The **<NodeName>** is a description of what the current block represents (e.g. ``experiment-folder``) and is used as an identifier. @@ -76,35 +76,35 @@ applied to the respective variables when the converter is executed. .. code-block:: yaml <NodeName>: - type: <ConverterName> - match: ".*" - transform: - <TransformNodeName>: - in: $<in_var_name> - out: $<out_var_name> - functions: - - <func_name>: # name of the function to be applied - <func_arg1>: <func_arg1_value> # key value pairs that are passed as parameters - <func_arg2>: <func_arg2_value> - # ... + type: <ConverterName> + match: ".*" + transform: + <TransformNodeName>: + in: $<in_var_name> + out: $<out_var_name> + functions: + - <func_name>: # name of the function to be applied + <func_arg1>: <func_arg1_value> # key value pairs that are passed as parameters + <func_arg2>: <func_arg2_value> + # ... An example that splits the variable ``a`` and puts the generated list in ``b`` is the following: .. code-block:: yaml Experiment: - type: Dict - match: ".*" - transform: - param_split: - in: $a - out: $b - functions: - - split: # split is a function that is defined by default - marker: "|" # its only parameter is the marker that is used to split the string - records: - Report: - tags: $b + type: Dict + match: ".*" + transform: + param_split: + in: $a + out: $b + functions: + - split: # split is a function that is defined by default + marker: "|" # its only parameter is the marker that is used to split the string + records: + Report: + tags: $b This splits the string in '$a' and stores the resulting list in '$b'. This is here used to add a list valued property to the Report Record. @@ -218,21 +218,21 @@ Example: type: CSVTableConverter match: ^test_table.csv$ records: - (...) # Records edited for the whole table file + (...) # Records edited for the whole table file subtree: - ROW: # Any name for a data row in the table - type: DictElement - match_name: .* - match_value: .* - records: - (...) # Records edited for each row - subtree: - COLUMN: # Any name for a specific type of column in the table - type: FloatElement - match_name: measurement # Name of the column in the table file - match_value: (?P<column_value).*) - records: - (...) # Records edited for each cell + ROW: # Any name for a data row in the table + type: DictElement + match_name: .* + match_value: .* + records: + (...) # Records edited for each row + subtree: + COLUMN: # Any name for a specific type of column in the table + type: FloatElement + match_name: measurement # Name of the column in the table file + match_value: (?P<column_value).*) + records: + (...) # Records edited for each cell XLSXTableConverter @@ -245,6 +245,140 @@ CSVTableConverter CSV File → DictElement +PropertiesFromDictConverter +=========================== + +The :py:class:`~caoscrawler.converters.PropertiesFromDictConverter` is +a specialization of the +:py:class:`~caoscrawler.converters.DictElementConverter` and offers +all its functionality. It is meant to operate on dictionaries (e.g., +from reading in a json or a table file), the keys of which correspond +closely to properties in a LinkAhead datamodel. This is especially +handy in cases where properties may be added to the data model and +data sources that are not yet known when writing the cfood definition. + +The converter definition of the +:py:class:`~caoscrawler.converters.PropertiesFromDictConverter` has an +additional required entry ``record_from_dict`` which specifies the +Record to which the properties extracted from the dict are attached +to. This Record is identified by its ``variable_name`` by which it can +be referred to further down the subtree. You can also use the name of +a Record that was specified earlier in the CFood definition in order +to extend it by the properties extracted from a dict. Let's have a +look at a simple example. A CFood definition + +.. code-block:: yaml + + PropertiesFromDictElement: + type: PropertiesFromDictElement + match: ".*" + record_from_dict: + variable_name: MyRec + parents: + - MyType1 + - MyType2 + +applied to a dictionary + +.. code-block:: json + + { + "name": "New name", + "a": 5, + "b": ["a", "b", "c"], + "author": { + "full_name": "Silvia Scientist" + } + } + +will create a Record ``New name`` with parents ``MyType1`` and +``MyType2``. It has a scalar property ``a`` with value 5, a list +property ``b`` with values "a", "b" and "c", and an ``author`` +property which references an ``author`` with a ``full_name`` property +with value "Silvia Scientist": + +.. image:: img/properties-from-dict-records-author.png + :height: 210 + +Note how the different dictionary keys are handled differently +depending on their types: scalar and list values are understood +automatically, and a dictionary-valued entry like ``author`` is +translated into a reference to an ``author`` Record automatically. + +You can further specify how references are treated with an optional +``references key`` in ``record_from_dict``. Let's assume that in the +above example, we have an ``author`` **Property** with datatype +``Person`` in our data model. We could add this information by +extending the above example definition by + + +.. code-block:: yaml + + PropertiesFromDictElement: + type: PropertiesFromDictElement + match: ".*" + record_from_dict: + variable_name: MyRec + parents: + - MyType1 + - MyType2 + references: + author: + parents: + - Person + +so that now, a ``Person`` record with a ``full_name`` property with +value "Silvia Scientist" is created as the value of the ``author`` +property: + +.. image:: img/properties-from-dict-records-person.png + :height: 200 + +For the time being, only the parents of the referenced record can be +set via this option. More complicated treatments can be implemented +via the ``referenced_record_callback`` (see below). + +Properties can be blacklisted with the ``properties_blacklist`` +keyword, i.e., all keys listed under ``properties_blacklist`` will be +excluded from automated treatment. Since the +:py:class:`~caoscrawler.converters.PropertiesFromDictConverter` has +all the functionality of the +:py:class:`~caoscrawler.converters.DictElementConverter`, individual +properties can still be used in a subtree. Together with +``properties_blacklist`` this can be used to add custom treatment to +specific properties by blacklisting them in ``record_from_dict`` and +then treating them in the subtree the same as you would do it in the +standard +:py:class:`~caoscrawler.converters.DictElementConverter`. Note that +the blacklisted keys are excluded on **all** levels of the dictionary, +i.e., also when they occur in a referenced entity. + +For further customization, the +:py:class:`~caoscrawler.converters.PropertiesFromDictConverter` can be +used as a basis for :ref:`custom converters<Custom Converters>` which +can make use of its ``referenced_record_callback`` argument. The +``referenced_record_callback`` can be a callable object which takes +exactly a Record as an argument and needs to return that Record after +doing whatever custom treatment is needed. Additionally, it is given +the ``RecordStore`` and the ``ValueStore`` in order to be able to +access the records and values that have already been defined from +within ``referenced_record_callback``. Such a function might look the +following: + +.. code-block:: python + + def my_callback(rec: db.Record, records: RecordStore, values: GeneralStore): + # do something with rec, possibly using other records or values from the stores... + rec.description = "This was updated in a callback" + return rec + +It is applied to all Records that are created from the dictionary and +it can be used to, e.g., transform values of some properties, or add +special treatment to all Records of a specific +type. ``referenced_record_callback`` is applied **after** the +properties from the dictionary have been applied as explained above. + + Further converters ++++++++++++++++++ @@ -293,7 +427,7 @@ datamodel like H5Ndarray: obligatory_properties: internal_hdf5-path: - datatype: TEXT + datatype: TEXT although the names of both property and record type can be configured within the cfood definition. @@ -407,11 +541,11 @@ First we will create our package and module structure, which might be: tox.ini src/ scifolder/ - __init__.py - converters/ - __init__.py - sources.py # <- the actual file containing - # the converter class + __init__.py + converters/ + __init__.py + sources.py # <- the actual file containing + # the converter class doc/ unittests/ @@ -436,74 +570,74 @@ that would be given using a yaml definition (see next section below). """ def __init__(self, definition: dict, name: str, - converter_registry: dict): - """ - Initialize a new directory converter. - """ - super().__init__(definition, name, converter_registry) + converter_registry: dict): + """ + Initialize a new directory converter. + """ + super().__init__(definition, name, converter_registry) def create_children(self, generalStore: GeneralStore, - element: StructureElement): + element: StructureElement): - # The source resolver does not create children: + # The source resolver does not create children: - return [] + return [] def create_records(self, values: GeneralStore, - records: RecordStore, - element: StructureElement, - file_path_prefix): - if not isinstance(element, TextElement): - raise RuntimeError() - - # This function must return a list containing tuples, each one for a modified - # property: (name_of_entity, name_of_property) - keys_modified = [] - - # This is the name of the entity where the source is going to be attached: - attach_to_scientific_activity = self.definition["scientific_activity"] - rec = records[attach_to_scientific_activity] - - # The "source" is a path to a source project, so it should have the form: - # /<Category>/<project>/<scientific_activity>/ - # obtain these information from the structure element: - val = element.value - regexp = (r'/(?P<category>(SimulationData)|(ExperimentalData)|(DataAnalysis))' - '/(?P<project_date>.*?)_(?P<project_identifier>.*)' - '/(?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2})(_(?P<identifier>.*))?/') - - res = re.match(regexp, val) - if res is None: - raise RuntimeError("Source cannot be parsed correctly.") - - # Mapping of categories on the file system to corresponding record types in CaosDB: - cat_map = { - "SimulationData": "Simulation", - "ExperimentalData": "Experiment", - "DataAnalysis": "DataAnalysis"} - linkrt = cat_map[res.group("category")] - - keys_modified.extend(create_records(values, records, { - "Project": { - "date": res.group("project_date"), - "identifier": res.group("project_identifier"), - }, - linkrt: { - "date": res.group("date"), - "identifier": res.group("identifier"), - "project": "$Project" - }, - attach_to_scientific_activity: { - "sources": "+$" + linkrt - }}, file_path_prefix)) - - # Process the records section of the yaml definition: - keys_modified.extend( - super().create_records(values, records, element, file_path_prefix)) - - # The create_records function must return the modified keys to make it compatible - # to the crawler functions: - return keys_modified + records: RecordStore, + element: StructureElement, + file_path_prefix): + if not isinstance(element, TextElement): + raise RuntimeError() + + # This function must return a list containing tuples, each one for a modified + # property: (name_of_entity, name_of_property) + keys_modified = [] + + # This is the name of the entity where the source is going to be attached: + attach_to_scientific_activity = self.definition["scientific_activity"] + rec = records[attach_to_scientific_activity] + + # The "source" is a path to a source project, so it should have the form: + # /<Category>/<project>/<scientific_activity>/ + # obtain these information from the structure element: + val = element.value + regexp = (r'/(?P<category>(SimulationData)|(ExperimentalData)|(DataAnalysis))' + '/(?P<project_date>.*?)_(?P<project_identifier>.*)' + '/(?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2})(_(?P<identifier>.*))?/') + + res = re.match(regexp, val) + if res is None: + raise RuntimeError("Source cannot be parsed correctly.") + + # Mapping of categories on the file system to corresponding record types in CaosDB: + cat_map = { + "SimulationData": "Simulation", + "ExperimentalData": "Experiment", + "DataAnalysis": "DataAnalysis"} + linkrt = cat_map[res.group("category")] + + keys_modified.extend(create_records(values, records, { + "Project": { + "date": res.group("project_date"), + "identifier": res.group("project_identifier"), + }, + linkrt: { + "date": res.group("date"), + "identifier": res.group("identifier"), + "project": "$Project" + }, + attach_to_scientific_activity: { + "sources": "+$" + linkrt + }}, file_path_prefix)) + + # Process the records section of the yaml definition: + keys_modified.extend( + super().create_records(values, records, element, file_path_prefix)) + + # The create_records function must return the modified keys to make it compatible + # to the crawler functions: + return keys_modified If the recommended (python) package structure is used, the package containing the converter @@ -530,8 +664,8 @@ function signature: .. code-block:: python def create_records(values: GeneralStore, # <- pass the current variables store here - records: RecordStore, # <- pass the current store of CaosDB records here - def_records: dict): # <- This is the actual definition of new records! + records: RecordStore, # <- pass the current store of CaosDB records here + def_records: dict): # <- This is the actual definition of new records! `def_records` is the actual definition of new records according to the yaml cfood specification @@ -547,7 +681,7 @@ Let's have a look at a few examples: match: (?P<dir_name>.*) records: Experiment: - identifier: $dir_name + identifier: $dir_name This block will just create a new record with parent `Experiment` and one property `identifier` with a value derived from the matching regular expression. @@ -565,7 +699,7 @@ Let's formulate that using `create_records`: } keys_modified = create_records(values, records, - record_def) + record_def) The `dir_name` is set explicitely here, everything else is identical to the yaml statements. @@ -588,9 +722,9 @@ So, a sketch of a typical implementation within a custom converter could look li .. code-block:: python def create_records(self, values: GeneralStore, - records: RecordStore, - element: StructureElement, - file_path_prefix: str): + records: RecordStore, + element: StructureElement, + file_path_prefix: str): # Modify some records: record_def = { @@ -598,15 +732,15 @@ So, a sketch of a typical implementation within a custom converter could look li } keys_modified = create_records(values, records, - record_def) + record_def) # You can of course do it multiple times: keys_modified.extend(create_records(values, records, - record_def)) + record_def)) # You can also process the records section of the yaml definition: keys_modified.extend( - super().create_records(values, records, element, file_path_prefix)) + super().create_records(values, records, element, file_path_prefix)) # This essentially allows users of your converter to customize the creation of records # by providing a custom "records" section additionally to the modifications provided # in this implementation of the Converter. @@ -627,12 +761,12 @@ Let's have a look at a more complex examples, defining multiple records: match: (?P<dir_name>.*) records: Project: - identifier: project_name + identifier: project_name Experiment: - identifier: $dir_name - Project: $Project + identifier: $dir_name + Project: $Project ProjectGroup: - projects: +$Project + projects: +$Project This block will create two new Records: @@ -665,7 +799,7 @@ Let's formulate that using `create_records` (again, `dir_name` is constant here) } keys_modified = create_records(values, records, - record_def) + record_def) Debugging ========= @@ -681,7 +815,7 @@ output for the match step. The following snippet illustrates this: debug_match: True records: Project: - identifier: project_name + identifier: project_name Whenever this Converter tries to match a StructureElement, it logs what was tried to macht against diff --git a/src/doc/img/properties-from-dict-records-author.png b/src/doc/img/properties-from-dict-records-author.png new file mode 100644 index 0000000000000000000000000000000000000000..20ee9497ab5ae577c3d515f11da6294c88601fed Binary files /dev/null and b/src/doc/img/properties-from-dict-records-author.png differ diff --git a/src/doc/img/properties-from-dict-records-person.png b/src/doc/img/properties-from-dict-records-person.png new file mode 100644 index 0000000000000000000000000000000000000000..8b026056a42ff3ba203c6077a426640c864b24c1 Binary files /dev/null and b/src/doc/img/properties-from-dict-records-person.png differ diff --git a/unittests/broken_cfoods/broken_record_from_dict.yml b/unittests/broken_cfoods/broken_record_from_dict.yml new file mode 100644 index 0000000000000000000000000000000000000000..fd8ffdbd29f6ad7b8b38fc17eb43686f4170dbcb --- /dev/null +++ b/unittests/broken_cfoods/broken_record_from_dict.yml @@ -0,0 +1,7 @@ +RecordFromDictElement: + type: PropertiesFromDictElement + match: "(.*)" + subtree: + AnotherElement: + type: Text + match_name: "(.*)" diff --git a/unittests/broken_cfoods/broken_record_from_dict_2.yml b/unittests/broken_cfoods/broken_record_from_dict_2.yml new file mode 100644 index 0000000000000000000000000000000000000000..ca321373c6c4d6bcc8c104c8c4b3c7147bf71375 --- /dev/null +++ b/unittests/broken_cfoods/broken_record_from_dict_2.yml @@ -0,0 +1,11 @@ +RecordFromDictElement: + type: PropertiesFromDictElement + record_from_dict: + parents: + - MyType1 + - MyType2 + match: "(.*)" + subtree: + AnotherElement: + type: Text + match_name: "(.*)" diff --git a/unittests/record_from_dict_cfood.yml b/unittests/record_from_dict_cfood.yml new file mode 100644 index 0000000000000000000000000000000000000000..1ea2159df9d63256d9a0b2e293d82a9ad694608f --- /dev/null +++ b/unittests/record_from_dict_cfood.yml @@ -0,0 +1,12 @@ +PropertiesFromDictElement: + type: PropertiesFromDictElement + match: ".*" + record_from_dict: + variable_name: MyRec + parents: + - MyType1 + - MyType2 + references: + author: + parents: + - Person diff --git a/unittests/test_converters.py b/unittests/test_converters.py index f5125e61efa49fe627480696703e570ef9b70e6f..e12302514d16f077882e41d6ff5995953f2228f8 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -29,26 +29,32 @@ import importlib import json import logging import os +import pytest import sys +import yaml + from itertools import product from pathlib import Path -import pytest -import yaml +import linkahead as db + from caoscrawler.converters import (Converter, ConverterValidationError, DateElementConverter, DictElementConverter, DictIntegerElementConverter, DirectoryConverter, FloatElementConverter, IntegerElementConverter, JSONFileConverter, - ListElementConverter, - MarkdownFileConverter, YAMLFileConverter, + ListElementConverter, MarkdownFileConverter, + PropertiesFromDictConverter, + YAMLFileConverter, _AbstractScalarValueElementConverter, handle_value, replace_variables) from caoscrawler.crawl import Crawler from caoscrawler.scanner import (_load_definition_from_yaml_dict, create_converter_registry, - create_transformer_registry, load_definition) -from caoscrawler.stores import GeneralStore + create_transformer_registry, + load_definition, + scan_structure_elements) +from caoscrawler.stores import GeneralStore, RecordStore from caoscrawler.structure_elements import (BooleanElement, DictElement, Directory, File, FloatElement, IntegerElement, ListElement, @@ -73,6 +79,10 @@ def converter_registry(): "DictElement": { "converter": "DictElementConverter", "package": "caoscrawler.converters"}, + "PropertiesFromDictElement": { + "converter": "PropertiesFromDictConverter", + "package": "caoscrawler.converters" + }, "TextElement": { "converter": "TextElementConverter", "package": "caoscrawler.converters"}, @@ -633,7 +643,7 @@ def test_load_converters(): # converter classes can be loaded from their respective packages. # Please adapt, if defaults change! - assert len(converter_registry) == 24 + assert len(converter_registry) == 25 # All of them are contained in caoscrawler.converters for conv_key, conv in converter_registry.items(): @@ -660,3 +670,342 @@ def test_create_path_value(converter_registry): dc.create_values(values, Directory("a", "/a")) assert "Test.path" in values assert values["Test.path"] == "/a" + + +def test_properties_from_dict_basic(converter_registry): + """Test that a record with the correct name and properties is created, and + that the children are still created correctly. + + """ + # definitions with blacklist and named references + pfdc = PropertiesFromDictConverter( + definition={ + "type": "PropertiesFromDictElement", + "match": ".*", + "record_from_dict": { + "variable_name": "MyRec", + "parents": ["DictRT1", "DictRT2"], + "properties_blacklist": ["blacklisted_int", "blacklisted_ref"], + "references": { + "authors": { + "parents": ["Person"] + } + } + } + }, + name="Test", converter_registry=converter_registry) + # Tests for Dict with scalars, dict with lists, dict with reference, + # dict with list of references, dict with reference with reference, named + # reference + values = GeneralStore() + records = RecordStore() + test_dict_element = DictElement("TestDictElement", { + "a": 5, + "b": ["a", "b", "c"], + "scalar_ref": { + "name": "Scalar Ref", + "a": 23, + "blacklisted_int": 42 + }, + "list_ref": [ + { + "c": True + }, + { + "c": False + } + ], + "ref_with_ref": { + "a": 789, + "ref_in_ref": { + "b": "something" + } + }, + "blacklisted_int": -123, + "blacklisted_ref": { + "a": 25 + }, + "authors": { + "full_name": "Some Author" + } + }) + pfdc.create_records(values=values, records=records, element=test_dict_element) + assert "MyRec" in records + my_rec = records["MyRec"] + assert isinstance(my_rec, db.Record) + assert len(my_rec.parents) == 2 + assert "DictRT1" in [par.name for par in my_rec.parents] + assert "DictRT2" in [par.name for par in my_rec.parents] + + # scalar prop + assert my_rec.get_property("a") is not None + assert my_rec.get_property("a").value == 5 + + # list prop + assert my_rec.get_property("b") is not None + assert len(my_rec.get_property("b").value) == 3 + for elt in ["a", "b", "c"]: + assert elt in my_rec.get_property("b").value + + # scalar ref + assert my_rec.get_property("scalar_ref") is not None + referenced = my_rec.get_property("scalar_ref").value + assert isinstance(referenced, db.Record) + assert referenced.name == "Scalar Ref" + assert len(referenced.parents) == 1 + assert "scalar_ref" in [par.name for par in referenced.parents] + assert referenced.get_property("a") is not None + assert referenced.get_property("a").value == 23 + # blacklisted + assert referenced.get_property("blacklisted_int") is None + + # list of ref + assert my_rec.get_property("list_ref") is not None + assert isinstance(my_rec.get_property("list_ref").value, list) + assert len(my_rec.get_property("list_ref").value) == 2 + for rec in my_rec.get_property("list_ref").value: + assert isinstance(rec, db.Record) + assert len(rec.parents) == 1 + assert "list_ref" in [par.name for par in rec.parents] + assert rec.get_property("c") is not None + assert type(rec.get_property("c").value) is bool + assert True in [rec.get_property("c").value for rec in my_rec.get_property("list_ref").value] + assert False in [rec.get_property("c").value for rec in my_rec.get_property("list_ref").value] + + # ref with ref + assert my_rec.get_property("ref_with_ref") is not None + outer_rec = my_rec.get_property("ref_with_ref").value + assert isinstance(outer_rec, db.Record) + assert len(outer_rec.parents) == 1 + assert "ref_with_ref" in [par.name for par in outer_rec.parents] + assert outer_rec.get_property("a") is not None + assert outer_rec.get_property("a").value == 789 + assert outer_rec.get_property("ref_in_ref") is not None + inner_rec = outer_rec.get_property("ref_in_ref").value + assert isinstance(inner_rec, db.Record) + assert len(inner_rec.parents) == 1 + assert "ref_in_ref" in [par.name for par in inner_rec.parents] + assert inner_rec.get_property("b") is not None + assert inner_rec.get_property("b").value == "something" + + # blacklisted + assert my_rec.get_property("blacklisted_int") is None + assert my_rec.get_property("blacklisted_ref") is None + + # named reference property + assert my_rec.get_property("authors") is not None + author_rec = my_rec.get_property("authors").value + assert isinstance(author_rec, db.Record) + assert len(author_rec.parents) == 1 + assert "Person" in [par.name for par in author_rec.parents] + assert author_rec.get_property("full_name") is not None + assert author_rec.get_property("full_name").value == "Some Author" + + +def test_properties_from_dict_callable(converter_registry): + + def convert_some_values(rec: db.Record, records: RecordStore, values: GeneralStore): + """Add an URL prefix to a property value if appliccable.""" + + if rec.get_property("url") is not None: + + old_val = rec.get_property("url").value + if not (old_val is None or old_val.startswith("http")): + + # only add if there is a value that doesn't look like an URL + rec.get_property("url").value = f"https://test.com/{old_val}" + + return rec + + pdfc = PropertiesFromDictConverter( + definition={ + "record_from_dict": { + "variable_name": "MyRec", + "name": "My New Record" + } + }, + name="TestConverter", + converter_registry=converter_registry, + referenced_record_callback=convert_some_values + ) + + values = GeneralStore() + records = RecordStore() + test_dict_element = DictElement("TestDictElement", { + "url": "something", + "referenced1": { + "url": "referenced" + }, + "referenced2": { + "nourl": "something else", + "url": "https://indiscale.com" + } + }) + pdfc.create_records(values=values, records=records, element=test_dict_element) + assert "MyRec" in records + my_rec = records["MyRec"] + assert isinstance(my_rec, db.Record) + assert len(my_rec.parents) == 1 + assert "MyRec" in [par.name for par in my_rec.parents] + assert my_rec.name == "My New Record" + + # simple conversion + assert my_rec.get_property("url") is not None + assert my_rec.get_property("url").value == "https://test.com/something" + + # also works in referenced + assert my_rec.get_property("referenced1") is not None + referenced1 = my_rec.get_property("referenced1").value + assert isinstance(referenced1, db.Record) + assert referenced1.get_property("url") is not None + assert referenced1.get_property("url").value == "https://test.com/referenced" + + # ... and works as expected + assert my_rec.get_property("referenced2") is not None + referenced2 = my_rec.get_property("referenced2").value + assert isinstance(referenced2, db.Record) + assert referenced2.get_property("nourl") is not None + assert referenced2.get_property("nourl").value == "something else" + assert referenced2.get_property("url") is not None + assert referenced2.get_property("url").value == "https://indiscale.com" + + +def test_properties_from_dict_nested(converter_registry): + """Test the PropertiesFromDictConverter with a nested dict, + together with the regular DictElementConverter and Records created + and used on different subtree levels. + + """ + root_dict_element = DictElement("RootDict", { + "TopLevelRec": "MyRec", + "propertiesDict": { + "a": 5, + "blacklisted": { + "bl_name": "BlackList", + "date": "2023-12-31" + } + }, + "otherDict": { + "additional_from_other": "other" + } + }) + def_dict = { + "RootElt": { + # Root dictionary + "type": "DictElement", + "match": ".*", + "records": { + # Define top-level, use below in subtrees + "MyRec": { + "parents": ["MyType"] + } + }, + "subtree": { + # Top-level text element for the Record name + "NameElt": { + "type": "TextElement", + "match_name": "^TopLevelRec$", + "match_value": "(?P<name>.*)", + "records": { + "MyRec": { + "name": "$name" + } + } + }, + "PFDElement": { + "type": "PropertiesFromDictElement", + "match_name": "^propertiesDict$", + "record_from_dict": { + "variable_name": "MyRec", + "properties_blacklist": ["blacklisted"] + }, + "subtree": { + "BLElement": { + "type": "DictElement", + "match_name": "^blacklisted$", + "records": { + "BLRec": { + "parents": ["BlackListedType"], + "MyRec": "$MyRec" + } + }, + "subtree": { + "BLNameElt": { + "type": "TextElement", + "match_name": "^bl_name$", + "match_value": "(?P<name>.*)", + "records": { + "BLRec": { + "name": "$name" + } + } + }, + "BLDateElt": { + "type": "TextElement", + "match_name": "^date$", + "match_value": "(?P<date>.*)", + "records": { + "BLRec": { + "creation_date": "$date" + } + } + } + } + } + } + }, + # Other dict which uses the DictElementConverter + "OtherDictElement": { + "type": "DictElement", + "match_name": "^otherDict$", + "subtree": { + "additionalElt": { + "type": "TextElement", + "match_name": "^additional_from_other$", + "match_value": "(?P<val>.*)", + "records": { + "MyRec": { + "additional_from_other": "$val" + } + } + } + } + } + } + } + } + + records = scan_structure_elements(root_dict_element, def_dict, converter_registry) + + # All records need to be there + assert len(records) == 2 + myrec = None + blrec = None + for rec in records: + if rec.name == "MyRec": + myrec = rec + elif rec.name == "BlackList": + blrec = rec + assert myrec is not None + assert blrec is not None + + # Parent is set from top level + assert len(myrec.parents) == 1 + assert "MyType" in [par.name for par in myrec.parents] + + # Set automatically, with blacklist + assert myrec.get_property("a") is not None + assert myrec.get_property("a").value == 5 + assert myrec.get_property("blacklisted") is None + + # Now check blacklisted record from subtree + assert len(blrec.parents) == 1 + assert "BlackListedType" in [par.name for par in blrec.parents] + assert blrec.get_property("MyRec") is not None + assert blrec.get_property("MyRec").value == myrec + assert blrec.get_property("creation_date") is not None + assert blrec.get_property("creation_date").value == "2023-12-31" + + # The "old" DictConverter should have added the additional property: + assert myrec.get_property("additional_from_other") is not None + assert myrec.get_property("additional_from_other").value == "other" diff --git a/unittests/test_directories/examples_json/invalidjson.json b/unittests/test_directories/examples_json/invalidjson.json index 9c012bf062264014278fc2df7be6cf33b65c7469..49a00fc6df33fe8d82ec2735e39c400a2342f0bf 100644 --- a/unittests/test_directories/examples_json/invalidjson.json +++ b/unittests/test_directories/examples_json/invalidjson.json @@ -1,13 +1,13 @@ { - "projectId": 10002, - "archived": false, - "coordinator": { - "firstname": "Miri", - "lastname": "Mueller", - "email": "miri.mueller@science.de" - }, - "start_date": "2022-03-01", - "candidates": ["Mouse", "Penguine"], - "rvalue": 0.4444, - "url": "https://site.de/index.php/" + "projectId": 10002, + "archived": false, + "coordinator": { + "firstname": "Miri", + "lastname": "Mueller", + "email": "miri.mueller@science.de" + }, + "start_date": "2022-03-01", + "candidates": ["Mouse", "Penguine"], + "rvalue": 0.4444, + "url": "https://site.de/index.php/" } diff --git a/unittests/test_directories/examples_json/testjson.json b/unittests/test_directories/examples_json/testjson.json index d37ea2defc21d767e4e13ad3b39d6682b3c452ef..29d59780f4824d9c2edbc8fe1da3a6b380def57b 100644 --- a/unittests/test_directories/examples_json/testjson.json +++ b/unittests/test_directories/examples_json/testjson.json @@ -1,22 +1,21 @@ { - "name": "DEMO", - "projectId": 10002, - "archived": false, - "Person": [ - { - "firstname": "Miri", - "lastname": "Mueller", - "other": null, - "email": "miri.mueller@science.de" - }, + "name": "DEMO", + "projectId": 10002, + "archived": false, + "Person": [{ + "firstname": "Miri", + "lastname": "Mueller", + "other": null, + "email": "miri.mueller@science.de" + }, { "firstname": "Mara", "lastname": "Mueller", - "email": "mara.mueller@science.de" + "email": "mara.mueller@science.de" } ], - "start_date": "2022-03-01", - "candidates": ["Mouse", "Penguine"], - "rvalue": 0.4444, - "url": "https://site.de/index.php/" + "start_date": "2022-03-01", + "candidates": ["Mouse", "Penguine"], + "rvalue": 0.4444, + "url": "https://site.de/index.php/" } diff --git a/unittests/test_macros.py b/unittests/test_macros.py index 85fe56cd2d49581bcf07b1c7af8456ad219b0111..020098676407f1f70932559b1a995af9f9644fe9 100644 --- a/unittests/test_macros.py +++ b/unittests/test_macros.py @@ -59,7 +59,7 @@ def _temp_file_load(txt: str): def test_macros(register_macros, macro_store_reset): dat = yaml.load(""" -defs: +macros: - !defmacro name: test params: @@ -85,7 +85,7 @@ testnode: def test_macro_list_replacment(register_macros, macro_store_reset): dat = yaml.load(""" -defs: +macros: - !defmacro name: test params: @@ -112,7 +112,7 @@ testnode: def test_multi_macros(register_macros, macro_store_reset): dat = yaml.load(""" -defs: +macros: - !defmacro name: test_one params: {} @@ -194,6 +194,7 @@ metadata: name: test_one params: {} definition: + type: TextElement replaced1: ok - !defmacro name: test_two @@ -213,6 +214,7 @@ extroot: extroot2: !macro # test top level macro test_one: extroot3: + type: Directory subtree: SimulationData: !macro test_two: @@ -228,7 +230,7 @@ def test_replace_arbitrary_objects(register_macros, macro_store_reset): See: https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/24 """ dat = yaml.load(""" -defs: +macros: - !defmacro name: test params: @@ -277,6 +279,7 @@ metadata: params: a: 25 definition: + type: DictElement macro_sub_$a: b: $a another_param: 3 @@ -284,6 +287,7 @@ metadata: name: test_macrodef params: {} definition: + type: DictElement macro_top: !macro one_macro: - a: 17 @@ -352,6 +356,7 @@ metadata: name: test_one params: {} definition: !macro + type: TextElement test_two: - !defmacro name: test_two @@ -367,6 +372,7 @@ metadata: name: test_four params: {} definition: !macro + type: TextElement test_four: --- extroot: !macro @@ -403,6 +409,7 @@ metadata: macro_name: default_name a: 4 definition: + type: DictElement $macro_name: something: a: $a @@ -555,7 +562,7 @@ extroot: !macro def test_list_macro_application(register_macros, macro_store_reset): dat = yaml.load(""" -defs: +macros: - !defmacro name: test params: @@ -586,7 +593,7 @@ testnode: def test_variable_in_macro_definition(register_macros, macro_store_reset): dat = yaml.load(""" -defs: +macros: - !defmacro name: test params: diff --git a/unittests/test_schema.py b/unittests/test_schema.py index 3b576c9b72e41b799355f927d6e5387f1c187a18..ea8549b0b8dfd1f1af35784082a9e46320cfcff4 100644 --- a/unittests/test_schema.py +++ b/unittests/test_schema.py @@ -27,6 +27,13 @@ def rfp(*pathcomponents): def test_schema_validation(): load_definition(rfp("scifolder_cfood.yml")) load_definition(rfp("scifolder_extended.yml")) + load_definition(rfp("record_from_dict_cfood.yml")) with raises(ValidationError, match=".*enum.*"): load_definition(rfp("broken_cfoods", "broken1.yml")) + + with raises(ValidationError, match=".*required.*"): + load_definition(rfp("broken_cfoods", "broken_record_from_dict.yml")) + + with raises(ValidationError, match=".*required.*"): + load_definition(rfp("broken_cfoods", "broken_record_from_dict_2.yml"))