diff --git a/src/caoscrawler/hdf5_converter.py b/src/caoscrawler/hdf5_converter.py index 47144575b45b7634014f172a6df1988246b53dbd..506c7b3942cc2518ffa47762c4bed742b9f09b83 100644 --- a/src/caoscrawler/hdf5_converter.py +++ b/src/caoscrawler/hdf5_converter.py @@ -39,6 +39,20 @@ from .structure_elements import DictElement, File, FloatElement, IntegerElement, def convert_attributes(elt: Union[h5py.File, h5py.Group, h5py.Dataset]): + """Convert hdf5 attributes to a list of either basic scalar structure elements or ndarrays. + + Parameters + ---------- + elt : Union[h5py.File, h5py.Group, h5py.Dataset] + The hdf5 element the attributes of which will be converted to structure + elements. + + Returns + ------- + converted : list[StructureElement] + A list of the attributes converted to StructureElements (either basic + scalar elements or ndarray). + """ converted = [] for name, value in elt.attrs.items(): @@ -49,6 +63,25 @@ def convert_attributes(elt: Union[h5py.File, h5py.Group, h5py.Dataset]): def convert_h5_element(elt: Union[h5py.Group, h5py.Dataset], name: str): + """Convert a given HDF5 element to the corresponding StructureElement. + + Parameters + ---------- + elt : Union[h5py.Group, h5py.Dataset] + The hdf5 element to be converted. + name : str + The name of the StructureElement that the hdf5 element is converted to. + + Raises + ------ + ValueError + In case of anything that is not convertible to a HDF5 structure element. + + Returns + ------- + StructureElement + The converted StructureElement. + """ if isinstance(elt, h5py.Group): @@ -63,6 +96,34 @@ def convert_h5_element(elt: Union[h5py.Group, h5py.Dataset], name: str): def convert_basic_element_with_nd_array(value, name: str = None, internal_path: str = None, msg_prefix: str = ""): + """Convert a given object either to an ndarray structure element or to a + basic scalar structure element. + + This function extends :func:`~caoscrawler.converters.convert_basic_element` + by a special treatment for certain numpy objects, most importantly + ndarrays. They are converted to a scalar in case of a size-1 array, to a + list in case of a 1-d array, and to a ``H5NdarrayElement`` in all other + cases. In addition, numpy integers and floats are also converted to + IntegerElements and FloatElements, respectively. + + Parameters + ---------- + value + The object to be converted. + name : str, optional + The name of the structure element ``value`` is being converted + to. Default is None. + internal_path : str, optional + The internal path of ``value`` within the HDF5 file. Default is None. + msg_prefix : str, optional + The prefix of the error message that will be raised. Default is ``""``. + + Returns + ------- + StructureElement + The StructureElement ``value`` was converted to. + + """ if isinstance(value, np.ndarray): @@ -92,24 +153,43 @@ def convert_basic_element_with_nd_array(value, name: str = None, class H5GroupElement(DictElement): + """StructureElement specific for HDF5 groups""" + def __init__(self, name: str, value: h5py.Group): super().__init__(name, value) class H5DatasetElement(DictElement): + """StructureElement specific for HDF5 datasets.""" + def __init__(self, name: str, value: h5py.Dataset): super().__init__(name, value) class H5NdarrayElement(DictElement): + """StructureElement specific for NDArrays within HDF5 files. + + Also store the internal path of the array within the HDF5 file in its + ``internal_path`` attribute. + + """ + def __init__(self, name: str, value, internal_path: str): super().__init__(name, value) self.internal_path = internal_path class H5FileConverter(SimpleFileConverter): + """Converter for HDF5 files that creates children for the contained + attributes, groups, and datasets. + + """ def create_children(self, generalStore: GeneralStore, element: StructureElement): + """Create children from root-level file attributes and contained hdf5 + elements. + + """ if not isinstance(element, File): @@ -129,12 +209,20 @@ class H5FileConverter(SimpleFileConverter): class H5GroupConverter(DictElementConverter): + """Converter for HDF5 groups that creates children from the group-level + attributes and the contained subgroups and datasets. + + """ def typecheck(self, element: StructureElement): return isinstance(element, H5GroupElement) def create_children(self, generalStore: GeneralStore, element: StructureElement): + """Create children from group attributes and hdf5 elements contained in + this group. + + """ if not isinstance(element.value, h5py.Group): @@ -152,12 +240,20 @@ class H5GroupConverter(DictElementConverter): class H5DatasetConverter(DictElementConverter): + """Converter for HDF5 datasets that creates children from the dataset + attributes and the contained array data. + + """ def typecheck(self, element: StructureElement): return isinstance(element, H5DatasetElement) def create_children(self, generalStore: GeneralStore, element: StructureElement): + """Create children from the dataset attributes and append the array data + contained in this dataset. + + """ if not isinstance(element.value, h5py.Dataset): @@ -176,6 +272,10 @@ class H5DatasetConverter(DictElementConverter): class H5NdarrayConverter(Converter): + """Converter for ndarrays contained in HDF5 files. Creates the wrapper + record for this ndarray. + + """ def __init__(self, definition: dict, name: str, converter_registry: dict): @@ -188,11 +288,17 @@ class H5NdarrayConverter(Converter): super().__init__(definition, name, converter_registry) def create_children(self, values: GeneralStore, element: StructureElement): + """The ndarray doesn't have any further children.""" - # The ndarray doesn't have any further children. return [] def create_records(self, values: GeneralStore, records: RecordStore, element: StructureElement): + """Create a wrapper record with name ``recordname``, type + ``array_recordtype_name`` (default ``H5Ndarray``) and the internal path + stored in a property with name ``internal_path_property_name`` (default + ``internal_hdf5_path``). + + """ rname = self.definition["recordname"] if "array_recordtype_name" in self.definition: diff --git a/src/doc/converters.rst b/src/doc/converters.rst index 2bbc67dcb3791c2e817ba62ec55e9ac3905e7438..44988fbd497cdb57023b5a696f83d55e7eb5113a 100644 --- a/src/doc/converters.rst +++ b/src/doc/converters.rst @@ -278,7 +278,7 @@ tests <https://gitlab.com/linkahead/linkahead-crawler/-/blob/main/unittests/h5_cfood.yml?ref_type=heads>`_ and shows how the individual converters are used in order to crawl a `simple example file -<https://gitlab.com/linkahead/linkahead-crawler/-/blob/main/unittests/hdf5_dummy_file.hdf5?ref_type=heads`_ +<https://gitlab.com/linkahead/linkahead-crawler/-/blob/main/unittests/hdf5_dummy_file.hdf5?ref_type=heads>`_ containing groups, subgroups, and datasets, together with their respective attributes.