Source code for dantro.proxy.hdf5

"""This module implements a :py:class:`dantro.base.BaseDataProxy`
specialization for HDF5 data.
"""

import logging

import numpy as np

from .._import_tools import LazyLoader
from ..base import BaseDataProxy

log = logging.getLogger(__name__)

da = LazyLoader("dask.array")
h5 = LazyLoader("h5py")

# -----------------------------------------------------------------------------


[docs]class Hdf5DataProxy(BaseDataProxy): """The Hdf5DataProxy is a placeholder for a :py:class:`h5py.Dataset`. It saves the filename and dataset name needed to later load the dataset. Additionaly, it caches some values that give information on the shape and dtype of the dataset, thus further delaying the load to the time the *actual* data is required. Depending on the type that this proxy is resolved as via the :py:meth:`.resolve` method, the corresponding :py:class:`h5py.File` object needs to stay open and in memory; it is closed upon garbage-collection of this object. """
[docs] def __init__(self, obj: "h5py.Dataset", *, resolve_as_dask: bool = False): """Initializes a proxy object for a :py:class:`h5py.Dataset` object. Args: obj (h5py.Dataset): The dataset object to be proxy for resolve_as_dask (bool, optional): Whether to resolve the dataset object as a delayed :py:class:`dask.array.Array` object, using an :py:class:`h5py.Dataset` to initialize it and passing over chunk information. """ super().__init__(obj) # Information to later resolve the data self._fname = obj.file.filename self._name = obj.name # is the full path within the file # If file objects need be kept in scope, this is the list to store them self._h5files = [] # Extract some further information from the dataset before, basically # all information that can be known without loading the data self._shape = obj.shape self._dtype = obj.dtype self._ndim = obj.ndim self._size = obj.size self._chunks = obj.chunks # Whether to load the hdf5 data through dask.array.from_array self._resolve_as_dask = resolve_as_dask # Set the tags self._tags += ("hdf5",) if self._resolve_as_dask: self._tags += ("dask",)
[docs] def resolve(self, *, astype: type = None): """Resolve the data of this proxy by opening the hdf5 file and loading the dataset into a :py:class:`numpy.ndarray` or a type specified by the ``astype`` argument. Args: astype (type, optional): As which type to return the data from the dataset this object is proxy for. If None, will return as :py:class:`numpy.ndarray`. For :py:class:`h5py.Dataset`, the :py:class:`h5py.File` object stays in memory until the proxy is deleted. Note that if ``resolve_as_dask`` was specified during proxy initialization, the data will be loaded as :py:class:`dask.array.Array` only if ``astype`` is **not** specified in this call! Returns: type specified by ``astype``: the resolved data. """ if astype is h5.Dataset and not self._resolve_as_dask: log.debug( "Resolving %s as h5py.Dataset from dataset %s in file " "at %s ...", self.classname, self._name, self._fname, ) # Open the file and keep it in scope h5file = self._open_h5file() # Return the dataset object, which remains valid until the file # object is closed, i.e. the proxy goes out of scope return h5file[self._name] elif astype is None and self._resolve_as_dask: log.debug( "Resolving %s as h5py.Dataset from dataset %s in file " "at %s into a delayed dask array object...", self.classname, self._name, self._fname, ) # Open the file and keep it in scope h5file = self._open_h5file() # Build the delayed dask array from the h5.Dataset and chunk info return da.from_array(h5file[self._name], chunks=self._chunks) else: # By default, return as numpy array astype = astype if astype is not None else np.array log.debug( "Resolving %s as %s.%s from dataset %s in file at %s ...", self.classname, astype.__module__, astype.__name__, self._name, self._fname, ) with h5.File(self._fname, "r") as h5file: return astype(h5file[self._name])
# Handling of HDF5 files ..................................................
[docs] def _open_h5file(self) -> "h5py.File": """Opens the associated HDF5 file and stores it in ``_h5files`` in order to keep it in scope. These file objects are only closed upon deletion of this proxy object! Returns: h5py.File: The newly opened HDF5 file """ h5file = h5.File(self._fname, "r") self._h5files.append(h5file) return h5file
[docs] def __del__(self): """Make sure all potentially still open h5py.File objects are closed""" for f in self._h5files: try: f.close() except Exception: # Can no longer close it; garbace collection probably already # took care of it ... which is fine. pass
# Properties to access information without resolving ...................... @property def shape(self): """The cached shape of the dataset, accessible without resolving""" return self._shape @property def dtype(self): """The cached dtype of the dataset, accessible without resolving""" return self._dtype @property def ndim(self): """The cached ndim of the dataset, accessible without resolving""" return self._ndim @property def size(self): """The cached size of the dataset, accessible without resolving""" return self._size @property def chunks(self): """The cached chunks of the dataset, accessible without resolving""" return self._chunks