Source code for dantro.containers.xrdatactr

"""This module implements specialisations of the BaseDataContainer class."""

import copy
import logging
from typing import Sequence, Tuple, Union

import numpy as np
import xarray as xr

from import AbstractDataProxy
from ..base import BaseDataContainer, CheckDataMixin, ItemAccessMixin
from ..mixins import ComparisonMixin, ForwardAttrsToDataMixin, NumbersMixin
from ..utils import Link, extract_coords, extract_dim_names

# Local constants
log = logging.getLogger(__name__)

# -----------------------------------------------------------------------------

[docs]class XrDataContainer( ForwardAttrsToDataMixin, NumbersMixin, ComparisonMixin, CheckDataMixin, ItemAccessMixin, BaseDataContainer, ): """The XrDataContainer stores numerical xarray.DataArray data associated with dimensions, coordinates, and attributes. """ # Specify expected data types for this container class DATA_EXPECTED_TYPES = ( xr.DataArray, np.ndarray, ) DATA_ALLOW_PROXY = False DATA_UNEXPECTED_ACTION = "raise" # Custom class variables for customizing XrDataContainer .................. # Define as class variable the name of the attribute that determines the # dimensions of the xarray.DataArray _XRC_DIMS_ATTR = "dims" # Attributes prefixed with this string can be used to set names for # specific dimensions. The prefix should be followed by an integer-parsable # string, e.g. `dim_name__0` would be the dimension name for the 0th dim. _XRC_DIM_NAME_PREFIX = "dim_name__" # Attributes prefixed with this string determine the coordinate values for # a specific dimension. The prefix should be followed by the _name_ of the # dimension, e.g. `coord__time`. The values are interpreted according to # the default coordinate mode or, if given, the coord_mode__* attribute _XRC_COORDS_ATTR_PREFIX = "coords__" # The default mode by which coordinates are interpreted _XRC_COORDS_MODE_DEFAULT = "values" # Prefix for the coordinate mode if a custom mode is to be used _XRC_COORDS_MODE_ATTR_PREFIX = "coords_mode__" # Whether to inherit the other container attributes _XRC_INHERIT_CONTAINER_ATTRIBUTES = True # Whether to use strict attribute checking; throws errors if there are # container attributes available that match the prefix but don't match a # valid dimension name. Can be disabled for speed improvements _XRC_STRICT_ATTR_CHECKING = True # .........................................................................
[docs] def __init__( self, *, name: str, data: Union[np.ndarray, xr.DataArray], dims: Sequence[str] = None, coords: dict = None, extract_metadata: bool = True, apply_metadata: bool = True, **dc_kwargs, ): """Initialize a XrDataContainer and extract dimension and coordinate labels. Args: name (str): which name to give to the XrDataContainer data (Union[np.ndarray, xr.DataArray]): The data to store; anything that an xr.DataArray can take dims (Sequence[str], optional): The dimension names. coords (dict, optional): The coordinates. The keys of this dict have to correspond to the dimension names. extract_metadata (bool, optional): If True, missing ``dims`` or ``coords`` arguments are tried to be populated from the container attributes. apply_metadata (bool, optional): Whether to apply the extracted or passed ``dims`` and ``coords`` to the underlying data. This might not be desired in cases where the given ``data`` already is a labelled ``xr.DataArray`` or where the data is a proxy and the labelling should be postponed. **dc_kwargs: passed to parent """ # To be a bit more tolerant, allow lists as data argument if isinstance(data, list): log.debug( "Received a list as `data` argument to %s '%s'. " "Calling np.array on it ...", self.classname, name, ) data = np.array(data) # Initialize with parent method super().__init__(name=name, data=data, **dc_kwargs) # NOTE The _data attribute is now set, but will be changed again below! # Set up cache attributes with given arguments self._dim_names = dims self._dim_to_coords_map = coords # Keep track of whether metadata was applied or not self._metadata_was_applied = False # If metadata is to be extracted from container attributes, do so now if extract_metadata: self._extract_metadata() # Apply the metadata, if set to do so (and not a proxy, which would not # allow it) ... if apply_metadata and not isinstance(self._data, AbstractDataProxy): self._apply_metadata()
[docs] def _format_info(self) -> str: """A __format__ helper function: returns info about the item. In this case, the dtype and sizes of the stored data is returned. Depending on whether metadata is available, the shape information is shown or the dimension names and the length of the dimensions are used. """ return "{dtype:}, {shape:}, {sup:}".format( dtype=self.dtype, shape=self._format_shape(), sup=super()._format_info(), )
[docs] def _format_shape(self) -> str: """A __format__ helper for parsing shape information""" sizes = None if self._metadata_was_applied: # Can directly use the xarray information sizes = self.sizes.items() elif self._dim_names is not None: # Parse the metadata accordingly ... sizes = self._parse_sizes_from_metadata() if sizes is not None: return "({})".format( ", ".join(["{}: {}".format(*kv) for kv in sizes]) ) return "shape {}".format(self.shape)
[docs] def __len__(self) -> int: """Length of the underlying data, i.e. first entry in shape""" return self.shape[0]
[docs] def copy(self, *, deep: bool = True): """Return a new object with a copy of the data. The copy is deep if not specified otherwise. Args: deep (bool, optional): Whether the copy is deep Returns: XrDataContainer: A (deep) copy of this object. """ log.debug("Creating copy of %s ...", self.logstr) return self.__class__( + "_copy", data=( copy.deepcopy(self._data) if deep else copy.copy(self._data) ), attrs=copy.deepcopy(self.attrs), # Carry over cache attributes, needed for proxy dims=self._dim_names, coords=self._dim_to_coords_map, # No need to extract or apply; if it is a proxy, # the metadata is passed; if it is an xarray, it # is already labelled and passed via `data` extract_metadata=False, apply_metadata=False, )
[docs] def save(self, path: str, **save_kwargs): """Saves the XrDataContainer to a file by invoking the .to_netcdf method of the underlying data. The recommended file extension is ``.xrdc`` or ``.nc_da``, which are compatible with the xarray-based data loader. .. warning:: This does NOT store container attributes! Args: path (str): The path to save the file at **save_kwargs: Passed to the .to_netcdf method """ self.to_netcdf(path, **save_kwargs)
# Methods to extract and apply metadata ...................................
[docs] def _extract_metadata(self): """Extracts metadata from the container attributes and stores them in the ``_dim_names`` and ``_dim_to_coords_map`` cache attributes. """ log.trace("Extracting metadata for labelling %s ...", self.logstr) # First: the dimension names if self._dim_names is None: try: dims = extract_dim_names( self.attrs, ndim=self.ndim, attr_name=self._XRC_DIMS_ATTR, attr_prefix=self._XRC_DIM_NAME_PREFIX, ) except Exception as exc: raise type(exc)( "Failed extracting dimension names from the " f"attributes of {self.logstr}! {exc}" ) from exc else: self._dim_names = dims # With dimension names being cached, try extracting coordinates. if self._dim_to_coords_map is None: coords = extract_coords( self, mode="attrs", dims=self._dim_names, # Attribute names and prefixes strict=self._XRC_STRICT_ATTR_CHECKING, coords_attr_prefix=self._XRC_COORDS_ATTR_PREFIX, mode_attr_prefix=self._XRC_COORDS_MODE_ATTR_PREFIX, default_mode=self._XRC_COORDS_MODE_DEFAULT, ) self._dim_to_coords_map = coords
[docs] def _inherit_attrs(self): """Carry over container attributes to the xr.DataArray attributes This does not include container attributes that are used for extracting metadata; it makes no sense to have them in the attributes of the already labelled xr.DataArray """ def skip(attr_name: str) -> bool: return ( attr_name == self._XRC_DIMS_ATTR or attr_name.startswith(self._XRC_DIM_NAME_PREFIX) or attr_name.startswith(self._XRC_COORDS_ATTR_PREFIX) or attr_name.startswith(self._XRC_COORDS_MODE_ATTR_PREFIX) ) for attr_name, attr_val in self.attrs.items(): if not skip(attr_name):[attr_name] = attr_val
[docs] def _apply_metadata(self): """Applies the cached metadata to the underlying xr.DataArray""" # Make sure that data is an xarray if not isinstance(, xr.DataArray): self._data = xr.DataArray( # Carry over the name (if the data itself is unnamed) if not = # Set the dimension names if self._dim_names: # Create a mapping from old to new names, then apply it new_names = { old: new for old, new in zip(, self._dim_names) if new is not None } log.trace("Renaming dimensions: %s", new_names) self._data = # Set the coordinates if self._dim_to_coords_map: log.trace("Associating coordinates: %s", self._dim_to_coords_map) for dim_name, coords in self._dim_to_coords_map.items(): # Need to handle links differently if isinstance(coords, Link): # The target object is another DataContainer, which can not # be used for association. Thus, just pass the raw data... coords = np.array(coords.target_object) # Can associate now. try:[dim_name] = coords except Exception as err: raise ValueError( f"Could not associate coordinates {coords} for " f"dimension '{dim_name}' due to a " f"{err.__class__.__name__}: {err}." ) from err # Now write the rest of the attributes of the dataset to the xarray if self._XRC_INHERIT_CONTAINER_ATTRIBUTES: self._inherit_attrs() # Now set the flag that metadata was applied self._metadata_was_applied = True
[docs] def _postprocess_proxy_resolution(self): """Only invoked from :py:class:`~dantro.mixins.proxy_support.ProxySupportMixin`, which have to be added to the class specifically. This function takes care to apply the potentially existing metadata *after* the proxy was resolved. """ self._apply_metadata()
[docs] def _parse_sizes_from_metadata(self) -> Sequence[Tuple[str, int]]: """Invoked from _format_shape when no metadata was applied but the dimension names are available. Should return data in the same form as xr.DataArray.sizes.items() does. """ # Iterate over dimension names and shapes ... it = enumerate(zip(self._dim_names, self.shape)) # ... and use the name from the metadata unless the name was None # which is a placeholder for "don't rename this dimension", in which # case it should be named via default names return tuple([(n if n else "dim_{}".format(i), l) for i, (n, l) in it])