Source code for dantro.utils.coords

"""This module provides coordinate parsing capabilities."""

import logging
from itertools import product
from typing import (

import numpy as np

from import AbstractDataContainer
from import is_iterable, try_conversion
from .link import Link, StrongLink

log = logging.getLogger(__name__)

# Some type definitions .......................................................
TDims = Tuple[str]
"""A dimension sequence type"""

TCoord = TypeVar("TCoord", int, float, str, Hashable)
"""A single coordinate value type"""

TCoords = Sequence[TCoord]
"""A sequence of coordinate values"""

TCoordsDict = Dict[str, TCoords]
"""Several coordinates, bundled into a map of dimension name to coordinates"""

# Dimension name extraction ---------------------------------------------------

[docs]def extract_dim_names( attrs: dict, *, ndim: int, attr_name: str, attr_prefix: str ) -> TDims: """Extract dimension names from the given attributes. This can be done in two ways: 1. A list of dimension names was specified in an attribute with the name specified by the ``attr_name`` argument 2. One by one via attributes that start with the string prefix defined in ``attr_prefix``. This can be used if not all dimension names are available. Note that this will also *not* be used if option 1 is used! Args: attrs (dict): The dict-like object to read attributes from obj_logstr (str): A string that is given as context in log and error messages, ideally describing the object these attributes belong to ndim (int): The expected rank of the dimension names attr_name (str): The key to look for in ``attrs`` that would give a sequence of the dimension names. attr_prefix (str): The prefix to look for in the keys of the ``attrs`` that would specify the name of a single dimension. Returns: Tuple[Union[str, None]]: The dimension names or None as placeholder Raises: TypeError: Attribute found at ``attr_name`` was a string, was not iterable or was not a sequence of strings ValueError: Length mismatch of attribute found at ``attr_name`` and the data. """ # The dimension names sequence to populate. None's are placeholders which # denote that no information was found for this dimension. dim_names = [None] * ndim # Distinguish two cases: All dimension names given directly as a list # or they are given separately, one by one if attr_name in attrs: dims_attr = attrs[attr_name] # Make sure it is an iterable of strings and of the right length if isinstance(dims_attr, str): raise TypeError( f"Attribute '{attr_name}' needs to be a sequence of " f"strings, but not directly a string! Got: {repr(dims_attr)}" ) elif not is_iterable(dims_attr): raise TypeError( f"Attribute '{attr_name}' needs to be an iterable, but was " f"{type(dims_attr)} with value '{dims_attr}'!" ) elif len(dims_attr) != ndim: raise ValueError( "Number of given dimension names does not match " f"the given rank of {ndim}! Names given: {dims_attr}" ) # Data seems ok. # Create the sequence of dimension name, potentially needing to do # some further processing ... for dim_num, dim_name in enumerate(dims_attr): # Might be a numpy scalar or 1-sized array; resolve that if isinstance(dim_name, np.ndarray): dim_name = dim_name.item() if not isinstance(dim_name, str): raise TypeError( "Dimension names need to be strings, got " f"{type(dim_name)} with value '{dim_name}' for " f"dimension {dim_num}!" ) dim_names[dim_num] = dim_name # Have a populated list of dimension names now, all strings. # There might be additionally specified dimension names that overwrite # the names given here... # Go over all attributes and check for the prefix for attr_name, attr_val in attrs.items(): if attr_name.startswith(attr_prefix): # Extract the integer dimension number try: dim_num = int(attr_name[len(attr_prefix) :]) except ValueError as err: raise ValueError( "Could not extract the dimension number from " f"the container/group attribute named '{attr_name}'! " "Take care that the part after the prefix " f"('{attr_prefix}') can be converted to an integer." ) from err # Make sure its valid if dim_num < 0 or dim_num >= ndim: raise ValueError( f"The dimension number {dim_num:d} extracted from " f"attribute '{attr_name}' exceeds the given rank {ndim}!" ) # Make sure the attribute value is a string if isinstance(attr_val, np.ndarray): attr_val = attr_val.item() # ... assuming already decoded. if not isinstance(attr_val, str): raise TypeError( "Dimension names need be strings, but the " f"attribute '{attr_name}' provided {type(attr_val)} " f"with value '{attr_val}'!" ) # All good now. Write it to the dim name list dim_names[dim_num] = attr_val # Done. return tuple(dim_names)
# Coordinate extraction ------------------------------------------------------- # Low level extractor functions ............................................... # These should all take the coordinate arguments as the first argument and then # accept arbitrary keyword-only arguments, of which some may be used to # determine the coordinate values
[docs]def _coords_start_and_step( cargs, *, data_shape: tuple, dim_num: int, **__ ) -> Iterable[int]: """Interpret as integer start and step of range expression and use the length of the data dimension as number of steps """ start, step = cargs stop = start + (step * data_shape[dim_num]) return range(int(start), int(stop), int(step))
[docs]def _coords_trivial( _, *, data_shape: tuple, dim_num: int, **__ ) -> Iterable[int]: """Returns trivial coordinates for the given dimension by creating a range iterator from the selected data shape. """ return range(data_shape[dim_num])
[docs]def _coords_scalar(coord, **__) -> List[TCoord]: """Returns a single, scalar coordinate, i.e.: list of length 1""" if isinstance(coord, np.ndarray): coord = coord.item() # If it is not iterable, it is a scalar now. Expected return value is a # list, though, so convert it. if not is_iterable(coord): coord = [coord] if len(coord) != 1: raise ValueError(f"Expected scalar coordinate, but got: {coord}!") # Make sure it is a list, not a tuple return list(coord)
[docs]def _coords_linked(cargs, *, link_anchor_obj, **__) -> Link: """Creates a Link object which is to be used for coordinates""" # Need to parse potential numpy array arguments to string if isinstance(cargs, np.ndarray): cargs = cargs.item() # Problem: at this point, the container to be linked to might not # know its full path within the data tree. Thus, coordinate # resolution has to be postponed until it is clear. # For that reason, create a link object, which can forward to an # actual container once the coordinates are applied... return StrongLink(anchor=link_anchor_obj, rel_path=cargs)
# Map of extractors ........................................................... # fmt: off COORD_EXTRACTORS = { "values": lambda cargs, **__: cargs, "range": lambda cargs, **__: range(*cargs), "arange": lambda cargs, **__: np.arange(*cargs), "linspace": lambda cargs, **__: np.linspace(*cargs), "logspace": lambda cargs, **__: np.logspace(*cargs), "start_and_step": _coords_start_and_step, "trivial": _coords_trivial, "scalar": _coords_scalar, "linked": _coords_linked, } # fmt: on # Actual (high-level) extraction functions ....................................
[docs]def extract_coords_from_attrs( obj: Union[AbstractDataContainer, np.ndarray], *, dims: Tuple[Union[str, None]], strict: bool, coords_attr_prefix: str, default_mode: str, mode_attr_prefix: str = None, attrs: dict = None, ) -> TCoordsDict: """Extract coordinates from the given object's attributes. This is done by iterating over the given ``dims`` and then looking for attributes that are prefixed with ``coords_attr_prefix`` and ending in the name of the dimension, e.g. attributes like ``coords__time``. The value of that attribute is then evaluated according to a so-called attribute ``mode``. By default, the mode set by ``default_mode`` is used, but it can be set explicitly for each dimension by the ``mode_attr_prefix`` parameter. The resulting number of coordinates for a dimension always need to match the length of that dimension. However, the corresponding error can only be raised once this information is applied. Args: obj (Union[AbstractDataContainer, numpy.ndarray]): The object to retrieve the attributes from (via the ``attrs`` attribute). If the ``attrs`` *argument* is given, will use those instead. It is furthermore expected that this object specifies the shape of the numerical data the coordinates are to be generated for by providing a ``shape`` property. This is possible with :py:class:`~dantro.containers.numeric.NumpyDataContainer` and derived classes. dims (Tuple[Union[str, None]]): Sequence of dimension names; this may also contain None's, which are ignored for coordinates. strict (bool): Whether to use strict checking, where no additional coordinate-specifying attributes are allowed. coords_attr_prefix (str): The attribute name prefix for coordinate specifications default_mode (str): The default coordinate extraction mode. Available modes: * ``values``: the explicit values (iterable) to use for coordinates * ``range``: range arguments * ``arange``: np.arange arguments * ``linspace``: np.linspace arguments * ``logspace``: np.logspace arguments * ``trivial``: The trivial indices. This does not require a value for the coordinate argument. * ``scalar``: makes sure only a single coordinate is provided * ``start_and_step``: the start and step values of an integer range expression; the stop value is deduced by looking at the length of the corresponding dimension. This is then passed to the python range function as (start, stop, step) * ``linked``: Load the coordinates from a linked object within the tree; this works only if ``link_anchor_obj`` is part of a data tree at the point of coordinate resolution! mode_attr_prefix (str, optional): The attribute name prefix that can be used to specify a non-default extraction mode. If not given, the default mode will be used. attrs (dict, optional): If given, these attributes will be used instead of attempting to extract attributes from ``obj``. Returns: TCoordsDict: The ``(dim_name -> coords)`` mapping Raises: ValueError: On invalid coordinates mode or (with strict attribute checking) on superfluous coordinate-setting attributes. """ def get_coord(attrs: dict, dim_name: str, dim_num: int): """Determines coordinate values for a single dimension.""" # The argument the coordinate will be determined from; may be None. cargs = attrs.get(coords_attr_prefix + dim_name) # Determine the mode to interpret the attribute values mode = default_mode if mode_attr_prefix: mode = attrs.get(mode_attr_prefix + dim_name, default_mode) # Might have to process the mode, e.g. because it is an array-type if isinstance(mode, np.ndarray): mode = mode.item() # Check if the mode is available if mode not in COORD_EXTRACTORS: _mode_attr_name = mode_attr_prefix + dim_name _extraction_modes = ", ".join(COORD_EXTRACTORS.keys()) raise ValueError( f"Invalid mode '{mode}' to interpret coordinate " "attribute values! Check whether a mode " f"attribute '{_mode_attr_name}' is set. " f"Available modes: {_extraction_modes}" ) # Invoke the method, passing on available arguments try: return COORD_EXTRACTORS[mode]( cargs, dim_num=dim_num, data_shape=obj.shape, link_anchor_obj=obj, ) except Exception as exc: raise type(exc)( "Failed extracting coordinates for dimension " f"'{dim_name}' via extraction mode '{mode}'! {exc}" ) from exc # If necessary, get the attributes from the given object if not attrs: attrs = obj.attrs # The to-be-populated mapping from dimension names to coordinates. coords_map = dict() # Get the coordinates for all labelled (!) dimensions; if not labelled, no # coordinate is expected. for dim_num, dim_name in enumerate(dims): if dim_name is None: continue # Is labelled; try to extract a coordinate and store it if found coords = get_coord(attrs, dim_name, dim_num) if coords is not None: coords_map[dim_name] = coords # Done. # Without strict attribute checking, can already return here if not strict: return coords_map # Need to do strict attribute checking ... # Determine the prefixes that are to be checked prefixes = [coords_attr_prefix] if mode_attr_prefix: prefixes.append(mode_attr_prefix) # Check all attribute name and prefix combinations for attr_name, prefix in product(attrs.keys(), prefixes): # See whether there are matching attributes that were not already # extracted above, i.e. appear as keys in the map if ( attr_name.startswith(prefix) and attr_name[len(prefix) :] not in coords_map ): _dims_avail = ", ".join([d for d in dims if d is not None]) _prefixes = ", ".join(prefixes) raise ValueError( f"Got superfluous attribute '{attr_name}' that " "does not match any of the available names of " f"labelled dimensions: {_dims_avail}. Valid " f"attribute prefixes: {_prefixes}. Either remove " "the attribute or turn strict attribute checking " "off." ) # All good. Return the coordinate map. return coords_map
[docs]def extract_coords_from_name( obj: AbstractDataContainer, *, dims: TDims, separator: str, attempt_conversion: bool = True, ) -> TCoordsDict: """Given a container or group, extract the coordinates from its name. The name of the object may be a ``separator``-separated string, where each segment contains the coordinate value for one dimension. This function assumes that the coordinates for each dimension are scalar. Thus, the values of the returned dict are sequences of length 1. Args: obj (AbstractDataContainer): The object to get the coordinates of by inspecting its name. dims (TDims): The dimension names corresponding to the coordinates that are expected to be found in the object's name. separator (str): The separtor to apply on the name. attempt_conversion (bool, optional): Whether to attempt conversion of the string value to a numerical type. Returns: TCoordsDict: The coordinate dict, i.e. a mapping from the external dimension names to the coordinate values. In this case, there can only a single value for each dimension! Raises: ValueError: Raised upon failure to extract external coordinates: On ``ext_dims`` evaluating to False, f coordinates were missing for any of the external dimensions, if the number of coordinates extracted from the name did not match the number of external dimensions, if any of the strings extracted from the object's name were empty. """ # Split the string and make some basic checks. coords = if len(coords) != len(dims): raise ValueError( "Number of coordinates extracted from the name of " f"{obj.logstr} does not match the number of expected " f"dimensions! Parsed coordinates: {coords}. " f"Dimensions: {dims}" ) if not all(coords): raise ValueError( "One or more of the coordinates extracted from the " f"name of {obj.logstr} were empty! Got: {coords}" ) # Build the dict, attempting conversion of the objects. coords = { dim_name: [try_conversion(c_val) if attempt_conversion else c_val] for dim_name, c_val in zip(dims, coords) } return coords
[docs]def extract_coords_from_data( obj: AbstractDataContainer, *, dims: TDims ) -> TCoordsDict: """Tries to extract the coordinates from the data of the given container or group. For that purpose, the ``obj`` needs to support the ``coords`` property. Args: obj (AbstractDataContainer): The object that holds the data from which the coordinates are to be extracted. dims (TDims): The sequence of dimension names for which the coordinates are to be extracted. """ return {dim_name: list(obj.coords[dim_name].values) for dim_name in dims}
# .............................................................................
[docs]def extract_coords( obj: AbstractDataContainer, *, mode: str, dims: TDims, use_cache: bool = False, cache_prefix: str = "__coords_cache_", **kwargs, ) -> TCoordsDict: """Wrapper around the more specific coordinate extraction functions. .. note:: This function does not support the extraction of non-dimension coordinates. Args: obj (AbstractDataContainer): The object from which to extract the coordinates. mode (str): Which mode to use for extraction. Can be: * ``name``: Use the name of the object * ``attrs``: Use the attributes of the object * ``data``: Use the data of the object dims (TDims): The dimensions for which the attributes are to be extracted. All dimension names given here are expected to be found. use_cache (bool, optional): Whether to use the object's attributes to write an extracted value to the cache and read it, if available. cache_prefix (str, optional): The prefix to use for writing the cache entries to the object attributes. Will suffix this with ``dims`` and ``coords`` and store the respective data there. **kwargs: Passed on to the actual coordinates extraction method. Raises: NotImplementedError: If ``use_cache`` is set """ EXTRACTORS = dict( name=extract_coords_from_name, attrs=extract_coords_from_attrs, data=extract_coords_from_data, ) if use_cache: # TODO Read from cache raise NotImplementedError("use_cache") # Get the extraction function if mode not in EXTRACTORS: _extraction_modes = ", ".join(EXTRACTORS.keys()) raise ValueError( f"Invalid extraction mode '{mode}'! Valid modes: " f"{_extraction_modes}" ) extraction_func = EXTRACTORS[mode] try: coords = extraction_func(obj, dims=dims, **kwargs) except Exception as exc: raise type(exc)( f"Failed extracting coordinates of {obj} using " f"mode '{mode}': {exc}" ) from exc # TODO Write to cache return coords