Source code for dantro.utils.coords

"""This module provides coordinate parsing capabilities."""

import logging
from itertools import product
from typing import Dict, Hashable, Iterable, List, Sequence, Tuple, Union

import numpy as np

from ..abc import AbstractDataContainer
from ..tools import is_iterable, try_conversion
from .link import Link, StrongLink

log = logging.getLogger(__name__)

# Some type definitions .......................................................
TDims = Tuple[str]
"""A dimension sequence type"""

TCoord = Union[int, float, str, Hashable]
"""A single coordinate value type"""

TCoords = Sequence[TCoord]
"""A sequence of coordinate values"""

TCoordsDict = Dict[str, TCoords]
"""Several coordinates, bundled into a map of dimension name to coordinates"""


# Dimension name extraction ---------------------------------------------------



[docs]
def extract_dim_names(
    attrs: dict, *, ndim: int, attr_name: str, attr_prefix: str
) -> TDims:
    """Extract dimension names from the given attributes.

    This can be done in two ways:
        1. A list of dimension names was specified in an attribute with the
           name specified by the ``attr_name`` argument
        2. One by one via attributes that start with the string prefix defined
           in ``attr_prefix``. This can be used if not all dimension names are
           available. Note that this will also *not* be used if option 1 is
           used!

    Args:
        attrs (dict): The dict-like object to read attributes from
        obj_logstr (str): A string that is given as context in log and error
            messages, ideally describing the object these attributes belong to
        ndim (int): The expected rank of the dimension names
        attr_name (str): The key to look for in ``attrs`` that would give a
            sequence of the dimension names.
        attr_prefix (str): The prefix to look for in the keys of the ``attrs``
            that would specify the name of a single dimension.

    Returns:
        Tuple[Union[str, None]]: The dimension names or None as placeholder

    Raises:
        TypeError: Attribute found at ``attr_name`` was a string, was not
            iterable or was not a sequence of strings
        ValueError: Length mismatch of attribute found at ``attr_name``
            and the data.

    """
    # The dimension names sequence to populate. None's are placeholders which
    # denote that no information was found for this dimension.
    dim_names = [None] * ndim

    # Distinguish two cases: All dimension names given directly as a list
    # or they are given separately, one by one
    if attr_name in attrs:
        dims_attr = attrs[attr_name]

        # Make sure it is an iterable of strings and of the right length
        if isinstance(dims_attr, str):
            raise TypeError(
                f"Attribute '{attr_name}' needs to be a sequence of "
                f"strings, but not directly a string! Got: {repr(dims_attr)}"
            )

        elif not is_iterable(dims_attr):
            raise TypeError(
                f"Attribute '{attr_name}' needs to be an iterable, but was "
                f"{type(dims_attr)} with value '{dims_attr}'!"
            )

        elif len(dims_attr) != ndim:
            raise ValueError(
                "Number of given dimension names does not match "
                f"the given rank of {ndim}! Names given: {dims_attr}"
            )

        # Data seems ok.
        # Create the sequence of dimension name, potentially needing to do
        # some further processing ...
        for dim_num, dim_name in enumerate(dims_attr):
            # Might be a numpy scalar or 1-sized array; resolve that
            if isinstance(dim_name, np.ndarray):
                dim_name = dim_name.item()

            if not isinstance(dim_name, str):
                raise TypeError(
                    "Dimension names need to be strings, got "
                    f"{type(dim_name)} with value '{dim_name}' for "
                    f"dimension {dim_num}!"
                )

            dim_names[dim_num] = dim_name

    # Have a populated list of dimension names now, all strings.
    # There might be additionally specified dimension names that overwrite
    # the names given here...

    # Go over all attributes and check for the prefix
    for attr_name, attr_val in attrs.items():
        if attr_name.startswith(attr_prefix):
            # Extract the integer dimension number
            try:
                dim_num = int(attr_name[len(attr_prefix) :])

            except ValueError as err:
                raise ValueError(
                    "Could not extract the dimension number from "
                    f"the container/group attribute named '{attr_name}'! "
                    "Take care that the part after the prefix "
                    f"('{attr_prefix}') can be converted to an integer."
                ) from err

            # Make sure its valid
            if dim_num < 0 or dim_num >= ndim:
                raise ValueError(
                    f"The dimension number {dim_num:d} extracted from "
                    f"attribute '{attr_name}' exceeds the given rank {ndim}!"
                )

            # Make sure the attribute value is a string
            if isinstance(attr_val, np.ndarray):
                attr_val = attr_val.item()  # ... assuming already decoded.

            if not isinstance(attr_val, str):
                raise TypeError(
                    "Dimension names need be strings, but the "
                    f"attribute '{attr_name}' provided {type(attr_val)} "
                    f"with value '{attr_val}'!"
                )

            # All good now. Write it to the dim name list
            dim_names[dim_num] = attr_val

    # Done.
    return tuple(dim_names)



# Coordinate extraction -------------------------------------------------------
# Low level extractor functions ...............................................
# These should all take the coordinate arguments as the first argument and then
# accept arbitrary keyword-only arguments, of which some may be used to
# determine the coordinate values



[docs]
def _coords_start_and_step(
    cargs, *, data_shape: tuple, dim_num: int, **__
) -> Iterable[int]:
    """Interpret as integer start and step of range expression and use the
    length of the data dimension as number of steps
    """
    start, step = cargs
    stop = start + (step * data_shape[dim_num])
    return range(int(start), int(stop), int(step))




[docs]
def _coords_trivial(
    _, *, data_shape: tuple, dim_num: int, **__
) -> Iterable[int]:
    """Returns trivial coordinates for the given dimension by creating a
    range iterator from the selected data shape.
    """
    return range(data_shape[dim_num])




[docs]
def _coords_scalar(coord, **__) -> List[TCoord]:
    """Returns a single, scalar coordinate, i.e.: list of length 1"""
    if isinstance(coord, np.ndarray):
        coord = coord.item()

    # If it is not iterable, it is a scalar now. Expected return value is a
    # list, though, so convert it.
    if not is_iterable(coord):
        coord = [coord]

    if len(coord) != 1:
        raise ValueError(f"Expected scalar coordinate, but got: {coord}!")

    # Make sure it is a list, not a tuple
    return list(coord)




[docs]
def _coords_linked(cargs, *, link_anchor_obj, **__) -> Link:
    """Creates a Link object which is to be used for coordinates"""
    # Need to parse potential numpy array arguments to string
    if isinstance(cargs, np.ndarray):
        cargs = cargs.item()

    # Problem: at this point, the container to be linked to might not
    # know its full path within the data tree. Thus, coordinate
    # resolution has to be postponed until it is clear.
    # For that reason, create a link object, which can forward to an
    # actual container once the coordinates are applied...
    return StrongLink(anchor=link_anchor_obj, rel_path=cargs)



# Map of extractors ...........................................................
# fmt: off
COORD_EXTRACTORS = {
    "values":           lambda cargs, **__: cargs,
    "range":            lambda cargs, **__: range(*cargs),
    "arange":           lambda cargs, **__: np.arange(*cargs),
    "linspace":         lambda cargs, **__: np.linspace(*cargs),
    "logspace":         lambda cargs, **__: np.logspace(*cargs),
    "start_and_step":   _coords_start_and_step,
    "trivial":          _coords_trivial,
    "scalar":           _coords_scalar,
    "linked":           _coords_linked,
}
# fmt: on

# Actual (high-level) extraction functions ....................................



[docs]
def extract_coords_from_attrs(
    obj: Union[AbstractDataContainer, np.ndarray],
    *,
    dims: Tuple[Union[str, None]],
    strict: bool,
    coords_attr_prefix: str,
    default_mode: str,
    mode_attr_prefix: str = None,
    attrs: dict = None,
) -> TCoordsDict:
    """Extract coordinates from the given object's attributes.

    This is done by iterating over the given ``dims`` and then looking
    for attributes that are prefixed with ``coords_attr_prefix`` and ending in
    the name of the dimension, e.g. attributes like ``coords__time``.

    The value of that attribute is then evaluated according to a so-called
    attribute ``mode``. By default, the mode set by ``default_mode`` is used,
    but it can be set explicitly for each dimension by the ``mode_attr_prefix``
    parameter.

    The resulting number of coordinates for a dimension always need to match
    the length of that dimension. However, the corresponding error can only be
    raised once this information is applied.

    Args:
        obj (Union[AbstractDataContainer, numpy.ndarray]): The object to
            retrieve the attributes from (via the ``attrs`` attribute). If the
            ``attrs`` *argument* is given, will use those instead.
            It is furthermore expected that this object specifies the shape of
            the numerical data the coordinates are to be generated for by
            providing a ``shape`` property. This is possible with
            :py:class:`~dantro.containers.numeric.NumpyDataContainer` and
            derived classes.
        dims (Tuple[Union[str, None]]): Sequence of dimension names; this
            may also contain None's, which are ignored for coordinates.
        strict (bool): Whether to use strict checking, where no additional
            coordinate-specifying attributes are allowed.
        coords_attr_prefix (str): The attribute name prefix for coordinate
            specifications
        default_mode (str): The default coordinate extraction mode. Available
            modes:

            * ``values``: the explicit values (iterable) to use for coordinates
            * ``range``: range arguments
            * ``arange``: np.arange arguments
            * ``linspace``: np.linspace arguments
            * ``logspace``: np.logspace arguments
            * ``trivial``: The trivial indices. This does not require a value
              for the coordinate argument.
            * ``scalar``: makes sure only a single coordinate is provided
            * ``start_and_step``: the start and step values of an integer range
              expression; the stop value is deduced by looking at the length of
              the corresponding dimension. This is then passed to the python
              range function as (start, stop, step)
            * ``linked``: Load the coordinates from a linked object within the
              tree; this works only if ``link_anchor_obj`` is part of a data
              tree at the point of coordinate resolution!

        mode_attr_prefix (str, optional): The attribute name prefix that can
            be used to specify a non-default extraction mode. If not given, the
            default mode will be used.
        attrs (dict, optional): If given, these attributes will be used instead
            of attempting to extract attributes from ``obj``.

    Returns:
        TCoordsDict: The ``(dim_name -> coords)`` mapping

    Raises:
        ValueError: On invalid coordinates mode or (with strict attribute
            checking) on superfluous coordinate-setting attributes.

    """

    def get_coord(attrs: dict, dim_name: str, dim_num: int):
        """Determines coordinate values for a single dimension."""
        # The argument the coordinate will be determined from; may be None.
        cargs = attrs.get(coords_attr_prefix + dim_name)

        # Determine the mode to interpret the attribute values
        mode = default_mode
        if mode_attr_prefix:
            mode = attrs.get(mode_attr_prefix + dim_name, default_mode)

        # Might have to process the mode, e.g. because it is an array-type
        if isinstance(mode, np.ndarray):
            mode = mode.item()

        # Check if the mode is available
        if mode not in COORD_EXTRACTORS:
            _mode_attr_name = mode_attr_prefix + dim_name
            _extraction_modes = ", ".join(COORD_EXTRACTORS.keys())
            raise ValueError(
                f"Invalid mode '{mode}' to interpret coordinate "
                "attribute values! Check whether a mode "
                f"attribute '{_mode_attr_name}' is set. "
                f"Available modes: {_extraction_modes}"
            )

        # Invoke the method, passing on available arguments
        try:
            return COORD_EXTRACTORS[mode](
                cargs,
                dim_num=dim_num,
                data_shape=obj.shape,
                link_anchor_obj=obj,
            )

        except Exception as exc:
            raise type(exc)(
                "Failed extracting coordinates for dimension "
                f"'{dim_name}' via extraction mode '{mode}'! {exc}"
            ) from exc

    # If necessary, get the attributes from the given object
    if not attrs:
        attrs = obj.attrs

    # The to-be-populated mapping from dimension names to coordinates.
    coords_map = dict()

    # Get the coordinates for all labelled (!) dimensions; if not labelled, no
    # coordinate is expected.
    for dim_num, dim_name in enumerate(dims):
        if dim_name is None:
            continue

        # Is labelled; try to extract a coordinate and store it if found
        coords = get_coord(attrs, dim_name, dim_num)
        if coords is not None:
            coords_map[dim_name] = coords

    # Done.
    # Without strict attribute checking, can already return here
    if not strict:
        return coords_map

    # Need to do strict attribute checking ...
    # Determine the prefixes that are to be checked
    prefixes = [coords_attr_prefix]
    if mode_attr_prefix:
        prefixes.append(mode_attr_prefix)

    # Check all attribute name and prefix combinations
    for attr_name, prefix in product(attrs.keys(), prefixes):
        # See whether there are matching attributes that were not already
        # extracted above, i.e. appear as keys in the map
        if (
            attr_name.startswith(prefix)
            and attr_name[len(prefix) :] not in coords_map
        ):

            _dims_avail = ", ".join([d for d in dims if d is not None])
            _prefixes = ", ".join(prefixes)

            raise ValueError(
                f"Got superfluous attribute '{attr_name}' that "
                "does not match any of the available names of "
                f"labelled dimensions: {_dims_avail}. Valid "
                f"attribute prefixes: {_prefixes}. Either remove "
                "the attribute or turn strict attribute checking "
                "off."
            )

    # All good. Return the coordinate map.
    return coords_map




[docs]
def extract_coords_from_name(
    obj: AbstractDataContainer,
    *,
    dims: TDims,
    separator: str,
    attempt_conversion: bool = True,
) -> TCoordsDict:
    """Given a container or group, extract the coordinates from its name.

    The name of the object may be a ``separator``-separated string, where each
    segment contains the coordinate value for one dimension.

    This function assumes that the coordinates for each dimension are scalar.
    Thus, the values of the returned dict are sequences of length 1.

    Args:
        obj (AbstractDataContainer): The object to get the coordinates of by
            inspecting its name.
        dims (TDims): The dimension names corresponding to the coordinates that
            are expected to be found in the object's name.
        separator (str): The separtor to apply on the name.
        attempt_conversion (bool, optional): Whether to attempt conversion of
            the string value to a numerical type.

    Returns:
        TCoordsDict: The coordinate dict, i.e. a mapping from the external
            dimension names to the coordinate values. In this case, there can
            only a single value for each dimension!

    Raises:
        ValueError: Raised upon failure to extract external coordinates:
            On ``ext_dims`` evaluating to False, f coordinates were missing for
            any of the external dimensions, if the number of coordinates
            extracted from the name did not match the number of external
            dimensions, if any of the strings extracted from the object's name
            were empty.
    """
    # Split the string and make some basic checks.
    coords = obj.name.split(separator)

    if len(coords) != len(dims):
        raise ValueError(
            "Number of coordinates extracted from the name of "
            f"{obj.logstr} does not match the number of expected "
            f"dimensions! Parsed coordinates: {coords}. "
            f"Dimensions: {dims}"
        )

    if not all(coords):
        raise ValueError(
            "One or more of the coordinates extracted from the "
            f"name of {obj.logstr} were empty! Got: {coords}"
        )

    # Build the dict, attempting conversion of the objects.
    coords = {
        dim_name: [try_conversion(c_val) if attempt_conversion else c_val]
        for dim_name, c_val in zip(dims, coords)
    }

    return coords




[docs]
def extract_coords_from_data(
    obj: AbstractDataContainer, *, dims: TDims
) -> TCoordsDict:
    """Tries to extract the coordinates from the data of the given container
    or group. For that purpose, the ``obj`` needs to support the ``coords``
    property.

    Args:
        obj (AbstractDataContainer): The object that holds the data from which
            the coordinates are to be extracted.
        dims (TDims): The sequence of dimension names for which the coordinates
            are to be extracted.
    """
    return {dim_name: list(obj.coords[dim_name].values) for dim_name in dims}



# .............................................................................



[docs]
def extract_coords(
    obj: AbstractDataContainer,
    *,
    mode: str,
    dims: TDims,
    use_cache: bool = False,
    cache_prefix: str = "__coords_cache_",
    **kwargs,
) -> TCoordsDict:
    """Wrapper around the more specific coordinate extraction functions.

    .. note::

        This function does not support the extraction of non-dimension
        coordinates.

    Args:
        obj (AbstractDataContainer): The object from which to extract the
            coordinates.
        mode (str): Which mode to use for extraction. Can be:

            * ``name``:   Use the name of the object
            * ``attrs``:  Use the attributes of the object
            * ``data``:   Use the data of the object

        dims (TDims): The dimensions for which the attributes are to be
            extracted. All dimension names given here are expected to be found.
        use_cache (bool, optional): Whether to use the object's attributes to
            write an extracted value to the cache and read it, if available.
        cache_prefix (str, optional): The prefix to use for writing the cache
            entries to the object attributes. Will suffix this with ``dims``
            and ``coords`` and store the respective data there.
        **kwargs: Passed on to the actual coordinates extraction method.

    Raises:
        NotImplementedError: If ``use_cache`` is set
    """
    EXTRACTORS = dict(
        name=extract_coords_from_name,
        attrs=extract_coords_from_attrs,
        data=extract_coords_from_data,
    )

    if use_cache:
        # TODO Read from cache
        raise NotImplementedError("use_cache")

    # Get the extraction function
    if mode not in EXTRACTORS:
        _extraction_modes = ", ".join(EXTRACTORS.keys())
        raise ValueError(
            f"Invalid extraction mode '{mode}'! Valid modes: "
            f"{_extraction_modes}"
        )

    extraction_func = EXTRACTORS[mode]
    try:
        coords = extraction_func(obj, dims=dims, **kwargs)

    except Exception as exc:
        raise type(exc)(
            f"Failed extracting coordinates of {obj} using "
            f"mode '{mode}': {exc}"
        ) from exc

    # TODO Write to cache

    return coords