Source code for bambi.families.family

from typing import Dict, Union

import numpy as np
import pymc as pm
import xarray as xr

from bambi.families.link import Link
from bambi.utils import get_auxiliary_parameters, get_aliased_name


[docs]class Family:
    """A specification of model family.

    Parameters
    ----------
    name : str
        The name of the family. It can be any string.
    likelihood : Likelihood
        A ``bambi.families.Likelihood`` instance specifying the model likelihood function.
    link : Union[str, Dict[str, Union[str, Link]]]
        The link function that's used for every parameter in the likelihood function.
        Keys are the names of the parameters and values are the link functions.
        These can be a ``str`` with a name or a ``bambi.families.Link`` instance.
        The link function transforms the linear predictors.

    Examples
    --------
    >>> import bambi as bmb

    Replicate the Gaussian built-in family.

    >>> sigma_prior = bmb.Prior("HalfNormal", sigma=1)
    >>> likelihood = bmb.Likelihood("Gaussian", params=["mu", "sigma"], parent="mu")
    >>> family = bmb.Family("gaussian", likelihood, "identity")
    >>> bmb.Model("y ~ x", data, family=family, priors={"sigma": sigma_prior})

    Replicate the Bernoulli built-in family.

    >>> likelihood = bmb.Likelihood("Bernoulli", parent="p")
    >>> family = bmb.Family("bernoulli", likelihood, "logit")
    >>> bmb.Model("y ~ x", data, family=family)
    """

    SUPPORTED_LINKS = [
        "cloglog",
        "identity",
        "inverse_squared",
        "inverse",
        "log",
        "logit",
        "probit",
        "softmax",
        "tan_2",
    ]

    def __init__(self, name, likelihood, link: Union[str, Dict[str, Union[str, Link]]]):
        self.name = name
        self.likelihood = likelihood
        self.link = link
        self.default_priors = {}

    @property
    def link(self):
        return self._link

    @link.setter
    def link(self, value):
        # The name of the link function. It's applied to the parent parameter of the likelihood
        if isinstance(value, (str, Link)):
            value = {self.likelihood.parent: value}
        links = {}
        for name, link in value.items():
            if isinstance(link, str):
                link = self.check_string_link(link, name)
            elif isinstance(link, Link):
                pass
            else:
                raise ValueError("'.link' must be set to a string or a Link instance.")
            links[name] = link
        self._link = links

    def check_string_link(self, link_name, param_name):
        # When you instantiate Family directly
        if isinstance(self.SUPPORTED_LINKS, list):
            supported_links = self.SUPPORTED_LINKS
        else:
            supported_links = self.SUPPORTED_LINKS[param_name]

        if not link_name in supported_links:
            raise ValueError(
                f"Link '{link_name}' cannot be used for '{param_name}' with family "
                f"'{self.name}'"
            )
        return Link(link_name)

[docs]    def set_default_priors(self, priors):
        """Set default priors for non-parent parameters

        Parameters
        ----------
        priors : dict
            The keys are the names of non-parent parameters and the values are their default priors.
        """
        auxiliary_parameters = get_auxiliary_parameters(self)
        priors = {k: v for k, v in priors.items() if k in auxiliary_parameters}
        self.default_priors.update(priors)

[docs]    def posterior_predictive(self, model, posterior, **kwargs):
        """Get draws from the posterior predictive distribution

        This function works for almost all the families. It grabs the draws for the parameters
        needed in the response distribution, and then gets samples from the posterior predictive
        distribution using `pm.draw()`. It won't work when the response distribution requires
        parameters that are not available in `posterior`.

        Parameters
        ----------
        model : bambi.Model
            The model
        posterior : xr.Dataset
            The xarray dataset that contains the draws for all the parameters in the posterior.
            It must contain the parameters that are needed in the distribution of the response, or
            the parameters that allow to derive them.
        kwargs :
            Parameters that are used to get draws but do not appear in the posterior object or
            other configuration parameters.
            For instance, the 'n' in binomial models and multinomial models.

        Returns
        -------
        xr.DataArray
            A data array with the draws from the posterior predictive distribution
        """
        response_dist = get_response_dist(model.family)
        params = model.family.likelihood.params
        response_aliased_name = get_aliased_name(model.response_component.response_term)

        kwargs.pop("data", None)  # Remove the 'data' kwarg
        dont_reshape = kwargs.pop("dont_reshape", [])
        output_dataset_list = []

        # In the posterior xr.Dataset we need to consider aliases,
        # but we don't use aliases when passing kwargs to the PyMC distribution.
        for param in params:
            # Extract posterior draws for the parent parameter
            if param == model.family.likelihood.parent:
                component = model.components[model.response_name]
                var_name = response_aliased_name + "_mean"
                kwargs[param] = posterior[var_name].to_numpy()
                output_dataset_list.append(posterior[var_name])
            else:
                # Extract posterior draws for non-parent parameters
                component = model.components[param]
                if component.alias:
                    var_name = component.alias
                else:
                    var_name = f"{response_aliased_name}_{param}"

                if var_name in posterior:
                    kwargs[param] = posterior[var_name].to_numpy()
                    output_dataset_list.append(posterior[var_name])
                elif hasattr(component, "prior") and isinstance(component.prior, (int, float)):
                    kwargs[param] = np.asarray(component.prior)
                else:
                    raise ValueError(
                        "Non-parent parameter not found in posterior."
                        "This error shouldn't have happened!"
                    )

        # Determine the array with largest number of dimensions
        ndims_max = max(x.ndim for x in kwargs.values())

        # Append a dimension when needed. Required to make `pm.draw()` work.
        # However, some distributions like Multinomial, require some parameters to be of a smaller
        # dimension than others (n.ndim <= p.ndim - 1) so we don't reshape those.
        for key, values in kwargs.items():
            if key in dont_reshape:
                continue
            kwargs[key] = expand_array(values, ndims_max)

        if hasattr(model.family, "transform_kwargs"):
            kwargs = model.family.transform_kwargs(kwargs)

        output_array = pm.draw(response_dist.dist(**kwargs))
        output_coords_all = xr.merge(output_dataset_list).coords

        coord_names = ["chain", "draw", response_aliased_name + "_obs"]
        is_multivariate = hasattr(model.family, "KIND") and model.family.KIND == "Multivariate"

        if is_multivariate:
            coord_names.append(response_aliased_name + "_dim")
        elif hasattr(model.family, "create_extra_pps_coord"):
            new_coords = model.family.create_extra_pps_coord()
            coord_names.append(response_aliased_name + "_dim")
            output_coords_all[response_aliased_name + "_dim"] = new_coords

        output_coords = {}
        for coord_name in coord_names:
            output_coords[coord_name] = output_coords_all[coord_name]
        return xr.DataArray(output_array, coords=output_coords)

    def __str__(self):
        msg_list = [f"Family: {self.name}", f"Likelihood: {self.likelihood}", f"Link: {self.link}"]
        return "\n".join(msg_list)

    def __repr__(self):
        return self.__str__()


def get_response_dist(family):
    """Get the PyMC distribution for the response

    Parameters
    ----------
    family : bambi.Family
        The family for which the response distribution is wanted

    Returns
    -------
    pm.Distribution
        The response distribution
    """
    mapping = {"Cumulative": pm.Categorical, "StoppingRatio": pm.Categorical}

    if family.likelihood.dist:
        dist = family.likelihood.dist
    elif family.likelihood.name in mapping:
        dist = mapping[family.likelihood.name]
    else:
        dist = getattr(pm, family.likelihood.name)
    return dist


def expand_array(x, ndim):
    """Add dimensions to an array to match the number of desired dimensions

    If x.ndim < ndim, it adds ndim - x.ndim dimensions after the last axis. If not, it is left
    untouched.

    For example, if we have a normal regression model with n = 1000, chains = 2, and draws = 500
    the shape of the draws of mu will be (2, 500, 1000) but the shape of the draws of sigma will be
    (2, 500). This function makes sure the shape of the draws of sigma is (2, 500, 1) which is
    comaptible with (2, 500, 1000).

    Parameters
    ----------
    x : np.ndarray
        The array
    ndim : int
        The number of desired dimensions

    Returns
    -------
    np.ndarray
        The array with the expanded dimensions
    """
    if x.ndim == ndim:
        return x
    dims_to_expand = tuple(range(ndim - 1, x.ndim - 1, -1))
    return np.expand_dims(x, dims_to_expand)