Source code for bambi.families.family

from typing import Dict, Union

import numpy as np
import pymc as pm
import xarray as xr

from bambi.families.link import Link
from bambi.utils import get_auxiliary_parameters, get_aliased_name


[docs]class Family: """A specification of model family. Parameters ---------- name : str The name of the family. It can be any string. likelihood : Likelihood A ``bambi.families.Likelihood`` instance specifying the model likelihood function. link : Union[str, Dict[str, Union[str, Link]]] The link function that's used for every parameter in the likelihood function. Keys are the names of the parameters and values are the link functions. These can be a ``str`` with a name or a ``bambi.families.Link`` instance. The link function transforms the linear predictors. Examples -------- >>> import bambi as bmb Replicate the Gaussian built-in family. >>> sigma_prior = bmb.Prior("HalfNormal", sigma=1) >>> likelihood = bmb.Likelihood("Gaussian", params=["mu", "sigma"], parent="mu") >>> family = bmb.Family("gaussian", likelihood, "identity") >>> bmb.Model("y ~ x", data, family=family, priors={"sigma": sigma_prior}) Replicate the Bernoulli built-in family. >>> likelihood = bmb.Likelihood("Bernoulli", parent="p") >>> family = bmb.Family("bernoulli", likelihood, "logit") >>> bmb.Model("y ~ x", data, family=family) """ SUPPORTED_LINKS = [ "cloglog", "identity", "inverse_squared", "inverse", "log", "logit", "probit", "softmax", "tan_2", ] def __init__(self, name, likelihood, link: Union[str, Dict[str, Union[str, Link]]]): self.name = name self.likelihood = likelihood self.link = link self.default_priors = {} @property def link(self): return self._link @link.setter def link(self, value): # The name of the link function. It's applied to the parent parameter of the likelihood if isinstance(value, (str, Link)): value = {self.likelihood.parent: value} links = {} for name, link in value.items(): if isinstance(link, str): link = self.check_string_link(link, name) elif isinstance(link, Link): pass else: raise ValueError("'.link' must be set to a string or a Link instance.") links[name] = link self._link = links def check_string_link(self, link_name, param_name): # When you instantiate Family directly if isinstance(self.SUPPORTED_LINKS, list): supported_links = self.SUPPORTED_LINKS else: supported_links = self.SUPPORTED_LINKS[param_name] if not link_name in supported_links: raise ValueError( f"Link '{link_name}' cannot be used for '{param_name}' with family " f"'{self.name}'" ) return Link(link_name)
[docs] def set_default_priors(self, priors): """Set default priors for non-parent parameters Parameters ---------- priors : dict The keys are the names of non-parent parameters and the values are their default priors. """ auxiliary_parameters = get_auxiliary_parameters(self) priors = {k: v for k, v in priors.items() if k in auxiliary_parameters} self.default_priors.update(priors)
[docs] def posterior_predictive(self, model, posterior, **kwargs): """Get draws from the posterior predictive distribution This function works for almost all the families. It grabs the draws for the parameters needed in the response distribution, and then gets samples from the posterior predictive distribution using `pm.draw()`. It won't work when the response distribution requires parameters that are not available in `posterior`. Parameters ---------- model : bambi.Model The model posterior : xr.Dataset The xarray dataset that contains the draws for all the parameters in the posterior. It must contain the parameters that are needed in the distribution of the response, or the parameters that allow to derive them. kwargs : Parameters that are used to get draws but do not appear in the posterior object or other configuration parameters. For instance, the 'n' in binomial models and multinomial models. Returns ------- xr.DataArray A data array with the draws from the posterior predictive distribution """ response_dist = get_response_dist(model.family) params = model.family.likelihood.params response_aliased_name = get_aliased_name(model.response_component.response_term) kwargs.pop("data", None) # Remove the 'data' kwarg dont_reshape = kwargs.pop("dont_reshape", []) output_dataset_list = [] # In the posterior xr.Dataset we need to consider aliases, # but we don't use aliases when passing kwargs to the PyMC distribution. for param in params: # Extract posterior draws for the parent parameter if param == model.family.likelihood.parent: component = model.components[model.response_name] var_name = response_aliased_name + "_mean" kwargs[param] = posterior[var_name].to_numpy() output_dataset_list.append(posterior[var_name]) else: # Extract posterior draws for non-parent parameters component = model.components[param] if component.alias: var_name = component.alias else: var_name = f"{response_aliased_name}_{param}" if var_name in posterior: kwargs[param] = posterior[var_name].to_numpy() output_dataset_list.append(posterior[var_name]) elif hasattr(component, "prior") and isinstance(component.prior, (int, float)): kwargs[param] = np.asarray(component.prior) else: raise ValueError( "Non-parent parameter not found in posterior." "This error shouldn't have happened!" ) # Determine the array with largest number of dimensions ndims_max = max(x.ndim for x in kwargs.values()) # Append a dimension when needed. Required to make `pm.draw()` work. # However, some distributions like Multinomial, require some parameters to be of a smaller # dimension than others (n.ndim <= p.ndim - 1) so we don't reshape those. for key, values in kwargs.items(): if key in dont_reshape: continue kwargs[key] = expand_array(values, ndims_max) if hasattr(model.family, "transform_kwargs"): kwargs = model.family.transform_kwargs(kwargs) output_array = pm.draw(response_dist.dist(**kwargs)) output_coords_all = xr.merge(output_dataset_list).coords coord_names = ["chain", "draw", response_aliased_name + "_obs"] is_multivariate = hasattr(model.family, "KIND") and model.family.KIND == "Multivariate" if is_multivariate: coord_names.append(response_aliased_name + "_dim") elif hasattr(model.family, "create_extra_pps_coord"): new_coords = model.family.create_extra_pps_coord() coord_names.append(response_aliased_name + "_dim") output_coords_all[response_aliased_name + "_dim"] = new_coords output_coords = {} for coord_name in coord_names: output_coords[coord_name] = output_coords_all[coord_name] return xr.DataArray(output_array, coords=output_coords)
def __str__(self): msg_list = [f"Family: {self.name}", f"Likelihood: {self.likelihood}", f"Link: {self.link}"] return "\n".join(msg_list) def __repr__(self): return self.__str__()
def get_response_dist(family): """Get the PyMC distribution for the response Parameters ---------- family : bambi.Family The family for which the response distribution is wanted Returns ------- pm.Distribution The response distribution """ mapping = {"Cumulative": pm.Categorical, "StoppingRatio": pm.Categorical} if family.likelihood.dist: dist = family.likelihood.dist elif family.likelihood.name in mapping: dist = mapping[family.likelihood.name] else: dist = getattr(pm, family.likelihood.name) return dist def expand_array(x, ndim): """Add dimensions to an array to match the number of desired dimensions If x.ndim < ndim, it adds ndim - x.ndim dimensions after the last axis. If not, it is left untouched. For example, if we have a normal regression model with n = 1000, chains = 2, and draws = 500 the shape of the draws of mu will be (2, 500, 1000) but the shape of the draws of sigma will be (2, 500). This function makes sure the shape of the draws of sigma is (2, 500, 1) which is comaptible with (2, 500, 1000). Parameters ---------- x : np.ndarray The array ndim : int The number of desired dimensions Returns ------- np.ndarray The array with the expanded dimensions """ if x.ndim == ndim: return x dims_to_expand = tuple(range(ndim - 1, x.ndim - 1, -1)) return np.expand_dims(x, dims_to_expand)