Source code for formulae.terms.variable

import sys

import numpy as np
import pandas as pd

from pandas.api.types import is_categorical_dtype, is_numeric_dtype, is_string_dtype


[docs]class Variable:
    """Representation of a variable in a model Term.

    This class and ``Call`` are the atomic components of a model term.

    Parameters
    ----------
    name: string
        The identifier of the variable.
    level: string
        The level to use as reference. Allows to use the notation ``variable["level"]`` to indicate
        which event should be model as success in binary response models. Can only be used with
        response terms. Defaults to ``None``.
    is_response: bool
        Indicates whether this variable represents a response. Defaults to ``False``.
    """

    def __init__(self, name, level=None, is_response=False):
        self.data = None
        self._intermediate_data = None
        self._type = None
        self.is_response = is_response
        self.name = name
        self.level = level

    def __hash__(self):
        return hash((self._type, self.name, self.level))

    def __eq__(self, other):
        if not isinstance(other, type(self)):
            return False
        return self._type == other._type and self.name == other.name and self.level == other.level

    def __repr__(self):
        return self.__str__()

    def __str__(self):
        return f"{self.__class__.__name__}({self.name}, level='{self.level}')"

    @property
    def var_names(self):
        """Returns the name of the variable as a set.

        This is used to determine which variables of the data set being used are actually used in
        the model. This allows us to subset the original data set and only raise errors regarding
        missing values when the missingness happens in variables used in the model.
        """
        return {self.name}

[docs]    def set_type(self, data_mask):
        """Detemines the type of the variable.

        Looks for the name of the variable in ``data_mask`` and sets the ``.type_`` property to
        ``"numeric"`` or ``"categoric"`` depending on the type of the variable.
        It also stores the result of the intermediate evaluation in ``self._intermediate_data`` to
        save computing time later.

        Parameters
        ----------
        data_mask: pd.DataFrame
            The data frame where variables are taken from
        """
        x = data_mask[self.name]
        if is_numeric_dtype(x):
            self._type = "numeric"
            if self.level is not None:
                raise ValueError("Subset notation can't be used with a numeric variable.")
        elif is_string_dtype(x) or is_categorical_dtype(x):
            self._type = "categoric"
        else:
            raise ValueError(f"Variable is of an unrecognized type ({type(x)}).")
        self._intermediate_data = x

[docs]    def set_data(self, encoding=None):
        """Obtains and stores the final data object related to this variable.

        The result is stored in ``self.data``.

        Parameters
        ----------
        encoding: bool
            Indicates if it uses full or reduced encoding when the type of the variable is
            categoric. Omitted when the variable is numeric.
        """

        try:
            if self._type is None:
                raise ValueError("Variable type is not set.")
            if self._type not in ["numeric", "categoric"]:
                raise ValueError(f"Variable is of an unrecognized type ({self._type}).")
            if self._type == "numeric":
                self.data = self._eval_numeric(self._intermediate_data)
            elif self._type == "categoric":
                self.data = self._eval_categoric(self._intermediate_data, encoding)
        except:
            print("Unexpected error while trying to evaluate a Variable.", sys.exc_info()[0])
            raise

[docs]    def _eval_numeric(self, x):
        """Finishes evaluation of a numeric variable.

        Converts the intermediate values in ``x`` into a numpy array of shape ``(n, 1)``,
        where ``n`` is the number of observations. This method is used both in ``self.set_data``
        and in ``self.eval_new_data``.

        Parameters
        ----------
        x: np.ndarray or pd.Series
            The intermediate values of the variable.

        Returns
        ----------
        result: dict
            A dictionary with keys ``"value"`` and ``"type"``. The first contains the result of the
            evaluation, and the latter is equal to ``"numeric"``.
        """
        if isinstance(x, np.ndarray):
            value = np.atleast_2d(x)
            if x.shape[0] == 1 and x.shape[1] > 1:
                value = value.T
        elif isinstance(x, pd.Series):
            value = np.atleast_2d(x.to_numpy()).T
        else:
            raise ValueError(f"Variable is of an unrecognized type ({type(x)}).")
        return {"value": value, "type": "numeric"}

[docs]    def _eval_categoric(self, x, encoding):
        """Finishes evaluation of a categoric variable.

        Converts the intermediate values in ``x`` into a numpy array of shape ``(n, p)``, where
        ``n`` is the number of observations and ``p`` the number of dummy variables used in the
        numeric representation of the categorical variable.

        Parameters
        ----------
        x: np.ndarray or pd.Series
            The intermediate values of the variable.
        encoding: bool
            Indicates if it uses full or reduced encoding.

        Returns
        ----------
        result: dict
            A dictionary with keys ``"value"``, ``"type"``, ``"levels"``, ``"reference"``, and
            ``"encoding"``. They represent the result of the evaluation, the type, which is
            ``"categoric"``, the levels observed in the variable, the level used as reference when
            using reduced encoding, and whether the encoding is ``"full"`` or ``"reduced"``.
        """
        # If not ordered, we make it ordered.
        if not hasattr(x.dtype, "ordered") or not x.dtype.ordered:
            categories = sorted(x.unique().tolist())
            cat_type = pd.api.types.CategoricalDtype(categories=categories, ordered=True)
            x = x.astype(cat_type)

        reference = x.min()
        levels = x.cat.categories.tolist()

        if self.is_response:
            # Will be binary, no matter how many levels
            if self.level is not None:
                reference = self.level
                value = np.where(x == reference, 1, 0)[:, np.newaxis]
            # Is binary, model first event
            elif len(x.unique()) == 2:
                value = np.where(x == reference, 1, 0)[:, np.newaxis]
            # Isn't binary, no level has been passed, return codes.
            else:
                value = pd.Categorical(x).codes[:, np.newaxis]
        else:
            # Not always we receive a bool, so we need to check.
            if isinstance(encoding, list):
                encoding = encoding[0]
            if isinstance(encoding, dict):
                encoding = encoding[self.name]
            if encoding:
                value = pd.get_dummies(x).to_numpy()
                encoding = "full"
            else:
                value = pd.get_dummies(x, drop_first=True).to_numpy()
                encoding = "reduced"
        return {
            "value": value,
            "type": "categoric",
            "levels": levels,
            "reference": reference,
            "encoding": encoding,
        }

[docs]    def eval_new_data(self, data_mask):
        """Evaluates the variable with new data.

        This method evaluates the variable within a new data mask. If this object is categorical,
        original encoding is remembered (and checked) when carrying out the new evaluation.

        Parameters
        ----------
        data_mask: pd.DataFrame
            The data frame where variables are taken from

        Returns
        ----------
        result: np.array
            The rules for the shape of this array are the rules for ``self._eval_numeric()`` and
            ``self._eval_categoric()``. The first applies for numeric variables, the second for
            categoric ones.

        """
        if self.data is None:
            raise ValueError("self.data is None. This error shouldn't have happened!")
        x = data_mask[self.name]
        if self._type == "numeric":
            return self._eval_numeric(x)["value"]
        else:
            return self._eval_new_data_categoric(x)

[docs]    def _eval_new_data_categoric(self, x):
        """Evaluates the variable with new data when variable is categoric.

        This method also checks the levels observed in the new data frame are included within the
        set of the levels of the original data set. If not, an error is raised.

        x: np.ndarray or pd.Series
            The intermediate values of the variable.

        Returns
        ----------
        result: np.array
            Numeric numpy array ``(n, p)``, where ``n`` is the number of observations and ``p`` the
            number of dummy variables used in the numeric representation of the categorical
            variable.
        """
        new_data_levels = pd.Categorical(x).dtype.categories.tolist()
        if set(new_data_levels).issubset(set(self.data["levels"])):
            series = pd.Categorical(x, categories=self.data["levels"])
            drop_first = self.data["encoding"] == "reduced"
            return pd.get_dummies(series, drop_first=drop_first).to_numpy()
        else:
            raise ValueError(
                f"At least one of the levels for '{self.name}' in the new data was "
                "not present in the original data set."
            )