Source code for formulae.terms.call

import sys

import numpy as np
import pandas as pd

from pandas.api.types import is_categorical_dtype, is_numeric_dtype, is_string_dtype

from formulae.transforms import TRANSFORMS, Proportion, Offset
from formulae.terms.call_utils import CallVarsExtractor


[docs]class Call:
    """Representation of a call in a model Term.

    This class and ``Variable`` are the atomic components of a model term.

    This object supports stateful transformations defined in ``formulae.transforms``.
    A transformation of this type defines its parameters the first time it is called,
    and then can be used to recompute the transformation with memorized parameter values.
    This behavior is useful when implementing a predict method and using transformations such
    as ``center(x)`` or ``scale(x)``. ``center(x)`` memorizes the value of the mean, and
    ``scale(x)`` memorizes both the mean and the standard deviation.

    Parameters
    ----------
    call: formulae.terms.call_resolver.LazyCall
        The call expression returned by the parser.
    is_response: bool
        Indicates whether this call represents a response. Defaults to ``False``.
    """

    def __init__(self, call, is_response=False):
        self.data = None
        self.eval_env = None
        self._intermediate_data = None
        self._type = None
        self.is_response = is_response
        self.call = call
        self.name = str(self.call)

    def __hash__(self):
        return hash(self.call)

    def __eq__(self, other):
        if not isinstance(other, type(self)):
            return False
        return self.call == other.call

    def __repr__(self):
        return self.__str__()

    def __str__(self):
        return f"{self.__class__.__name__}({self.name})"

[docs]    def accept(self, visitor):
        """Accept method called by a visitor.

        Visitors are those available in call_utils.py, and are used to work with call terms.
        """
        return visitor.visitCallTerm(self)

    @property
    def var_names(self):
        """Returns the names of the variables involved in the call, not including the callee.

        This is used to determine which variables of the data set being used are actually used in
        the model. This allows us to subset the original data set and only raise errors regarding
        missing values when the missingness happens in variables used in the model.

        Uses a visitor of class ``CallVarsExtractor`` that walks through the components of the call
        and returns a list with the name of the variables in the call.

        Returns
        ----------
        result: list
            A list of strings with the names of the names of the variables in the call, not
            including the name of the callee.
        """
        return set(CallVarsExtractor(self).get())

[docs]    def set_type(self, data_mask, eval_env):
        """Evaluates function and determines the type of the result of the call.

        Evaluates the function call and sets the ``._type`` property to ``"numeric"`` or
        ``"categoric"`` depending on the type of the result. It also stores the intermediate result
        of the evaluation in ``._intermediate_data`` to prevent us from computing the same thing
        more than once.

        Parameters
        ----------
        data_mask: pd.DataFrame
            The data frame where variables are taken from
        eval_env: EvalEnvironment
            The environment where values and functions are taken from.
        """

        self.eval_env = eval_env.with_outer_namespace(TRANSFORMS)
        x = self.call.eval(data_mask, self.eval_env)

        if is_numeric_dtype(x):
            self._type = "numeric"
        elif is_string_dtype(x) or is_categorical_dtype(x):
            self._type = "categoric"
        elif isinstance(x, Proportion):
            self._type = "proportion"
        elif isinstance(x, Offset):
            self._type = "offset"
            x.set_size(len(data_mask.index))
        else:
            raise ValueError(f"Call result is of an unrecognized type ({type(x)}).")
        self._intermediate_data = x

[docs]    def set_data(self, encoding=False):
        """Finishes the evaluation of the call according to its type.

        Evaluates the call according to its type and stores the result in ``.data``. It does not
        support multi-level categoric responses yet. If ``self.is_response`` is ``True`` and the
        variable is of a categoric type, this method returns a 1d array of 0-1 instead of a matrix.

        In practice, it just completes the evaluation that started with ``self.set_type()``.

        Parameters
        ----------
        encoding: bool
            Indicates if it uses full or reduced encoding when the type of the call is
            categoric. Omitted when the result of the call is numeric.
        """
        try:
            if self._type is None:
                raise ValueError("Call result type is not set.")
            if self._type not in ["numeric", "categoric", "proportion", "offset"]:
                raise ValueError(f"Call result is of an unrecognized type ({self._type}).")
            if self._type == "numeric":
                self.data = self._eval_numeric(self._intermediate_data)
            elif self._type == "categoric":
                self.data = self._eval_categoric(self._intermediate_data, encoding)
            elif self._type == "proportion":
                self.data = self._eval_proportion(self._intermediate_data)
            elif self._type == "offset":
                self.data = self._eval_offset(self._intermediate_data)
        except:
            print("Unexpected error while trying to evaluate a Call:", sys.exc_info()[0])
            raise

[docs]    def _eval_numeric(self, x):
        """Finishes evaluation of a numeric call.

        Converts the intermediate values of the call into a numpy array of shape ``(n, 1)``,
        where ``n`` is the number of observations. This method is used both in ``self.set_data``
        and in ``self.eval_new_data``.

        Parameters
        ----------
        x: np.ndarray or pd.Series
            The intermediate values resulting from the call.

        Returns
        ----------
        result: dict
            A dictionary with keys ``"value"`` and ``"type"``. The first contains the result of the
            evaluation, and the latter is equal to ``"numeric"``.
        """
        if isinstance(x, np.ndarray):
            value = x.flatten()
            if value.ndim > 1:
                raise ValueError(f"The result of {self.name} is not 1-dimensional.")
            value = value[:, np.newaxis]
        elif isinstance(x, pd.Series):
            value = x.to_numpy()[:, np.newaxis]
        else:
            raise ValueError(f"Call result is of an unrecognized type ({type(x)}).")
        return {"value": value, "type": "numeric"}

[docs]    def _eval_categoric(self, x, encoding):
        """Finishes evaluation of categoric call.

        First, it checks whether the intermediate evaluation returned is ordered. If not, it
        creates a category where the levels are the observed in the variable. They are sorted
        according to ``sorted()`` rules.

        Then, it determines the reference level as well as all the other levels. If the variable
        is a response, the value returned is a dummy with 1s for the reference level and 0s
        elsewhere. If it is not a response variable, it determines the matrix of dummies according
        to the levels and the encoding passed.

        Parameters
        ----------
        x: np.ndarray or pd.Series
            The intermediate values of the variable.
        encoding: bool
            Indicates if it uses full or reduced encoding.

        Returns
        ----------
        result: dict
            A dictionary with keys ``"value"``, ``"type"``, ``"levels"``, ``"reference"``, and
            ``"encoding"``. They represent the result of the evaluation, the type, which is
            ``"categoric"``, the levels observed in the variable, the level used as reference when
            using reduced encoding, and whether the encoding is ``"full"`` or ``"reduced"``.
        """

        if not hasattr(x.dtype, "ordered") or not x.dtype.ordered:
            categories = sorted(x.unique().tolist())
            cat_type = pd.api.types.CategoricalDtype(categories=categories, ordered=True)
            x = x.astype(cat_type)

        reference = x.min()
        levels = x.cat.categories.tolist()

        if self.is_response:
            value = np.atleast_2d(np.where(x == reference, 1, 0)).T
            encoding = None
        else:
            if isinstance(encoding, list):
                encoding = encoding[0]
            if isinstance(encoding, dict):
                encoding = encoding[self.name]
            if encoding:
                value = pd.get_dummies(x).to_numpy()
                encoding = "full"
            else:
                value = pd.get_dummies(x, drop_first=True).to_numpy()
                encoding = "reduced"
        return {
            "value": value,
            "type": "categoric",
            "levels": levels,
            "reference": reference,
            "encoding": encoding,
        }

    def _eval_proportion(self, proportion):
        if not self.is_response:
            raise ValueError("'prop()' can only be used in the context of a response term.")
        return {"value": proportion.eval(), "type": "proportion"}

    def _eval_offset(self, offset):
        if self.is_response:
            raise ValueError("'offset() cannot be used in the context of a response term.")
        return {"value": offset.eval(), "type": "offset"}

[docs]    def eval_new_data(self, data_mask):  # pylint: disable = inconsistent-return-statements
        """Evaluates the function call with new data.

        This method evaluates the function call within a new data mask. If the transformation
        applied is a stateful transformation, it uses the proper object that remembers all
        parameters or settings that may have been set in a first pass.

        Parameters
        ----------
        data_mask: pd.DataFrame
            The data frame where variables are taken from

        Returns
        ----------
        result: np.array
            The rules for the shape of this array are the rules for ``self._eval_numeric()`` and
            ``self._eval_categoric()``. The first applies for numeric calls, the second for
            categoric ones.
        """
        if self._type in ["numeric", "categoric"]:
            x = self.call.eval(data_mask, self.eval_env)
            if self._type == "numeric":
                return self._eval_numeric(x)["value"]
            else:
                return self._eval_new_data_categoric(x)
        elif self._type == "proportion":
            if self._intermediate_data.trials_type == "constant":
                # Return value passed in the second component
                return np.ones((len(data_mask.index), 1)) * self.call.args[1].value
            else:
                # Extract name of the second component
                name = self.call.args[1].name
                values = data_mask[name]
                if isinstance(values, pd.Series):
                    values = values.values[:, np.newaxis]
                return values
        elif self._type == "offset":
            if self._intermediate_data.type == "constant":
                # Return value passed as the argument
                return np.ones((len(data_mask.index), 1)) * self.call.args[0].value
            else:
                # Extract name of the argument
                name = self.call.args[0].name
                values = data_mask[name]
                if isinstance(values, pd.Series):
                    values = values.values[:, np.newaxis]
                return values

[docs]    def _eval_new_data_categoric(self, x):
        """Evaluates the call with new data when the result of the call is categoric.

        This method also checks the levels observed in the new data frame are included within the
        set of the levels of the result of the original call If not, an error is raised.

        x: np.ndarray or pd.Series
            The intermediate values of the variable.

        Returns
        ----------
        result: np.array
            Numeric numpy array ``(n, p)``, where ``n`` is the number of observations and ``p`` the
            number of dummy variables used in the numeric representation of the categorical
            variable.
        """

        # Raise error if passing a level that was not observed.
        new_data_levels = pd.Categorical(x).dtype.categories.tolist()
        if set(new_data_levels).issubset(set(self.data["levels"])):
            series = pd.Categorical(x, categories=self.data["levels"])
            drop_first = self.data["encoding"] == "reduced"
            return pd.get_dummies(series, drop_first=drop_first).to_numpy()
        else:
            raise ValueError(
                f"At least one of the levels for '{self.name}' in the new data was "
                "not present in the original data set."
            )