import sys
import numpy as np
import pandas as pd
from pandas.api.types import is_categorical_dtype, is_numeric_dtype, is_string_dtype
from formulae.transforms import TRANSFORMS, Proportion, Offset
from formulae.terms.call_utils import CallVarsExtractor
[docs]class Call:
"""Representation of a call in a model Term.
This class and ``Variable`` are the atomic components of a model term.
This object supports stateful transformations defined in ``formulae.transforms``.
A transformation of this type defines its parameters the first time it is called,
and then can be used to recompute the transformation with memorized parameter values.
This behavior is useful when implementing a predict method and using transformations such
as ``center(x)`` or ``scale(x)``. ``center(x)`` memorizes the value of the mean, and
``scale(x)`` memorizes both the mean and the standard deviation.
Parameters
----------
call: formulae.terms.call_resolver.LazyCall
The call expression returned by the parser.
is_response: bool
Indicates whether this call represents a response. Defaults to ``False``.
"""
def __init__(self, call, is_response=False):
self.data = None
self.eval_env = None
self._intermediate_data = None
self._type = None
self.is_response = is_response
self.call = call
self.name = str(self.call)
def __hash__(self):
return hash(self.call)
def __eq__(self, other):
if not isinstance(other, type(self)):
return False
return self.call == other.call
def __repr__(self):
return self.__str__()
def __str__(self):
return f"{self.__class__.__name__}({self.name})"
[docs] def accept(self, visitor):
"""Accept method called by a visitor.
Visitors are those available in call_utils.py, and are used to work with call terms.
"""
return visitor.visitCallTerm(self)
@property
def var_names(self):
"""Returns the names of the variables involved in the call, not including the callee.
This is used to determine which variables of the data set being used are actually used in
the model. This allows us to subset the original data set and only raise errors regarding
missing values when the missingness happens in variables used in the model.
Uses a visitor of class ``CallVarsExtractor`` that walks through the components of the call
and returns a list with the name of the variables in the call.
Returns
----------
result: list
A list of strings with the names of the names of the variables in the call, not
including the name of the callee.
"""
return set(CallVarsExtractor(self).get())
[docs] def set_type(self, data_mask, eval_env):
"""Evaluates function and determines the type of the result of the call.
Evaluates the function call and sets the ``._type`` property to ``"numeric"`` or
``"categoric"`` depending on the type of the result. It also stores the intermediate result
of the evaluation in ``._intermediate_data`` to prevent us from computing the same thing
more than once.
Parameters
----------
data_mask: pd.DataFrame
The data frame where variables are taken from
eval_env: EvalEnvironment
The environment where values and functions are taken from.
"""
self.eval_env = eval_env.with_outer_namespace(TRANSFORMS)
x = self.call.eval(data_mask, self.eval_env)
if is_numeric_dtype(x):
self._type = "numeric"
elif is_string_dtype(x) or is_categorical_dtype(x):
self._type = "categoric"
elif isinstance(x, Proportion):
self._type = "proportion"
elif isinstance(x, Offset):
self._type = "offset"
x.set_size(len(data_mask.index))
else:
raise ValueError(f"Call result is of an unrecognized type ({type(x)}).")
self._intermediate_data = x
[docs] def set_data(self, encoding=False):
"""Finishes the evaluation of the call according to its type.
Evaluates the call according to its type and stores the result in ``.data``. It does not
support multi-level categoric responses yet. If ``self.is_response`` is ``True`` and the
variable is of a categoric type, this method returns a 1d array of 0-1 instead of a matrix.
In practice, it just completes the evaluation that started with ``self.set_type()``.
Parameters
----------
encoding: bool
Indicates if it uses full or reduced encoding when the type of the call is
categoric. Omitted when the result of the call is numeric.
"""
try:
if self._type is None:
raise ValueError("Call result type is not set.")
if self._type not in ["numeric", "categoric", "proportion", "offset"]:
raise ValueError(f"Call result is of an unrecognized type ({self._type}).")
if self._type == "numeric":
self.data = self._eval_numeric(self._intermediate_data)
elif self._type == "categoric":
self.data = self._eval_categoric(self._intermediate_data, encoding)
elif self._type == "proportion":
self.data = self._eval_proportion(self._intermediate_data)
elif self._type == "offset":
self.data = self._eval_offset(self._intermediate_data)
except:
print("Unexpected error while trying to evaluate a Call:", sys.exc_info()[0])
raise
[docs] def _eval_numeric(self, x):
"""Finishes evaluation of a numeric call.
Converts the intermediate values of the call into a numpy array of shape ``(n, 1)``,
where ``n`` is the number of observations. This method is used both in ``self.set_data``
and in ``self.eval_new_data``.
Parameters
----------
x: np.ndarray or pd.Series
The intermediate values resulting from the call.
Returns
----------
result: dict
A dictionary with keys ``"value"`` and ``"type"``. The first contains the result of the
evaluation, and the latter is equal to ``"numeric"``.
"""
if isinstance(x, np.ndarray):
value = x.flatten()
if value.ndim > 1:
raise ValueError(f"The result of {self.name} is not 1-dimensional.")
value = value[:, np.newaxis]
elif isinstance(x, pd.Series):
value = x.to_numpy()[:, np.newaxis]
else:
raise ValueError(f"Call result is of an unrecognized type ({type(x)}).")
return {"value": value, "type": "numeric"}
[docs] def _eval_categoric(self, x, encoding):
"""Finishes evaluation of categoric call.
First, it checks whether the intermediate evaluation returned is ordered. If not, it
creates a category where the levels are the observed in the variable. They are sorted
according to ``sorted()`` rules.
Then, it determines the reference level as well as all the other levels. If the variable
is a response, the value returned is a dummy with 1s for the reference level and 0s
elsewhere. If it is not a response variable, it determines the matrix of dummies according
to the levels and the encoding passed.
Parameters
----------
x: np.ndarray or pd.Series
The intermediate values of the variable.
encoding: bool
Indicates if it uses full or reduced encoding.
Returns
----------
result: dict
A dictionary with keys ``"value"``, ``"type"``, ``"levels"``, ``"reference"``, and
``"encoding"``. They represent the result of the evaluation, the type, which is
``"categoric"``, the levels observed in the variable, the level used as reference when
using reduced encoding, and whether the encoding is ``"full"`` or ``"reduced"``.
"""
if not hasattr(x.dtype, "ordered") or not x.dtype.ordered:
categories = sorted(x.unique().tolist())
cat_type = pd.api.types.CategoricalDtype(categories=categories, ordered=True)
x = x.astype(cat_type)
reference = x.min()
levels = x.cat.categories.tolist()
if self.is_response:
value = np.atleast_2d(np.where(x == reference, 1, 0)).T
encoding = None
else:
if isinstance(encoding, list):
encoding = encoding[0]
if isinstance(encoding, dict):
encoding = encoding[self.name]
if encoding:
value = pd.get_dummies(x).to_numpy()
encoding = "full"
else:
value = pd.get_dummies(x, drop_first=True).to_numpy()
encoding = "reduced"
return {
"value": value,
"type": "categoric",
"levels": levels,
"reference": reference,
"encoding": encoding,
}
def _eval_proportion(self, proportion):
if not self.is_response:
raise ValueError("'prop()' can only be used in the context of a response term.")
return {"value": proportion.eval(), "type": "proportion"}
def _eval_offset(self, offset):
if self.is_response:
raise ValueError("'offset() cannot be used in the context of a response term.")
return {"value": offset.eval(), "type": "offset"}
[docs] def eval_new_data(self, data_mask): # pylint: disable = inconsistent-return-statements
"""Evaluates the function call with new data.
This method evaluates the function call within a new data mask. If the transformation
applied is a stateful transformation, it uses the proper object that remembers all
parameters or settings that may have been set in a first pass.
Parameters
----------
data_mask: pd.DataFrame
The data frame where variables are taken from
Returns
----------
result: np.array
The rules for the shape of this array are the rules for ``self._eval_numeric()`` and
``self._eval_categoric()``. The first applies for numeric calls, the second for
categoric ones.
"""
if self._type in ["numeric", "categoric"]:
x = self.call.eval(data_mask, self.eval_env)
if self._type == "numeric":
return self._eval_numeric(x)["value"]
else:
return self._eval_new_data_categoric(x)
elif self._type == "proportion":
if self._intermediate_data.trials_type == "constant":
# Return value passed in the second component
return np.ones((len(data_mask.index), 1)) * self.call.args[1].value
else:
# Extract name of the second component
name = self.call.args[1].name
values = data_mask[name]
if isinstance(values, pd.Series):
values = values.values[:, np.newaxis]
return values
elif self._type == "offset":
if self._intermediate_data.type == "constant":
# Return value passed as the argument
return np.ones((len(data_mask.index), 1)) * self.call.args[0].value
else:
# Extract name of the argument
name = self.call.args[0].name
values = data_mask[name]
if isinstance(values, pd.Series):
values = values.values[:, np.newaxis]
return values
[docs] def _eval_new_data_categoric(self, x):
"""Evaluates the call with new data when the result of the call is categoric.
This method also checks the levels observed in the new data frame are included within the
set of the levels of the result of the original call If not, an error is raised.
x: np.ndarray or pd.Series
The intermediate values of the variable.
Returns
----------
result: np.array
Numeric numpy array ``(n, p)``, where ``n`` is the number of observations and ``p`` the
number of dummy variables used in the numeric representation of the categorical
variable.
"""
# Raise error if passing a level that was not observed.
new_data_levels = pd.Categorical(x).dtype.categories.tolist()
if set(new_data_levels).issubset(set(self.data["levels"])):
series = pd.Categorical(x, categories=self.data["levels"])
drop_first = self.data["encoding"] == "reduced"
return pd.get_dummies(series, drop_first=drop_first).to_numpy()
else:
raise ValueError(
f"At least one of the levels for '{self.name}' in the new data was "
"not present in the original data set."
)