Source code for sliced.sir

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import six
import warnings

import numpy as np
import scipy.linalg as linalg
import scipy.stats as stats

from scipy import sparse
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import normalize

from .externals import check_array, check_X_y, check_is_fitted
from .base import slice_y, grouped_sum, is_multioutput


__all__ = ['SlicedInverseRegression']


def diagonal_precision(X, R=None):
    """Calculate he diagonal elements of the empirical precision matrix
    from a pre-computed QR decomposition of the centered data matrix X.

    Parameters
    ----------
    X : numpy-array, shape (n_samples, n_features)
        The centered data matrix.

    R : numpy-array, shape (n_features, n_features), optional
        The upper triangular matrix obtained by the economic QR decomposition
        of the centered data matrix.

    Returns
    -------
    precisions : numpy-array, shape (n_features,)
        The diagonal elements of the precision (inverse covariance) matrix of
        the centered data.
    """
    if R is None:
        Q, R = linalg.qr(X, mode='economic')

    R_inv = linalg.solve_triangular(R, np.eye(R.shape[1]))
    return np.sum(R_inv ** 2, axis=1) * X.shape[0]


[docs]class SlicedInverseRegression(BaseEstimator, TransformerMixin):
    """Sliced Inverse Regression (SIR) [1]

    Linear dimensionality reduction using the inverse regression curve,
    E[X|y], to identify the directions defining the central subspace of
    the data.

    The inverse comes from the fact that X and y are reversed with respect
    to the standard regression framework (estimating E[y|X]).

    The algorithm performs a weighted principal component analysis on
    slices of the whitened data, which has been sorted with respect to
    the target, y.

    For a binary target the directions found correspond to those found
    with Fisher's Linear Discriminant Analysis (LDA).

    Note that SIR may fail to estimate the directions if the conditional
    density X|y is symmetric, so that E[X|y] = 0. See
    :class:`sliced.save.SlicedAverageVarianceEstimation`,
    which is able to overcome this limitation but may fail to pick up on
    linear trends. If possible, both SIR and SAVE should be used when analyzing
    a dataset.

    Parameters
    ----------
    n_directions : int, str or None (default='auto')
        Number of directions to keep. Corresponds to the dimension of
        the central subpace. If n_directions=='auto', the number of directions
        is chosen by finding the maximum gap in the ordered eigenvalues of
        the var(X|y) matrix and choosing the directions before this gap.
        If n_directions==None, the number of directions equals the number of
        features.

    n_slices : int (default=10)
        The number of slices used when calculating the inverse regression
        curve. Truncated to at most the number of unique values of ``y``.

    alpha : float or None (default=None)
        Significance level for the two-sided t-test used to check for
        non-zero coefficients. Must be a number between 0 and 1. If not
        `None`, the non-zero components of each direction are determined
        from an asymptotic normal approximation. Useful if one desires that
        the directions are sparse in the number of features.

    copy : bool (default=True)
         If False, data passed to fit are overwritten and running
         fit(X).transform(X) will not yield the expected results,
         use fit_transform(X) instead.

    Attributes
    ----------
    directions_ : array, shape (n_directions, n_features)
        The directions in feature space, representing the
        central subspace which is sufficient to describe the conditional
        distribution y|X. The directions are sorted by ``eigenvalues_``.

    eigenvalues_ : array, shape (n_directions,)
        The eigenvalues corresponding to each of the selected directions.
        These are the eigenvalues of the covariance matrix
        of the inverse regression curve. Larger eigenvalues indicate
        more prevalent directions.

    Examples
    --------

    >>> import numpy as np
    >>> from sliced import SlicedInverseRegression
    >>> from sliced.datasets import make_cubic
    >>> X, y = make_cubic(random_state=123)
    >>> sir = SlicedInverseRegression(n_directions=2)
    >>> sir.fit(X, y)
    SlicedInverseRegression(alpha=None, copy=True, n_directions=2, n_slices=10)
    >>> X_sir = sir.transform(X)

    References
    ----------

    [1] Li, K C. (1991)
        "Sliced Inverse Regression for Dimension Reduction (with discussion)",
        Journal of the American Statistical Association, 86, 316-342.
    [2] Chen, C.H., and Li, K.C. (1998), "Can SIR Be as Popular as Multiple
        Linear Regression?" Statistica Sinica, 8, 289-316.
    """
[docs]    def __init__(self, n_directions='auto', n_slices=10, alpha=None, copy=True):
        self.n_directions = n_directions
        self.n_slices = n_slices
        self.alpha = alpha
        self.copy = copy

    def fit(self, X, y):
        """Fit the model with X and y.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.

        y : array-like, shape (n_samples,)
            The target values (class labels in classification, real numbers
            in regression).

        Returns
        -------
        self : object
            Returns the instance itself.
        """
        if sparse.issparse(X):
            raise TypeError("SlicedInverseRegression does not support "
                            "sparse input.")

        X, y = check_X_y(X, y, dtype=[np.float64, np.float32],
                         accept_sparse=['csr'],
                         y_numeric=True, copy=self.copy)

        if self.n_directions is None:
            n_directions = X.shape[1]
        elif (not isinstance(self.n_directions, six.string_types) and
                self.n_directions < 1):
            raise ValueError('The number of directions `n_directions` '
                             'must be >= 1. Got `n_directions`={}'.format(
                                self.n_directions))
        else:
            n_directions = self.n_directions

        if self.alpha is not None and (self.alpha <= 0 or self.alpha >= 1):
            raise ValueError("The significance level `alpha` "
                             "must be between 0 and 1. Got `alpha`={}".format(
                                self.alpha))

        # validate y
        if is_multioutput(y):
            raise TypeError("The target `y` cannot be multi-output.")

        n_samples, n_features = X.shape

        # Center and Whiten feature matrix using a QR decomposition
        # (this is the approach used in the dr package)
        if self.copy:
            X = X - np.mean(X, axis=0)
        else:
            X -= np.mean(X, axis=0)
        Q, R = linalg.qr(X, mode='economic')
        Z = np.sqrt(n_samples) * Q
        Z = Z[np.argsort(y), :]

        # determine slice indices and counts per slice
        slices, counts = slice_y(y, self.n_slices)
        self.n_slices_ = counts.shape[0]

        # means in each slice (sqrt factor takes care of the weighting)
        Z_means = grouped_sum(Z, slices) / np.sqrt(counts.reshape(-1, 1))
        M = np.dot(Z_means.T, Z_means) / n_samples

        # eigen-decomposition of slice matrix
        evals, evecs = linalg.eigh(M)
        evecs = evecs[:, ::-1]
        evals = evals[::-1]
        try:
            # TODO: internally handle zero variance features. This would not
            # be a problem if we used svd, but does not match DR.
            directions = linalg.solve_triangular(np.sqrt(n_samples) * R, evecs)
        except (linalg.LinAlgError, TypeError):
            # NOTE: The TypeError is because of a bug in the reporting of scipy
            raise linalg.LinAlgError(
                "Unable to back-solve R for the dimension "
                "reducing directions. This is usually caused by the presents "
                "of zero variance features. Try removing these features with "
                "`sklearn.feature_selection.VarianceThreshold(threshold=0.)` "
                "and refitting.")

        # the number of directions is chosen by finding the maximum gap among
        # the ordered eigenvalues.
        if self.n_directions == 'auto':
            n_directions = np.argmax(np.abs(np.diff(evals))) + 1
        self.n_directions_ = n_directions

        directions = normalize(
            directions[:, :self.n_directions_], norm='l2', axis=0)
        self.directions_ = directions.T
        self.eigenvalues_ = evals[:self.n_directions_]

        # Drop components in each direction using the t-ratio approach
        # suggested in section 4 of Chen and Li (1998).
        if self.alpha is not None:
            # similar to multiple linear-regression the standard error
            # estimates are proportional to the diagonals of the inverse
            # covariance matrix.
            precs = diagonal_precision(X, R=R)
            weights = (1 - self.eigenvalues_) / (n_samples * self.eigenvalues_)
            std_error = np.sqrt(weights.reshape(-1, 1) * precs)

            # perform a two-sided t-test at level alpha for coefs equal to zero
            # NOTE: we are not correcting for multiple tests and this is
            #       a very rough approximation, so do not expect the test
            #       to be close to the nominal level.
            df = n_samples - n_features - 1
            crit_val = stats.distributions.t.ppf(1 - self.alpha/2, df)
            for j in range(self.n_directions_):
                test_stat = np.abs(self.directions_[j, :] / std_error[j])
                zero_mask = test_stat < crit_val
                if np.sum(zero_mask) == n_features:
                    warnings.warn("Not zeroing out coefficients. All "
                                  "coefficients are not significantly "
                                  "different from zero.", RuntimeWarning)
                else:
                    self.directions_[j, test_stat < crit_val] = 0.

        return self

    def transform(self, X):
        """Apply dimension reduction on X.

        X is projected onto the central subspace  previously extracted from a
        training set.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            New data, where n_samples in the number of samples
            and n_features is the number of features.


        Returns
        -------
        X_new : array-like, shape (n_samples, n_directions)

        """
        check_is_fitted(self, 'directions_')

        X = check_array(X)
        return np.dot(X, self.directions_.T)