Source code for sliced.sir

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import six
import warnings

import numpy as np
import scipy.linalg as linalg
import scipy.stats as stats

from scipy import sparse
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import normalize

from .externals import check_array, check_X_y, check_is_fitted
from .base import slice_y, grouped_sum, is_multioutput


__all__ = ['SlicedInverseRegression']


def diagonal_precision(X, R=None):
    """Calculate he diagonal elements of the empirical precision matrix
    from a pre-computed QR decomposition of the centered data matrix X.

    Parameters
    ----------
    X : numpy-array, shape (n_samples, n_features)
        The centered data matrix.

    R : numpy-array, shape (n_features, n_features), optional
        The upper triangular matrix obtained by the economic QR decomposition
        of the centered data matrix.

    Returns
    -------
    precisions : numpy-array, shape (n_features,)
        The diagonal elements of the precision (inverse covariance) matrix of
        the centered data.
    """
    if R is None:
        Q, R = linalg.qr(X, mode='economic')

    R_inv = linalg.solve_triangular(R, np.eye(R.shape[1]))
    return np.sum(R_inv ** 2, axis=1) * X.shape[0]


[docs]class SlicedInverseRegression(BaseEstimator, TransformerMixin): """Sliced Inverse Regression (SIR) [1] Linear dimensionality reduction using the inverse regression curve, E[X|y], to identify the directions defining the central subspace of the data. The inverse comes from the fact that X and y are reversed with respect to the standard regression framework (estimating E[y|X]). The algorithm performs a weighted principal component analysis on slices of the whitened data, which has been sorted with respect to the target, y. For a binary target the directions found correspond to those found with Fisher's Linear Discriminant Analysis (LDA). Note that SIR may fail to estimate the directions if the conditional density X|y is symmetric, so that E[X|y] = 0. See :class:`sliced.save.SlicedAverageVarianceEstimation`, which is able to overcome this limitation but may fail to pick up on linear trends. If possible, both SIR and SAVE should be used when analyzing a dataset. Parameters ---------- n_directions : int, str or None (default='auto') Number of directions to keep. Corresponds to the dimension of the central subpace. If n_directions=='auto', the number of directions is chosen by finding the maximum gap in the ordered eigenvalues of the var(X|y) matrix and choosing the directions before this gap. If n_directions==None, the number of directions equals the number of features. n_slices : int (default=10) The number of slices used when calculating the inverse regression curve. Truncated to at most the number of unique values of ``y``. alpha : float or None (default=None) Significance level for the two-sided t-test used to check for non-zero coefficients. Must be a number between 0 and 1. If not `None`, the non-zero components of each direction are determined from an asymptotic normal approximation. Useful if one desires that the directions are sparse in the number of features. copy : bool (default=True) If False, data passed to fit are overwritten and running fit(X).transform(X) will not yield the expected results, use fit_transform(X) instead. Attributes ---------- directions_ : array, shape (n_directions, n_features) The directions in feature space, representing the central subspace which is sufficient to describe the conditional distribution y|X. The directions are sorted by ``eigenvalues_``. eigenvalues_ : array, shape (n_directions,) The eigenvalues corresponding to each of the selected directions. These are the eigenvalues of the covariance matrix of the inverse regression curve. Larger eigenvalues indicate more prevalent directions. Examples -------- >>> import numpy as np >>> from sliced import SlicedInverseRegression >>> from sliced.datasets import make_cubic >>> X, y = make_cubic(random_state=123) >>> sir = SlicedInverseRegression(n_directions=2) >>> sir.fit(X, y) SlicedInverseRegression(alpha=None, copy=True, n_directions=2, n_slices=10) >>> X_sir = sir.transform(X) References ---------- [1] Li, K C. (1991) "Sliced Inverse Regression for Dimension Reduction (with discussion)", Journal of the American Statistical Association, 86, 316-342. [2] Chen, C.H., and Li, K.C. (1998), "Can SIR Be as Popular as Multiple Linear Regression?" Statistica Sinica, 8, 289-316. """
[docs] def __init__(self, n_directions='auto', n_slices=10, alpha=None, copy=True): self.n_directions = n_directions self.n_slices = n_slices self.alpha = alpha self.copy = copy
def fit(self, X, y): """Fit the model with X and y. Parameters ---------- X : array-like, shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) The target values (class labels in classification, real numbers in regression). Returns ------- self : object Returns the instance itself. """ if sparse.issparse(X): raise TypeError("SlicedInverseRegression does not support " "sparse input.") X, y = check_X_y(X, y, dtype=[np.float64, np.float32], accept_sparse=['csr'], y_numeric=True, copy=self.copy) if self.n_directions is None: n_directions = X.shape[1] elif (not isinstance(self.n_directions, six.string_types) and self.n_directions < 1): raise ValueError('The number of directions `n_directions` ' 'must be >= 1. Got `n_directions`={}'.format( self.n_directions)) else: n_directions = self.n_directions if self.alpha is not None and (self.alpha <= 0 or self.alpha >= 1): raise ValueError("The significance level `alpha` " "must be between 0 and 1. Got `alpha`={}".format( self.alpha)) # validate y if is_multioutput(y): raise TypeError("The target `y` cannot be multi-output.") n_samples, n_features = X.shape # Center and Whiten feature matrix using a QR decomposition # (this is the approach used in the dr package) if self.copy: X = X - np.mean(X, axis=0) else: X -= np.mean(X, axis=0) Q, R = linalg.qr(X, mode='economic') Z = np.sqrt(n_samples) * Q Z = Z[np.argsort(y), :] # determine slice indices and counts per slice slices, counts = slice_y(y, self.n_slices) self.n_slices_ = counts.shape[0] # means in each slice (sqrt factor takes care of the weighting) Z_means = grouped_sum(Z, slices) / np.sqrt(counts.reshape(-1, 1)) M = np.dot(Z_means.T, Z_means) / n_samples # eigen-decomposition of slice matrix evals, evecs = linalg.eigh(M) evecs = evecs[:, ::-1] evals = evals[::-1] try: # TODO: internally handle zero variance features. This would not # be a problem if we used svd, but does not match DR. directions = linalg.solve_triangular(np.sqrt(n_samples) * R, evecs) except (linalg.LinAlgError, TypeError): # NOTE: The TypeError is because of a bug in the reporting of scipy raise linalg.LinAlgError( "Unable to back-solve R for the dimension " "reducing directions. This is usually caused by the presents " "of zero variance features. Try removing these features with " "`sklearn.feature_selection.VarianceThreshold(threshold=0.)` " "and refitting.") # the number of directions is chosen by finding the maximum gap among # the ordered eigenvalues. if self.n_directions == 'auto': n_directions = np.argmax(np.abs(np.diff(evals))) + 1 self.n_directions_ = n_directions directions = normalize( directions[:, :self.n_directions_], norm='l2', axis=0) self.directions_ = directions.T self.eigenvalues_ = evals[:self.n_directions_] # Drop components in each direction using the t-ratio approach # suggested in section 4 of Chen and Li (1998). if self.alpha is not None: # similar to multiple linear-regression the standard error # estimates are proportional to the diagonals of the inverse # covariance matrix. precs = diagonal_precision(X, R=R) weights = (1 - self.eigenvalues_) / (n_samples * self.eigenvalues_) std_error = np.sqrt(weights.reshape(-1, 1) * precs) # perform a two-sided t-test at level alpha for coefs equal to zero # NOTE: we are not correcting for multiple tests and this is # a very rough approximation, so do not expect the test # to be close to the nominal level. df = n_samples - n_features - 1 crit_val = stats.distributions.t.ppf(1 - self.alpha/2, df) for j in range(self.n_directions_): test_stat = np.abs(self.directions_[j, :] / std_error[j]) zero_mask = test_stat < crit_val if np.sum(zero_mask) == n_features: warnings.warn("Not zeroing out coefficients. All " "coefficients are not significantly " "different from zero.", RuntimeWarning) else: self.directions_[j, test_stat < crit_val] = 0. return self def transform(self, X): """Apply dimension reduction on X. X is projected onto the central subspace previously extracted from a training set. Parameters ---------- X : array-like, shape (n_samples, n_features) New data, where n_samples in the number of samples and n_features is the number of features. Returns ------- X_new : array-like, shape (n_samples, n_directions) """ check_is_fitted(self, 'directions_') X = check_array(X) return np.dot(X, self.directions_.T)