import warnings
import numpy as np
from scipy.linalg import eigh
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator
from ..utils import check_array
[docs]class PLDA(BaseEstimator):
"""
Penalized Linear Discriminant Analysis.
This is both a dimensionality reduction method and a linear classifier.
Parameters
----------
alpha : scalar (default=1.)
Parameter that controls the proportion of LDA vs PCA.
If alpha=0, PLDA functions like LDA. If alpha is large, PLDA functions
more like PCA.
n_components : int or None (default=None)
Number of components to keep. If n_components is not set, all
components are kept: n_components == min(n_samples, n_features).
Attributes
----------
components_ : array, shape (n_components, n_features)
Axes in the feature space. The components are sorted by the explained
variance.
explained_variance_ : array, shape (n_components,)
The amount of variance explained by each of the selected components.
explained_variance_ratio_ : array, shape(n_components,)
Proportion of variance explained by each of the selected components.
If n_components is not set then all components are stored and the sum
of explained variance ratios is equal to 1.0.
mean_ : array, shape (n_features,)
Per-feature empirical mean, estimated from the training set.
n_components_ : int
The number of components.
coef_ : array, shape (n_features,) or (n_classes, n_features)
Weight vector(s).
intercept_ : array, shape (n_features,)
Intercept term.
class_means_ : array, shape (n_classes, n_features)
Class means, estimated from the training set.
classes_ : array, shape (n_classes,)
Unique class labels.
References
----------
W. Wang et al. Penalized Fisher Discriminant Analysis and its Application
to Image-Based Morphometry. Pattern Recognit. Lett., 32(15):2128-35, 2011
"""
def __init__(self, alpha=1., n_components=None):
self.is_fitted = False
self.alpha = alpha
self.n_components_ = n_components
def _check_is_fitted(self):
if not self.is_fitted:
raise AssertionError("The fit function has not been "
"called yet. Call 'fit' before using "
"this method".format(type(self).__name__))
def _class_means(self, X, y):
"""
Compute class means.
Parameters
----------
X : array, shape (n_samples, n_features)
Input data.
y : array, shape (n_samples,)
Target values.
Returns
-------
means : array, shape (n_features,)
Class means.
"""
means = []
classes = np.unique(y)
for group in classes:
Xg = X[y==group,:]
means.append(Xg.mean(0))
return np.asarray(means)
def _solve_eigen(self, X, y):
"""
Eigenvalue solver.
Parameters
----------
X : array, shape (n_samples, n_features)
Training data.
y : array, shape (n_samples,)
Target values.
"""
# Data dimensions
n_samp, n_feat = X.shape
# Initialize between-class and within-class scatter matrices
S_B = np.zeros((n_feat,n_feat))
S_W = np.zeros((n_feat,n_feat))
# Loop over each class label
for i,yi in enumerate(self.classes_):
# Get indices of data in this class
ind = (y==yi)
ni = ind.sum()
# Class mean and centered class data
Xi_cent = X[ind] - np.tile(self.class_means_[i], (ni,1))
# Get difference of means as column vector (so transpose will work)
diff = (self.class_means_[i]-self.mean_).reshape((n_feat,1))
# Update scatter matrices
S_W += Xi_cent.T.dot(Xi_cent)
S_B += float(ni) * diff.dot(diff.T)
# PLDA solution is given by generalized eigenvalue decomposition:
# S_T w = lambda*(S_W + alpha*I) w, where S_T = S_B + S_W
alpha_I = self.alpha*np.identity(n_feat)
evals, evecs = eigh(S_B+S_W, b=S_W+alpha_I)
# Sort the eigenvalues (eigenvectors) in descending order
self.explained_variance_ratio_ = np.sort(evals / np.sum(evals)
)[::-1][:self.n_components_]
evecs = evecs[:,np.argsort(evals)[::-1]]
evecs /= np.linalg.norm(evecs, axis=0)
# Set the components for transforming data, and coefficients and
# intercept for classification
self.components_ = evecs[:,:self.n_components_].T
self.coef_ = np.dot(self.class_means_, evecs).dot(evecs.T)
self.intercept_ = -0.5 * np.diag(self.class_means_.dot(self.coef_.T))
return
[docs] def fit(self, X, y):
"""
Fit PLDA model according to the given training data and parameters.
Parameters
----------
X : array, shape (n_samples, n_features)
Training data.
y : array, shape (n_samples,)
Target values.
"""
# Check input arrays
X = check_array(X, ndim=2, dtype='numeric', force_all_finite=True)
y = check_array(y.astype(int), ndim=1, dtype='numeric',
force_all_finite=True)
# Check input arrays are same length
if X.shape[0] != y.size:
raise ValueError("Number of samples in X and y must be the same: "
"{} vs {}".format(X.shape[0], y.size))
# Check that n_components does not exceed maximum possible
max_components = min(X.shape)
if self.n_components_ is None:
self.n_components_ = max_components
elif self.n_components_ > max_components:
self.n_components_ = max_components
warnings.warn("n_components exceeds maximum possible components. "
"Setting n_components = {}".format(max_components))
# Set useful data attributes
self.classes_ = np.unique(y)
self.mean_ = X.mean(axis=0)
self.class_means_ = self._class_means(X, y)
self._solve_eigen(X, y)
# Adjust coefficients and intercept for binary classification problems
if self.classes_.size == 2:
self.coef_ = np.array(self.coef_[1,:] - self.coef_[0,:], ndmin=2)
self.intercept_ = np.array(self.intercept_[1]-self.intercept_[0],
ndmin=1)
# Transform data so we can get the explained variance
self.explained_variance_ = np.dot(X, self.components_.T).var(axis=0)
self.is_fitted = True
return
[docs] def decision_function(self, X):
"""
Predict confidence scores for samples.
The confidence score for a sample is the signed distance of that
sample to the hyperplane.
Parameters
----------
X : array, shape (n_samples, n_features)
Input data.
Returns
-------
scores : array, shape=(n_samples,) if n_classes == 2
else (n_samples, n_classes)
Confidence scores per (sample, class) combination. In the binary
case, confidence score for self.classes_[1] where >0 means this
class would be predicted.
"""
self._check_is_fitted()
# Check input array
X = check_array(X, ndim=2, dtype='numeric', force_all_finite=True)
# Check number of features is correct
n_feat = self.components_.shape[1]
if X.shape[1] != n_feat:
raise ValueError("X has {} features per sample."
"Expecting {}".format(X.shape[1], n_feat))
scores = np.dot(X, self.coef_.T) + self.intercept_
return scores.ravel() if scores.shape[1] == 1 else scores
[docs] def predict(self, X):
"""
Predict class labels for samples in X.
Parameters
----------
X : array, shape (n_samples, n_features)
Input data.
Returns
-------
C : array, shape (n_samples,)
Predicted class label per sample.
"""
scores = self.decision_function(X)
if len(scores.shape) == 1:
ind = (scores > 0).astype(np.int)
else:
ind = scores.argmax(axis=1)
return self.classes_[ind]
[docs] def predict_proba(self, X):
"""
Estimate probability.
Parameters
----------
X : array, shape (n_samples, n_features)
Input data.
Returns
-------
C : array, shape (n_samples, n_classes)
Estimated probabilities.
"""
prob = self.decision_function(X)
prob *= -1
np.exp(prob, prob)
prob += 1
np.reciprocal(prob, prob)
if len(self.classes_) == 2:
# Binary case
return np.column_stack([1-prob, prob]).T
else:
# One-vs-rest normalization
prob /= prob.sum(axis=1).reshape((prob.shape[0],-1))
return prob
[docs] def predict_log_proba(self, X):
"""
Estimate log probability.
Parameters
----------
X : array, shape (n_samples, n_features)
Input data.
Returns
-------
C : array, shape (n_samples, n_classes)
Estimated log probabilities.
"""
return np.log(self.predict_proba(X))
[docs] def score(self, X, y, sample_weight=None):
"""
Returns the mean accuracy on the given test data and labels.
Parameters
----------
X : array, shape (n_samples, n_features)
Test samples.
y : array, shape (n_samples,)
True labels for X.
sample_weight : array, shape (n_samples,), optional
Sample weights.
Returns
-------
score : float
Mean accuracy of self.predict(X) w.r.t. y.
"""
# Check input arrays
X = check_array(X, ndim=2, dtype='numeric', force_all_finite=True)
y = check_array(y.astype(int), ndim=1, dtype='numeric',
force_all_finite=True)
# Check input arrays are same length
if X.shape[0] != y.size:
raise ValueError("Number of samples in X and y must be the same: "
"{} vs {}".format(X.shape[0], y.size))
return accuracy_score(y, self.predict(X), sample_weight=sample_weight)