Source code for pytranskit.optrans.decomposition.plda

import warnings
import numpy as np
from scipy.linalg import eigh
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator

from ..utils import check_array


[docs]class PLDA(BaseEstimator): """ Penalized Linear Discriminant Analysis. This is both a dimensionality reduction method and a linear classifier. Parameters ---------- alpha : scalar (default=1.) Parameter that controls the proportion of LDA vs PCA. If alpha=0, PLDA functions like LDA. If alpha is large, PLDA functions more like PCA. n_components : int or None (default=None) Number of components to keep. If n_components is not set, all components are kept: n_components == min(n_samples, n_features). Attributes ---------- components_ : array, shape (n_components, n_features) Axes in the feature space. The components are sorted by the explained variance. explained_variance_ : array, shape (n_components,) The amount of variance explained by each of the selected components. explained_variance_ratio_ : array, shape(n_components,) Proportion of variance explained by each of the selected components. If n_components is not set then all components are stored and the sum of explained variance ratios is equal to 1.0. mean_ : array, shape (n_features,) Per-feature empirical mean, estimated from the training set. n_components_ : int The number of components. coef_ : array, shape (n_features,) or (n_classes, n_features) Weight vector(s). intercept_ : array, shape (n_features,) Intercept term. class_means_ : array, shape (n_classes, n_features) Class means, estimated from the training set. classes_ : array, shape (n_classes,) Unique class labels. References ---------- W. Wang et al. Penalized Fisher Discriminant Analysis and its Application to Image-Based Morphometry. Pattern Recognit. Lett., 32(15):2128-35, 2011 """ def __init__(self, alpha=1., n_components=None): self.is_fitted = False self.alpha = alpha self.n_components_ = n_components def _check_is_fitted(self): if not self.is_fitted: raise AssertionError("The fit function has not been " "called yet. Call 'fit' before using " "this method".format(type(self).__name__)) def _class_means(self, X, y): """ Compute class means. Parameters ---------- X : array, shape (n_samples, n_features) Input data. y : array, shape (n_samples,) Target values. Returns ------- means : array, shape (n_features,) Class means. """ means = [] classes = np.unique(y) for group in classes: Xg = X[y==group,:] means.append(Xg.mean(0)) return np.asarray(means) def _solve_eigen(self, X, y): """ Eigenvalue solver. Parameters ---------- X : array, shape (n_samples, n_features) Training data. y : array, shape (n_samples,) Target values. """ # Data dimensions n_samp, n_feat = X.shape # Initialize between-class and within-class scatter matrices S_B = np.zeros((n_feat,n_feat)) S_W = np.zeros((n_feat,n_feat)) # Loop over each class label for i,yi in enumerate(self.classes_): # Get indices of data in this class ind = (y==yi) ni = ind.sum() # Class mean and centered class data Xi_cent = X[ind] - np.tile(self.class_means_[i], (ni,1)) # Get difference of means as column vector (so transpose will work) diff = (self.class_means_[i]-self.mean_).reshape((n_feat,1)) # Update scatter matrices S_W += Xi_cent.T.dot(Xi_cent) S_B += float(ni) * diff.dot(diff.T) # PLDA solution is given by generalized eigenvalue decomposition: # S_T w = lambda*(S_W + alpha*I) w, where S_T = S_B + S_W alpha_I = self.alpha*np.identity(n_feat) evals, evecs = eigh(S_B+S_W, b=S_W+alpha_I) # Sort the eigenvalues (eigenvectors) in descending order self.explained_variance_ratio_ = np.sort(evals / np.sum(evals) )[::-1][:self.n_components_] evecs = evecs[:,np.argsort(evals)[::-1]] evecs /= np.linalg.norm(evecs, axis=0) # Set the components for transforming data, and coefficients and # intercept for classification self.components_ = evecs[:,:self.n_components_].T self.coef_ = np.dot(self.class_means_, evecs).dot(evecs.T) self.intercept_ = -0.5 * np.diag(self.class_means_.dot(self.coef_.T)) return
[docs] def fit(self, X, y): """ Fit PLDA model according to the given training data and parameters. Parameters ---------- X : array, shape (n_samples, n_features) Training data. y : array, shape (n_samples,) Target values. """ # Check input arrays X = check_array(X, ndim=2, dtype='numeric', force_all_finite=True) y = check_array(y.astype(int), ndim=1, dtype='numeric', force_all_finite=True) # Check input arrays are same length if X.shape[0] != y.size: raise ValueError("Number of samples in X and y must be the same: " "{} vs {}".format(X.shape[0], y.size)) # Check that n_components does not exceed maximum possible max_components = min(X.shape) if self.n_components_ is None: self.n_components_ = max_components elif self.n_components_ > max_components: self.n_components_ = max_components warnings.warn("n_components exceeds maximum possible components. " "Setting n_components = {}".format(max_components)) # Set useful data attributes self.classes_ = np.unique(y) self.mean_ = X.mean(axis=0) self.class_means_ = self._class_means(X, y) self._solve_eigen(X, y) # Adjust coefficients and intercept for binary classification problems if self.classes_.size == 2: self.coef_ = np.array(self.coef_[1,:] - self.coef_[0,:], ndmin=2) self.intercept_ = np.array(self.intercept_[1]-self.intercept_[0], ndmin=1) # Transform data so we can get the explained variance self.explained_variance_ = np.dot(X, self.components_.T).var(axis=0) self.is_fitted = True return
[docs] def transform(self, X): """ Transform data. Parameters ---------- X : array, shape (n_samples, n_features) Input data. Returns ------- X_new : array, shape (n_samples, n_components) Transformed data. """ self._check_is_fitted() # Check input arrays X = check_array(X, ndim=2, dtype='numeric', force_all_finite=True) # Check number of features is correct n_feat = self.components_.shape[1] if X.shape[1] != n_feat: raise ValueError("X has {} features per sample." "Expecting {}".format(X.shape[1], n_feat)) # Transform data X_new = np.dot(X, self.components_.T) return X_new[:,:self.n_components_]
[docs] def fit_transform(self, X, y): """ Fit the model with X and transform X. Parameters ---------- X : array, shape (n_samples, n_features) Training data. y : array, shape (n_samples,) Target values. Returns ------- X_new : array, shape (n_samples, n_components) Transformed data. """ self.fit(X, y) return self.transform(X)
[docs] def inverse_transform(self, X): """ Transform data back to its original space. Note: If n_components is less than the maximum, information will be lost, so reconstructed data will not exactly match the original data. Parameters ---------- X : array shape (n_samples, n_components) New data. Returns ------- X_original : array, shape (n_samples, n_features) Data transformed back into original space. """ self._check_is_fitted() X = check_array(X, ndim=2) # Check data dimensions if X.shape[1] != self.n_components_: raise ValueError("X has {} features per sample." "Expecting {}".format(X.shape[1], self.n_components_)) # Inverse transform X_original = self.mean_ + np.dot(X, self.components_) return X_original
[docs] def decision_function(self, X): """ Predict confidence scores for samples. The confidence score for a sample is the signed distance of that sample to the hyperplane. Parameters ---------- X : array, shape (n_samples, n_features) Input data. Returns ------- scores : array, shape=(n_samples,) if n_classes == 2 else (n_samples, n_classes) Confidence scores per (sample, class) combination. In the binary case, confidence score for self.classes_[1] where >0 means this class would be predicted. """ self._check_is_fitted() # Check input array X = check_array(X, ndim=2, dtype='numeric', force_all_finite=True) # Check number of features is correct n_feat = self.components_.shape[1] if X.shape[1] != n_feat: raise ValueError("X has {} features per sample." "Expecting {}".format(X.shape[1], n_feat)) scores = np.dot(X, self.coef_.T) + self.intercept_ return scores.ravel() if scores.shape[1] == 1 else scores
[docs] def predict(self, X): """ Predict class labels for samples in X. Parameters ---------- X : array, shape (n_samples, n_features) Input data. Returns ------- C : array, shape (n_samples,) Predicted class label per sample. """ scores = self.decision_function(X) if len(scores.shape) == 1: ind = (scores > 0).astype(np.int) else: ind = scores.argmax(axis=1) return self.classes_[ind]
[docs] def predict_proba(self, X): """ Estimate probability. Parameters ---------- X : array, shape (n_samples, n_features) Input data. Returns ------- C : array, shape (n_samples, n_classes) Estimated probabilities. """ prob = self.decision_function(X) prob *= -1 np.exp(prob, prob) prob += 1 np.reciprocal(prob, prob) if len(self.classes_) == 2: # Binary case return np.column_stack([1-prob, prob]).T else: # One-vs-rest normalization prob /= prob.sum(axis=1).reshape((prob.shape[0],-1)) return prob
[docs] def predict_log_proba(self, X): """ Estimate log probability. Parameters ---------- X : array, shape (n_samples, n_features) Input data. Returns ------- C : array, shape (n_samples, n_classes) Estimated log probabilities. """ return np.log(self.predict_proba(X))
[docs] def score(self, X, y, sample_weight=None): """ Returns the mean accuracy on the given test data and labels. Parameters ---------- X : array, shape (n_samples, n_features) Test samples. y : array, shape (n_samples,) True labels for X. sample_weight : array, shape (n_samples,), optional Sample weights. Returns ------- score : float Mean accuracy of self.predict(X) w.r.t. y. """ # Check input arrays X = check_array(X, ndim=2, dtype='numeric', force_all_finite=True) y = check_array(y.astype(int), ndim=1, dtype='numeric', force_all_finite=True) # Check input arrays are same length if X.shape[0] != y.size: raise ValueError("Number of samples in X and y must be the same: " "{} vs {}".format(X.shape[0], y.size)) return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
[docs] def predict_transformed(self, X_trans): """ Predict class labels for data that have already been transformed by self.transform(X). This is useful for plotting classification boundaries. Note: Due to arithemtic discrepancies, this may return slightly different class labels to self.predict(X). Parameters ---------- X_trans : array, shape (n_samples, n_components) Test samples that have already been transformed into PLDA space. Returns ------- y : array, shape (n_samples,) Predicted class labels for X_trans. """ self._check_is_fitted() # Check input array X_trans = check_array(X_trans, ndim=2) # Make sure this is a set of transformed data if X_trans.shape[1] != self.n_components_: raise ValueError("Number of features in X_trans must match " "n_components: {}".format(self.n_components_)) # Transform class means into PLDA space mean_trans = self.transform(self.class_means_) # Initialize useful values n_samples = X_trans.shape[0] n_classes = mean_trans.shape[0] dists = np.zeros((n_samples,n_classes)) # Compute the distance between each data sample and each class mean for i,mean in enumerate(mean_trans): dists[:,i] = np.linalg.norm(X_trans-np.tile(mean,(n_samples,1)), axis=1) # Classification based on shortest distance to class mean ind = dists.argmin(axis=1) return self.classes_[ind]