Principal Component Analysis 即主成分分析的主要思想是将 n 维特征映射到 k 维上,这 k 维是全新的正交特征也被称为主成分,是在原有 n 维特征的基础上重新构造出来的 k 维特征。PCA 的工作就是从原始的空间中顺序地找一组相互正交的坐标轴,其中,第一个新坐标轴选择是原始数据中方差最大的方向,第二个新坐标轴选取是与第一个坐标轴正交的平面中使得方差最大的,第三个轴是与第 1,2 个轴正交的平面中方差最大的,依次类推,得到 n 个这样的坐标轴。通过这种方式获得的新的坐标轴,大部分方差都包含在前面 k 个坐标轴中,后面的坐标轴所含的方差几乎为 0 。于是,我们可以只保留前 k 维,从而实现对数据特征的降维处理。
deftransform(self, X): ... # 对程序和数据进行验证,暂且忽略 if self.mean_ isnotNone: X = X - self.mean_ X_transformed = np.dot(X, self.components_.T) # 白化 if self.whiten: X_transformed /= np.sqrt(self.explained_variance_) return X_transformed
fit_transform() 方法的实现:
相关的公式都被作者贴心的写在注释里了。
1 2 3 4 5 6 7 8 9 10 11
deffit_transform(self, X, y=None): U, S, Vt = self._fit(X) U = U[:, : self.n_components_]
if self.whiten: # X_new = X * V / S * sqrt(n_samples) = U * sqrt(n_samples) U *= sqrt(X.shape[0] - 1) else: # X_new = X * V = U * S * Vt * V = U * S U *= S[: self.n_components_] return U
def_assess_dimension(spectrum, rank, n_samples): """Compute the log-likelihood of a rank ``rank`` dataset. The dataset is assumed to be embedded in gaussian noise of shape(n, dimf) having spectrum ``spectrum``. Parameters ---------- spectrum : ndarray of shape (n_features,) Data spectrum. rank : int Tested rank value. It should be strictly lower than n_features, otherwise the method isn't specified (division by zero in equation (31) from the paper). n_samples : int Number of samples. Returns ------- ll : float The log-likelihood. """
n_features = spectrum.shape[0] ifnot1 <= rank < n_features: raise ValueError("the tested rank should be in [1, n_features - 1]")
eps = 1e-15
if spectrum[rank - 1] < eps: # When the tested rank is associated with a small eigenvalue, there's # no point in computing the log-likelihood: it's going to be very # small and won't be the max anyway. Also, it can lead to numerical # issues below when computing pa, in particular in log((spectrum[i] - # spectrum[j]) because this will take the log of something very small. return -np.inf
pu = -rank * log(2.0) for i inrange(1, rank + 1): pu += ( gammaln((n_features - i + 1) / 2.0) - log(np.pi) * (n_features - i + 1) / 2.0 )
def_infer_dimension(spectrum, n_samples): """Infers the dimension of a dataset with a given spectrum. The returned value will be in [1, n_features - 1]. """ ll = np.empty_like(spectrum) ll[0] = -np.inf # we don't want to return n_components = 0 for rank inrange(1, spectrum.shape[0]): ll[rank] = _assess_dimension(spectrum, rank, n_samples) return ll.argmax()
defget_covariance(self): """Compute data covariance with the generative model. ``cov = components_.T * S**2 * components_ + sigma2 * eye(n_features)`` where S**2 contains the explained variances, and sigma2 contains the noise variances. Returns ------- cov : array of shape=(n_features, n_features) Estimated covariance of data. """ components_ = self.components_ exp_var = self.explained_variance_ if self.whiten: components_ = components_ * np.sqrt(exp_var[:, np.newaxis]) exp_var_diff = np.maximum(exp_var - self.noise_variance_, 0.0) cov = np.dot(components_.T * exp_var_diff, components_) cov.flat[:: len(cov) + 1] += self.noise_variance_ # modify diag inplace return cov
defget_precision(self): """Compute data precision matrix with the generative model. Equals the inverse of the covariance but computed with the matrix inversion lemma for efficiency. Returns ------- precision : array, shape=(n_features, n_features) Estimated precision of data. """ n_features = self.components_.shape[1]
# handle corner cases first if self.n_components_ == 0: return np.eye(n_features) / self.noise_variance_
if np.isclose(self.noise_variance_, 0.0, atol=0.0): return linalg.inv(self.get_covariance())