Source code for qfeval_functions.functions.nanpca

from dataclasses import dataclass

import torch

from .eigh import eigh
from .nancovar import nancovar


@dataclass
class NanpcaResult:
    components: torch.Tensor
    explained_variance: torch.Tensor


[docs] def nanpca(data: torch.Tensor) -> NanpcaResult: r"""Compute Principal Component Analysis (PCA) on data, ignoring NaN values. This function performs PCA by computing the eigendecomposition of the covariance matrix calculated with NaN-aware operations. PCA finds the principal components (eigenvectors) that capture the maximum variance in the data, ordered by their explained variance (eigenvalues). The function computes the covariance matrix using :func:`nancovar` to handle NaN values appropriately, then applies eigendecomposition via :func:`eigh` to obtain the principal components. The components are returned in descending order of explained variance. The mathematical formulation follows standard PCA: .. math:: \mathbf{C} = \text{nancovar}(\mathbf{X}) .. math:: \mathbf{C} \mathbf{v}_i = \lambda_i \mathbf{v}_i where :math:`\mathbf{C}` is the covariance matrix, :math:`\mathbf{v}_i` are the eigenvectors (principal components), and :math:`\lambda_i` are the eigenvalues (explained variance). Args: data (Tensor): Input tensor of shape :math:`(*, N, C)` where :math:`*` means any number of additional batch dimensions, :math:`N` is the number of samples, and :math:`C` is the number of features. Returns: NanpcaResult: A dataclass containing: - ``components`` (Tensor): Principal components of shape :math:`(*, C, C)`. ``components[..., i, :]`` represents the :math:`(i+1)`-th principal component (ordered by decreasing explained variance). - ``explained_variance`` (Tensor): Eigenvalues of shape :math:`(*, C)` representing the variance explained by each component, in descending order. Example: >>> # Simple 2D PCA with NaN values >>> data = torch.tensor([[[1.0, 2.0], ... [nan, 4.0], ... [3.0, 6.0], ... [4.0, nan]]]) >>> result = QF.nanpca(data) >>> result.components.shape torch.Size([1, 2, 2]) >>> result.explained_variance.shape torch.Size([1, 2]) >>> # Batch processing multiple datasets >>> data = torch.randn(2, 10, 3) # 2 batches, 10 samples, 3 features >>> # Introduce some NaN values >>> data[0, 2, 1] = nan >>> data[1, 5, :] = nan >>> result = QF.nanpca(data) >>> result.components.shape torch.Size([2, 3, 3]) >>> result.explained_variance.shape torch.Size([2, 3]) >>> # Access first principal component >>> first_pc = result.components[0, 0, :] # First batch, first component >>> first_variance = result.explained_variance[0, 0] # Corresponding variance >>> # Simple case without convergence issues >>> data = torch.tensor([[[1.0, 2.0], ... [3.0, 4.0], ... [5.0, nan]]]) >>> result = QF.nanpca(data) >>> result.components.shape torch.Size([1, 2, 2]) .. warning:: If there are insufficient valid (non-NaN) observations to compute meaningful covariance estimates, the results may contain NaN values. Ensure adequate data coverage for reliable PCA results. .. seealso:: :func:`nancovar`: NaN-aware covariance computation. :func:`eigh`: Eigendecomposition for symmetric matrices. :func:`nanmean`: NaN-aware mean used in covariance calculation. """ batch_shape = data.shape[:-2] data = data[None].flatten(end_dim=-3) w, v = eigh(nancovar(data[:, :, :, None], data[:, :, None, :], dim=-3)) v = v.flip(-1).transpose(-1, -2) w = w.flip(-1) return NanpcaResult( components=v.reshape(batch_shape + v.shape[-2:]), explained_variance=w.reshape(batch_shape + w.shape[-1:]), )