Source code for graspologic.embed.svd

# Copyright (c) Microsoft Corporation and contributors.
# Licensed under the MIT License.

from typing import Optional, Union

import numpy as np
import scipy
import scipy.sparse as sp
import sklearn
from scipy.sparse import csr_array
from scipy.stats import norm
from typing_extensions import Literal

from graspologic.types import List, Tuple

SvdAlgorithmType = Literal["full", "truncated", "randomized", "eigsh"]


def _compute_likelihood(arr: np.ndarray) -> np.ndarray:
    """
    Computes the log likelihoods based on normal distribution given
    a 1d-array of sorted values. If the input has no variance,
    the likelihood will be nan.
    """
    n_elements = len(arr)
    likelihoods = np.zeros(n_elements)

    for idx in range(1, n_elements + 1):
        # split into two samples
        s1 = arr[:idx]
        s2 = arr[idx:]

        # deal with when input only has 2 elements
        if (s1.size == 1) & (s2.size == 1):
            likelihoods[idx - 1] = -np.inf
            continue

        # compute means
        mu1 = np.mean(s1)
        if s2.size != 0:
            mu2 = np.mean(s2)
        else:
            # Prevent numpy warning for taking mean of empty array
            mu2 = -np.inf

        # compute pooled variance
        variance = (np.sum((s1 - mu1) ** 2) + np.sum((s2 - mu2) ** 2)) / (
            n_elements - 1 - (idx < n_elements)
        )
        std = np.sqrt(variance)

        # compute log likelihoods
        likelihoods[idx - 1] = np.sum(norm.logpdf(s1, loc=mu1, scale=std)) + np.sum(
            norm.logpdf(s2, loc=mu2, scale=std)
        )

    return likelihoods


[docs] def select_dimension( X: Union[np.ndarray, sp.csr_array], n_components: Optional[int] = None, n_elbows: int = 2, threshold: Optional[float] = None, return_likelihoods: bool = False, ) -> Union[ Tuple[List[int], List[float]], Tuple[List[int], List[float], List[np.ndarray]] ]: """ Generates profile likelihood from array based on Zhu and Godsie method. Elbows correspond to the optimal embedding dimension. Parameters ---------- X : 1d or 2d array-like Input array generate profile likelihoods for. If 1d-array, it should be sorted in decreasing order. If 2d-array, shape should be (n_samples, n_features). n_components : int, optional, default: None. Number of components to embed. If None, ``n_components = floor(log2(min(n_samples, n_features)))``. Ignored if ``X`` is 1d-array. n_elbows : int, optional, default: 2. Number of likelihood elbows to return. Must be ``> 1``. threshold : float, int, optional, default: None If given, only consider the singular values that are ``> threshold``. Must be ``>= 0``. return_likelihoods : bool, optional, default: False If True, returns the all likelihoods associated with each elbow. Returns ------- elbows : list Elbows indicate subsequent optimal embedding dimensions. Number of elbows may be less than ``n_elbows`` if there are not enough singular values. sing_vals : list The singular values associated with each elbow. likelihoods : list of array-like Array of likelihoods of the corresponding to each elbow. Only returned if ``return_likelihoods`` is True. References ---------- .. [1] Zhu, M. and Ghodsi, A. (2006). Automatic dimensionality selection from the scree plot via the use of profile likelihood. Computational Statistics & Data Analysis, 51(2), pp.918-930. """ # Handle input data if not isinstance(X, (np.ndarray, csr_array)): msg = "X must be a numpy array or scipy.sparse.csr_array, not {}.".format( type(X) ) raise ValueError(msg) if X.ndim > 2: msg = "X must be a 1d or 2d-array, not {}d array.".format(X.ndim) raise ValueError(msg) elif np.min(X.shape) <= 1: msg = "X must have more than 1 samples or 1 features." raise ValueError(msg) # Handle n_elbows if not isinstance(n_elbows, int): msg = "n_elbows must be an integer, not {}.".format(type(n_elbows)) raise ValueError(msg) elif n_elbows < 1: msg = f"number of elbows should be an integer > 1, not {n_elbows}." raise ValueError(msg) # Handle threshold if threshold is not None: if not isinstance(threshold, (int, float)): msg = "threshold must be an integer or a float, not {}.".format( type(threshold) ) raise ValueError(msg) elif threshold < 0: msg = "threshold must be >= 0, not {}.".format(threshold) raise ValueError(msg) # Handle n_components if n_components is None: # per recommendation by Zhu & Godsie k = int(np.ceil(np.log2(np.min(X.shape)))) elif not isinstance(n_components, int): msg = "n_components must be an integer, not {}.".format(type(n_components)) raise ValueError(msg) else: k = n_components # Check to see if svd is needed if X.ndim == 1: D = np.sort(X)[::-1] elif X.ndim == 2: # Singular values in decreasing order D = scipy.sparse.linalg.svds(A=X, k=k, return_singular_vectors=False) D = np.sort(D)[::-1] # U, D, V = sklearn.utils.extmath.randomized_svd() if threshold is not None: D = D[D > threshold] if len(D) == 0: msg = "No values greater than threshold {}." raise IndexError(msg.format(threshold)) idx = 0 elbows = [] values = [] likelihoods = [] for _ in range(n_elbows): arr = D[idx:] if arr.size <= 1: # Cant compute likelihoods with 1 numbers break lq = _compute_likelihood(arr) idx += np.argmax(lq).item() + 1 elbows.append(idx) values.append(D[idx - 1]) likelihoods.append(lq) if return_likelihoods: return elbows, values, likelihoods else: return elbows, values
[docs] def select_svd( X: Union[np.ndarray, sp.csr_array], n_components: Optional[int] = None, n_elbows: Optional[int] = 2, algorithm: SvdAlgorithmType = "randomized", n_iter: int = 5, svd_seed: Optional[int] = None, ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: r""" Dimensionality reduction using SVD. Performs linear dimensionality reduction by using either full singular value decomposition (SVD) or truncated SVD. Full SVD is performed using SciPy's wrapper for ARPACK, while truncated SVD is performed using either SciPy's wrapper for LAPACK or Sklearn's implementation of randomized SVD. It also performs optimal dimensionality selection using Zhu & Godsie algorithm if number of target dimension is not specified. Parameters ---------- X : array-like, shape (n_samples, n_features) The data to perform svd on. n_components : int or None, default = None Desired dimensionality of output data. If "full", ``n_components`` must be ``<= min(X.shape)``. Otherwise, ``n_components`` must be ``< min(X.shape)``. If None, then optimal dimensions will be chosen by :func:`~graspologic.embed.select_dimension` using ``n_elbows`` argument. n_elbows : int, optional, default: 2 If ``n_components`` is None, then compute the optimal embedding dimension using :func:`~graspologic.embed.select_dimension`. Otherwise, ignored. algorithm : {'randomized' (default), 'full', 'truncated'}, optional SVD solver to use: - 'randomized' Computes randomized svd using :func:`sklearn.utils.extmath.randomized_svd` - 'full' Computes full svd using :func:`scipy.linalg.svd` Does not support ``graph`` input of type scipy.sparse.csr_array - 'truncated' Computes truncated svd using :func:`scipy.sparse.linalg.svds` - 'eigsh' Computes svd of a real, symmetric square matrix using :func:`scipy.sparse.linalg.eigsh`. Extremely fast for these types of matrices. n_iter : int, optional (default = 5) Number of iterations for randomized SVD solver. Not used by 'full' or 'truncated'. The default is larger than the default in randomized_svd to handle sparse matrices that may have large slowly decaying spectrum. svd_seed : int or None (default ``None``) Only applicable for ``algorithm="randomized"``; allows you to seed the randomized svd solver for deterministic, albeit pseudo-randomized behavior. Returns ------- U : array-like, shape (n_samples, n_components) Left singular vectors corresponding to singular values. D : array-like, shape (n_components) Singular values in decreasing order, as a 1d array. V : array-like, shape (n_components, n_samples) Right singular vectors corresponding to singular values. References ---------- .. [1] Zhu, M. and Ghodsi, A. (2006). Automatic dimensionality selection from the scree plot via the use of profile likelihood. Computational Statistics & Data Analysis, 51(2), pp.918-930. """ # Added in order to pass check estimator, must include words "one sample" if X.shape[0] == 1: msg = "Input data has only one sample (node)" raise ValueError(msg) # Deal with algorithms if algorithm not in ["full", "truncated", "randomized", "eigsh"]: msg = "algorithm must be one of {full, truncated, randomized, eigsh}." raise ValueError(msg) if algorithm == "full" and sp.isspmatrix_csr(X): msg = "'full' agorithm does not support scipy.sparse.csr_array inputs." raise TypeError(msg) if n_components is None: if n_elbows is None: raise ValueError( "both n_components and n_elbows are None. One must be provided." ) else: dims = select_dimension(X, n_elbows=n_elbows, threshold=None) elbows = dims[0] n_components = elbows[-1] # Check if (algorithm == "full") & (n_components > min(X.shape)): msg = "n_components must be <= min(X.shape)." raise ValueError(msg) if (algorithm in ["truncated", "randomized"]) & (n_components >= min(X.shape)): msg = "n_components must be strictly < min(X.shape)." raise ValueError(msg) if algorithm == "full": U, D, V = scipy.linalg.svd(X) U = U[:, :n_components] D = D[:n_components] V = V[:n_components, :] elif algorithm == "truncated": U, D, V = scipy.sparse.linalg.svds(X, k=n_components) idx = np.argsort(D)[::-1] # sort in decreasing order D = D[idx] U = U[:, idx] V = V[idx, :] elif algorithm == "eigsh": D, U = scipy.sparse.linalg.eigsh(X, k=n_components) # singular values of a real symmetric matrix are the absolute values of its # eigenvalues, so need to take np.abs D = np.abs(D) V = U.T # sort in decreasing order idx = np.argsort(D)[::-1] D = D[idx] U = U[:, idx] V = V[idx, :] elif algorithm == "randomized": # for some reason, randomized_svd defaults random_state to 0 if not provided # which is weird because None is a valid starting point too svd_seed = svd_seed if svd_seed is not None else 0 U, D, V = sklearn.utils.extmath.randomized_svd( X, n_components, n_iter=n_iter, random_state=svd_seed ) else: raise ValueError( "algorithm must be in {'full', 'truncated', 'randomized', 'eigsh'}" ) return U, D, V