Source code for graspologic.embed.svd

# Copyright (c) Microsoft Corporation and contributors.
# Licensed under the MIT License.

from typing import Optional, Union

import numpy as np
import scipy
import scipy.sparse as sp
import sklearn
from scipy.sparse import csr_array
from scipy.stats import norm
from typing_extensions import Literal

from graspologic.types import List, Tuple

SvdAlgorithmType = Literal["full", "truncated", "randomized", "eigsh"]


def _compute_likelihood(arr: np.ndarray) -> np.ndarray:
    """
    Computes the log likelihoods based on normal distribution given
    a 1d-array of sorted values. If the input has no variance,
    the likelihood will be nan.
    """
    n_elements = len(arr)
    likelihoods = np.zeros(n_elements)

    for idx in range(1, n_elements + 1):
        # split into two samples
        s1 = arr[:idx]
        s2 = arr[idx:]

        # deal with when input only has 2 elements
        if (s1.size == 1) & (s2.size == 1):
            likelihoods[idx - 1] = -np.inf
            continue

        # compute means
        mu1 = np.mean(s1)
        if s2.size != 0:
            mu2 = np.mean(s2)
        else:
            # Prevent numpy warning for taking mean of empty array
            mu2 = -np.inf

        # compute pooled variance
        variance = (np.sum((s1 - mu1) ** 2) + np.sum((s2 - mu2) ** 2)) / (
            n_elements - 1 - (idx < n_elements)
        )
        std = np.sqrt(variance)

        # compute log likelihoods
        likelihoods[idx - 1] = np.sum(norm.logpdf(s1, loc=mu1, scale=std)) + np.sum(
            norm.logpdf(s2, loc=mu2, scale=std)
        )

    return likelihoods



[docs]
def select_dimension(
    X: Union[np.ndarray, sp.csr_array],
    n_components: Optional[int] = None,
    n_elbows: int = 2,
    threshold: Optional[float] = None,
    return_likelihoods: bool = False,
) -> Union[
    Tuple[List[int], List[float]], Tuple[List[int], List[float], List[np.ndarray]]
]:
    """
    Generates profile likelihood from array based on Zhu and Godsie method.
    Elbows correspond to the optimal embedding dimension.

    Parameters
    ----------
    X : 1d or 2d array-like
        Input array generate profile likelihoods for. If 1d-array, it should be
        sorted in decreasing order. If 2d-array, shape should be
        (n_samples, n_features).
    n_components : int, optional, default: None.
        Number of components to embed. If None, ``n_components =
        floor(log2(min(n_samples, n_features)))``. Ignored if ``X`` is 1d-array.
    n_elbows : int, optional, default: 2.
        Number of likelihood elbows to return. Must be ``> 1``.
    threshold : float, int, optional, default: None
        If given, only consider the singular values that are ``> threshold``. Must
        be ``>= 0``.
    return_likelihoods : bool, optional, default: False
        If True, returns the all likelihoods associated with each elbow.

    Returns
    -------
    elbows : list
        Elbows indicate subsequent optimal embedding dimensions. Number of
        elbows may be less than ``n_elbows`` if there are not enough singular
        values.
    sing_vals : list
        The singular values associated with each elbow.
    likelihoods : list of array-like
        Array of likelihoods of the corresponding to each elbow. Only returned
        if ``return_likelihoods`` is True.

    References
    ----------
    .. [1] Zhu, M. and Ghodsi, A. (2006).
        Automatic dimensionality selection from the scree plot via the use of
        profile likelihood. Computational Statistics & Data Analysis, 51(2),
        pp.918-930.
    """
    # Handle input data
    if not isinstance(X, (np.ndarray, csr_array)):
        msg = "X must be a numpy array or scipy.sparse.csr_array, not {}.".format(
            type(X)
        )
        raise ValueError(msg)
    if X.ndim > 2:
        msg = "X must be a 1d or 2d-array, not {}d array.".format(X.ndim)
        raise ValueError(msg)
    elif np.min(X.shape) <= 1:
        msg = "X must have more than 1 samples or 1 features."
        raise ValueError(msg)

    # Handle n_elbows
    if not isinstance(n_elbows, int):
        msg = "n_elbows must be an integer, not {}.".format(type(n_elbows))
        raise ValueError(msg)
    elif n_elbows < 1:
        msg = f"number of elbows should be an integer > 1, not {n_elbows}."
        raise ValueError(msg)

    # Handle threshold
    if threshold is not None:
        if not isinstance(threshold, (int, float)):
            msg = "threshold must be an integer or a float, not {}.".format(
                type(threshold)
            )
            raise ValueError(msg)
        elif threshold < 0:
            msg = "threshold must be >= 0, not {}.".format(threshold)
            raise ValueError(msg)

    # Handle n_components
    if n_components is None:
        # per recommendation by Zhu & Godsie
        k = int(np.ceil(np.log2(np.min(X.shape))))
    elif not isinstance(n_components, int):
        msg = "n_components must be an integer, not {}.".format(type(n_components))
        raise ValueError(msg)
    else:
        k = n_components

    # Check to see if svd is needed
    if X.ndim == 1:
        D = np.sort(X)[::-1]
    elif X.ndim == 2:
        # Singular values in decreasing order
        D = scipy.sparse.linalg.svds(A=X, k=k, return_singular_vectors=False)
        D = np.sort(D)[::-1]
        # U, D, V = sklearn.utils.extmath.randomized_svd()

    if threshold is not None:
        D = D[D > threshold]

        if len(D) == 0:
            msg = "No values greater than threshold {}."
            raise IndexError(msg.format(threshold))

    idx = 0
    elbows = []
    values = []
    likelihoods = []
    for _ in range(n_elbows):
        arr = D[idx:]
        if arr.size <= 1:  # Cant compute likelihoods with 1 numbers
            break
        lq = _compute_likelihood(arr)
        idx += np.argmax(lq).item() + 1
        elbows.append(idx)
        values.append(D[idx - 1])
        likelihoods.append(lq)

    if return_likelihoods:
        return elbows, values, likelihoods
    else:
        return elbows, values




[docs]
def select_svd(
    X: Union[np.ndarray, sp.csr_array],
    n_components: Optional[int] = None,
    n_elbows: Optional[int] = 2,
    algorithm: SvdAlgorithmType = "randomized",
    n_iter: int = 5,
    svd_seed: Optional[int] = None,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    r"""
    Dimensionality reduction using SVD.

    Performs linear dimensionality reduction by using either full singular
    value decomposition (SVD) or truncated SVD. Full SVD is performed using
    SciPy's wrapper for ARPACK, while truncated SVD is performed using either
    SciPy's wrapper for LAPACK or Sklearn's implementation of randomized SVD.

    It also performs optimal dimensionality selection using Zhu & Godsie algorithm
    if number of target dimension is not specified.

    Parameters
    ----------
    X : array-like, shape (n_samples, n_features)
        The data to perform svd on.
    n_components : int or None, default = None
        Desired dimensionality of output data. If "full",
        ``n_components`` must be ``<= min(X.shape)``. Otherwise, ``n_components`` must be
        ``< min(X.shape)``. If None, then optimal dimensions will be chosen by
        :func:`~graspologic.embed.select_dimension` using ``n_elbows`` argument.
    n_elbows : int, optional, default: 2
        If ``n_components`` is None, then compute the optimal embedding dimension using
        :func:`~graspologic.embed.select_dimension`. Otherwise, ignored.
    algorithm : {'randomized' (default), 'full', 'truncated'}, optional
        SVD solver to use:

        - 'randomized'
            Computes randomized svd using
            :func:`sklearn.utils.extmath.randomized_svd`
        - 'full'
            Computes full svd using :func:`scipy.linalg.svd`
            Does not support ``graph`` input of type scipy.sparse.csr_array
        - 'truncated'
            Computes truncated svd using :func:`scipy.sparse.linalg.svds`
        - 'eigsh'
            Computes svd of a real, symmetric square matrix using
            :func:`scipy.sparse.linalg.eigsh`. Extremely fast for these types of
            matrices.
    n_iter : int, optional (default = 5)
        Number of iterations for randomized SVD solver. Not used by 'full' or
        'truncated'. The default is larger than the default in randomized_svd
        to handle sparse matrices that may have large slowly decaying spectrum.

    svd_seed : int or None (default ``None``)
        Only applicable for ``algorithm="randomized"``; allows you to seed the
        randomized svd solver for deterministic, albeit pseudo-randomized behavior.

    Returns
    -------
    U : array-like, shape (n_samples, n_components)
        Left singular vectors corresponding to singular values.
    D : array-like, shape (n_components)
        Singular values in decreasing order, as a 1d array.
    V : array-like, shape (n_components, n_samples)
        Right singular vectors corresponding to singular values.

    References
    ----------
    .. [1] Zhu, M. and Ghodsi, A. (2006).
        Automatic dimensionality selection from the scree plot via the use of
        profile likelihood. Computational Statistics & Data Analysis, 51(2),
        pp.918-930.
    """
    # Added in order to pass check estimator, must include words "one sample"
    if X.shape[0] == 1:
        msg = "Input data has only one sample (node)"
        raise ValueError(msg)

    # Deal with algorithms
    if algorithm not in ["full", "truncated", "randomized", "eigsh"]:
        msg = "algorithm must be one of {full, truncated, randomized, eigsh}."
        raise ValueError(msg)

    if algorithm == "full" and sp.isspmatrix_csr(X):
        msg = "'full' agorithm does not support scipy.sparse.csr_array inputs."
        raise TypeError(msg)

    if n_components is None:
        if n_elbows is None:
            raise ValueError(
                "both n_components and n_elbows are None. One must be provided."
            )
        else:
            dims = select_dimension(X, n_elbows=n_elbows, threshold=None)
            elbows = dims[0]
            n_components = elbows[-1]

    # Check
    if (algorithm == "full") & (n_components > min(X.shape)):
        msg = "n_components must be <= min(X.shape)."
        raise ValueError(msg)

    if (algorithm in ["truncated", "randomized"]) & (n_components >= min(X.shape)):
        msg = "n_components must be strictly < min(X.shape)."
        raise ValueError(msg)

    if algorithm == "full":
        U, D, V = scipy.linalg.svd(X)
        U = U[:, :n_components]
        D = D[:n_components]
        V = V[:n_components, :]

    elif algorithm == "truncated":
        U, D, V = scipy.sparse.linalg.svds(X, k=n_components)
        idx = np.argsort(D)[::-1]  # sort in decreasing order
        D = D[idx]
        U = U[:, idx]
        V = V[idx, :]

    elif algorithm == "eigsh":
        D, U = scipy.sparse.linalg.eigsh(X, k=n_components)
        # singular values of a real symmetric matrix are the absolute values of its
        # eigenvalues, so need to take np.abs
        D = np.abs(D)
        V = U.T

        # sort in decreasing order
        idx = np.argsort(D)[::-1]
        D = D[idx]
        U = U[:, idx]
        V = V[idx, :]

    elif algorithm == "randomized":
        # for some reason, randomized_svd defaults random_state to 0 if not provided
        # which is weird because None is a valid starting point too
        svd_seed = svd_seed if svd_seed is not None else 0
        U, D, V = sklearn.utils.extmath.randomized_svd(
            X, n_components, n_iter=n_iter, random_state=svd_seed
        )

    else:
        raise ValueError(
            "algorithm must be in {'full', 'truncated', 'randomized', 'eigsh'}"
        )

    return U, D, V