Source code for simdesign.utils.stats

"""This module provides statistics related utility methods.
"""
# Imports from installed packages
from scipy.stats import lognorm, norm, uniform
import numpy as np
from time import gmtime
from typing import List



[docs]
def random_truncated_lognormal(size: int, mu: np.ndarray, sigma: np.ndarray,
                               lower: np.ndarray, upper: np.ndarray
                               ) -> np.ndarray:
    """Sample from a truncated log-normal distribution.

    Parameters
    ----------
    size : int
        Sample size (number of random samples to generate).
    mu : np.ndarray
        Mean of the log-normal distribution in log-space.
    sigma : np.ndarray
        Standard deviation of the log-normal distribution in log-space.
    lower : np.ndarray
        Lower truncation limit of the distribution.
    upper : np.ndarray
        Upper truncation limit of the distribution.

    Returns
    -------
    np.ndarray
        An array of random samples from the truncated log-normal distribution.

    Raises
    ------
    Exception
        If `sigma` is less than or equal to 0.
    Exception
        If `upper` is less than or equal to `lower`.

    Notes
    -----
    The truncated log-normal distribution is sampled by generating random
    probabilities within the cumulative distribution function (CDF) range
    defined by the lower and upper truncation limits.

    Examples
    --------
    >>> random_truncated_lognormal(
            100, mu=1.0, sigma=0.5, lower=2.0, upper=10.0)
    array([2.35, 4.17, 8.22, ...])  # Example output
    """
    # Validate the standard deviation
    if np.any(sigma <= 0):
        raise Exception('random_truncated_lognormal - '
                        'Fatal error: sigma <= 0.0')
    # Validate the truncation limits
    if np.any(upper <= lower):
        raise Exception(
            'random_truncated_lognormal - Fatal error: upper <= lower')
    # Calculate the cumulative probabilities at the truncation limits
    lncdf_a = lognorm(s=sigma, scale=np.exp(mu)).cdf(lower)
    lncdf_b = lognorm(s=sigma, scale=np.exp(mu)).cdf(upper)
    # Generate uniform random values scaled to the truncated CDF range
    lncdf_x = lncdf_a + (lncdf_b - lncdf_a) * np.random.rand(size)
    # Transform the cumulative probabilities back to the log-normal domain
    x = lognorm(s=sigma, scale=np.exp(mu)).ppf(lncdf_x)
    return x




[docs]
def truncated_lognormal_cdf_inv(
    cdf: np.ndarray, mu: np.ndarray, sigma: np.ndarray,
    lower: np.ndarray, upper: float
) -> np.ndarray:
    """Inverts the truncated (lower-upper) log-normal CDF.

    Parameters
    ----------
    cdf : np.ndarray
        The value of the cumulative distribution.
    mu : np.ndarray
        Mean of the log-normal PDF.
    sigma : np.ndarray
        Standard deviation of the log-normal PDF.
    lower : np.ndarray
        Lower truncation limit.
    upper : np.ndarray
        Upper truncation limit.

    Returns
    -------
    np.ndarray
        Real X, the argument of the PDF with the given CDF.
    """

    # Compute the truncated CDF bounds
    lncdf_a = lognorm(s=sigma, scale=np.exp(mu)).cdf(lower)
    lncdf_b = lognorm(s=sigma, scale=np.exp(mu)).cdf(upper)
    # Rescale the input CDFs to the truncated range
    lncdf_x = lncdf_a + cdf * (lncdf_b - lncdf_a)
    # Invert the log-normal CDF to get samples
    x = lognorm(s=sigma, scale=np.exp(mu)).ppf(lncdf_x)
    # Apply truncation limits for edge cases
    x = np.clip(x, lower, upper)

    return x




[docs]
def random_multivariate_normal(m: np.ndarray, k0: np.ndarray, size: int,
                               epsilon: float = 0.0) -> np.ndarray:
    """Sample from a multivariate normal distribution.

    It follows section A.2 Gaussian Identities of the book Gaussian Processes
    for Machine Learning.

    Parameters
    ----------
    m : np.ndarray
        Mean vector.
    k0 : np.ndarray
        Covariance matrix.
    size : int
        Sample size.
    epsilon : float, optional
        Perturbation value, by default 0.0.

    Returns
    -------
    np.ndarray
        Sample from a multivariate normal distribution with
        mean m and covariance k_0.

    Raises
    ------
    ValueError
        non-positive definite covariance matrix.

    References
    ----------
    https://juanitorduz.github.io/multivariate_normal/
    https://gaussianprocess.org/gpml/chapters/

    Notes
    -----
    In practice, it may be necessary to add a small multiple of the identity
    matrix to the covariance matrix for numerical reasons. This is because the
    eigenvalues of the matrix k0 can decay very rapidly and without this
    stabilization the Cholesky decomposition fails. The effect on the
    generated samples is to add additional independent noise of variance.
    From the context can usually be chosen to have inconsequential effects on
    the samples, while ensuring numerical stability (A.2 Gaussian Identities).
    For this purpose, epsilon value can be changed.
    """
    # Make sure that eigenvalues covariance function are greater than 0
    if not np.all(np.linalg.eigvals(k0) > 0):
        raise ValueError('Covariance matrix is not positive definite!')
    # Define dimension
    d = len(m)
    # Make sure mean vector is 2d
    m = m.reshape(d, 1)
    # Step 1: Compute the Cholesky Decomposition
    # Add small perturbation to the covariance matrix
    k = k0 + epsilon * np.identity(d)
    # Cholesky decomposition
    lower_triangular_matrix = np.linalg.cholesky(k)
    # Hermitian positive-definite matrix, A=L*L.T
    a = np.dot(lower_triangular_matrix, np.transpose(lower_triangular_matrix))
    # Let us verify the desired property
    if not np.allclose(k, a):
        print('Cholesky decomposition is not verified! '
              'It might be better to reduce the perturbation (epsilon)!')
    # Step 2: Generate Independent Samples u∼N(0,1)
    u = norm.ppf(np.random.rand(d * size), loc=0, scale=1).reshape(d, size)
    # Step 3: Compute x = m + lu
    x = (m + np.dot(lower_triangular_matrix, u)).T

    return x




[docs]
def random_choice(q: List[str], size: int, p: List[float] = None
                  ) -> np.ndarray:
    """Randomly selects elements from the input list 'q' with replacement,
    optionally using specified probabilities.

    Parameters
    ----------
    q : List[str]
        Input list of strings from which elements will be randomly chosen.
    size : int
        Number of elements to be selected.
    p : List[float], optional
        The probabilities associated with each element in 'q'. If None,
        the probabilities are assumed to be uniform. By default None.

    Returns
    -------
    np.ndarray
        An array containing the randomly selected elements from 'q'.

    Examples
    --------
    >>> q = ['B01', 'B02', 'B03', 'B04', 'B05']
    >>> size = 3
    >>> random_choice(q, size)
    array(['B03', 'B02', 'B05'], dtype='<U4')
    """
    if p is None:
        p = [1 / len(q)] * len(q)
    cumulative_probs = np.cumsum(p)
    ids = []
    for _ in range(size):
        r = np.random.rand()
        for idx, cp in enumerate(cumulative_probs):
            if r < cp:
                ids.append(idx)
                break
    x = np.array([q[i] for i in ids])
    return x




[docs]
def random_uniform(size: int, lower: float, upper: float) -> np.ndarray:
    """
    Generates an array of random samples uniformly distributed within
    a specified range.

    Parameters
    ----------
    size : int
        The number of random samples to generate.
    lower : float
        The lower bound of the uniform distribution.
    upper : float
        The upper bound of the uniform distribution.

    Returns
    -------
    np.ndarray
        An array of random samples uniformly distributed between
        `lower` and `upper`.

    Notes
    -----
    The function uses the inverse cumulative distribution function (CDF) of
    the uniform distribution to generate samples based on random probabilities.

    Examples
    --------
    >>> random_uniform(5, 0, 1)
    array([0.123, 0.456, 0.789, 0.234, 0.567])  # Example output
    """
    loc = lower
    scale = upper - lower
    prob = np.random.rand(size)
    sample = uniform.ppf(prob, loc=loc, scale=scale)
    return sample




[docs]
def random_multivariate_truncated_lognormal(
    size: int, rho: np.ndarray, lower_bound: np.ndarray,
    upper_bound: np.ndarray, theta: np.ndarray, std_ln: np.ndarray,
) -> np.ndarray:
    """Sample from a multivariate truncated log-normal distribution.

    Parameters
    ----------
    size : int
        Number of samples to generate.
    rho : np.ndarray
        Correlation matrix (n x n) describing the relationships between
        variables.
    lower_bound : np.ndarray
        Lower bounds for the truncated log-normal distributions.
    upper_bound : np.ndarray
        Upper bounds for the truncated log-normal distributions.
    theta : np.ndarray
        Medians of the log-normal distributions.
    std_ln : np.ndarray
        Logarithmic standard deviations of the log-normal distributions.

    Returns
    -------
    np.ndarray
        Samples from the multivariate truncated log-normal distribution
        (size x n).

    Notes
    -----
    ~LN(ln(theta), sigma)
    """
    n = len(theta)  # Number of dimensions

    # Validate dimensions of inputs
    if not (rho.shape == (n, n)):
        raise ValueError("Correlation matrix must be square and match the "
                         "dimension of the input parameters.")
    if not (len(lower_bound) == len(upper_bound) == n):
        raise ValueError(
            "Bounds, theta, and std_ln must all have the same length.")

    # Mean vector and covariance matrix for the multivariate normal dist.
    mu = np.zeros(n)  # Standard normal distribution
    std = np.ones(n)  # Identity standard deviations
    cov = np.outer(std, std) * rho  # Covariance matrix
    # Realisations from multivariate normal distribution
    z = np.random.multivariate_normal(mu, cov, size=size)
    # Transform multivariate normal samples to cumulative probabilities
    u = norm.cdf(z, loc=0.0, scale=1.0)
    # Log-normal truncated distribution parameters
    mu_ln = np.log(np.array(theta))
    # Map cumulative probabilities to truncated log-normal samples
    samples = truncated_lognormal_cdf_inv(
        u, mu=mu_ln, sigma=std_ln, lower=lower_bound, upper=upper_bound
    )

    return samples




[docs]
def get_time_based_seed() -> int:
    """Return a random seed derived from the current date and time.

    Returns
    -------
    int
        Summation of numbers in time date.
    """

    return sum(gmtime())