Source code for qumphy.misc.output_conversions

"""
File: qumphy/output_conversions.py
Project: 22HLT01 QUMPHY
Contact: vivek.desai@npl.co.uk
Gitlab: https://gitlab.com/qumphy
Description: Model output conversion functions.
"""

import numpy as np
import scipy.stats as stats
from scipy.integrate import quad
import statsmodels.api as sm
from collections.abc import Callable


# -------------- #
# CLASSIFICATION #
# -------------- #


# Convert prediction intervals for classification to probabilities

[docs]
def intervals_to_probs(intervals: np.ndarray, method: str = "jaccard") -> np.ndarray:
    """
    Convert prediction intervals to probabilities by taking the mean of lower and upper bounds.

    Parameters:
    -----------
        intervals (np.ndarray): Prediction intervals with shape (n_samples, n_classes, 2), where last dimension represents [lower_bound, upper_bound].

    Returns:
    --------
        probs (np.ndarray): Probabilities derived from interval means
    """

    assert (
        method == "jaccard" or method == "mean"
    ), 'Specify method as either "jaccard" or "mean"'

    intervals = np.array(intervals)

    if method == "jaccard":
        probs = intervals[:, :, 1] / (1 - intervals[:, :, 0] + intervals[:, :, 1])

    # if(method=="harmonic"):
    #     probs = 2*intervals[:, :, 1]*intervals[:, :, 0]/(intervals[:, :, 0]+intervals[:, :, 1])

    else:  # simple mean
        # Compute mean of lower and upper bounds
        probs = (intervals[:, :, 0] + intervals[:, :, 1]) / 2
        print("-" * 85 + "\n")

    # Ensure probabilities sum to 1 for each sample
    probs /= np.sum(probs, axis=1)[:, np.newaxis]

    return probs



# Convert probabilities to prediction sets using top-k selection

[docs]
def probs_to_pred_sets(probabilities: np.ndarray, alpha: float) -> np.ndarray:
    """
    Wrapper function to convert probabilities to prediction sets given an array of model confidences per class.

    Parameters:
    -----------
        probabilities (np.ndarray): Array of model outputs as probabilities for each class.
        alpha (float): Defines the confidence/coverage level at which to set threshold for top-k selection.

    Returns:
    --------
        np.ndarry: Array of lists, where each list is a prediction set determined through top-k selection at 1-alpha confidence level.
    """

    # Generate prediction sets using top-k selection
    def generate_pred_set(prob_vec: list[float], alpha: float) -> list[int]:
        """
        Create prediction set from probability vector using top-k selection for a given confidence/coverage level.
        Note: coverage is not guaranteed in any sense with this method.

        Parameters:
        -----------
            prob_vec (list): Model output for a given input, as probabilites for each class.
            alpha (float): Defines the coverage level that the prediction sets should target.

        Returns:
        --------
            m (list): List of class labels that form the prediction set.
        """
        sorted_probs = np.sort(prob_vec)[::-1]
        # Calculate cumulative sum of sorted confidences.
        cumsum = np.cumsum(sorted_probs)
        # Find smallest k where the cumulative sum exceeds the desired coverage level.
        k = np.searchsorted(cumsum, 1 - alpha)
        # Get top-k class labels that form the prediction set.
        m = np.argsort(prob_vec)[::-1][: k + 1]
        return m

    return np.array(
        [list(generate_pred_set(prob, alpha)) for prob in probabilities], dtype=object
    )



# ---------- #
# REGRESSION #
# ---------- #


# Convert quantiles to a distribution assuming quantiles are from univariate gaussian, for a given confidence level

[docs]
def norm_convert_interval(
    lower_bound: float, upper_bound: float, confidence_level: float = 0.95
):
    """

    Assuming the quantiles are from a Gaussian distribution, convert to a full distribution object.
    Return the distribution object, as well as the mean and standard deviation.

    Parameters:
    -----------
        lower_bound (float): Lower quantile.
        upper_bound (float): Upper quantile.
        confidence_level (float, optional): Confidence level associated with the quantiles. Defaults to 0.95.

    Returns:
    --------
        mean (float): The mean of the normal distribution.
        var (float): The variance of the normal distribution.
        distribution (scipy.stats.norm object): The full scipy.stats.norm distribution object, which has useful attributes e.g. CDF.
    """
    assert lower_bound < upper_bound, "Lower bound must be smaller than upper bound."

    # Calculate mean
    mean = (lower_bound + upper_bound) / 2

    # Calculate z-score for the given confidence level
    z_score = stats.norm.ppf((1 + confidence_level) / 2)

    # Estimate standard deviation and variance
    std_dev = (upper_bound - lower_bound) / (2 * z_score)
    var = std_dev**2

    # Create normal distribution
    distribution = stats.norm(loc=mean, scale=std_dev)

    return mean, var, distribution



# Convert quantiles to a distribution using Gaussian Kernel Density Estimation

[docs]
def kde_convert_interval(
    lower_bound: float,
    upper_bound: float,
    num_samples: int = 10000,
    confidence_level: float = 0.95,
    integrate: bool = False,
) -> tuple:
    """
    Convert a prediction interval to a distribution object using Kernel Density Estimation (KDE).

    References:
    -----------

    KDE documentation for statsmodel package can be found here: <https://www.statsmodels.org/dev/generated/statsmodels.nonparametric.kde.KDEUnivariate.html>.

    Documentation for scipy quad integration method can be found here: <https://docs.scipy.org/doc/scipy/reference/generated/scipy.integrate.quad.html>.

    Parameters:
    -----------
    lower_bound (float):
        Lower bound of the prediction interval.
    upper_bound (float):
        Upper bound of the prediction interval.
    num_samples (int, optional):
        Number of samples to generate between the interval bounds. Defaults to 10000.
    confidence_level (float, optional):
        Confidence level associated with prediction interval. Defaults to 0.95.
    integrate (bool, optional):
        Option to evaluate mean and standard deviation using numerical integration instead of discrete summation. Defaults to False.


    Returns:
    --------

    mean (float):
        The mean of the output distribution.
    var (float):
        The variance of the output distribution - used as the uncertainty.
    kde (statsmodel.nonparametric.KDEUnivariate object):
        The output distribution approximated using KDE.
    """
    # Generate samples within the interval using multiple sampling methods (Gaussian samples and uniform samples)

    assert isinstance(integrate, bool), 'The "integrate" input must be Boolean.'
    assert lower_bound < upper_bound, "Lower bound must be smaller than upper bound."

    uniform_samples = np.random.uniform(lower_bound, upper_bound, num_samples)

    norm_mean = (lower_bound + upper_bound) / 2
    norm_std = (upper_bound - lower_bound) / (
        2 * stats.norm.ppf((1 + confidence_level) / 2)
    )

    normal_samples = np.random.normal(loc=norm_mean, scale=norm_std, size=num_samples)

    # Combine and clip samples
    combined_samples = np.concatenate([uniform_samples, normal_samples])
    combined_samples = np.clip(
        combined_samples, lower_bound, upper_bound
    )  # May get better results if the samples are not clipped.

    # Perform KDE with statsmodels
    kde = sm.nonparametric.KDEUnivariate(combined_samples)
    kde.fit(kernel="gau", bw="scott")

    if integrate:
        # Alternative method to determine the mean and standard deviation of the KDE distribution using numerical integration (slower)
        mean = quad(lambda x: kde.evaluate(x) * x, kde.support[0], kde.support[-1])[0]
        var = (
            quad(lambda x: kde.evaluate(x) * x**2, kde.support[0], kde.support[-1])[0]
            - mean**2
        )
        # std = np.sqrt(var)
        # print(f"The mean {u"\u00B1"} std using statsmodels KDE with scipy quad numerical integration is: {mean:.7f} {u"\u00B1"} {std:.7f}")

    else:
        # Determine the mean and standard deviation using discrete summation of the PDF
        mean = np.dot(kde.density, kde.support) / kde.density.sum()
        var = np.dot(kde.density, kde.support**2) / kde.density.sum() - mean**2
        # std = np.sqrt(var)
        # print(f"The approximate mean {u"\u00B1"} std using statsmodels KDE without numerical integration is: {mean:.7f} {u"\u00B1"} {std:.7f}")

    return mean, var, kde



# Wrapper function to convert an array of prediction intervals to distributions, with a specified conversion function

[docs]
def convert_prediction_intervals(
    intervals: np.ndarray,
    converter: Callable,
    confidence_level: float = 0.95,
) -> tuple:
    """
    Convert a list of prediction intervals to distributions, giving a list of distributions, predictions, and uncertainties as variances.

    Parameters:
    -----------
    intervals (np.ndarray):
        Prediction interval outputs from a model. Expected shape of (n_samples, 2) for [lower_bound, upper_bound] pairs.
    converter (Callable):
        Conversion function used to convert intervals, given as lower and upper bounds, to distributions. Means and variances for the distributions are also returned.
    confidence_level (float, optional):
        The confidence level at which conformal prediction was evaluated. Used in the sampling process for converting the interval to a distribution. Defaults to 0.95.

    Returns:
    --------
    distributions (List[distribution objects]):
        List of distribution objects.
    predictions (List[float]):
        List of predictions, which are the means of the distributions, and should be asymptotically equivalent to the midpoint of the prediction intervals.
    uncertainties (List[float]):
        List of uncertainties, given as variances of the distributions.
    """

    assert callable(
        converter
    ), '"converter" must be a function; either Gaussian or KDE converter.'
    assert (
        isinstance(confidence_level, float) and confidence_level > 0
    ), "Confidence level must be a positive float."

    predictions = []
    uncertainties = []
    distributions = []

    for lower_bound, upper_bound in intervals:
        # For each prediction interval, convert to a distribution and append the mean and variance to corresponding lists
        mean, var, dist = converter(
            lower_bound=lower_bound,
            upper_bound=upper_bound,
            confidence_level=confidence_level,
        )

        predictions.append(mean)
        uncertainties.append(var)
        distributions.append(dist)

        del mean, var, dist

    return predictions, uncertainties, distributions



if __name__ == "__main__":
    pass