Source code for qumphy.misc.output_conversions

"""
File: qumphy/output_conversions.py
Project: 22HLT01 QUMPHY
Contact: vivek.desai@npl.co.uk
Gitlab: https://gitlab.com/qumphy
Description: Model output conversion functions.
"""

import numpy as np
import scipy.stats as stats
from scipy.integrate import quad
import statsmodels.api as sm
from collections.abc import Callable


# -------------- #
# CLASSIFICATION #
# -------------- #


# Convert prediction intervals for classification to probabilities
[docs] def intervals_to_probs(intervals: np.ndarray, method: str = "jaccard") -> np.ndarray: """ Convert prediction intervals to probabilities by taking the mean of lower and upper bounds. Parameters: ----------- intervals (np.ndarray): Prediction intervals with shape (n_samples, n_classes, 2), where last dimension represents [lower_bound, upper_bound]. Returns: -------- probs (np.ndarray): Probabilities derived from interval means """ assert ( method == "jaccard" or method == "mean" ), 'Specify method as either "jaccard" or "mean"' intervals = np.array(intervals) if method == "jaccard": probs = intervals[:, :, 1] / (1 - intervals[:, :, 0] + intervals[:, :, 1]) # if(method=="harmonic"): # probs = 2*intervals[:, :, 1]*intervals[:, :, 0]/(intervals[:, :, 0]+intervals[:, :, 1]) else: # simple mean # Compute mean of lower and upper bounds probs = (intervals[:, :, 0] + intervals[:, :, 1]) / 2 print("-" * 85 + "\n") # Ensure probabilities sum to 1 for each sample probs /= np.sum(probs, axis=1)[:, np.newaxis] return probs
# Convert probabilities to prediction sets using top-k selection
[docs] def probs_to_pred_sets(probabilities: np.ndarray, alpha: float) -> np.ndarray: """ Wrapper function to convert probabilities to prediction sets given an array of model confidences per class. Parameters: ----------- probabilities (np.ndarray): Array of model outputs as probabilities for each class. alpha (float): Defines the confidence/coverage level at which to set threshold for top-k selection. Returns: -------- np.ndarry: Array of lists, where each list is a prediction set determined through top-k selection at 1-alpha confidence level. """ # Generate prediction sets using top-k selection def generate_pred_set(prob_vec: list[float], alpha: float) -> list[int]: """ Create prediction set from probability vector using top-k selection for a given confidence/coverage level. Note: coverage is not guaranteed in any sense with this method. Parameters: ----------- prob_vec (list): Model output for a given input, as probabilites for each class. alpha (float): Defines the coverage level that the prediction sets should target. Returns: -------- m (list): List of class labels that form the prediction set. """ sorted_probs = np.sort(prob_vec)[::-1] # Calculate cumulative sum of sorted confidences. cumsum = np.cumsum(sorted_probs) # Find smallest k where the cumulative sum exceeds the desired coverage level. k = np.searchsorted(cumsum, 1 - alpha) # Get top-k class labels that form the prediction set. m = np.argsort(prob_vec)[::-1][: k + 1] return m return np.array( [list(generate_pred_set(prob, alpha)) for prob in probabilities], dtype=object )
# ---------- # # REGRESSION # # ---------- # # Convert quantiles to a distribution assuming quantiles are from univariate gaussian, for a given confidence level
[docs] def norm_convert_interval( lower_bound: float, upper_bound: float, confidence_level: float = 0.95 ): """ Assuming the quantiles are from a Gaussian distribution, convert to a full distribution object. Return the distribution object, as well as the mean and standard deviation. Parameters: ----------- lower_bound (float): Lower quantile. upper_bound (float): Upper quantile. confidence_level (float, optional): Confidence level associated with the quantiles. Defaults to 0.95. Returns: -------- mean (float): The mean of the normal distribution. var (float): The variance of the normal distribution. distribution (scipy.stats.norm object): The full scipy.stats.norm distribution object, which has useful attributes e.g. CDF. """ assert lower_bound < upper_bound, "Lower bound must be smaller than upper bound." # Calculate mean mean = (lower_bound + upper_bound) / 2 # Calculate z-score for the given confidence level z_score = stats.norm.ppf((1 + confidence_level) / 2) # Estimate standard deviation and variance std_dev = (upper_bound - lower_bound) / (2 * z_score) var = std_dev**2 # Create normal distribution distribution = stats.norm(loc=mean, scale=std_dev) return mean, var, distribution
# Convert quantiles to a distribution using Gaussian Kernel Density Estimation
[docs] def kde_convert_interval( lower_bound: float, upper_bound: float, num_samples: int = 10000, confidence_level: float = 0.95, integrate: bool = False, ) -> tuple: """ Convert a prediction interval to a distribution object using Kernel Density Estimation (KDE). References: ----------- KDE documentation for statsmodel package can be found here: <https://www.statsmodels.org/dev/generated/statsmodels.nonparametric.kde.KDEUnivariate.html>. Documentation for scipy quad integration method can be found here: <https://docs.scipy.org/doc/scipy/reference/generated/scipy.integrate.quad.html>. Parameters: ----------- lower_bound (float): Lower bound of the prediction interval. upper_bound (float): Upper bound of the prediction interval. num_samples (int, optional): Number of samples to generate between the interval bounds. Defaults to 10000. confidence_level (float, optional): Confidence level associated with prediction interval. Defaults to 0.95. integrate (bool, optional): Option to evaluate mean and standard deviation using numerical integration instead of discrete summation. Defaults to False. Returns: -------- mean (float): The mean of the output distribution. var (float): The variance of the output distribution - used as the uncertainty. kde (statsmodel.nonparametric.KDEUnivariate object): The output distribution approximated using KDE. """ # Generate samples within the interval using multiple sampling methods (Gaussian samples and uniform samples) assert isinstance(integrate, bool), 'The "integrate" input must be Boolean.' assert lower_bound < upper_bound, "Lower bound must be smaller than upper bound." uniform_samples = np.random.uniform(lower_bound, upper_bound, num_samples) norm_mean = (lower_bound + upper_bound) / 2 norm_std = (upper_bound - lower_bound) / ( 2 * stats.norm.ppf((1 + confidence_level) / 2) ) normal_samples = np.random.normal(loc=norm_mean, scale=norm_std, size=num_samples) # Combine and clip samples combined_samples = np.concatenate([uniform_samples, normal_samples]) combined_samples = np.clip( combined_samples, lower_bound, upper_bound ) # May get better results if the samples are not clipped. # Perform KDE with statsmodels kde = sm.nonparametric.KDEUnivariate(combined_samples) kde.fit(kernel="gau", bw="scott") if integrate: # Alternative method to determine the mean and standard deviation of the KDE distribution using numerical integration (slower) mean = quad(lambda x: kde.evaluate(x) * x, kde.support[0], kde.support[-1])[0] var = ( quad(lambda x: kde.evaluate(x) * x**2, kde.support[0], kde.support[-1])[0] - mean**2 ) # std = np.sqrt(var) # print(f"The mean {u"\u00B1"} std using statsmodels KDE with scipy quad numerical integration is: {mean:.7f} {u"\u00B1"} {std:.7f}") else: # Determine the mean and standard deviation using discrete summation of the PDF mean = np.dot(kde.density, kde.support) / kde.density.sum() var = np.dot(kde.density, kde.support**2) / kde.density.sum() - mean**2 # std = np.sqrt(var) # print(f"The approximate mean {u"\u00B1"} std using statsmodels KDE without numerical integration is: {mean:.7f} {u"\u00B1"} {std:.7f}") return mean, var, kde
# Wrapper function to convert an array of prediction intervals to distributions, with a specified conversion function
[docs] def convert_prediction_intervals( intervals: np.ndarray, converter: Callable, confidence_level: float = 0.95, ) -> tuple: """ Convert a list of prediction intervals to distributions, giving a list of distributions, predictions, and uncertainties as variances. Parameters: ----------- intervals (np.ndarray): Prediction interval outputs from a model. Expected shape of (n_samples, 2) for [lower_bound, upper_bound] pairs. converter (Callable): Conversion function used to convert intervals, given as lower and upper bounds, to distributions. Means and variances for the distributions are also returned. confidence_level (float, optional): The confidence level at which conformal prediction was evaluated. Used in the sampling process for converting the interval to a distribution. Defaults to 0.95. Returns: -------- distributions (List[distribution objects]): List of distribution objects. predictions (List[float]): List of predictions, which are the means of the distributions, and should be asymptotically equivalent to the midpoint of the prediction intervals. uncertainties (List[float]): List of uncertainties, given as variances of the distributions. """ assert callable( converter ), '"converter" must be a function; either Gaussian or KDE converter.' assert ( isinstance(confidence_level, float) and confidence_level > 0 ), "Confidence level must be a positive float." predictions = [] uncertainties = [] distributions = [] for lower_bound, upper_bound in intervals: # For each prediction interval, convert to a distribution and append the mean and variance to corresponding lists mean, var, dist = converter( lower_bound=lower_bound, upper_bound=upper_bound, confidence_level=confidence_level, ) predictions.append(mean) uncertainties.append(var) distributions.append(dist) del mean, var, dist return predictions, uncertainties, distributions
if __name__ == "__main__": pass