"""
File: qumphy/output_conversions.py
Project: 22HLT01 QUMPHY
Contact: vivek.desai@npl.co.uk
Gitlab: https://gitlab.com/qumphy
Description: Model output conversion functions.
"""
import numpy as np
import scipy.stats as stats
from scipy.integrate import quad
import statsmodels.api as sm
from collections.abc import Callable
# -------------- #
# CLASSIFICATION #
# -------------- #
# Convert prediction intervals for classification to probabilities
[docs]
def intervals_to_probs(intervals: np.ndarray, method: str = "jaccard") -> np.ndarray:
"""
Convert prediction intervals to probabilities by taking the mean of lower and upper bounds.
Parameters:
-----------
intervals (np.ndarray): Prediction intervals with shape (n_samples, n_classes, 2), where last dimension represents [lower_bound, upper_bound].
Returns:
--------
probs (np.ndarray): Probabilities derived from interval means
"""
assert (
method == "jaccard" or method == "mean"
), 'Specify method as either "jaccard" or "mean"'
intervals = np.array(intervals)
if method == "jaccard":
probs = intervals[:, :, 1] / (1 - intervals[:, :, 0] + intervals[:, :, 1])
# if(method=="harmonic"):
# probs = 2*intervals[:, :, 1]*intervals[:, :, 0]/(intervals[:, :, 0]+intervals[:, :, 1])
else: # simple mean
# Compute mean of lower and upper bounds
probs = (intervals[:, :, 0] + intervals[:, :, 1]) / 2
print("-" * 85 + "\n")
# Ensure probabilities sum to 1 for each sample
probs /= np.sum(probs, axis=1)[:, np.newaxis]
return probs
# Convert probabilities to prediction sets using top-k selection
[docs]
def probs_to_pred_sets(probabilities: np.ndarray, alpha: float) -> np.ndarray:
"""
Wrapper function to convert probabilities to prediction sets given an array of model confidences per class.
Parameters:
-----------
probabilities (np.ndarray): Array of model outputs as probabilities for each class.
alpha (float): Defines the confidence/coverage level at which to set threshold for top-k selection.
Returns:
--------
np.ndarry: Array of lists, where each list is a prediction set determined through top-k selection at 1-alpha confidence level.
"""
# Generate prediction sets using top-k selection
def generate_pred_set(prob_vec: list[float], alpha: float) -> list[int]:
"""
Create prediction set from probability vector using top-k selection for a given confidence/coverage level.
Note: coverage is not guaranteed in any sense with this method.
Parameters:
-----------
prob_vec (list): Model output for a given input, as probabilites for each class.
alpha (float): Defines the coverage level that the prediction sets should target.
Returns:
--------
m (list): List of class labels that form the prediction set.
"""
sorted_probs = np.sort(prob_vec)[::-1]
# Calculate cumulative sum of sorted confidences.
cumsum = np.cumsum(sorted_probs)
# Find smallest k where the cumulative sum exceeds the desired coverage level.
k = np.searchsorted(cumsum, 1 - alpha)
# Get top-k class labels that form the prediction set.
m = np.argsort(prob_vec)[::-1][: k + 1]
return m
return np.array(
[list(generate_pred_set(prob, alpha)) for prob in probabilities], dtype=object
)
# ---------- #
# REGRESSION #
# ---------- #
# Convert quantiles to a distribution assuming quantiles are from univariate gaussian, for a given confidence level
[docs]
def norm_convert_interval(
lower_bound: float, upper_bound: float, confidence_level: float = 0.95
):
"""
Assuming the quantiles are from a Gaussian distribution, convert to a full distribution object.
Return the distribution object, as well as the mean and standard deviation.
Parameters:
-----------
lower_bound (float): Lower quantile.
upper_bound (float): Upper quantile.
confidence_level (float, optional): Confidence level associated with the quantiles. Defaults to 0.95.
Returns:
--------
mean (float): The mean of the normal distribution.
var (float): The variance of the normal distribution.
distribution (scipy.stats.norm object): The full scipy.stats.norm distribution object, which has useful attributes e.g. CDF.
"""
assert lower_bound < upper_bound, "Lower bound must be smaller than upper bound."
# Calculate mean
mean = (lower_bound + upper_bound) / 2
# Calculate z-score for the given confidence level
z_score = stats.norm.ppf((1 + confidence_level) / 2)
# Estimate standard deviation and variance
std_dev = (upper_bound - lower_bound) / (2 * z_score)
var = std_dev**2
# Create normal distribution
distribution = stats.norm(loc=mean, scale=std_dev)
return mean, var, distribution
# Convert quantiles to a distribution using Gaussian Kernel Density Estimation
[docs]
def kde_convert_interval(
lower_bound: float,
upper_bound: float,
num_samples: int = 10000,
confidence_level: float = 0.95,
integrate: bool = False,
) -> tuple:
"""
Convert a prediction interval to a distribution object using Kernel Density Estimation (KDE).
References:
-----------
KDE documentation for statsmodel package can be found here: <https://www.statsmodels.org/dev/generated/statsmodels.nonparametric.kde.KDEUnivariate.html>.
Documentation for scipy quad integration method can be found here: <https://docs.scipy.org/doc/scipy/reference/generated/scipy.integrate.quad.html>.
Parameters:
-----------
lower_bound (float):
Lower bound of the prediction interval.
upper_bound (float):
Upper bound of the prediction interval.
num_samples (int, optional):
Number of samples to generate between the interval bounds. Defaults to 10000.
confidence_level (float, optional):
Confidence level associated with prediction interval. Defaults to 0.95.
integrate (bool, optional):
Option to evaluate mean and standard deviation using numerical integration instead of discrete summation. Defaults to False.
Returns:
--------
mean (float):
The mean of the output distribution.
var (float):
The variance of the output distribution - used as the uncertainty.
kde (statsmodel.nonparametric.KDEUnivariate object):
The output distribution approximated using KDE.
"""
# Generate samples within the interval using multiple sampling methods (Gaussian samples and uniform samples)
assert isinstance(integrate, bool), 'The "integrate" input must be Boolean.'
assert lower_bound < upper_bound, "Lower bound must be smaller than upper bound."
uniform_samples = np.random.uniform(lower_bound, upper_bound, num_samples)
norm_mean = (lower_bound + upper_bound) / 2
norm_std = (upper_bound - lower_bound) / (
2 * stats.norm.ppf((1 + confidence_level) / 2)
)
normal_samples = np.random.normal(loc=norm_mean, scale=norm_std, size=num_samples)
# Combine and clip samples
combined_samples = np.concatenate([uniform_samples, normal_samples])
combined_samples = np.clip(
combined_samples, lower_bound, upper_bound
) # May get better results if the samples are not clipped.
# Perform KDE with statsmodels
kde = sm.nonparametric.KDEUnivariate(combined_samples)
kde.fit(kernel="gau", bw="scott")
if integrate:
# Alternative method to determine the mean and standard deviation of the KDE distribution using numerical integration (slower)
mean = quad(lambda x: kde.evaluate(x) * x, kde.support[0], kde.support[-1])[0]
var = (
quad(lambda x: kde.evaluate(x) * x**2, kde.support[0], kde.support[-1])[0]
- mean**2
)
# std = np.sqrt(var)
# print(f"The mean {u"\u00B1"} std using statsmodels KDE with scipy quad numerical integration is: {mean:.7f} {u"\u00B1"} {std:.7f}")
else:
# Determine the mean and standard deviation using discrete summation of the PDF
mean = np.dot(kde.density, kde.support) / kde.density.sum()
var = np.dot(kde.density, kde.support**2) / kde.density.sum() - mean**2
# std = np.sqrt(var)
# print(f"The approximate mean {u"\u00B1"} std using statsmodels KDE without numerical integration is: {mean:.7f} {u"\u00B1"} {std:.7f}")
return mean, var, kde
# Wrapper function to convert an array of prediction intervals to distributions, with a specified conversion function
[docs]
def convert_prediction_intervals(
intervals: np.ndarray,
converter: Callable,
confidence_level: float = 0.95,
) -> tuple:
"""
Convert a list of prediction intervals to distributions, giving a list of distributions, predictions, and uncertainties as variances.
Parameters:
-----------
intervals (np.ndarray):
Prediction interval outputs from a model. Expected shape of (n_samples, 2) for [lower_bound, upper_bound] pairs.
converter (Callable):
Conversion function used to convert intervals, given as lower and upper bounds, to distributions. Means and variances for the distributions are also returned.
confidence_level (float, optional):
The confidence level at which conformal prediction was evaluated. Used in the sampling process for converting the interval to a distribution. Defaults to 0.95.
Returns:
--------
distributions (List[distribution objects]):
List of distribution objects.
predictions (List[float]):
List of predictions, which are the means of the distributions, and should be asymptotically equivalent to the midpoint of the prediction intervals.
uncertainties (List[float]):
List of uncertainties, given as variances of the distributions.
"""
assert callable(
converter
), '"converter" must be a function; either Gaussian or KDE converter.'
assert (
isinstance(confidence_level, float) and confidence_level > 0
), "Confidence level must be a positive float."
predictions = []
uncertainties = []
distributions = []
for lower_bound, upper_bound in intervals:
# For each prediction interval, convert to a distribution and append the mean and variance to corresponding lists
mean, var, dist = converter(
lower_bound=lower_bound,
upper_bound=upper_bound,
confidence_level=confidence_level,
)
predictions.append(mean)
uncertainties.append(var)
distributions.append(dist)
del mean, var, dist
return predictions, uncertainties, distributions
if __name__ == "__main__":
pass