"""
File: qumphy/uq_metrics.py
Project: QUMPHY
Contact: vivek.desai@npl.co.uk
Gitlab: https://gitlab.com/qumphy
Description: Evaluation metrics for assessing model calibration for classification and regression tasks.
"""
import os
import numpy as np
import mpmath as mp
import matplotlib.pyplot as plt
from scipy import stats
import relplot as rp # Python package used to calculate smECE and plot smooth reliability diagram
from scipy.integrate import cumulative_simpson, simpson
from typing import Tuple
# Helper functions.
[docs]
def flatten_list_of_batches(output_list: list, concat_axis: int = -1):
"""
Function to flatten model output arrays before they are used to calculate the calibration error metrics.
Parameters:
-----------
output_list: list
List to be flattened into 1D np.ndarray
concat_axis: int, optional
Axis on which to perform the flattening, by default -1 (last axis)
Returns:
--------
flattened_array: np.ndarry
Flattened array
"""
for batch_index in range(len(output_list[-1])):
# -1 is used as sometimes output_list contains two entries,
# of which the latter is relevant
if batch_index == 0:
# get the values for all the examples in the first batch
# and create the array that will store all examples
flattened_array = np.array(output_list[-1][batch_index][:])
else:
# add the value for all examples in the batch to flattened_array
flattened_array = np.concatenate(
(flattened_array, output_list[-1][batch_index][:]), axis=concat_axis
)
return flattened_array
[docs]
def split_into_classes(predictions: np.ndarray, targets: np.ndarray) -> tuple:
"""
Helper function to split classification predictions into predictions conditioned on the ground truth class label.
Parameters:
-----------
predictions: np.ndarray
Model output probabilities with shape (n, k) for k-class classification.
targets: np.ndarray
Target class labels with shape (n, 1).
Returns:
tuple:
Predictions per-class for k classes, with corresponding ground truth class labels for each class. Size of tuple is dependent on number of classes.
"""
assert (
predictions.ndim == 2
), "Predictions must have shape (n, k) for n samples, with k classes."
assert len(predictions) == len(
targets
), "Predictions and targets array must have the same length!"
n_classes = predictions.shape[-1]
result = []
# Add predictions for each class
for class_idx in range(n_classes):
mask = targets == class_idx
result.append(predictions[mask])
# Add targets for each class
for class_idx in range(n_classes):
mask = targets == class_idx
result.append(targets[mask])
return tuple(result)
# Calculate the prediction entropies
[docs]
def entropy(p: np.ndarray) -> np.ndarray:
"""
Calculate the normalised entropy of the probabilities array.
Parameters:
-----------
p (np.ndarray):
Probabilities from classifier.
Returns:
--------
ents (np.ndarray):
Entropy values for each sample in the probability array.
"""
class_num = p.shape[1]
ents = -np.sum(p * np.log(p + 1e-12), axis=1)
ents = (1 / (np.log(class_num))) * ents # norm by log(class number)
return ents
##############################################################################
# -------------------------------------------------------------------------- #
# THE METRICS BELOW ARE TO CALCULATE CALIBRATION ERRORS FOR CLASSIFICATION #
# -------------------------------------------------------------------------- #
##############################################################################
[docs]
def expected_calibration_error(
predictions: np.ndarray,
targets: np.ndarray,
filepath: str = None,
save_fig: bool = False,
bin_number: int = 15,
):
"""
Calculate the Expected Calibration Error (ECE) for the predicted class model probabilities for classification tasks.
References:
-----------
The script below is heavily based on the following implementations of the ECE:
<https://medium.com/towards-data-science/expected-calibration-error-ece-a-step-by-step-visual-explanation-with-python-code-c3e9aa12937d>
and
<https://github.com/gpleiss/temperature_scaling/blob/master/temperature_scaling.py>.
There exists alternative packages that calculate an ECE value, such as the netcal package. Depending on the implementation of binning in each package, the ECE values differ by small amounts (~1e-3).
In the QUMPHY evaluation framework, the quoted ECE value is determined using this function.
Parameters:
-----------
predictions: np.ndarray
Model output probabilities with shape (n, k) for k-class classification.
targets: np.ndarray
Target class labels with shape (n, 1).
filepath: str
The filepath for the directory to save the reliability diagram to.
save_fig: bool
Boolean to determine whether to save the reliability diagram to the given filepath.
bin_number: int, optional
Number of bins to be used (note: bins are equal width in the mean confidence axis), by default 15.
Returns:
--------
ECE: float
ECE value for the input data
"""
if len(predictions) != len(targets):
raise ValueError("Number of predictions and target labels must match!")
# Check that the predictions array has shape (n, k), where n = number of samples
assert (
predictions.ndim == 2
), "The shape of the prediction array must be (n, k) for n samples, with k classes."
num_classes = predictions.shape[1]
max_prob_thresh = 1.0 / num_classes # Threshold to determine the predicted class
# Set the limits of the bins, and acquire the upper and lower bounds of the bins
bin_edges = np.linspace(max_prob_thresh, 1, bin_number + 1)
bin_lower_bounds = bin_edges[:-1]
bin_upper_bounds = bin_edges[1:]
# Only need to consider confidences that are >= max_prob_thresh
model_confidences = np.max(predictions, axis=1)
# Indices of the max model probability - equivalent to the model's predicted class label
predicted_labels = np.argmax(predictions, axis=1)
# Boolean array for instances where the model predicts the correct class
accs = predicted_labels == targets
# Initialise the ECE value, and mean_confidences_per_bin & bin_accuracies lists
ECE = 0
mean_confidences_per_bin = []
bin_accuracies = []
# Loop over the bin boundaries
for lower_bound, upper_bound in zip(bin_lower_bounds, bin_upper_bounds):
# Boolean array that identifies which confidence values fall into the current bin
in_bin = np.logical_and(
model_confidences > lower_bound.item(),
model_confidences <= upper_bound.item(),
)
# Empirical probability of finding the sample in that bin: equivalent to |Bm|/n
prob_in_bin = in_bin.mean()
# Determine bin statistics for all non-empty bins
if prob_in_bin.item() > 0:
# Determine accuracy for the predictions within the bin
acc_bin = accs[in_bin].mean()
# Determine the average confidence value for predictions within the bin
avg_conf = model_confidences[in_bin].mean()
# Calculate the ECE value for the bin
ECE += (prob_in_bin) * np.abs(acc_bin - avg_conf)
bin_accuracies.append(acc_bin)
mean_confidences_per_bin.append(avg_conf)
# Plot reliability diagram and save to filepath if chosen
xs = mean_confidences_per_bin
ys = bin_accuracies
if save_fig:
plt.clf()
plt.figure(figsize=(8, 8))
plt.plot(
[max_prob_thresh, 1], [max_prob_thresh, 1], "--", color="black", alpha=0.5
)
plt.scatter(xs, ys, alpha=1)
plt.plot(xs, ys, "b", alpha=0.4, label=f"AF classification: {ECE:.4f}")
plt.xlabel("Mean confidence in bin", fontsize=13)
plt.ylabel("Relative Frequency of Correct Predictions in bin", fontsize=13)
plt.legend(loc="best")
plt.title("Reliability Diagram for Classification", fontsize=15)
plt.savefig(os.path.join(filepath, "ECE_reliability_diagram.png"))
plt.close()
# Return the mean confidences per bin array and bin accuracies per bin for plotting purposes
return ECE, xs, ys
[docs]
def smooth_expected_calibration_error(
predictions: np.ndarray, targets: np.ndarray, filepath: str, save_fig: bool = True
) -> float:
"""
Calculate the smooth ECE value (smECE) for the predicted class probabilities for the classification task.
References:
-----------
Details on the relplot python package can be found in the following repository: <https://github.com/apple/ml-calibration/tree/main>.
The accompanying paper for the smooth ECE metric, and the smooth reliability diagrams can be found here: <https://arxiv.org/pdf/2309.12236>.
Parameters:
-----------
predictions: np.ndarray
Model output probabilities with shape (n, k) for k class classification.
targets: np.ndarray
Target class labels with shape (n, 1).
filepath: str
Path to directory to save the reliability diagram.
save_fig: bool
Boolean to determine whether to save the reliability diagram to the given filepath.
Returns:
--------
smECE: float
The calculated smECE value using the relplot python package.
"""
assert (
predictions.ndim == 2
), "The shape of the prediction array must be (n, k) for n samples, with k classes."
if len(predictions) != len(targets):
raise ValueError("Number of predictions and target labels must match!")
original_plt_style = plt.rcParams.copy()
# Helper function from the relplot package to select probabilities of the predicted class, and return an accuracies array.
confidences, accuracies = rp.multiclass_logits_to_confidences(
predictions, targets, probs=True
)
# Calculate the smooth ECE value for confidence calibration.
smECE = rp.smECE(confidences, accuracies)
# binnedECE = rp.metrics.binnedECE(confidences, accuracies, nbins = 20)
# When plotting the reliability diagram in the binary case, it is easier to visualise calibration using the predicted probabilities for AF.
AF_probs = predictions[:, 1]
# Plot the reliability diagram for regression of outcomes on predictions and save to filepath directory.
diagram = rp.prepare_rel_diagram(AF_probs, targets, plot_density=True)
fig, ax = rp.plot_rel_diagram(diagram)
ax.set_title("SmoothECE Reliability Diagram", fontsize=22)
if save_fig:
fig.savefig(os.path.join(filepath, "smECE_reliability_diagram.png"))
plt.close(fig)
plt.style.use("default")
plt.rcParams.update(original_plt_style)
# Uncertainty on smooth ECE value from bootstrapping and using the 95% confidence level band.
# smECE_uncert = float(diagram['ce_ci_width'])
return smECE
[docs]
def adaptive_calibration_error(
predictions: np.ndarray,
targets: np.ndarray,
*args,
range_number: int = 15,
threshold: float = 0.0,
):
"""
Calculate the Adaptive Calibration Error (ACE) for the predicted class probabilities.
This metric will work for both the binary and multiclass classification cases, with the option to apply thresholding as done in the original paper.
References:
-----------
The ACE is introduced in the following paper: <https://arxiv.org/pdf/1904.01685>.
This implementation is heavily inspired by the more general calibration error functions in: <https://github.com/JeremyNixon/uncertainty-metrics-1/tree/master>.
We present a simplified version here, focused on the ACE alone.
Parameters:
-----------
predictions: np.ndarray
Model output probabilities with shape (n, m) for classification. Note m >= 2 i.e. probabilities given for all classes.
targets: np.ndarray
Target class labels with shape (n, 1)
*args:
Used to catch any un-used arguments in the main UQ evaluation notebook.
range_number: int, optional
Number of ranges to be used (note: ranges are equal frequency bins in the mean confidence axis), by default 15
threshold: float, optional
Threshold value below which probabilities are ignored, by default 0.0 for the ACE i.e. no thresholding.
Returns:
--------
ACE: float
The ACE metric value.
"""
if len(predictions) != len(targets):
raise ValueError("Number of predictions and target labels must match!")
# Check that the predictions array has shape (n, m), where n = number of samples and m = number of classes.
assert (
predictions.ndim == 2 and predictions.shape[1] >= 2
), "Predictions array must only have 2 dimensions, with a minimum of 2 classes."
# Check the threshold value lies between 0 and 1.
assert 0 <= threshold < 1, "Threshold must be between 0 and 1."
num_classes = predictions.shape[1]
# One-hot encode the ground truth class labels.
labels_matrix = np.eye(num_classes)[targets]
class_calibration_error_list = []
for j in range(num_classes):
probs_slice = predictions[:, j]
labels_slice = labels_matrix[:, j]
# Get probabilities that are greater than or equal to the threshold. For ACE, threshold is 0.
mask = probs_slice >= threshold
filtered_probs = probs_slice[mask]
filtered_labels = labels_slice[mask]
# If no predictions for a class, set calibration error to 0.
if len(filtered_probs) == 0:
class_calibration_error_list.append(0)
continue
# Sort the probabilities and corresponding labels.
sorted_indices = np.argsort(filtered_probs, axis=-1)
sorted_probs = filtered_probs[sorted_indices]
sorted_labels = filtered_labels[sorted_indices]
n_samples = len(sorted_probs)
# Determine how many ranges to use dependent on number of predictions above threshold per class.
actual_n_ranges = min(range_number, n_samples)
if actual_n_ranges < range_number:
# If we have fewer samples than specified number of ranges, create one bin if the sample number is
actual_n_ranges = max(1, actual_n_ranges) # Ensure at least one bin.
# Define equal-frequency bin edges
bin_edges = np.linspace(0, n_samples, actual_n_ranges + 1).astype(int)
bin_calibration_errors = []
bin_weights = []
# Loop through the bins
for i in range(actual_n_ranges):
bin_lower, bin_upper = bin_edges[i], bin_edges[i + 1]
if bin_lower == bin_upper:
continue
# Get the probabilities and labels for the current bin
bin_probs = sorted_probs[bin_lower:bin_upper]
bin_labels = sorted_labels[bin_lower:bin_upper]
# Calculate mean predicted confidence and accuracy for the bin
bin_confidence = np.mean(bin_probs)
bin_accuracy = np.mean(bin_labels)
bin_error = np.abs(bin_accuracy - bin_confidence) # bin calibration error
# Calculate the weight for the bin
bin_weight = (bin_upper - bin_lower) / n_samples
assert (bin_upper - bin_lower) == len(bin_probs)
bin_calibration_errors.append(bin_error)
bin_weights.append(bin_weight)
# Calculate the weighted calibration error for this class
class_error = np.sum(np.array(bin_calibration_errors) * np.array(bin_weights))
class_calibration_error_list.append(class_error / num_classes)
# Calculate overall ACE as sum of individual class calibration errors
ACE = np.sum(class_calibration_error_list)
return ACE
[docs]
def uncertainty_calibration_error(
predictions: np.ndarray,
targets: np.ndarray,
filepath: str = None,
save_fig: bool = False,
prediction_entropies: np.ndarray = None,
bin_number: int = 15,
):
"""
Calculate the Uncertainty Calibration Error (UCE) for model predictions and target labels for classification.
There exists the optional argument to provide an array of entropies of shape (n, 1); if None, the entropies will be calculated from the probabilities.
References:
-----------
The UCE was introduced in the following paper: <https://arxiv.org/pdf/1909.13550>.
This implementation is based on: <https://github.com/mlaves/bayesian-temperature-scaling/blob/master/uce.py>.
Parameters:
-----------
predictions: np.ndarray
Model output probabilities with shape (n, k) for k-class classification.
targets: np.ndarray
Target class labels with shape (n, 1).
filepath: str
Path to directory to save the reliability diagram.
prediction_entropies: np.ndarray, optional
The entropies for the predictions, by default None; if None, then the entropies are calculated from the prediction probabilities.
bin_number: int, optional
Number of bins to be used (note: bins are equal width in the normalised entropy axis), by default 15.
Returns:
--------
UCE: float
The UCE for the model predictions and their corresponding entropies.
xs: np.ndarray
Array of the mean normalised predicted entropies, used for plotting the reliability diagram.
ys: np.ndarray
Array of the empirical inaccuracy per bin, used for the plotting the reliability diagram.
"""
if len(predictions) != len(targets) != len(prediction_entropies):
raise ValueError(
"Number of predictions, target labels, and entropies must match."
)
if prediction_entropies is not None:
assert isinstance(prediction_entropies, np.ndarray)
# Check that the predictions array has shape (n, k), where n = number of samples, and k = number of classes.
assert (
predictions.ndim == 2
), "The shape of the predictions must be (n, k) for n samples, and k classes."
n_classes = predictions.shape[-1]
# If no prediction entropies are provided, then they are calculated from the prediction probabilities.
if prediction_entropies is None:
prediction_entropies = []
for prob_vector in predictions:
# Only compute log where probabilities are non-zero to avoid log(0).
ent = -np.sum(prob_vector * np.log(prob_vector, where=(prob_vector != 0)))
prediction_entropies.append(ent)
# Normalise the prediction entropies by np.log(k) for k-class classification
prediction_entropies = (1 / np.log(n_classes)) * np.array(prediction_entropies)
# Define the bin intervals.
bin_edges = np.linspace(0, 1, bin_number + 1)
bin_lower_bounds = bin_edges[:-1]
bin_upper_bounds = bin_edges[1:]
# Predicted class label from the predictions array.
predicted_labels = np.argmax(predictions, axis=1)
# Boolean array for instances where the model predicts the incorrect class.
inaccuracies = predicted_labels != targets
# Initialise the UCE value, and bin_entropy_values & bin_inaccuracies lists.
UCE = 0
bin_entropy_values = []
bin_inaccuracies = []
# Loop over the bin boundaries.
for lower, upper in zip(bin_lower_bounds, bin_upper_bounds):
# # Boolean array that identifies which confidence values fall into the current bin.
in_bin = np.logical_and(
prediction_entropies > lower.item(), prediction_entropies <= upper.item()
)
# Empirical probability of finding the sample in that bin: equivalent to |Bm|/n
prob_in_bin = in_bin.mean()
# Determine bin statistics for all non-empty bins.
if prob_in_bin.item() > 0:
# Calculate the average inaccuracy for the current bin.
err_in_bin = inaccuracies[in_bin].mean()
# Calculate the average normalised prediction entropy for the current bin.
uncert_in_bin = prediction_entropies[in_bin].mean()
# Calculate the UCE for the current bin and add it to the running total UCE value.
norm_factor = (n_classes - 1) / n_classes
# For classification, a uniform predictive distribution corresponds to maximum entropy (max entropy of 1, max inaccuracy of 1 - 1/k).
# This means that to get the entropy to scale with inaccuracy, it must be normalised by (k-1)/k for k classes.
UCE += (prob_in_bin) * np.abs(err_in_bin - (uncert_in_bin * norm_factor))
bin_inaccuracies.append(err_in_bin)
bin_entropy_values.append(uncert_in_bin)
# Create the reliability diagram and save to the filepath directory
xs = bin_entropy_values
ys = bin_inaccuracies
if save_fig:
plt.clf()
plt.figure(figsize=(8, 8))
plt.plot([0.0, 1], [0.0, norm_factor], "--", color="black", alpha=0.75)
plt.scatter(xs, ys, alpha=1)
plt.plot(xs, ys, "tab:blue", alpha=0.4, label=f"Classification\nUCE: {UCE:.4f}")
plt.xlabel("Mean entropy", fontsize=13)
plt.ylabel("Relative Frequency of incorrect Predictions", fontsize=13)
plt.legend(fontsize=12, loc="upper left")
plt.title("Uncertainty Calibration Curve", fontsize=15)
plt.savefig(os.path.join(filepath, "UCE_reliability_diagram.png"))
plt.close()
# Return the bin entropy array and bin inaccuracy array for plotting purposes later in the evaluation script
return UCE, xs, ys
# New VCE function for multiclass
[docs]
def variation_calibration_error(
predictions: np.ndarray,
targets: np.ndarray,
filepath: str,
save_fig: bool = True,
bin_number: int = 15,
) -> Tuple[float, np.ndarray, np.ndarray]:
"""
Compute the Variation Calibration Error (VCE) metric. Extends the ECE to a measure of distributional calibration for multiclass problems.
Measure of variation employed for this metric is the Shannon entropy. Bins can be defined with either equal-widths or equal-frequency.
Parameters:
-----------
predictions (np.ndarray):
Prediction array with shape (n, k) for n samples and k classes.
targets (np.ndarray):
Target class labels with shape (n, 1).
save_fig (bool, default = False):
Flag to save the reliability diagram.
filepath (str, default = os.getcwd()):
Filepath to which the reliability diagram is saved.
bin_number (int, default = 10):
Number of bins used to compute metric and plot reliability diagram.
Returns:
--------
VCE (float):
VCE metric value.
xs (np.ndarray):
Mean entropy of predictions array for plotting the reliability diagram.
ys (np.ndarray):
Entropy of the rank distribution array for plotting the reliability diagram.
"""
assert (
predictions.ndim == 2
), "The shape of the predictions must be (n, k) for n samples, and k classes."
assert len(predictions) == len(
targets
), "Predictions and targets array must have same length."
num_classes = predictions.shape[-1]
n_samples = len(predictions)
# Get prediction entropies using the entropy function from utils.py
prediction_entropies = entropy(predictions)
# Assign bin edges based on the chosen binning strategy
# Equal width bins, lower bound based on threshold
bin_edges = np.linspace(0, 1, bin_number + 1)
bin_indices = [
np.where(
(prediction_entropies >= bin_edges[i])
& (prediction_entropies < bin_edges[i + 1])
)[0]
for i in range(bin_number)
]
VCE = 0
xs = []
ys = []
# Loop over bins
for indices in bin_indices:
bin_size = len(indices)
# Bin weighting
prop_in_bin = bin_size / n_samples
print(f"Proportion of samples in this bin: {prop_in_bin}")
# Determine bin statistics for all non-empty bins
if prop_in_bin > 0:
bin_preds = predictions[indices]
bin_targs = targets[indices]
# bin_ents = prediction_entropies[indices]
# Mean entropy of model predictions for this bin
# mean_bin_ent = np.mean(bin_ents)
# --- NEW METRIC DEFINITION ---
# Determine the entropy of the mean of the ordered predicted probability distributions
p = np.mean(np.flip(np.sort(bin_preds, axis=1)), axis=0, keepdims=True)
print(f"SHAPE: {p.shape}, MEAN_PROB: {p}")
ent_bin_mean = entropy(p)
# Order class labels in descending order by classifier confidence
order = np.argsort(-bin_preds, axis=1)
# Create rankings array for each class label
ranks_matrix = np.zeros_like(order)
rows = np.arange(bin_size)[:, np.newaxis]
ranks_matrix[rows, order] = np.arange(num_classes)
# Get rankings of the true class from the rankings array (1-indexed)
true_ranks = ranks_matrix[np.arange(bin_size), bin_targs] + 1
# Calculate the frequency of each rank to get distribution, and calculate entropy of distribution
ranks_freqs = np.bincount(true_ranks - 1, minlength=num_classes)
ranks_freqs = ranks_freqs / ranks_freqs.sum()
ranks_entropy = entropy(
ranks_freqs[np.newaxis, :]
).item() # Entropy of ranks distribution using entropy function from utils.py
# Calculate VCE, weighted by bin size
VCE += prop_in_bin * (np.abs(ent_bin_mean.item() - ranks_entropy))
xs.append(ent_bin_mean.item())
ys.append(ranks_entropy)
if save_fig:
plt.clf()
plt.figure(figsize=(8, 8))
plt.plot([0.0, 1], [0.0, 1], "--", color="black", alpha=0.75)
plt.scatter(xs, ys, alpha=1)
plt.plot(xs, ys, "tab:blue", alpha=0.4, label=f"Classification\nVCE: {VCE:.4f}")
plt.xlabel("Mean Normalised Entropy", fontsize=13)
plt.ylabel("Mean Empirical Entropy", fontsize=13)
plt.legend(fontsize=12, loc="upper left")
plt.title("Uncertainty Calibration Curve", fontsize=15)
plt.savefig(
os.path.join(filepath, f"VCE_reliability_diagram_{num_classes}_classes.png")
)
plt.close()
return VCE, np.asarray(xs), np.asarray(ys)
[docs]
def negative_log_likelihood(
predictions: np.ndarray, targets: np.ndarray, *args
) -> float:
"""
Compute the negative log likelihood metric for the predicted probabilities for k-class classification.
Values are clipped to avoid divide by zero errors when taking the base-2 logarithm.
Parameters:
-----------
predictions: np.ndarray
Model output probabilities with shape (n, k) for k-class classification.
targets: np.ndarray
Target class labels with shape (n, 1).
*args:
Used to capture other arguments passed to the function that are not necessary e.g. filepath for reliability diagram.
Returns:
--------
nll (float): Calculated NLL value.
"""
assert len(predictions) == len(
targets
), "Number of predictions and ground truth class labels must match!"
assert (
predictions.ndim == 2
), "Predictions array must be 2D with shape (n_samples, n_classes)."
# Clipping to avoid log(0).
epsilon = 1e-15
clipped_predictions = np.clip(predictions, epsilon, 1 - epsilon)
ll = mp.mpf(0) # Arbitrary precision float.
count = 0
for i, (pred, label) in enumerate(zip(clipped_predictions, targets)):
# Get probability for the ground truth class.
true_class_prob = float(pred[int(label)])
# Calculate log probability for ground truth class.
log_prob = mp.log(true_class_prob, b=2)
ll += log_prob
count += 1 # Counting successful calculations.
# print(f"Samples used for NLL calculation: {count} out of {n_samples}")
# Calculate average negative log-likelihood.
nll = -ll / count
return float(nll)
# Placeholder coverage function for prediction sets.
[docs]
def set_coverage(prediction_sets, targets):
"""
Calculate the prediction set coverage for model ouputs given from either basic conformal prediction, or sets created using top-k selection from model probabilities.
Note: top-k selection from model probabilities has no coverage guarantee, in contrast to conformal methods.
Parameters:
-----------
prediction_sets (np.ndarray(List)): Array of lists containing the predicted classes.
targets (np.ndarry): The ground truth class labels.
Returns:
--------
float: The empirical frequency of the ground truth class labels in the set.
"""
return np.mean([target in set for set, target in zip(prediction_sets, targets)])
# Optional binary classification metrics that evaluate calibration based on cumulative differences between predicted probabilities and ground truth labels.
[docs]
def expected_cumulative_calibration_errors(
predictions: np.ndarray, targets: np.ndarray, filepath: str, save_fig: bool = True
):
"""
Calculating the ECCE-MAD (aka Kolomogorv-Smirnov statistic) and ECCE-R (aka Kuiper statistic) metrics.
The saved plot shows the deviation from zero of the cumulative errors.
References:
-----------
The ECCE metrics are introduced in the following: <https://arxiv.org/pdf/2205.09680>.
This implementation is from the cumulative function in: <https://github.com/facebookresearch/ecevecce/blob/main/codes/calibration.py>.
The alterations to the cumulative function simply involve editing the format of the inputs to the function to be consistent with the other metrics.
In addition, the normalised ECCE_MAD and ECCE_R metrics are given, as this is easier to compare with the target limits of convergence under the null hypothesis of perfect calibration.
Parameters:
-----------
predictions: np.ndarray
Model output probabilities with shape (n, 2) for binary classification
targets: np.ndarray
Target class labels with shape (n, 1)
filepath: str
Path to directory to save the plot of the cumulative calibration errors.
save_fig: bool
Boolean to determine whether to save the reliability diagram to the given filepath.
Returns:
--------
norm_ECCE-MAD: float
The maximum absolute deviation from zero statistic (Kolmogorov-Smirnov) for the cumulative differences between the confidences and outcomes
norm_ECCE-R: float
The range of deviation from zero statistic (Kuiper) for the cumulative differences between the confidences and outcomes
statistical_significance_scale: float
Normalisation factor that is used to assess statistically significant deviations from the asymptotic expected values (under large sample size n)
"""
# Helper function to determine the location of the minor ticks
def histcounts(nbins, x):
# Counts the number of entries of x
# falling into each of nbins equispaced bins.
j = 0
nbin = np.zeros(nbins)
for k in range(len(x)):
if x[k] > (j + 1) / nbins:
j += 1
if j == nbins:
break
nbin[j] += 1
return nbin
assert (
predictions.ndim == 2 and predictions.shape[1] == 2
), "Predictions must have shape (n_samples, 2) for binary classification"
# Get accuracy outcomes as binary array (1 for successful prediction, 0 for unsuccessful prediction)
unsorted_responses = (
np.argmax(predictions, axis=-1).astype(int) == targets
).astype(int)
unsorted_scores = np.max(predictions, axis=-1) # Probability of predicted class
# Add small noise to ensure uniqueness, and get sorted indices
scores_with_noise = unsorted_scores * (
np.ones(shape=unsorted_scores.shape)
+ np.random.normal(size=unsorted_scores.shape) * 1e-6
)
sorted_indices = np.argsort(scores_with_noise, axis=-1)
scores = scores_with_noise[sorted_indices]
responses = unsorted_responses[
sorted_indices
] # Preserve pairing of outcome with predicted probability/confidence
# The predictions must all be unique and in ascending order
assert all(
scores[k] < scores[k + 1] for k in range(len(scores) - 1)
), "The scores must be in strictly ascending order"
assert len(responses) == len(
scores
), "The lengths of the outcomes and scores arrays must be the same"
plt.clf()
plt.figure()
ax = plt.axes()
# Statistically significant length scale, which indicates significant deviations from zero (under null hypothesis of perfect calibration)
lenscale = np.sqrt(np.sum(scores * (1 - scores))) / len(scores)
# Cumulative sums of the scores/predictions, and the responses/outcomes (success or failure)
cumulative_responses = np.cumsum(responses) / int(len(responses))
cumulative_scores = np.cumsum(scores) / int(len(scores))
# Plot the difference.
plt.plot(
np.insert(cumulative_responses - cumulative_scores, 0, [0])[
: (int(len(cumulative_responses)) + 1)
],
"k",
)
# Make sure the plot includes the origin.
plt.plot(0, "k")
# Add an indicator of the scale of 1/sqrt(n) to the vertical axis.
plt.plot(2 * lenscale, "k")
plt.plot(-2 * lenscale, "k")
kwargs = {
"head_length": 2 * lenscale,
"head_width": len(cumulative_scores) / 20,
"width": 0,
"linewidth": 0,
"length_includes_head": True,
"color": "k",
}
plt.arrow(0.1e-100, -2 * lenscale, 0, 4 * lenscale, shape="left", **kwargs)
plt.arrow(0.1e-100, 2 * lenscale, 0, -4 * lenscale, shape="right", **kwargs)
plt.margins(x=0, y=0)
# Hardcode the number of major and minor ticks
majorticks = 10
minorticks = 100
# Label the major ticks of the lower axis with the values of the scores.
ss = [
"{:.5f}".format(x)
for x in np.insert(cumulative_scores, 0, [0])[
:: (len(cumulative_scores) // majorticks)
].tolist()
]
plt.xticks(
np.arange(majorticks) * len(cumulative_scores) // majorticks, ss[:majorticks]
)
ax.tick_params(which="major", axis="x", labelsize=8.5)
if len(cumulative_scores) >= 300 and minorticks >= 50:
# Indicate the distribution of the scores via unlabeled minor ticks.
plt.minorticks_on()
ax.tick_params(which="minor", axis="x")
ax.tick_params(which="minor", axis="y", left=False)
ax.set_xticks(np.cumsum(histcounts(minorticks, cumulative_scores)), minor=True)
# Label the axes.
plt.xlabel("$S_k$")
plt.ylabel("$C_k$")
plt.twiny()
plt.xlabel("$k/n$")
# Calculate the cumulative differences between the predictions and outcomes
cumulative_diffs = np.insert(cumulative_responses - cumulative_scores, 0, [0])[
: int(len(cumulative_responses) + 1)
]
# Determine the metric values from the cumulative differences
ECCE_R = np.max(cumulative_diffs) - np.min(cumulative_diffs)
ECCE_MAD = np.max(np.abs(cumulative_diffs))
# Get metrics normalised by length scale to compare with MAD and Range of standard Brownian motion ()
norm_ECCE_MAD = ECCE_MAD / lenscale
norm_ECCE_R = ECCE_R / lenscale
legend_text = (
f"ECCE-R/σ (norm. Kuiper): {norm_ECCE_R:.4f}\n"
f"ECCE-MAD/σ (norm. K-S): {norm_ECCE_MAD:.4f}"
)
plt.figtext(
0.94,
0.80,
legend_text,
horizontalalignment="right",
verticalalignment="top",
bbox=dict(
facecolor="white", alpha=0.8, edgecolor="gray", boxstyle="round,pad=0.5"
),
)
# Title the plot.
plt.title("Cumulative Calibration Curve")
# Clean up the whitespace in the plot.
plt.tight_layout()
# Save the plot if desired.
if save_fig:
plt.savefig(
os.path.join(filepath, "cumulative_calibration_plot.png"),
bbox_inches="tight",
)
plt.close()
# print(f"Cumulative calibration Statistics:")
# print("-"*77)
# print(f"ECCE_MAD statistic: {ECCE_R:.5f}")
# print(f"ECCE_MAD statistic: {ECCE_MAD:.5f}")
# print(f"The variance/length scale of statistically significant deviations: {lenscale:.7f}")
# print("-"*77)
# print(f"The normalised ECCE-MAD: {norm_ECCE_MAD}\nThe normalised ECCE-R: {norm_ECCE_R}")
return ["%.5f" % norm_ECCE_MAD, "%.5f" % norm_ECCE_R]
#######################################################################################
# ----------------------------------------------------------------------------------- #
# THE METRICS BELOW ARE TO CALCULATE AVERAGE CALIBRATION ERRORS FOR REGRESSION TASKS #
# ----------------------------------------------------------------------------------- #
#######################################################################################
[docs]
def picp(
predictions: np.ndarray,
targets: np.ndarray,
confidence_level: float,
is_interval: bool = True,
uncertainties: np.ndarray = None,
is_var: bool = True,
) -> tuple:
"""
Calculate the Prediction Interval Coverage Probability (PICP) for predictions given either as intervals or point-predictions, for a specified confidence level.
If inputs are given as point-predictions, uncertainty (as variance/standard deviations) arrays must be given, from which intervals are derived, assuming a Gaussian distribution.
The Mean Prediction Interval Width (MPIW) is also returned. Note this is an unbounded non-negative metric.
Parameters:
-----------
predictions: np.ndarray
Array of model predictions with shape either (n, 2) for intervals or (n, 1) for point predictions
targets: np.ndarray
Array of target values with shape (n, 1)
confidence_level: float
The confidence level of the prediction intervals. If point-predictions are given, intervals will be created assuming Gaussian distribution at this confidence level.
is_interval: bool
Boolean flag to set whether predictions are given as intervals or point-predictions. Defaults to True.
uncertainties: np.ndarray
Array of model uncertainties with shape (n, 1). Defaults to None, must be present if is_interval is False.
is_var: bool
Bollean flag corresponding to if the uncertainties are given as variances (default) or standard deviations. Defaults to True.
Returns:
--------
picp: float
The prediction interval coverage probability of the model predictions. Aim is for the PICP to match the confidence level.
MPIW: float
The Mean Predicted Interval Wdith of the model predictions.
"""
assert (
isinstance(confidence_level, float) and 0 < confidence_level <= 1.0
), "Confidence level must be a float between 0 and 1."
confidence_level = np.abs(confidence_level) # Force confidence level to be positive
if is_interval:
intervals = predictions
assert (
intervals.shape[1] == 2
), "Intervals array must be of shape (n_samples, 2)"
assert np.all(
intervals[:, 0] <= intervals[:, 1]
), "Intervals must be given as (lower, upper)"
if len(intervals) != len(targets):
raise ValueError(
"The lengths of the intervals and targets array must be the same."
)
within_interval = np.logical_and(
targets >= intervals[:, 0], targets <= intervals[:, 1]
)
# Mean Prediction Interval Width.
MPIW = np.mean(np.abs(intervals[:, 0] - intervals[:, 1]))
# Fraction of targets that lie within the prediction interval.
picp = np.mean(within_interval)
# If prediction intervals are not given, then they are created from point predictions and uncertainties, assuming Gaussian distribution.
else:
assert (
uncertainties is not None and not is_interval
), "Please provide uncertainties as variances for point predictions."
assert isinstance(is_var, bool), '"is_var" must be a boolean.'
if len(predictions) != len(targets) != len(uncertainties):
raise ValueError(
"The lengths of the predictions, targets, and uncertainties array must be the same."
)
if is_var:
std_devs = np.sqrt(uncertainties)
else:
std_devs = uncertainties
# Determine lower and upper bounds, based on defined confidence level
lower_bounds = stats.norm.ppf(
(1 - confidence_level) / 2, loc=predictions, scale=std_devs
)
upper_bounds = stats.norm.ppf(
(1 + confidence_level) / 2, loc=predictions, scale=std_devs
)
within_interval = np.logical_and(
targets >= lower_bounds, targets <= upper_bounds
)
# Mean prediction interval width - smaller the interval, the sharper the prediction
MPIW = np.mean(np.abs(upper_bounds - lower_bounds))
# Prediction interval coverage probability - fraction of target values that lie within the predicted intervals
picp = np.mean(within_interval)
# print(f"PICP: {picp}")
# print(f"MPIW: {MPIW}")
return picp, MPIW
[docs]
def coverage_calibration_error(
predictions: np.ndarray,
targets: np.ndarray,
uncertainties: np.ndarray,
filepath: str = None,
is_var: bool = True,
save_fig: bool = False,
number_of_intervals: int = 10,
):
"""
Calculate the confidence interval-based metric for model predictions and their corresponding uncertainties.
If uncertainties are given as variances, they are converted to standard deviations.
Parameters:
-----------
predictions: np.ndarray
Array of model predictions with shape (n, 1)
targets: np.ndarray
Array of target values with shape (n, 1)
uncertainties: np.ndarray
Array of total uncertainties (aleatoric + epistemic) with shape (n, 1)
filepath: str
Path to directory to save the coverage calibration curve.
is_var: bool, optional
Boolean parameter that determines whether to convert the uncertainties from variances to standard deviations, by default True
save_fig: bool, optional
Boolean parameter that sets whether to save the plot to the filepath, by default True.
number_of_intervals: int, optional
The number of confidence intervals to evaluate the calibration error on, by default 10 (range 0 to 1 in steps of 0.1)
Returns:
--------
calibration_error: float
The confidence interval-based calibration error metric
"""
assert (
len(predictions) == len(targets) == len(uncertainties)
), "Input arrays have different lengths!"
predictions, targets, uncertainties = (
np.array(predictions),
np.array(targets),
np.array(uncertainties),
)
assert (
predictions.ndim == targets.ndim == uncertainties.ndim
), "Input arrays have different dimensions \nExpected all arrays to have shape (n, 1)"
num_samples = len(predictions)
# Define the confidence intervals
confidence_levels = np.linspace(0, 1, number_of_intervals + 1)[1:]
# If the uncertainties are given as variances, convert them to standard deviations
if is_var:
uncertainties = np.sqrt(uncertainties)
# Initialise the calibration error value, and the observed agreements list
observed_agreements = []
calibration_error = 0
# Loop over the confidence intervals
for p in confidence_levels:
# Calculate the threshold value of the inverse CDF of the predictions for the given confidence level 100*p%
threshold = stats.norm.ppf(p, predictions, uncertainties)
# Sum the number of target values that lie in the region: GT <= threshold
within_interval = (targets <= threshold).sum()
# Relative number of ground truths that lie below the threshold for a given confidence interval
empirical_freq = within_interval / num_samples
observed_agreements.append(empirical_freq)
# Calculate the calibration error as the sum of squared differences
calibration_error += np.square(p - empirical_freq)
assert len(observed_agreements) == len(
confidence_levels
), "The shape of agreements != the shape of the confidence intervals!"
# Plot the coverage calibration curve
xs = confidence_levels
ys = observed_agreements
if save_fig:
plt.figure(figsize=(8, 8))
plt.plot([0, 1], [0, 1], "--", color="black", alpha=0.5)
plt.scatter(xs, ys, alpha=1)
plt.plot(
xs, ys, "b", alpha=0.4, label=f"Regression \nCCE: {calibration_error:.4f}"
)
plt.xlabel("Expected Confidence Level")
plt.ylabel("Observed Confidence Level")
plt.legend(loc="upper left")
plt.title("Regession Calibration Curve")
plt.savefig(os.path.join(filepath, "coverage_calibration_curve.png"))
plt.close()
return calibration_error
[docs]
def continuous_ranked_probability_score(
predictions: np.ndarray,
targets: np.ndarray,
uncertainties: np.ndarray,
converter: str = "Gaussian",
distributions: list = None,
is_var: bool = True,
) -> float:
"""
Calculate the Continuous Ranked Probability Score (CRPS) from the given model predictions, uncertainties, and corresponding ground truths.
The option is given to calculate the CRPS either using the analytical expression given the model predictions parameterising a Gaussian, or using numerical integration.
The numerical integration method for a distribution estimated with KDE yields errors of <1% compared to the analytical expression for a Gaussian.
References:
-----------
The CRPS and its analytical expression for a univariate Gaussian is given in Gneiting et al., 2007: <https://sites.stat.washington.edu/raftery/Research/PDF/Gneiting2007jasa.pdf>.
Parameters:
-----------
predictions (np.ndarray):
An array of the model predictions with shape (n, 1).
targets (np.ndarray):
Array of target values with shape (n, 1).
uncertainties (np.ndarray):
Array of the total uncertainties associated with the model predictions from a chosen UQ method, with shape (n, 1).
converter (str, optional):
Specify whether the converter used was "Gaussian" or "KDE"; evaluates CRPS using numerical integration of CDF if "KDE", otherwise uses analytical expression for Gaussian. Defaults to "Gaussian".
distributions (List[distribution objects], optional):
Option to include the list of distributions as a direct input - needed if input is from KDE conversion of prediction interval to distribution. Defaults to None.
is_var (bool, optional):
Boolean parameter that sets whether to convert the uncertainties from variances to standard deviations. Defaults to True.
Returns:
--------
CRPS (float):
The average CRPS value across the dataset.
"""
# Check function inputs
assert (
converter == "Gaussian" or converter == "KDE"
), 'Please specify the conversion method used as either: "Gaussian" or "KDE".'
if is_var:
std_devs = np.sqrt(uncertainties)
else:
std_devs = uncertainties
CRPS_values = []
# Calculate the CRPS from the analytical expression for a Gaussian distribution.
if converter == "Gaussian":
# Calculate the z-scores for each prediction and ground truth.
z_scores = (targets - predictions) / std_devs
for i, (z_score, uncert) in enumerate(zip(z_scores, std_devs)):
# Determine the CDF and PDF values for the standard normal distribution at the z_score value.
CDF = stats.norm.cdf(z_score, loc=0, scale=1)
PDF = stats.norm.pdf(z_score, loc=0, scale=1)
# Calculate the CRPS for a given ground truth using the analytical expression for a Gaussian.
crps = uncert * ((2 * PDF) + z_score * (2 * CDF - 1) - (1 / np.sqrt(np.pi)))
CRPS_values.append(crps)
# Manually need to specify that the converter is KDE to use this method. Must be changed if new conversion methods are added.
elif converter == "KDE":
assert (
distributions is not None
), "Please provide a list of distribution objects if you have used the KDE conversion method."
assert len(distributions) == len(
targets
), "Distribution and target array must have the same length."
# For each distribution parameterised by a mean and standard deviation, calculate the CRPS with numerical integration and append to the list.
for distribution, target in zip(distributions, targets):
assert all(hasattr(distribution, attr) for attr in ["density", "support"])
support = distribution.support
pdf_vals = (
distribution.density
) # PDF values for the distribution evaluated at the support
dx = support[1] - support[0] # Resolution of the support
# Approximate the CDF of the distribution with a cumulative numerical integration of the PDF array.
cdf_vals = cumulative_simpson(pdf_vals, dx=dx, initial=0)
# The bounds of integration need extending in the case that the target is outside of the support values for the KDE distribution.
# This is a crude method that assumes CDF is 0/1 to the left/right of the support values - yields an error of ~1% compared to ground truth CRPS values
# but is necessary to prohibit large computation time with more accurate numerical integration methods.
if target < support[0]:
# Values from the observation upto the left tail of the support to define the integration range.
left_extension = np.arange(target, support[0], dx)
extended_support = np.concatenate(
[left_extension, support]
) # Extended support range to cover observation
# Estimate CDF values to be 0 from left tail of the support upto target value.
extended_cdf = np.concatenate([np.zeros_like(left_extension), cdf_vals])
elif target > support[-1]:
# Values from the right tail of the support upto the observation, to define the integration range.
right_extension = np.arange(support[-1] + dx, target + dx, dx)
extended_support = np.concatenate(
[support, right_extension]
) # Extended support range to cover observation
# Estimate CDF values to be 1 from right tail of the CDF upto the target value.
extended_cdf = np.concatenate([cdf_vals, np.ones_like(right_extension)])
else:
# Target lies within the support bounds of the distribution, so no extension necessary.
extended_support = support
extended_cdf = cdf_vals
# Define the empirical CDF of the target value.
heaviside = np.where(extended_support >= target, 1.0, 0.0)
# Integrand of the CRPS
squared_diff = np.square(extended_cdf - heaviside)
# Numerical integration of the integrand over the extended support region that includes the target value.
crps = simpson(squared_diff, dx=dx)
CRPS_values.append(crps)
CRPS = np.mean(CRPS_values)
return CRPS
###########################################################################################
# --------------------------------------------------------------------------------------- #
# THE METRICS BELOW ARE TO CALCULATE CONDITIONAL CALIBRATION ERRORS FOR REGRESSION TASKS #
# --------------------------------------------------------------------------------------- #
###########################################################################################
[docs]
def expected_normalised_calibration_error(
predictions: np.ndarray,
targets: np.ndarray,
uncertainties: np.ndarray,
filepath: str = None,
is_var: bool = False,
save_fig: bool = False,
bin_number: int = 15,
):
"""
Calculating the Expected Normaised Calibration Error (ENCE) for the model predictions and the corresponding total uncertainties.
Parameters:
-----------
predictions: np.ndarray
An array of the model predictions with shape (n, 1)
targets: np.ndarray
Array of target values with shape (n, 1)
uncertainties: np.ndarray
Array of the total uncertainties associated with the model predictions from a chosen UQ method, with shape (n, 1)
filepath: str
Path of directory to save the reliability diagram.
is_var: bool, optional
Boolean parameter that sets whether to convert the uncertainties to standard deviations, by default True
save_fig: bool, optional
Boolean parameter that sets whether to save the plot to the filepath, by default True.
bin_number: int, optional
Chosen number of bins for calculating ENCE (note: the bins are defined to have an equal number of samples per bin), by default 15
Returns:
--------
ENCE: float
The value of the ENCE calculated for the given inputs
"""
predictions, targets, uncertainties = (
np.array(predictions),
np.array(targets),
np.array(uncertainties),
)
assert (
len(predictions) == len(targets) == len(uncertainties)
), "Input arrays have different lengths!"
if not (predictions.ndim == targets.ndim == uncertainties.ndim):
raise ValueError(
"Input arrays have different dimensions \nExpected all arrays to have shape (n, 1)"
)
# If the optional argument is_var = True, then convert the uncertainties to standard deviations
if is_var:
uncertainties = np.sqrt(uncertainties)
# Create an array of the errors
errors = predictions - targets
# Sort the arrays on increasing standard deviation (σ) and sort the errors array based on these indices
sorted_indices = np.argsort(uncertainties)
sorted_stdevs = uncertainties[sorted_indices]
sorted_errors = errors[sorted_indices]
n_samples = len(predictions)
# Define the equal-frequency bin edges
bin_edges = np.linspace(0, n_samples, bin_number + 1).astype(int)
# Initialise the ENCE value, and the RMSE and RMV lists
ENCE = 0
RMSEs = []
RMVs = []
# Loop through the bins
for i in range(bin_number):
# Assign the bin boundaries for the current bin
bin_lower, bin_upper = bin_edges[i], bin_edges[i + 1]
# Determine which uncertainties and errors fall within the current bin
bin_stdevs = sorted_stdevs[bin_lower:bin_upper]
bin_errors = sorted_errors[bin_lower:bin_upper]
# Convert the uncertainties in the bin to variances
bin_vars = (bin_stdevs) ** 2
# Calculate the average variance for the bin (RMV)
bin_MV = bin_vars.mean()
# Calculate the average squared error for the bin (RMSE)
bin_MSE = ((bin_errors) ** 2).mean()
RMSEs.append(np.sqrt(bin_MSE))
RMVs.append(np.sqrt(bin_MV))
# Update the ENCE value with the absolute difference between the RMSE and RMV, normalised by the RMV
ENCE += np.abs(np.sqrt(bin_MV) - np.sqrt(bin_MSE)) / (np.sqrt(bin_MV))
# Average the ENCE across all the bins
ENCE = ENCE / bin_number
# Plot the reliability diagram and save to filepath
xs = RMVs
ys = RMSEs
if save_fig:
plt.figure(figsize=(8, 8))
plt.plot(
[np.min(xs), np.max(xs)],
[np.min(xs), np.max(xs)],
"--",
color="black",
alpha=0.75,
)
plt.scatter(xs, ys, alpha=1)
plt.plot(xs, ys, "b", alpha=0.4, label=f"Regression\nENCE = {ENCE:.4f}")
plt.xlabel("RMV per bin", fontsize=13)
plt.ylabel("RMSE per bin", fontsize=13)
plt.tick_params(axis="x", labelsize=11)
plt.tick_params(axis="y", labelsize=11)
plt.legend(fontsize=12)
plt.title("Regression Calibration Curve")
# plt.ylim(np.min(xs)-1, np.max(xs)+1)
# plt.xlim(np.min(xs)-1, np.max(xs)+1)
plt.savefig(os.path.join(filepath, "ENCE_reliability_diagram.png"))
plt.close()
return ENCE, xs, ys
# Not used in the QUMPHY evaluation framework.
[docs]
def z_variance_error(predictions, targets, uncertainties, is_var=True, bin_number=15):
"""
Calculate the Z-variance error (ZVE) for the model predictions and corresponding total uncertainties.
Parameters:
-----------
predictions: np.ndarray
An array of the model predictions with shape (n, 1)
targets: np.ndarray
Array of target values with shape (n, 1)
uncertainties: np.ndarray
Array of the total uncertainties associated with the model predictions from a chosen UQ method, with shape (n, 1)
is_var: bool, optional
Boolean parameter that sets whether to convert the uncertainties from variances to standard deviations, by default True
bin_number: int, optional
Chosen number of bins for calculating ZVE (note: the bins are defined to have an equal number of samples per bin), by default 15
Returns:
--------
ZVE: float
The ZVE value for the input predictions and uncertainties
"""
assert (
len(predictions) == len(targets) == len(uncertainties)
), "Input arrays have different lengths!"
assert (
predictions.ndim == targets.ndim == uncertainties.ndim
), "Input arrays have different dimensions \nExpected all arrays to have shape (n, 1)"
# Create an array of the errors
errors = predictions - targets
# Sort based on the uncertainties
sorted_indices = np.argsort(uncertainties)
sorted_uncertainties = uncertainties[sorted_indices]
sorted_errors = errors[sorted_indices]
n_samples = len(errors)
# Create the bin edges for equal frequency bins
bin_edges = np.linspace(0, n_samples, bin_number + 1).astype(int)
# Initiliase ZVE, bin sizes list, Var(Z) list, bin centres list, σ(E) list, RMS(uncertainties) list
ZVE = 0
bin_sizes = []
# These are used to plot Var(Z) vs bin_centres, to check that Var(Z) is close to 1 for conditional calibration
Var_Zs = []
bin_centres = []
# These are used to plot the reliability diagram to check conditional calibration
stdev_errors = []
RMS_uncerts = []
# Loop over each bin
for i in range(bin_number):
bin_lower, bin_upper = bin_edges[i], bin_edges[i + 1]
bin_errors = sorted_errors[
bin_lower:bin_upper
] # the errors that lie within the current bin
bin_vars = sorted_uncertainties[
bin_lower:bin_upper
] # these uncertainties are output as variances from the model
if is_var:
bin_stdevs = np.sqrt(
np.asarray(bin_vars)
) # converting the uncertainties to standard deviations to calculate the Z-score per bin
bin_length = len(bin_errors)
bin_sizes.append(
bin_length
) # Obtain a list of the bin sizes to check they have an equal number of samples
bin_z_scores = (bin_errors) / (
bin_stdevs
) # List of the Z-scores in the current bin
bin_z_score_variance = np.var(
np.asarray(bin_z_scores)
) # Variance of the Z-scores for the current bin
# Append the variance of the z score for the bin, as well as the bin centre, for the plot of Var(Z) against bin centre
Var_Zs.append(bin_z_score_variance)
bin_centres.append(bin_stdevs[bin_length // 2])
# Append the bin σ(E) and bin mean RMS(u), for the reliability diagram
stdev_errors.append(np.std(bin_errors))
RMS_uncerts.append(np.sqrt(bin_vars.mean()))
# Calculate the ZVE for the bin and add it to the running ZVE score
ZVE += np.abs(np.log(bin_z_score_variance))
# Average the log(Var(Z)) across all bins, and return the overall ZVE
ZVE = np.exp(ZVE / bin_number)
return ZVE