Source code for qumphy.uq_metrics

"""
File: qumphy/uq_metrics.py
Project: QUMPHY
Contact: vivek.desai@npl.co.uk
Gitlab: https://gitlab.com/qumphy
Description: Evaluation metrics for assessing model calibration for classification and regression tasks.
"""

import os
import numpy as np
import mpmath as mp
import matplotlib.pyplot as plt
from scipy import stats
import relplot as rp  # Python package used to calculate smECE and plot smooth reliability diagram
from scipy.integrate import cumulative_simpson, simpson
from typing import Tuple


# Helper functions.

[docs]
def flatten_list_of_batches(output_list: list, concat_axis: int = -1):
    """
    Function to flatten model output arrays before they are used to calculate the calibration error metrics.

    Parameters:
    -----------
    output_list: list
        List to be flattened into 1D np.ndarray
    concat_axis: int, optional
        Axis on which to perform the flattening, by default -1 (last axis)

    Returns:
    --------
    flattened_array: np.ndarry
        Flattened array
    """

    for batch_index in range(len(output_list[-1])):
        # -1 is used as sometimes output_list contains two entries,
        # of which the latter is relevant
        if batch_index == 0:
            # get the values for all the examples in the first batch
            # and create the array that will store all examples
            flattened_array = np.array(output_list[-1][batch_index][:])
        else:
            # add the value for all examples in the batch to flattened_array
            flattened_array = np.concatenate(
                (flattened_array, output_list[-1][batch_index][:]), axis=concat_axis
            )

    return flattened_array




[docs]
def split_into_classes(predictions: np.ndarray, targets: np.ndarray) -> tuple:
    """
    Helper function to split classification predictions into predictions conditioned on the ground truth class label.

    Parameters:
    -----------
        predictions: np.ndarray
            Model output probabilities with shape (n, k) for k-class classification.
        targets: np.ndarray
            Target class labels with shape (n, 1).

    Returns:
        tuple:
            Predictions per-class for k classes, with corresponding ground truth class labels for each class. Size of tuple is dependent on number of classes.
    """
    assert (
        predictions.ndim == 2
    ), "Predictions must have shape (n, k) for n samples, with k classes."
    assert len(predictions) == len(
        targets
    ), "Predictions and targets array must have the same length!"
    n_classes = predictions.shape[-1]

    result = []
    # Add predictions for each class
    for class_idx in range(n_classes):
        mask = targets == class_idx
        result.append(predictions[mask])

    # Add targets for each class
    for class_idx in range(n_classes):
        mask = targets == class_idx
        result.append(targets[mask])

    return tuple(result)



# Calculate the prediction entropies

[docs]
def entropy(p: np.ndarray) -> np.ndarray:
    """
    Calculate the normalised entropy of the probabilities array.

    Parameters:
    -----------
        p (np.ndarray):
            Probabilities from classifier.

    Returns:
    --------
        ents (np.ndarray):
            Entropy values for each sample in the probability array.
    """
    class_num = p.shape[1]
    ents = -np.sum(p * np.log(p + 1e-12), axis=1)
    ents = (1 / (np.log(class_num))) * ents  # norm by log(class number)
    return ents



##############################################################################
# -------------------------------------------------------------------------- #
# THE METRICS BELOW ARE TO CALCULATE CALIBRATION ERRORS FOR CLASSIFICATION   #
# -------------------------------------------------------------------------- #
##############################################################################



[docs]
def expected_calibration_error(
    predictions: np.ndarray,
    targets: np.ndarray,
    filepath: str = None,
    save_fig: bool = False,
    bin_number: int = 15,
):
    """
    Calculate the Expected Calibration Error (ECE) for the predicted class model probabilities for classification tasks.

    References:
    -----------

    The script below is heavily based on the following implementations of the ECE:
    <https://medium.com/towards-data-science/expected-calibration-error-ece-a-step-by-step-visual-explanation-with-python-code-c3e9aa12937d>

    and

    <https://github.com/gpleiss/temperature_scaling/blob/master/temperature_scaling.py>.

    There exists alternative packages that calculate an ECE value, such as the netcal package. Depending on the implementation of binning in each package, the ECE values differ by small amounts (~1e-3).
    In the QUMPHY evaluation framework, the quoted ECE value is determined using this function.

    Parameters:
    -----------

    predictions: np.ndarray
        Model output probabilities with shape (n, k) for k-class classification.
    targets: np.ndarray
        Target class labels with shape (n, 1).
    filepath: str
        The filepath for the directory to save the reliability diagram to.
    save_fig: bool
        Boolean to determine whether to save the reliability diagram to the given filepath.
    bin_number: int, optional
        Number of bins to be used (note: bins are equal width in the mean confidence axis), by default 15.

    Returns:
    --------

    ECE: float
        ECE value for the input data
    """

    if len(predictions) != len(targets):
        raise ValueError("Number of predictions and target labels must match!")

    # Check that the predictions array has shape (n, k), where n = number of samples
    assert (
        predictions.ndim == 2
    ), "The shape of the prediction array must be (n, k) for n samples, with k classes."

    num_classes = predictions.shape[1]
    max_prob_thresh = 1.0 / num_classes  # Threshold to determine the predicted class

    # Set the limits of the bins, and acquire the upper and lower bounds of the bins
    bin_edges = np.linspace(max_prob_thresh, 1, bin_number + 1)
    bin_lower_bounds = bin_edges[:-1]
    bin_upper_bounds = bin_edges[1:]

    # Only need to consider confidences that are >= max_prob_thresh
    model_confidences = np.max(predictions, axis=1)

    # Indices of the max model probability - equivalent to the model's predicted class label
    predicted_labels = np.argmax(predictions, axis=1)

    # Boolean array for instances where the model predicts the correct class
    accs = predicted_labels == targets

    # Initialise the ECE value, and mean_confidences_per_bin & bin_accuracies lists
    ECE = 0
    mean_confidences_per_bin = []
    bin_accuracies = []

    # Loop over the bin boundaries
    for lower_bound, upper_bound in zip(bin_lower_bounds, bin_upper_bounds):
        # Boolean array that identifies which confidence values fall into the current bin
        in_bin = np.logical_and(
            model_confidences > lower_bound.item(),
            model_confidences <= upper_bound.item(),
        )

        # Empirical probability of finding the sample in that bin: equivalent to |Bm|/n
        prob_in_bin = in_bin.mean()

        # Determine bin statistics for all non-empty bins
        if prob_in_bin.item() > 0:
            # Determine accuracy for the predictions within the bin
            acc_bin = accs[in_bin].mean()

            # Determine the average confidence value for predictions within the bin
            avg_conf = model_confidences[in_bin].mean()

            # Calculate the ECE value for the bin
            ECE += (prob_in_bin) * np.abs(acc_bin - avg_conf)

            bin_accuracies.append(acc_bin)
            mean_confidences_per_bin.append(avg_conf)

    # Plot reliability diagram and save to filepath if chosen
    xs = mean_confidences_per_bin
    ys = bin_accuracies

    if save_fig:
        plt.clf()
        plt.figure(figsize=(8, 8))
        plt.plot(
            [max_prob_thresh, 1], [max_prob_thresh, 1], "--", color="black", alpha=0.5
        )
        plt.scatter(xs, ys, alpha=1)
        plt.plot(xs, ys, "b", alpha=0.4, label=f"AF classification: {ECE:.4f}")
        plt.xlabel("Mean confidence in bin", fontsize=13)
        plt.ylabel("Relative Frequency of Correct Predictions in bin", fontsize=13)
        plt.legend(loc="best")
        plt.title("Reliability Diagram for Classification", fontsize=15)
        plt.savefig(os.path.join(filepath, "ECE_reliability_diagram.png"))
        plt.close()

    # Return the mean confidences per bin array and bin accuracies per bin for plotting purposes
    return ECE, xs, ys




[docs]
def smooth_expected_calibration_error(
    predictions: np.ndarray, targets: np.ndarray, filepath: str, save_fig: bool = True
) -> float:
    """
    Calculate the smooth ECE value (smECE) for the predicted class probabilities for the classification task.


    References:
    -----------

    Details on the relplot python package can be found in the following repository: <https://github.com/apple/ml-calibration/tree/main>.
    The accompanying paper for the smooth ECE metric, and the smooth reliability diagrams can be found here: <https://arxiv.org/pdf/2309.12236>.


    Parameters:
    -----------

    predictions: np.ndarray
        Model output probabilities with shape (n, k) for k class classification.
    targets: np.ndarray
        Target class labels with shape (n, 1).
    filepath: str
        Path to directory to save the reliability diagram.
    save_fig: bool
        Boolean to determine whether to save the reliability diagram to the given filepath.

    Returns:
    --------

    smECE: float
        The calculated smECE value using the relplot python package.
    """

    assert (
        predictions.ndim == 2
    ), "The shape of the prediction array must be (n, k) for n samples, with k classes."
    if len(predictions) != len(targets):
        raise ValueError("Number of predictions and target labels must match!")

    original_plt_style = plt.rcParams.copy()

    # Helper function from the relplot package to select probabilities of the predicted class, and return an accuracies array.
    confidences, accuracies = rp.multiclass_logits_to_confidences(
        predictions, targets, probs=True
    )

    # Calculate the smooth ECE value for confidence calibration.
    smECE = rp.smECE(confidences, accuracies)
    # binnedECE = rp.metrics.binnedECE(confidences, accuracies, nbins = 20)

    # When plotting the reliability diagram in the binary case, it is easier to visualise calibration using the predicted probabilities for AF.
    AF_probs = predictions[:, 1]

    # Plot the reliability diagram for regression of outcomes on predictions and save to filepath directory.
    diagram = rp.prepare_rel_diagram(AF_probs, targets, plot_density=True)

    fig, ax = rp.plot_rel_diagram(diagram)
    ax.set_title("SmoothECE Reliability Diagram", fontsize=22)
    if save_fig:
        fig.savefig(os.path.join(filepath, "smECE_reliability_diagram.png"))
    plt.close(fig)

    plt.style.use("default")
    plt.rcParams.update(original_plt_style)

    # Uncertainty on smooth ECE value from bootstrapping and using the 95% confidence level band.
    # smECE_uncert = float(diagram['ce_ci_width'])

    return smECE




[docs]
def adaptive_calibration_error(
    predictions: np.ndarray,
    targets: np.ndarray,
    *args,
    range_number: int = 15,
    threshold: float = 0.0,
):
    """
    Calculate the Adaptive Calibration Error (ACE) for the predicted class probabilities.
    This metric will work for both the binary and multiclass classification cases, with the option to apply thresholding as done in the original paper.

    References:
    -----------

    The ACE is introduced in the following paper: <https://arxiv.org/pdf/1904.01685>.
    This implementation is heavily inspired by the more general calibration error functions in: <https://github.com/JeremyNixon/uncertainty-metrics-1/tree/master>.
    We present a simplified version here, focused on the ACE alone.

    Parameters:
    -----------

    predictions: np.ndarray
        Model output probabilities with shape (n, m) for classification. Note m >= 2 i.e. probabilities given for all classes.
    targets: np.ndarray
        Target class labels with shape (n, 1)
    *args:
        Used to catch any un-used arguments in the main UQ evaluation notebook.
    range_number: int, optional
        Number of ranges to be used (note: ranges are equal frequency bins in the mean confidence axis), by default 15
    threshold: float, optional
        Threshold value below which probabilities are ignored, by default 0.0 for the ACE i.e. no thresholding.

    Returns:
    --------

    ACE: float
        The ACE metric value.
    """

    if len(predictions) != len(targets):
        raise ValueError("Number of predictions and target labels must match!")

    # Check that the predictions array has shape (n, m), where n = number of samples and m = number of classes.
    assert (
        predictions.ndim == 2 and predictions.shape[1] >= 2
    ), "Predictions array must only have 2 dimensions, with a minimum of 2 classes."

    # Check the threshold value lies between 0 and 1.
    assert 0 <= threshold < 1, "Threshold must be between 0 and 1."

    num_classes = predictions.shape[1]

    # One-hot encode the ground truth class labels.
    labels_matrix = np.eye(num_classes)[targets]

    class_calibration_error_list = []

    for j in range(num_classes):
        probs_slice = predictions[:, j]
        labels_slice = labels_matrix[:, j]

        # Get probabilities that are greater than or equal to the threshold. For ACE, threshold is 0.
        mask = probs_slice >= threshold
        filtered_probs = probs_slice[mask]
        filtered_labels = labels_slice[mask]

        # If no predictions for a class, set calibration error to 0.
        if len(filtered_probs) == 0:
            class_calibration_error_list.append(0)
            continue

        # Sort the probabilities and corresponding labels.
        sorted_indices = np.argsort(filtered_probs, axis=-1)
        sorted_probs = filtered_probs[sorted_indices]
        sorted_labels = filtered_labels[sorted_indices]

        n_samples = len(sorted_probs)

        # Determine how many ranges to use dependent on number of predictions above threshold per class.
        actual_n_ranges = min(range_number, n_samples)
        if actual_n_ranges < range_number:
            # If we have fewer samples than specified number of ranges, create one bin if the sample number is
            actual_n_ranges = max(1, actual_n_ranges)  # Ensure at least one bin.

        # Define equal-frequency bin edges
        bin_edges = np.linspace(0, n_samples, actual_n_ranges + 1).astype(int)

        bin_calibration_errors = []
        bin_weights = []

        # Loop through the bins
        for i in range(actual_n_ranges):
            bin_lower, bin_upper = bin_edges[i], bin_edges[i + 1]

            if bin_lower == bin_upper:
                continue

            # Get the probabilities and labels for the current bin
            bin_probs = sorted_probs[bin_lower:bin_upper]
            bin_labels = sorted_labels[bin_lower:bin_upper]

            # Calculate mean predicted confidence and accuracy for the bin
            bin_confidence = np.mean(bin_probs)
            bin_accuracy = np.mean(bin_labels)
            bin_error = np.abs(bin_accuracy - bin_confidence)  # bin calibration error

            # Calculate the weight for the bin
            bin_weight = (bin_upper - bin_lower) / n_samples
            assert (bin_upper - bin_lower) == len(bin_probs)

            bin_calibration_errors.append(bin_error)
            bin_weights.append(bin_weight)

        # Calculate the weighted calibration error for this class
        class_error = np.sum(np.array(bin_calibration_errors) * np.array(bin_weights))
        class_calibration_error_list.append(class_error / num_classes)

    # Calculate overall ACE as sum of individual class calibration errors
    ACE = np.sum(class_calibration_error_list)

    return ACE




[docs]
def uncertainty_calibration_error(
    predictions: np.ndarray,
    targets: np.ndarray,
    filepath: str = None,
    save_fig: bool = False,
    prediction_entropies: np.ndarray = None,
    bin_number: int = 15,
):
    """
    Calculate the Uncertainty Calibration Error (UCE) for model predictions and target labels for classification.
    There exists the optional argument to provide an array of entropies of shape (n, 1); if None, the entropies will be calculated from the probabilities.


    References:
    -----------

    The UCE was introduced in the following paper: <https://arxiv.org/pdf/1909.13550>.
    This implementation is based on: <https://github.com/mlaves/bayesian-temperature-scaling/blob/master/uce.py>.


    Parameters:
    -----------
    predictions: np.ndarray
        Model output probabilities with shape (n, k) for k-class classification.
    targets: np.ndarray
        Target class labels with shape (n, 1).
    filepath: str
        Path to directory to save the reliability diagram.
    prediction_entropies: np.ndarray, optional
        The entropies for the predictions, by default None; if None, then the entropies are calculated from the prediction probabilities.
    bin_number: int, optional
        Number of bins to be used (note: bins are equal width in the normalised entropy axis), by default 15.


    Returns:
    --------

    UCE: float
        The UCE for the model predictions and their corresponding entropies.
    xs: np.ndarray
        Array of the mean normalised predicted entropies, used for plotting the reliability diagram.
    ys: np.ndarray
        Array of the empirical inaccuracy per bin, used for the plotting the reliability diagram.
    """
    if len(predictions) != len(targets) != len(prediction_entropies):
        raise ValueError(
            "Number of predictions, target labels, and entropies must match."
        )

    if prediction_entropies is not None:
        assert isinstance(prediction_entropies, np.ndarray)

    # Check that the predictions array has shape (n, k), where n = number of samples, and k = number of classes.
    assert (
        predictions.ndim == 2
    ), "The shape of the predictions must be (n, k) for n samples, and k classes."

    n_classes = predictions.shape[-1]

    # If no prediction entropies are provided, then they are calculated from the prediction probabilities.
    if prediction_entropies is None:
        prediction_entropies = []
        for prob_vector in predictions:
            # Only compute log where probabilities are non-zero to avoid log(0).
            ent = -np.sum(prob_vector * np.log(prob_vector, where=(prob_vector != 0)))
            prediction_entropies.append(ent)

        # Normalise the prediction entropies by np.log(k) for k-class classification
        prediction_entropies = (1 / np.log(n_classes)) * np.array(prediction_entropies)

    # Define the bin intervals.
    bin_edges = np.linspace(0, 1, bin_number + 1)
    bin_lower_bounds = bin_edges[:-1]
    bin_upper_bounds = bin_edges[1:]

    # Predicted class label from the predictions array.
    predicted_labels = np.argmax(predictions, axis=1)

    # Boolean array for instances where the model predicts the incorrect class.
    inaccuracies = predicted_labels != targets

    # Initialise the UCE value, and bin_entropy_values & bin_inaccuracies lists.
    UCE = 0
    bin_entropy_values = []
    bin_inaccuracies = []

    # Loop over the bin boundaries.
    for lower, upper in zip(bin_lower_bounds, bin_upper_bounds):
        # # Boolean array that identifies which confidence values fall into the current bin.
        in_bin = np.logical_and(
            prediction_entropies > lower.item(), prediction_entropies <= upper.item()
        )

        # Empirical probability of finding the sample in that bin: equivalent to |Bm|/n
        prob_in_bin = in_bin.mean()

        # Determine bin statistics for all non-empty bins.
        if prob_in_bin.item() > 0:
            # Calculate the average inaccuracy for the current bin.
            err_in_bin = inaccuracies[in_bin].mean()
            # Calculate the average normalised prediction entropy for the current bin.
            uncert_in_bin = prediction_entropies[in_bin].mean()

            # Calculate the UCE for the current bin and add it to the running total UCE value.
            norm_factor = (n_classes - 1) / n_classes
            # For classification, a uniform predictive distribution corresponds to maximum entropy (max entropy of 1, max inaccuracy of 1 - 1/k).
            # This means that to get the entropy to scale with inaccuracy, it must be normalised by (k-1)/k for k classes.
            UCE += (prob_in_bin) * np.abs(err_in_bin - (uncert_in_bin * norm_factor))

            bin_inaccuracies.append(err_in_bin)
            bin_entropy_values.append(uncert_in_bin)

    # Create the reliability diagram and save to the filepath directory
    xs = bin_entropy_values
    ys = bin_inaccuracies

    if save_fig:
        plt.clf()
        plt.figure(figsize=(8, 8))
        plt.plot([0.0, 1], [0.0, norm_factor], "--", color="black", alpha=0.75)
        plt.scatter(xs, ys, alpha=1)
        plt.plot(xs, ys, "tab:blue", alpha=0.4, label=f"Classification\nUCE: {UCE:.4f}")
        plt.xlabel("Mean entropy", fontsize=13)
        plt.ylabel("Relative Frequency of incorrect Predictions", fontsize=13)
        plt.legend(fontsize=12, loc="upper left")
        plt.title("Uncertainty Calibration Curve", fontsize=15)
        plt.savefig(os.path.join(filepath, "UCE_reliability_diagram.png"))
        plt.close()

    # Return the bin entropy array and bin inaccuracy array for plotting purposes later in the evaluation script
    return UCE, xs, ys



# New VCE function for multiclass

[docs]
def variation_calibration_error(
    predictions: np.ndarray,
    targets: np.ndarray,
    filepath: str,
    save_fig: bool = True,
    bin_number: int = 15,
) -> Tuple[float, np.ndarray, np.ndarray]:
    """
    Compute the Variation Calibration Error (VCE) metric. Extends the ECE to a measure of distributional calibration for multiclass problems.
    Measure of variation employed for this metric is the Shannon entropy. Bins can be defined with either equal-widths or equal-frequency.

    Parameters:
    -----------
    predictions (np.ndarray):
        Prediction array with shape (n, k) for n samples and k classes.
    targets (np.ndarray):
        Target class labels with shape (n, 1).
    save_fig (bool, default = False):
        Flag to save the reliability diagram.
    filepath (str, default = os.getcwd()):
        Filepath to which the reliability diagram is saved.
    bin_number (int, default = 10):
        Number of bins used to compute metric and plot reliability diagram.

    Returns:
    --------
    VCE (float):
        VCE metric value.
    xs (np.ndarray):
        Mean entropy of predictions array for plotting the reliability diagram.
    ys (np.ndarray):
        Entropy of the rank distribution array for plotting the reliability diagram.
    """
    assert (
        predictions.ndim == 2
    ), "The shape of the predictions must be (n, k) for n samples, and k classes."
    assert len(predictions) == len(
        targets
    ), "Predictions and targets array must have same length."

    num_classes = predictions.shape[-1]
    n_samples = len(predictions)

    # Get prediction entropies using the entropy function from utils.py
    prediction_entropies = entropy(predictions)

    # Assign bin edges based on the chosen binning strategy
    # Equal width bins, lower bound based on threshold
    bin_edges = np.linspace(0, 1, bin_number + 1)

    bin_indices = [
        np.where(
            (prediction_entropies >= bin_edges[i])
            & (prediction_entropies < bin_edges[i + 1])
        )[0]
        for i in range(bin_number)
    ]

    VCE = 0
    xs = []
    ys = []

    # Loop over bins
    for indices in bin_indices:
        bin_size = len(indices)

        # Bin weighting
        prop_in_bin = bin_size / n_samples
        print(f"Proportion of samples in this bin: {prop_in_bin}")

        # Determine bin statistics for all non-empty bins
        if prop_in_bin > 0:
            bin_preds = predictions[indices]
            bin_targs = targets[indices]
            # bin_ents = prediction_entropies[indices]

            # Mean entropy of model predictions for this bin
            # mean_bin_ent = np.mean(bin_ents)

            # --- NEW METRIC DEFINITION ---
            # Determine the entropy of the mean of the ordered predicted probability distributions
            p = np.mean(np.flip(np.sort(bin_preds, axis=1)), axis=0, keepdims=True)
            print(f"SHAPE: {p.shape}, MEAN_PROB: {p}")
            ent_bin_mean = entropy(p)

            # Order class labels in descending order by classifier confidence
            order = np.argsort(-bin_preds, axis=1)

            # Create rankings array for each class label
            ranks_matrix = np.zeros_like(order)
            rows = np.arange(bin_size)[:, np.newaxis]
            ranks_matrix[rows, order] = np.arange(num_classes)

            # Get rankings of the true class from the rankings array (1-indexed)
            true_ranks = ranks_matrix[np.arange(bin_size), bin_targs] + 1

            # Calculate the frequency of each rank to get distribution, and calculate entropy of distribution
            ranks_freqs = np.bincount(true_ranks - 1, minlength=num_classes)
            ranks_freqs = ranks_freqs / ranks_freqs.sum()
            ranks_entropy = entropy(
                ranks_freqs[np.newaxis, :]
            ).item()  # Entropy of ranks distribution using entropy function from utils.py

            # Calculate VCE, weighted by bin size
            VCE += prop_in_bin * (np.abs(ent_bin_mean.item() - ranks_entropy))
            xs.append(ent_bin_mean.item())
            ys.append(ranks_entropy)

    if save_fig:
        plt.clf()
        plt.figure(figsize=(8, 8))
        plt.plot([0.0, 1], [0.0, 1], "--", color="black", alpha=0.75)
        plt.scatter(xs, ys, alpha=1)
        plt.plot(xs, ys, "tab:blue", alpha=0.4, label=f"Classification\nVCE: {VCE:.4f}")
        plt.xlabel("Mean Normalised Entropy", fontsize=13)
        plt.ylabel("Mean Empirical Entropy", fontsize=13)
        plt.legend(fontsize=12, loc="upper left")
        plt.title("Uncertainty Calibration Curve", fontsize=15)
        plt.savefig(
            os.path.join(filepath, f"VCE_reliability_diagram_{num_classes}_classes.png")
        )
        plt.close()

    return VCE, np.asarray(xs), np.asarray(ys)




[docs]
def negative_log_likelihood(
    predictions: np.ndarray, targets: np.ndarray, *args
) -> float:
    """

    Compute the negative log likelihood metric for the predicted probabilities for k-class classification.
    Values are clipped to avoid divide by zero errors when taking the base-2 logarithm.

    Parameters:
    -----------
    predictions: np.ndarray
        Model output probabilities with shape (n, k) for k-class classification.
    targets: np.ndarray
        Target class labels with shape (n, 1).
    *args:
        Used to capture other arguments passed to the function that are not necessary e.g. filepath for reliability diagram.


    Returns:
    --------
        nll (float): Calculated NLL value.
    """
    assert len(predictions) == len(
        targets
    ), "Number of predictions and ground truth class labels must match!"
    assert (
        predictions.ndim == 2
    ), "Predictions array must be 2D with shape (n_samples, n_classes)."
    # Clipping to avoid log(0).
    epsilon = 1e-15
    clipped_predictions = np.clip(predictions, epsilon, 1 - epsilon)

    ll = mp.mpf(0)  # Arbitrary precision float.
    count = 0

    for i, (pred, label) in enumerate(zip(clipped_predictions, targets)):
        # Get probability for the ground truth class.
        true_class_prob = float(pred[int(label)])

        # Calculate log probability for ground truth class.
        log_prob = mp.log(true_class_prob, b=2)
        ll += log_prob
        count += 1  # Counting successful calculations.

    # print(f"Samples used for NLL calculation: {count} out of {n_samples}")
    # Calculate average negative log-likelihood.
    nll = -ll / count

    return float(nll)



# Placeholder coverage function for prediction sets.

[docs]
def set_coverage(prediction_sets, targets):
    """

    Calculate the prediction set coverage for model ouputs given from either basic conformal prediction, or sets created using top-k selection from model probabilities.
    Note: top-k selection from model probabilities has no coverage guarantee, in contrast to conformal methods.

    Parameters:
    -----------
        prediction_sets (np.ndarray(List)): Array of lists containing the predicted classes.
        targets (np.ndarry): The ground truth class labels.

    Returns:
    --------
        float: The empirical frequency of the ground truth class labels in the set.
    """

    return np.mean([target in set for set, target in zip(prediction_sets, targets)])



# Optional binary classification metrics that evaluate calibration based on cumulative differences between predicted probabilities and ground truth labels.

[docs]
def expected_cumulative_calibration_errors(
    predictions: np.ndarray, targets: np.ndarray, filepath: str, save_fig: bool = True
):
    """

    Calculating the ECCE-MAD (aka Kolomogorv-Smirnov statistic) and ECCE-R (aka Kuiper statistic) metrics.
    The saved plot shows the deviation from zero of the cumulative errors.

    References:
    -----------

    The ECCE metrics are introduced in the following: <https://arxiv.org/pdf/2205.09680>.
    This implementation is from the cumulative function in: <https://github.com/facebookresearch/ecevecce/blob/main/codes/calibration.py>.

    The alterations to the cumulative function simply involve editing the format of the inputs to the function to be consistent with the other metrics.
    In addition, the normalised ECCE_MAD and ECCE_R metrics are given, as this is easier to compare with the target limits of convergence under the null hypothesis of perfect calibration.


    Parameters:
    -----------

    predictions: np.ndarray
        Model output probabilities with shape (n, 2) for binary classification
    targets: np.ndarray
        Target class labels with shape (n, 1)
    filepath: str
        Path to directory to save the plot of the cumulative calibration errors.
    save_fig: bool
        Boolean to determine whether to save the reliability diagram to the given filepath.

    Returns:
    --------

    norm_ECCE-MAD: float
        The maximum absolute deviation from zero statistic (Kolmogorov-Smirnov) for the cumulative differences between the confidences and outcomes
    norm_ECCE-R: float
        The range of deviation from zero statistic (Kuiper) for the cumulative differences between the confidences and outcomes
    statistical_significance_scale: float
        Normalisation factor that is used to assess statistically significant deviations from the asymptotic  expected values (under large sample size n)

    """

    # Helper function to determine the location of the minor ticks
    def histcounts(nbins, x):
        # Counts the number of entries of x
        # falling into each of nbins equispaced bins.
        j = 0
        nbin = np.zeros(nbins)
        for k in range(len(x)):
            if x[k] > (j + 1) / nbins:
                j += 1
            if j == nbins:
                break
            nbin[j] += 1
        return nbin

    assert (
        predictions.ndim == 2 and predictions.shape[1] == 2
    ), "Predictions must have shape (n_samples, 2) for binary classification"

    # Get accuracy outcomes as binary array (1 for successful prediction, 0 for unsuccessful prediction)
    unsorted_responses = (
        np.argmax(predictions, axis=-1).astype(int) == targets
    ).astype(int)
    unsorted_scores = np.max(predictions, axis=-1)  # Probability of predicted class

    # Add small noise to ensure uniqueness, and get sorted indices
    scores_with_noise = unsorted_scores * (
        np.ones(shape=unsorted_scores.shape)
        + np.random.normal(size=unsorted_scores.shape) * 1e-6
    )

    sorted_indices = np.argsort(scores_with_noise, axis=-1)
    scores = scores_with_noise[sorted_indices]
    responses = unsorted_responses[
        sorted_indices
    ]  # Preserve pairing of outcome with predicted probability/confidence

    # The predictions must all be unique and in ascending order
    assert all(
        scores[k] < scores[k + 1] for k in range(len(scores) - 1)
    ), "The scores must be in strictly ascending order"
    assert len(responses) == len(
        scores
    ), "The lengths of the outcomes and scores arrays must be the same"

    plt.clf()
    plt.figure()
    ax = plt.axes()

    # Statistically significant length scale, which indicates significant deviations from zero (under null hypothesis of perfect calibration)
    lenscale = np.sqrt(np.sum(scores * (1 - scores))) / len(scores)

    # Cumulative sums of the scores/predictions, and the responses/outcomes (success or failure)
    cumulative_responses = np.cumsum(responses) / int(len(responses))
    cumulative_scores = np.cumsum(scores) / int(len(scores))

    # Plot the difference.
    plt.plot(
        np.insert(cumulative_responses - cumulative_scores, 0, [0])[
            : (int(len(cumulative_responses)) + 1)
        ],
        "k",
    )

    # Make sure the plot includes the origin.
    plt.plot(0, "k")

    # Add an indicator of the scale of 1/sqrt(n) to the vertical axis.
    plt.plot(2 * lenscale, "k")
    plt.plot(-2 * lenscale, "k")
    kwargs = {
        "head_length": 2 * lenscale,
        "head_width": len(cumulative_scores) / 20,
        "width": 0,
        "linewidth": 0,
        "length_includes_head": True,
        "color": "k",
    }
    plt.arrow(0.1e-100, -2 * lenscale, 0, 4 * lenscale, shape="left", **kwargs)
    plt.arrow(0.1e-100, 2 * lenscale, 0, -4 * lenscale, shape="right", **kwargs)
    plt.margins(x=0, y=0)

    # Hardcode the number of major and minor ticks
    majorticks = 10
    minorticks = 100

    # Label the major ticks of the lower axis with the values of the scores.
    ss = [
        "{:.5f}".format(x)
        for x in np.insert(cumulative_scores, 0, [0])[
            :: (len(cumulative_scores) // majorticks)
        ].tolist()
    ]

    plt.xticks(
        np.arange(majorticks) * len(cumulative_scores) // majorticks, ss[:majorticks]
    )

    ax.tick_params(which="major", axis="x", labelsize=8.5)

    if len(cumulative_scores) >= 300 and minorticks >= 50:
        # Indicate the distribution of the scores via unlabeled minor ticks.
        plt.minorticks_on()
        ax.tick_params(which="minor", axis="x")
        ax.tick_params(which="minor", axis="y", left=False)
        ax.set_xticks(np.cumsum(histcounts(minorticks, cumulative_scores)), minor=True)

    # Label the axes.
    plt.xlabel("$S_k$")
    plt.ylabel("$C_k$")
    plt.twiny()
    plt.xlabel("$k/n$")

    # Calculate the cumulative differences between the predictions and outcomes
    cumulative_diffs = np.insert(cumulative_responses - cumulative_scores, 0, [0])[
        : int(len(cumulative_responses) + 1)
    ]

    # Determine the metric values from the cumulative differences
    ECCE_R = np.max(cumulative_diffs) - np.min(cumulative_diffs)
    ECCE_MAD = np.max(np.abs(cumulative_diffs))

    # Get metrics normalised by length scale to compare with MAD and Range of standard Brownian motion ()
    norm_ECCE_MAD = ECCE_MAD / lenscale
    norm_ECCE_R = ECCE_R / lenscale

    legend_text = (
        f"ECCE-R/σ (norm. Kuiper): {norm_ECCE_R:.4f}\n"
        f"ECCE-MAD/σ (norm. K-S): {norm_ECCE_MAD:.4f}"
    )

    plt.figtext(
        0.94,
        0.80,
        legend_text,
        horizontalalignment="right",
        verticalalignment="top",
        bbox=dict(
            facecolor="white", alpha=0.8, edgecolor="gray", boxstyle="round,pad=0.5"
        ),
    )

    # Title the plot.
    plt.title("Cumulative Calibration Curve")

    # Clean up the whitespace in the plot.
    plt.tight_layout()

    # Save the plot if desired.
    if save_fig:
        plt.savefig(
            os.path.join(filepath, "cumulative_calibration_plot.png"),
            bbox_inches="tight",
        )
    plt.close()

    # print(f"Cumulative calibration Statistics:")
    # print("-"*77)
    # print(f"ECCE_MAD statistic: {ECCE_R:.5f}")
    # print(f"ECCE_MAD statistic: {ECCE_MAD:.5f}")
    # print(f"The variance/length scale of statistically significant deviations: {lenscale:.7f}")
    # print("-"*77)
    # print(f"The normalised ECCE-MAD: {norm_ECCE_MAD}\nThe normalised ECCE-R: {norm_ECCE_R}")

    return ["%.5f" % norm_ECCE_MAD, "%.5f" % norm_ECCE_R]



#######################################################################################
# ----------------------------------------------------------------------------------- #
# THE METRICS BELOW ARE TO CALCULATE AVERAGE CALIBRATION ERRORS FOR REGRESSION TASKS  #
# ----------------------------------------------------------------------------------- #
#######################################################################################



[docs]
def picp(
    predictions: np.ndarray,
    targets: np.ndarray,
    confidence_level: float,
    is_interval: bool = True,
    uncertainties: np.ndarray = None,
    is_var: bool = True,
) -> tuple:
    """

    Calculate the Prediction Interval Coverage Probability (PICP) for predictions given either as intervals or point-predictions, for a specified confidence level.
    If inputs are given as point-predictions, uncertainty (as variance/standard deviations) arrays must be given, from which intervals are derived, assuming a Gaussian distribution.

    The Mean Prediction Interval Width (MPIW) is also returned. Note this is an unbounded non-negative metric.


    Parameters:
    -----------

        predictions: np.ndarray
            Array of model predictions with shape either (n, 2) for intervals or (n, 1) for point predictions
        targets: np.ndarray
            Array of target values with shape (n, 1)
        confidence_level: float
            The confidence level of the prediction intervals. If point-predictions are given, intervals will be created assuming Gaussian distribution at this confidence level.
        is_interval: bool
            Boolean flag to set whether predictions are given as intervals or point-predictions. Defaults to True.
        uncertainties: np.ndarray
            Array of model uncertainties with shape (n, 1). Defaults to None, must be present if is_interval is False.
        is_var: bool
            Bollean flag corresponding to if the uncertainties are given as variances (default) or standard deviations. Defaults to True.

    Returns:
    --------

        picp: float
            The prediction interval coverage probability of the model predictions. Aim is for the PICP to match the confidence level.
        MPIW: float
            The Mean Predicted Interval Wdith of the model predictions.
    """

    assert (
        isinstance(confidence_level, float) and 0 < confidence_level <= 1.0
    ), "Confidence level must be a float between 0 and 1."
    confidence_level = np.abs(confidence_level)  # Force confidence level to be positive

    if is_interval:
        intervals = predictions

        assert (
            intervals.shape[1] == 2
        ), "Intervals array must be of shape (n_samples, 2)"
        assert np.all(
            intervals[:, 0] <= intervals[:, 1]
        ), "Intervals must be given as (lower, upper)"
        if len(intervals) != len(targets):
            raise ValueError(
                "The lengths of the intervals and targets array must be the same."
            )

        within_interval = np.logical_and(
            targets >= intervals[:, 0], targets <= intervals[:, 1]
        )

        # Mean Prediction Interval Width.
        MPIW = np.mean(np.abs(intervals[:, 0] - intervals[:, 1]))

        # Fraction of targets that lie within the prediction interval.
        picp = np.mean(within_interval)

    # If prediction intervals are not given, then they are created from point predictions and uncertainties, assuming Gaussian distribution.
    else:
        assert (
            uncertainties is not None and not is_interval
        ), "Please provide uncertainties as variances for point predictions."
        assert isinstance(is_var, bool), '"is_var" must be a boolean.'

        if len(predictions) != len(targets) != len(uncertainties):
            raise ValueError(
                "The lengths of the predictions, targets, and uncertainties array must be the same."
            )

        if is_var:
            std_devs = np.sqrt(uncertainties)
        else:
            std_devs = uncertainties

        # Determine lower and upper bounds, based on defined confidence level
        lower_bounds = stats.norm.ppf(
            (1 - confidence_level) / 2, loc=predictions, scale=std_devs
        )
        upper_bounds = stats.norm.ppf(
            (1 + confidence_level) / 2, loc=predictions, scale=std_devs
        )

        within_interval = np.logical_and(
            targets >= lower_bounds, targets <= upper_bounds
        )

        # Mean prediction interval width - smaller the interval, the sharper the prediction
        MPIW = np.mean(np.abs(upper_bounds - lower_bounds))

        # Prediction interval coverage probability - fraction of target values that lie within the predicted intervals
        picp = np.mean(within_interval)

    # print(f"PICP: {picp}")
    # print(f"MPIW: {MPIW}")

    return picp, MPIW




[docs]
def coverage_calibration_error(
    predictions: np.ndarray,
    targets: np.ndarray,
    uncertainties: np.ndarray,
    filepath: str = None,
    is_var: bool = True,
    save_fig: bool = False,
    number_of_intervals: int = 10,
):
    """
    Calculate the confidence interval-based metric for model predictions and their corresponding uncertainties.
    If uncertainties are given as variances, they are converted to standard deviations.

    Parameters:
    -----------

    predictions: np.ndarray
        Array of model predictions with shape (n, 1)
    targets: np.ndarray
        Array of target values with shape (n, 1)
    uncertainties: np.ndarray
        Array of total uncertainties (aleatoric + epistemic) with shape (n, 1)
    filepath: str
        Path to directory to save the coverage calibration curve.
    is_var: bool, optional
        Boolean parameter that determines whether to convert the uncertainties from variances to standard deviations, by default True
    save_fig: bool, optional
        Boolean parameter that sets whether to save the plot to the filepath, by default True.
    number_of_intervals: int, optional
        The number of confidence intervals to evaluate the calibration error on, by default 10 (range 0 to 1 in steps of 0.1)


    Returns:
    --------

    calibration_error: float
        The confidence interval-based calibration error metric
    """

    assert (
        len(predictions) == len(targets) == len(uncertainties)
    ), "Input arrays have different lengths!"
    predictions, targets, uncertainties = (
        np.array(predictions),
        np.array(targets),
        np.array(uncertainties),
    )
    assert (
        predictions.ndim == targets.ndim == uncertainties.ndim
    ), "Input arrays have different dimensions \nExpected all arrays to have shape (n, 1)"

    num_samples = len(predictions)

    # Define the confidence intervals
    confidence_levels = np.linspace(0, 1, number_of_intervals + 1)[1:]

    # If the uncertainties are given as variances, convert them to standard deviations
    if is_var:
        uncertainties = np.sqrt(uncertainties)

    # Initialise the calibration error value, and the observed agreements list
    observed_agreements = []
    calibration_error = 0

    # Loop over the confidence intervals
    for p in confidence_levels:
        # Calculate the threshold value of the inverse CDF of the predictions for the given confidence level 100*p%
        threshold = stats.norm.ppf(p, predictions, uncertainties)

        # Sum the number of target values that lie in the region: GT <= threshold
        within_interval = (targets <= threshold).sum()
        # Relative number of ground truths that lie below the threshold for a given confidence interval
        empirical_freq = within_interval / num_samples
        observed_agreements.append(empirical_freq)

        # Calculate the calibration error as the sum of squared differences
        calibration_error += np.square(p - empirical_freq)

    assert len(observed_agreements) == len(
        confidence_levels
    ), "The shape of agreements != the shape of the confidence intervals!"

    # Plot the coverage calibration curve
    xs = confidence_levels
    ys = observed_agreements

    if save_fig:
        plt.figure(figsize=(8, 8))
        plt.plot([0, 1], [0, 1], "--", color="black", alpha=0.5)
        plt.scatter(xs, ys, alpha=1)
        plt.plot(
            xs, ys, "b", alpha=0.4, label=f"Regression \nCCE: {calibration_error:.4f}"
        )
        plt.xlabel("Expected Confidence Level")
        plt.ylabel("Observed Confidence Level")
        plt.legend(loc="upper left")
        plt.title("Regession Calibration Curve")
        plt.savefig(os.path.join(filepath, "coverage_calibration_curve.png"))
        plt.close()

    return calibration_error




[docs]
def continuous_ranked_probability_score(
    predictions: np.ndarray,
    targets: np.ndarray,
    uncertainties: np.ndarray,
    converter: str = "Gaussian",
    distributions: list = None,
    is_var: bool = True,
) -> float:
    """

    Calculate the Continuous Ranked Probability Score (CRPS) from the given model predictions, uncertainties, and corresponding ground truths.
    The option is given to calculate the CRPS either using the analytical expression given the model predictions parameterising a Gaussian, or using numerical integration.
    The numerical integration method for a distribution estimated with KDE yields errors of <1% compared to the analytical expression for a Gaussian.

    References:
    -----------

    The CRPS and its analytical expression for a univariate Gaussian is given in Gneiting et al., 2007: <https://sites.stat.washington.edu/raftery/Research/PDF/Gneiting2007jasa.pdf>.

    Parameters:
    -----------

    predictions (np.ndarray):
        An array of the model predictions with shape (n, 1).
    targets (np.ndarray):
        Array of target values with shape (n, 1).
    uncertainties (np.ndarray):
        Array of the total uncertainties associated with the model predictions from a chosen UQ method, with shape (n, 1).
    converter (str, optional):
        Specify whether the converter used was "Gaussian" or "KDE"; evaluates CRPS using numerical integration of CDF if "KDE", otherwise uses analytical expression for Gaussian. Defaults to "Gaussian".
    distributions (List[distribution objects], optional):
        Option to include the list of distributions as a direct input - needed if input is from KDE conversion of prediction interval to distribution. Defaults to None.
    is_var (bool, optional):
        Boolean parameter that sets whether to convert the uncertainties from variances to standard deviations. Defaults to True.

    Returns:
    --------

    CRPS (float):
        The average CRPS value across the dataset.
    """

    # Check function inputs
    assert (
        converter == "Gaussian" or converter == "KDE"
    ), 'Please specify the conversion method used as either: "Gaussian" or "KDE".'

    if is_var:
        std_devs = np.sqrt(uncertainties)
    else:
        std_devs = uncertainties

    CRPS_values = []
    # Calculate the CRPS from the analytical expression for a Gaussian distribution.
    if converter == "Gaussian":
        # Calculate the z-scores for each prediction and ground truth.
        z_scores = (targets - predictions) / std_devs

        for i, (z_score, uncert) in enumerate(zip(z_scores, std_devs)):
            # Determine the CDF and PDF values for the standard normal distribution at the z_score value.
            CDF = stats.norm.cdf(z_score, loc=0, scale=1)
            PDF = stats.norm.pdf(z_score, loc=0, scale=1)
            # Calculate the CRPS for a given ground truth using the analytical expression for a Gaussian.
            crps = uncert * ((2 * PDF) + z_score * (2 * CDF - 1) - (1 / np.sqrt(np.pi)))
            CRPS_values.append(crps)

    # Manually need to specify that the converter is KDE to use this method. Must be changed if new conversion methods are added.
    elif converter == "KDE":
        assert (
            distributions is not None
        ), "Please provide a list of distribution objects if you have used the KDE conversion method."
        assert len(distributions) == len(
            targets
        ), "Distribution and target array must have the same length."

        # For each distribution parameterised by a mean and standard deviation, calculate the CRPS with numerical integration and append to the list.
        for distribution, target in zip(distributions, targets):
            assert all(hasattr(distribution, attr) for attr in ["density", "support"])

            support = distribution.support
            pdf_vals = (
                distribution.density
            )  # PDF values for the distribution evaluated at the support
            dx = support[1] - support[0]  # Resolution of the support

            # Approximate the CDF of the distribution with a cumulative numerical integration of the PDF array.
            cdf_vals = cumulative_simpson(pdf_vals, dx=dx, initial=0)

            # The bounds of integration need extending in the case that the target is outside of the support values for the KDE distribution.

            # This is a crude method that assumes CDF is 0/1 to the left/right of the support values - yields an error of ~1% compared to ground truth CRPS values
            # but is necessary to prohibit large computation time with more accurate numerical integration methods.

            if target < support[0]:
                # Values from the observation upto the left tail of the support to define the integration range.
                left_extension = np.arange(target, support[0], dx)
                extended_support = np.concatenate(
                    [left_extension, support]
                )  # Extended support range to cover observation
                # Estimate CDF values to be 0 from left tail of the support upto target value.
                extended_cdf = np.concatenate([np.zeros_like(left_extension), cdf_vals])

            elif target > support[-1]:
                # Values from the right tail of the support upto the observation, to define the integration range.
                right_extension = np.arange(support[-1] + dx, target + dx, dx)
                extended_support = np.concatenate(
                    [support, right_extension]
                )  # Extended support range to cover observation
                # Estimate CDF values to be 1 from right tail of the CDF upto the target value.
                extended_cdf = np.concatenate([cdf_vals, np.ones_like(right_extension)])

            else:
                # Target lies within the support bounds of the distribution, so no extension necessary.
                extended_support = support
                extended_cdf = cdf_vals

            # Define the empirical CDF of the target value.
            heaviside = np.where(extended_support >= target, 1.0, 0.0)

            # Integrand of the CRPS
            squared_diff = np.square(extended_cdf - heaviside)

            # Numerical integration of the integrand over the extended support region that includes the target value.
            crps = simpson(squared_diff, dx=dx)
            CRPS_values.append(crps)

    CRPS = np.mean(CRPS_values)

    return CRPS



###########################################################################################
# --------------------------------------------------------------------------------------- #
# THE METRICS BELOW ARE TO CALCULATE CONDITIONAL CALIBRATION ERRORS FOR REGRESSION TASKS  #
# --------------------------------------------------------------------------------------- #
###########################################################################################



[docs]
def expected_normalised_calibration_error(
    predictions: np.ndarray,
    targets: np.ndarray,
    uncertainties: np.ndarray,
    filepath: str = None,
    is_var: bool = False,
    save_fig: bool = False,
    bin_number: int = 15,
):
    """
    Calculating the Expected Normaised Calibration Error (ENCE) for the model predictions and the corresponding total uncertainties.

    Parameters:
    -----------

    predictions: np.ndarray
        An array of the model predictions with shape (n, 1)
    targets: np.ndarray
        Array of target values with shape (n, 1)
    uncertainties: np.ndarray
        Array of the total uncertainties associated with the model predictions from a chosen UQ method, with shape (n, 1)
    filepath: str
        Path of directory to save the reliability diagram.
    is_var: bool, optional
        Boolean parameter that sets whether to convert the uncertainties to standard deviations, by default True
    save_fig: bool, optional
        Boolean parameter that sets whether to save the plot to the filepath, by default True.
    bin_number: int, optional
        Chosen number of bins for calculating ENCE (note: the bins are defined to have an equal number of samples per bin), by default 15

    Returns:
    --------

    ENCE: float
        The value of the ENCE calculated for the given inputs
    """

    predictions, targets, uncertainties = (
        np.array(predictions),
        np.array(targets),
        np.array(uncertainties),
    )
    assert (
        len(predictions) == len(targets) == len(uncertainties)
    ), "Input arrays have different lengths!"

    if not (predictions.ndim == targets.ndim == uncertainties.ndim):
        raise ValueError(
            "Input arrays have different dimensions \nExpected all arrays to have shape (n, 1)"
        )

    # If the optional argument is_var = True, then convert the uncertainties to standard deviations
    if is_var:
        uncertainties = np.sqrt(uncertainties)

    # Create an array of the errors
    errors = predictions - targets

    # Sort the arrays on increasing standard deviation (σ) and sort the errors array based on these indices
    sorted_indices = np.argsort(uncertainties)
    sorted_stdevs = uncertainties[sorted_indices]
    sorted_errors = errors[sorted_indices]

    n_samples = len(predictions)

    # Define the equal-frequency bin edges
    bin_edges = np.linspace(0, n_samples, bin_number + 1).astype(int)

    # Initialise the ENCE value, and the RMSE and RMV lists
    ENCE = 0
    RMSEs = []
    RMVs = []

    # Loop through the bins
    for i in range(bin_number):
        # Assign the bin boundaries for the current bin
        bin_lower, bin_upper = bin_edges[i], bin_edges[i + 1]

        # Determine which uncertainties and errors fall within the current bin
        bin_stdevs = sorted_stdevs[bin_lower:bin_upper]
        bin_errors = sorted_errors[bin_lower:bin_upper]

        # Convert the uncertainties in the bin to variances
        bin_vars = (bin_stdevs) ** 2

        # Calculate the average variance for the bin (RMV)
        bin_MV = bin_vars.mean()
        # Calculate the average squared error for the bin (RMSE)
        bin_MSE = ((bin_errors) ** 2).mean()

        RMSEs.append(np.sqrt(bin_MSE))
        RMVs.append(np.sqrt(bin_MV))

        # Update the ENCE value with the absolute difference between the RMSE and RMV, normalised by the RMV
        ENCE += np.abs(np.sqrt(bin_MV) - np.sqrt(bin_MSE)) / (np.sqrt(bin_MV))

    # Average the ENCE across all the bins
    ENCE = ENCE / bin_number

    # Plot the reliability diagram and save to filepath
    xs = RMVs
    ys = RMSEs

    if save_fig:
        plt.figure(figsize=(8, 8))
        plt.plot(
            [np.min(xs), np.max(xs)],
            [np.min(xs), np.max(xs)],
            "--",
            color="black",
            alpha=0.75,
        )
        plt.scatter(xs, ys, alpha=1)
        plt.plot(xs, ys, "b", alpha=0.4, label=f"Regression\nENCE = {ENCE:.4f}")
        plt.xlabel("RMV per bin", fontsize=13)
        plt.ylabel("RMSE per bin", fontsize=13)
        plt.tick_params(axis="x", labelsize=11)
        plt.tick_params(axis="y", labelsize=11)
        plt.legend(fontsize=12)
        plt.title("Regression Calibration Curve")
        # plt.ylim(np.min(xs)-1, np.max(xs)+1)
        # plt.xlim(np.min(xs)-1, np.max(xs)+1)
        plt.savefig(os.path.join(filepath, "ENCE_reliability_diagram.png"))
        plt.close()

    return ENCE, xs, ys



# Not used in the QUMPHY evaluation framework.

[docs]
def z_variance_error(predictions, targets, uncertainties, is_var=True, bin_number=15):
    """
    Calculate the Z-variance error (ZVE) for the model predictions and corresponding total uncertainties.


    Parameters:
    -----------

    predictions: np.ndarray
        An array of the model predictions with shape (n, 1)
    targets: np.ndarray
        Array of target values with shape (n, 1)
    uncertainties: np.ndarray
        Array of the total uncertainties associated with the model predictions from a chosen UQ method, with shape (n, 1)
    is_var: bool, optional
        Boolean parameter that sets whether to convert the uncertainties from variances to standard deviations, by default True
    bin_number: int, optional
        Chosen number of bins for calculating ZVE (note: the bins are defined to have an equal number of samples per bin), by default 15


    Returns:
    --------

    ZVE: float
        The ZVE value for the input predictions and uncertainties
    """

    assert (
        len(predictions) == len(targets) == len(uncertainties)
    ), "Input arrays have different lengths!"
    assert (
        predictions.ndim == targets.ndim == uncertainties.ndim
    ), "Input arrays have different dimensions \nExpected all arrays to have shape (n, 1)"

    # Create an array of the errors
    errors = predictions - targets

    # Sort based on the uncertainties
    sorted_indices = np.argsort(uncertainties)
    sorted_uncertainties = uncertainties[sorted_indices]
    sorted_errors = errors[sorted_indices]

    n_samples = len(errors)

    # Create the bin edges for equal frequency bins
    bin_edges = np.linspace(0, n_samples, bin_number + 1).astype(int)

    # Initiliase ZVE, bin sizes list, Var(Z) list, bin centres list, σ(E) list, RMS(uncertainties) list
    ZVE = 0
    bin_sizes = []

    # These are used to plot Var(Z) vs bin_centres, to check that Var(Z) is close to 1 for conditional calibration
    Var_Zs = []
    bin_centres = []

    # These are used to plot the reliability diagram to check conditional calibration
    stdev_errors = []
    RMS_uncerts = []

    # Loop over each bin
    for i in range(bin_number):
        bin_lower, bin_upper = bin_edges[i], bin_edges[i + 1]

        bin_errors = sorted_errors[
            bin_lower:bin_upper
        ]  # the errors that lie within the current bin
        bin_vars = sorted_uncertainties[
            bin_lower:bin_upper
        ]  # these uncertainties are output as variances from the model

        if is_var:
            bin_stdevs = np.sqrt(
                np.asarray(bin_vars)
            )  # converting the uncertainties to standard deviations to calculate the Z-score per bin

        bin_length = len(bin_errors)
        bin_sizes.append(
            bin_length
        )  # Obtain a list of the bin sizes to check they have an equal number of samples

        bin_z_scores = (bin_errors) / (
            bin_stdevs
        )  # List of the Z-scores in the current bin

        bin_z_score_variance = np.var(
            np.asarray(bin_z_scores)
        )  # Variance of the Z-scores for the current bin

        # Append the variance of the z score for the bin, as well as the bin centre, for the plot of Var(Z) against bin centre
        Var_Zs.append(bin_z_score_variance)
        bin_centres.append(bin_stdevs[bin_length // 2])

        # Append the bin σ(E) and bin mean RMS(u), for the reliability diagram
        stdev_errors.append(np.std(bin_errors))
        RMS_uncerts.append(np.sqrt(bin_vars.mean()))

        # Calculate the ZVE for the bin and add it to the running ZVE score
        ZVE += np.abs(np.log(bin_z_score_variance))

    # Average the log(Var(Z)) across all bins, and return the overall ZVE
    ZVE = np.exp(ZVE / bin_number)

    return ZVE