Source code for qumphy.metrics

"""
File: qumphy/metrics.py
Project: 22HLT01 QUMPHY
Contact: nando.hegemann@ptb.de
Gitlab: https://gitlab.com/qumphy
Description: Evaluation metrics for model performance.
"""

from __future__ import annotations

import typing
import warnings
import torch
import numpy as np
import sklearn.metrics



[docs]
def all_binary_metrics(
    target: np.ndarray,
    prediction: np.ndarray,
):
    """Evaluate all binary classification metrics.

    Given a target and a prediction array, this function computes
    all metrics as decided for the QUMPHY common evaluation framework.

    The metrics are returned as a dictionary with the following keys:

    - `auc`: Area under the curve calculated with raw probabilities
    - `f1`: F1-score calculated with a classification threshold of 0.5
    - `mcc_sens`: Matthews correlation coefficient calculated with a threshold achieving a sensitivity of 0.8
    - `mcc_spec`: Matthews correlation coefficient calculated with a threshold achieving a specificity of 0.8
    - `sens`: Sensitivity (with a threshold achieving a sensitivity of 0.8)
    - `spec`: Specificity (with a threshold achieving a specificity of 0.8)


    Parameters
    ----------
    target : np.ndarray
        Ground truth values.
    prediction : np.ndarray
        Model output predictions (raw probability of positive
        class).

    Returns
    -------
    Dict[str, float]
        Dictionary with all metrics.
    """
    # Fix the threshold so that recall is 0.8
    # This is hard-coded here for the QUMPHY common evaluation framework.
    recall_value = 0.8
    threshold_sens = recall_score_threshold(
        target, prediction, recall_value=recall_value, pos_label=1
    )
    threshold_spec = recall_score_threshold(
        target, prediction, recall_value=recall_value, pos_label=0
    )

    prediction_05 = prediction > 0.5
    prediction_sens = prediction > threshold_sens
    prediction_spec = prediction > threshold_spec

    metrics_dict = {}
    if np.any(prediction == np.nan):
        metrics_dict["auc"] = np.nan
        metrics_dict["f1"] = np.nan
        metrics_dict["mcc_sens"] = np.nan
        metrics_dict["mcc_spec"] = np.nan
        metrics_dict["sens"] = np.nan
        metrics_dict["spec"] = np.nan
        return metrics_dict

    # metrics_dict["acc_b"] = balanced_accuracy_score(target, prediction)
    # metrics_dict["ppv"] = precision_score(target, prediction)
    metrics_dict["auc"] = auc_score_binary(target, prediction)
    metrics_dict["f1"] = f1_score(target, prediction_05, average="binary")
    metrics_dict["mcc_sens"] = matthews_correlation_coefficient(target, prediction_sens)
    metrics_dict["mcc_spec"] = matthews_correlation_coefficient(target, prediction_spec)
    metrics_dict["sens"] = sensitivity(target, prediction_spec)
    metrics_dict["spec"] = specificity(target, prediction_sens)

    return metrics_dict




[docs]
def all_regression_metrics(
    target: np.ndarray,
    prediction: np.ndarray,
    baseline_mae: float = None,
) -> dict[str, float]:
    """Evaluate all regression classification metrics.

    Parameters
    ----------
    target : np.ndarray
        Ground truth values.
    prediction : np.ndarray
        Model output predictions.
    baseline_mae : float
        Baseline mean absolute error.

    Returns
    -------
    dict[str, float]
        Dictionary with all metrics.
    """
    metrics_dict = {}
    metrics_dict["mae"] = mean_absolute_error(target, prediction)
    # metrics_dict["rmse"] = root_mean_square_error(target, prediction)
    if baseline_mae is not None:
        metrics_dict["mase"] = mean_absolute_scaled_error(
            baseline_mae, metrics_dict["mae"]
        )
    metrics_dict["ieee_grades"] = ieee_grades(target, prediction)

    return metrics_dict




[docs]
def auc_score_binary(
    target: np.ndarray, prediction: np.ndarray, axis: int = 0
) -> float | np.ndarray:
    """Compute the area und curve (AUC) score for binary classification.

    Parameters
    ----------
    target : np.ndarray
        Binary ground truth values for different samples.
    prediction : np.ndarray
        Binary model output predictions (raw prob.) associated
        with the positive class.
    axis : int, optional
        Axis to compute AUC over, by default 0.

    Returns
    -------
    :
        Array of AUC values.

    See Also
    --------
    multiclass_auc_score : AUC score for more then two classes.

    Examples
    --------
    >>> target = np.array([0, 1, 0, 1, 1, 0, 1, 0, 1, 0])
    >>> prediction = np.array([.99, .8, .6, .63, .77, .23, .3, .78, .2, 0.01])
    >>> auc_score_binary(target, prediction)
    xx

    >>> target = np.random.randint(0, 2, (100, 2))
    >>> auc_score_binary(target, target, axis=0)
    (1.0, 1.0)

    >>> target = np.random.randint(0, 2, (50, 100, 2, 5))
    >>> auc_score_binary(target, target, axis=1).shape
    (50, 2, 5)
    """
    assert 0 <= axis < target.ndim
    assert np.all([t == p for t, p in zip(target.shape, prediction.shape)])

    if target.ndim == 1:
        if np.unique(target).size == 1:
            warnings.warn("Only one class present in target.")
            return np.nan
        return sklearn.metrics.roc_auc_score(target, prediction)
    # if target.ndim == 1:
    #    try:
    #        return sklearn.metrics.roc_auc_score(target, prediction)
    #    except:
    #        return np.inf

    shape = target.shape
    new_shape = tuple([s for j, s in enumerate(shape) if j != axis])
    target_reshape = np.reshape(np.moveaxis(target, axis, 0), (shape[axis], -1))
    prediction_reshape = np.reshape(np.moveaxis(prediction, axis, 0), (shape[axis], -1))
    auc = np.zeros(target_reshape.shape[-1])
    for j, (t, p) in enumerate(zip(target_reshape.T, prediction_reshape.T)):
        auc[j] = sklearn.metrics.roc_auc_score(t, p)

    return np.reshape(auc, new_shape)




[docs]
def auc_score_multiclass(
    target: np.ndarray,
    prediction: np.ndarray,
    comparison_type: str = "ovr",
) -> float | np.ndarray:
    """Compute the area und curve (AUC) score.

    Parameters
    ----------
    target : np.ndarray
        Multiclass ground truth values.
    prediction : np.ndarray
        Array of model output probabilities of different classes for different samples.
        If `target` shape is ``(n_samples, ...)`` with ``n_classes`` different class
        values, then `prediction` needs to have shape ``(n_samples, n_classes, ...)``.
        Axis 1 (``n_classes``) needs to sum to one.
    comparison_type : str
        Comparison type for multiclasses, by default \"ovr\".

        ``ovr`` : Stands for one-vs-rest. Computes the AUC for each class against
        the rest of the classes.

        ``ovo`` : Stands for one-vs-one. Computes the average AUC of all possible
        pairwise combinations of classes.

    Returns
    -------
    :
        Array of AUC values.

    See Also
    --------
    auc_score_binary : AUC score for exactly two classes.

    Examples
    --------
    >>> target = np.array([0, 1, 2, 1, 2, 0])
    >>> prediction = np.array([[0.8, 0.1, 0.1],
    >>>                        [0.2, 0.5, 0.3],
    >>>                        [0.8, 0.1, 0.1],
    >>>                        [0.7, 0.2, 0.1],
    >>>                        [0.4, 0.3, 0.3],
    >>>                        [0.5, 0.4, 0.1]])
    >>> auc_score_multiclass(target, prediction, comparison_type="ovo")
    0.6875

    >>> target = np.random.randint(0, 3, (100, 10, 5, 2))
    >>> prediction = np.random.uniform(0, 1, (100, 3, 10, 5, 2))
    >>> prediction /= np.expand_dims(np.sum(prediction, axis=1), 1)
    >>> auc_score_multiclass(target, prediction).shape
    (10, 5, 2)
    """
    assert prediction.ndim == target.ndim + 1
    assert target.shape[0] == prediction.shape[0]
    assert np.all(t == p for t, p in zip(target.shape[1:], prediction.shape[2:]))
    assert (  # second axis is prediction probabilities for classes
        np.unique(target).size == prediction.shape[1]
    )
    assert (  # second axis needs to sum to one
        np.all(prediction >= 0.0)
        and np.all(prediction <= 1.0)
        and np.linalg.norm(np.sum(prediction, axis=1) - 1.0) <= 1e-12
    )
    assert comparison_type in ["ovr", "ovo"]

    if target.ndim == 1:
        return sklearn.metrics.roc_auc_score(
            target, prediction, multi_class=comparison_type
        )

    shape = prediction.shape
    target_reshape = np.reshape(target, (shape[0], -1))
    prediction_reshape = np.reshape(prediction, (shape[0], shape[1], -1))
    auc = np.zeros(target_reshape.shape[-1])
    for j, (t, p) in enumerate(
        zip(target_reshape.T, np.moveaxis(prediction_reshape, -1, 0))
    ):
        auc[j] = sklearn.metrics.roc_auc_score(t, p, multi_class=comparison_type)

    return np.reshape(auc, shape[2:])




[docs]
def balanced_accuracy_score(
    target: np.ndarray,
    prediction: np.ndarray,
) -> float:
    """Compute balanced accuracy score for binary or multi-class classification.

    For the binary case the balanced accuracy score :math:`\\operatorname{Acc}_b` is
    given by the arithmetic mean of sensitivity (Se) and specificity (Sp), i.e.

    .. math::
        \\operatorname{Acc}_b
        = \\frac{1}{2}(\\operatorname{Se} + \\operatorname{Sp})
        = \\frac{1}{2}\\Bigl( \\frac{\\operatorname{TP}}{\\operatorname{TP}+\\operatorname{FN}} + \\frac{\\operatorname{TN}}{\\operatorname{TN}+\\operatorname{FP}} \\Bigr),

    expressed by true positives (TP), true negatives (TN), false positives (FP) and
    false negatives (FN). In general, balanced accuracy is computed by

    .. math::
        \\operatorname{Acc}_b(y_{\\mathrm{true}}, y_{\\mathrm{pred}})
        = \\frac{\\sum_{i=1}^N w_i \\, \\delta(y_{\\mathrm{true},i} = y_{\\mathrm{pred}, i})}{\\sum_{i=1}^N w_i}

    with weights

    .. math::
        w_i = \\frac{1}{\\sum_{j=1}^N \\delta(y_{\\mathrm{true},i} = y_{\\mathrm{true},j})},

    where :math:`\\delta(y_i = y_j)` denotes the Kronecker delta function.

    Parameters
    ----------
    target : np.ndarray
        Ground truth values.
    prediction : np.ndarray
        Predicted values.

    Returns
    -------
    float
        Balanced accuracy score for binary or multi-class classification.

    Examples
    --------
    Computation of balanced accuracy score for binary classification, i.e.,
    a one dimensional array with only 2 classes

    >>> target = np.array([0, 1, 1, 0, 1, 0, 0, 1, 0, 1])
    >>> prediction = np.array([0, 1, 0, 1, 1, 1, 0, 1, 1, 1])
    >>> balanced_accuracy_score(target, prediction)
    0.6

    Computation of balanced accuracy score for a multi-class scenario, i.e.,
    a 1D array with more then two classes

    >>> target = np.array([1, 2, 2, 2, 1, 2, 1, 0, 1, 1])
    >>> prediction = np.array([1, 1, 2, 0, 0, 1, 1, 0, 0, 2])
    >>> balanced_accuracy_score(target, prediction)
    0.55
    """
    return sklearn.metrics.balanced_accuracy_score(target, prediction)




[docs]
def brier_score(
    target: np.ndarray,
    prediction: np.ndarray,
) -> float:
    """Compute Brier score for binary classification.

    Parameters
    ----------
    target : np.ndarray
        Ground truth values.
    prediction : np.ndarray
        Predicted values.

    Returns
    -------
    float
        Brier score for binary classification.

    Examples
    --------
    >>> target = np.array([0, 1, 1, 0, 1, 0, 0, 1, 0, 1])
    >>> prediction = prediction = np.linspace(0,1,10)
    >>> brier_score(target, prediction)
    0.34074074074074073
    """
    return sklearn.metrics.brier_score_loss(target, prediction)




[docs]
def f1_score(
    target: np.ndarray,
    prediction: np.ndarray,
    average: str | None = None,
) -> float | np.ndarray:
    """Compute F1-score of binary, multi-class or multi-label classification.

    The :math:`F_1` score is computed using the true positives (TP), false
    positives (FP) and false negatives (FN) via

    .. math::
        F_1 = \\frac{2\\operatorname{TP}}{2\\operatorname{TP} + \\operatorname{FP} + \\operatorname{FN}}.

    Parameters
    ----------
    target : np.ndarray
        Ground truth values.
    prediction : np.ndarray
        Predicted values.
    average : str | None, optional
        Averaging of the F1-scores (default None).
        For binary classification, ``average=binary`` is the default case.
        For multi-class and multi-lable classification, ``average=None`` is the
        default case, which results in F1-scores for each individual class.
        For more detail about averaging see the documentation of
        `sklearn.metrics.f1_score <https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html>`_.

    Returns
    -------
    float | np.ndarray
        F1 score(s) for binary, multi-class or multi-lable classification.

    Examples
    --------
    Computation of F1-score for binary classification, i.e.,
    a one dimensional array with only 2 classes

    >>> target = np.array([0, 0, 1, 0, 1, 0, 0, 1, 0, 1])
    >>> prediction = np.array([1, 1, 0, 1, 1, 1, 0, 1, 1, 1])
    >>> f1_score(target, prediction)
    0.5

    Computation of F1-score for a multi-class scenario, i.e., a 1D array with
    more then two classes

    >>> target = np.array([1, 2, 2, 2, 1, 2, 1, 0, 1, 1])
    >>> prediction = np.array([1, 1, 2, 0, 0, 1, 1, 0, 0, 2])
    >>> f1_score(target, prediction)
    array([0.4, 0.44444444, 0.33333333])

    Computation of F1-score for a multi-lable scenario, i.e., a 2D array with
    with columns representing different labels and values 0 or 1 as entries

    >>> target = np.array([[0, 1, 0], [1, 0, 1], [1, 1, 0], [1, 1, 1]])
    >>> prediction = np.array([[1, 0, 1], [1, 1, 1], [0, 0, 1], [1, 1, 0]])
    >>> f1_score(target, prediction)
    array([0.66666667, 0.4, 0.4])

    Computation of F1-score for a multi-class scenario with averaging of F1-scores
    over the different classes

    >>> target = np.array([1, 2, 2, 2, 1, 2, 1, 0, 1, 1])
    >>> prediction = np.array([1, 1, 2, 0, 0, 1, 1, 0, 0, 2])
    >>> f1_score(target, prediction, average='micro')
    0.4
    """
    assert target.shape == prediction.shape
    target = np.squeeze(target)
    prediction = np.squeeze(prediction)

    if np.ndim(target) > 1:
        # multi-lable case
        # NOTE: target and prediction are matrices with only two different entries
        assert np.unique(target).size == np.unique(prediction).size == 2
        avg = average
    elif np.unique(target).size == np.unique(prediction).size == 2:
        # binary case
        avg = "binary"
    else:
        # multi-class case
        avg = average
    return sklearn.metrics.f1_score(target, prediction, average=avg)




[docs]
def false_discovery_rate(
    target: np.ndarray,
    prediction: np.ndarray,
    average: str | None = None,
) -> float | np.ndarray:
    """Compute false discovery rate (FDR).

    The false discovery rate (FDR) is given by
    :math:`\\operatorname{FDR} = 1 - \\operatorname{PPV}`, where
    :math:`PPV` is the precision (positive predicted value).

    Parameters
    ----------
    target : np.ndarray
        Ground truth values.
    prediction : np.ndarray
        Predicted values.
    average : str | None, optional
        Averaging type (default None).
        For more detail about averaging see the documentation of
        `sklearn.metrics.precision_score <https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html>`_.

    Returns
    -------
    float | np.ndarray
        False discovery rate for binary, multi-class or multi-lable classification.

    See Also
    --------
    precision_score, f1_score
    """
    ppv = precision_score(target, prediction, average)
    return np.ones(ppv.size) - ppv if isinstance(ppv, np.ndarray) else 1 - ppv




[docs]
def general_threshold(
    target: np.ndarray,
    prediction: np.ndarray,
    metric: typing.Callable[[np.ndarray, np.ndarray], float],
    metric_value: float,
    greater_than: bool = True,
) -> float:
    """
    Find the threshold that sets the given metric closest to `metric_value`.

    Parameters
    ----------
    target : np.ndarray
        Ground truth values.
    prediction : np.ndarray
        Model output predictions.
    metric : Callable[[np.ndarray, np.ndarray], float]
        A metric function that takes target and prediction arrays as input.
    metric_value : float
        The value of the metric to be achieved.
    greater_than : bool
        True if the metric is supposed to be higher than `metric_value`,
        false otherwise.

    Returns
    -------
    float
        The threshold that achieves the metric.
    """
    thresholds = np.unique(prediction)
    if metric(target, prediction > thresholds[-1]) > metric(
        target, prediction > thresholds[-2]
    ):
        if greater_than:
            thresholds = thresholds[::-2]
    else:
        if not (greater_than):
            thresholds = thresholds[::-2]
    for threshold in thresholds:
        prediction_binary = prediction > threshold
        res = metric(target, prediction_binary)
        if greater_than and res > metric_value:
            return threshold
        elif not (greater_than) and res < metric_value:
            return threshold
    warnings.warn("Could not find threshold achieving target metric_value.")
    return -1.0




[docs]
def ieee_grades_torch(target, prediction):
    assert target.ndim == 1
    difference = torch.abs(prediction - target)
    grades_dict = {}
    grades_dict["A"] = torch.sum(difference < 5.0) / difference.shape[0]
    grades_dict["B"] = (
        torch.sum(torch.logical_and(5.0 <= difference, difference < 6.0))
        / difference.shape[0]
    )
    grades_dict["C"] = (
        torch.sum(torch.logical_and(6.0 <= difference, difference < 7.0))
        / difference.shape[0]
    )
    grades_dict["D"] = torch.sum(7.0 <= difference) / difference.shape[0]

    return grades_dict




[docs]
def ieee_grades(target: np.ndarray, prediction: np.ndarray) -> np.ndarray:
    """Compute the IEEE grades of the predicted values.

    The grades are calculated by comparing the difference between the target
    and the prediction. Returned are the percentage of samples that fall within
    each grade. The grading scores follow the IEEE Std 1708a™-2019 scheme, where
    instead of a mean absolute difference of two measurements with the standard
    device, we use only one measurement.

    The grading for each sample is determined as follows
     - Grade A for error <5 mmHg
     - Grade B for error between 5-6 mmHg
     - Grade C for error between 6-7 mmHg
     - Grade D for error >7 mmHg

    Parameters
    ----------
    target : np.ndarray
        Ground truth values.
    prediction : np.ndarray
        Model output predictions.

    Returns
    -------
    np.ndarray
    """
    difference = np.abs(prediction - target)
    graded_preds = np.where(
        difference <= 5,
        "A",
        np.where(
            difference <= 6,
            "B",
            np.where(
                difference <= 7,
                "C",
                "D",
            ),
        ),
    )

    grades_dict = {}
    for grade in ["A", "B", "C", "D"]:
        grades_dict[grade] = np.count_nonzero(graded_preds == grade) / len(graded_preds)
    return grades_dict




[docs]
def l1_norm(array: np.ndarray, axis: int = 0) -> float | np.ndarray:
    """Compute the :math:`L^1`-norm of an array along an axis.

    The :math:`L^1`-norm of an array :math:`x\\in\\mathbb{R}^N` is given by
    :math:`\\Vert x \\Vert_{L^1} = \\frac{1}{N}\\sum_{j=1}^N \\vert x_j \\vert`.

    Parameters
    ----------
    array : np.ndarray
        Data array.
    axis : int, optional
        Axis, by default 0.

    Returns
    -------
    :
        Array of :math:`L^1`-norms over the specified axes.

    See Also
    --------
    mean_absolute_error : Wrapper for ``l1_norm(target - prediction)``.
    l2_norm, root_mean_square_error

    Examples
    --------
    >>> l1_norm(np.array([1, 2, 3, 4]))
    10

    >>> array = np.random.normal(0, 1, (10, 5, 3, 2))
    >>> l1_norm(array, axis=1).shape  # norm over second axis
    (10, 3, 1)
    """
    assert isinstance(array, np.ndarray) and array.ndim > 0
    norm = np.sum(np.abs(array) / array.shape[axis], axis=axis)
    if norm.size == 1:
        return norm.flatten()[0]
    return norm




[docs]
def l2_norm(array: np.ndarray, axis: int = 0) -> float | np.ndarray:
    """Compute the :math:`L^2`-norm along an axis.

    The :math:`L^2`-norm of an array :math:`x\\in\\mathbb{R}^N` is given by
    :math:`\\Vert x \\Vert_{L^2} = \\sqrt{\\frac{1}{N}\\sum_{j=1}^N x_j^2 }`.

    Parameters
    ----------
    array : np.ndarray
        Data array.
    axis : int, optional
        Axis, by default 0.

    Returns
    -------
    :
        Array of :math:`L^2`-norms over the specified axes.

    See Also
    --------
    root_mean_square_error : Wrapper for ``l2_norm(target - prediction)``.
    l1_norm, mean_absolute_error

    Examples
    --------
    >>> l2_norm(np.array([1, 2, 3, 4]))**2
    30.0

    >>> array = np.random.normal(0, 1, (10, 5, 3, 2))
    >>> l2_norm(array, axis=1).shape  # norm over second axis
    (10, 3, 1)
    """
    assert isinstance(array, np.ndarray) and array.ndim > 0
    norm = np.sqrt(np.sum(array**2, axis=axis) / array.shape[axis])
    if norm.size == 1:
        return norm.flatten()[0]
    return norm




[docs]
def matthews_correlation_coefficient(
    target: np.ndarray, prediction: np.ndarray
) -> float:
    """Compute Matthews correlation coefficient (Mcc) of binary or multi-class task.

    The Matthews correlation coefficient (Mcc) is computed using the true
    positives (TP), false positives (FP), true negatives (TN) and false negatives (FN)
    via

    .. math::
        \\operatorname{Mcc}
        = \\frac{\\operatorname{TP}\\cdot\\operatorname{TN} - \\operatorname{FP}\\cdot\\operatorname{FN}}{\\sqrt{(\\operatorname{TP}+\\operatorname{FP})(\\operatorname{TP}+\\operatorname{FN})(\\operatorname{TN}+\\operatorname{FP})(\\operatorname{TN}+\\operatorname{FN})}}.

    For the multi-class case, let :math:`C` be the confusion matrix for :math:`K`
    classes and define
    the number of times class :math:`k` truly occurs :math:`t_k = \\sum_{i=1}^K C_{ik}`,
    the number of times class :math:`k` was predicted :math:`p_k = \\sum_{i=1}^K C_{ki}`,
    the total number of samples correctly predicted :math:`c = \\sum_{k=1}^K C_{kk}` and
    the total number of samples :math:`s = \\sum_{i,j=1}^K C_{ij}`.
    Then the multiclass Mcc is defined as

    .. math::
        \\operatorname{Mcc}
        = \\frac{c \\cdot s -\\sum_{k=1}^K p_k \\cdot t_k}{\\sqrt{ (s^2 - \\sum_{k=1}^K p_k^2)(s^2 - \\sum_{k=1}^K t_k^2)}}.

    .. note::
        When there are more than two labels, the value of the MCC will no longer range
        between -1 and +1. Instead the minimum value will be somewhere between -1 and 0
        depending on the number and distribution of ground true labels. The maximum
        value is always +1.

    Parameters
    ----------
    target : np.ndarray
        Ground truth values.
    prediction : np.ndarray
        Predicted values.

    Returns
    -------
    float | np.ndarray
        Mcc for binary and multi-class classification.

    Examples
    --------
    Computation of Mcc for binary classification, i.e.,
    a one dimensional array with only 2 classes

    >>> target = np.array([0, 0, 1, 0, 1, 0, 0, 1, 0, 1])
    >>> prediction = np.array([1, 1, 0, 1, 1, 1, 0, 1, 1, 1])
    >>> matthews_correlation_coefficient(target, prediction)
    -0.10206207261596577

    Computation of Mcc for a multi-class scenario, i.e., a 1D array with
    more then two classes

    >>> target = np.array([1, 2, 2, 2, 1, 2, 1, 0, 1, 1])
    >>> prediction = np.array([1, 1, 2, 0, 0, 1, 1, 0, 0, 2])
    >>> matthews_correlation_coefficient(target, prediction)
    0.13130643285972254
    """
    return sklearn.metrics.matthews_corrcoef(target, prediction)




[docs]
def mean_absolute_error(
    target: np.ndarray, prediction: np.ndarray, axis: int = 0
) -> float | np.ndarray:
    """Compute the mean absolute error (MAE) between target and prediction.

    Parameters
    ----------
    target : np.ndarray
        Ground truth values.
    prediction : np.ndarray
        Model output predictions.
    axis : int, optional
        Axis to sum over, by default 0.

    Returns
    -------
    :
        Array of MAE values (:math:`L^1`-norms) over the specified axes.

    See Also
    --------
    l1_norm : This is a wrapper for ``l1_norm(target - prediction, axis=axis)``.
    l2_norm, root_mean_square_error

    Examples
    --------
    >>> mean_absolute_error(np.array([1, 2, 3]), np.array([1, 2, 3]))
    0.0

    >>> target = np.random.normal(0, 1, (10, 5, 3, 2))
    >>> prediction = np.random.normal(0, 1, (10, 5, 3, 2))
    >>> mean_absolute_error(target, prediction, axis=1).shape  # norm over second axis
    (10, 3, 1)
    """
    return l1_norm(target - prediction, axis=axis)




[docs]
def mean_absolute_scaled_error(baseline_mae: float, model_mae: float) -> float:
    """Compute mean absolute scaled error (MASE).

    The MASE is a measure of the magnitude of the error relative to a baseline
    error. It is defined as the mean absolute error divided by the baseline
    error.

    Parameters
    ----------
    baseline_mae : float
        Mean absolute scaled error of the baseline.
    model_mae : float
        Mean absolute error.

    Returns
    -------
    float
        Mean absolute scaled error.
    """
    return model_mae / baseline_mae




[docs]
def negative_predictive_value(
    target: np.ndarray,
    prediction: np.ndarray,
) -> float | np.ndarray:
    """Compute negative predictive value (NPV) of binary, multi-class or multi-label classification.

    The negative predictive value is computed using the
    true positives (TN) and false positives (FN) via

    .. math::
        \\operatorname{NPV}
        = \\frac{\\operatorname{TN}}{\\operatorname{TN} + \\operatorname{FN}}.

    Parameters
    ----------
    target : np.ndarray
        Ground truth values.
    prediction : np.ndarray
        Predicted values.


    Returns
    -------
    float | np.ndarray
        Precision scores for binary, multi-class or multi-lable classification.

    See Also
    --------
    f1_score, false_discovery_rate, precision_score

    Examples
    --------
    Computation of negative predictive value score for binary classification, i.e.,
    a one dimensional array with only 2 classes

    >>> target = np.array([0, 0, 1, 0, 1, 0, 0, 1, 0, 1])
    >>> prediction = np.array([1, 1, 0, 1, 1, 1, 0, 1, 1, 1])
    >>> negative_predictive_value(target, prediction)
    0.5
    """
    assert target.shape == prediction.shape
    target = np.squeeze(target)
    prediction = np.squeeze(prediction)

    return sklearn.metrics.precision_score(
        target, prediction, average="binary", pos_label=0
    )




[docs]
def precision_score(
    target: np.ndarray,
    prediction: np.ndarray,
    average: str | None = None,
) -> float | np.ndarray:
    """Compute precision (PPV) of binary, multi-class or multi-label classification.

    The precision score (positive predictive value, PPV) is computed using the
    true positives (TP) and false positives (FP) via

    .. math::
        \\operatorname{PPV}
        = \\frac{\\operatorname{TP}}{\\operatorname{TP} + \\operatorname{FP}}.

    Parameters
    ----------
    target : np.ndarray
        Ground truth values.
    prediction : np.ndarray
        Predicted values.
    average : str | None, optional
        Averaging type for score (default None).
        For more detail about averaging see the documentation of
        `sklearn.metrics.precision_score <https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html>`_.

    Returns
    -------
    float | np.ndarray
        Precision scores for binary, multi-class or multi-lable classification.

    See Also
    --------
    f1_score, false_discovery_rate, negative_predictive_value

    Examples
    --------
    Computation of precision score for binary classification, i.e.,
    a one dimensional array with only 2 classes

    >>> target = np.array([0, 0, 1, 0, 1, 0, 0, 1, 0, 1])
    >>> prediction = np.array([1, 1, 0, 1, 1, 1, 0, 1, 1, 1])
    >>> precision_score(target, prediction)
    0.375

    Computation of precision score for a multi-class scenario, i.e., a 1D array with
    more then two classes

    >>> target = np.array([1, 2, 2, 2, 1, 2, 1, 0, 1, 1])
    >>> prediction = np.array([1, 1, 2, 0, 0, 1, 1, 0, 0, 2])
    >>> precision_score(target, prediction)
    array([0.25, 0.5, 0.5])

    Computation of precision score for a multi-lable scenario, i.e., a 2D array with
    with columns representing different labels and values 0 or 1 as entries

    >>> target = np.array([[0, 1, 0], [1, 0, 1], [1, 1, 0], [1, 1, 1]])
    >>> prediction = np.array([[1, 0, 1], [1, 1, 1], [0, 0, 1], [1, 1, 0]])
    >>> precision_score(target, prediction)
    array([0.33333333, 0.5, 0.66666667])

    Computation of precision score for a multi-class scenario with averaging of F1-scores
    over the different classes

    >>> target = np.array([1, 2, 2, 2, 1, 2, 1, 0, 1, 1])
    >>> prediction = np.array([1, 1, 2, 0, 0, 1, 1, 0, 0, 2])
    >>> precision_score(target, prediction, average='micro')
    0.4
    """
    assert target.shape == prediction.shape
    target = np.squeeze(target)
    prediction = np.squeeze(prediction)

    if np.ndim(target) > 1:
        # multi-lable case
        # NOTE: target and prediction are matrices with only two different entries
        assert np.unique(target).size == np.unique(prediction).size == 2
        avg = average
    elif np.unique(target).size == np.unique(prediction).size == 2:
        # binary case
        avg = "binary"
    else:
        # multi-class case
        avg = average
    return sklearn.metrics.precision_score(target, prediction, average=avg)




[docs]
def recall_score_threshold(
    target: np.ndarray,
    prediction: np.ndarray,
    recall_value: float,
    pos_label: 1 | 0 = 1,
    greater_than: bool = True,
    dtype: np.dtype = np.float32,
) -> float:
    """
    Compute the classification threshold so that the recall score is
    closest to the specified value, but greater (or lower).

    Default: The threshold is computed for the sensitivity score.

    The threshold is set as the next floating point number after
    (before) the value of the prediction that needs to be classified
    positive (negative) to achieve the desired recall score.

    Parameters
    ----------
    target : np.ndarray
        Ground truth values.
    prediction : np.ndarray
        Model output predictions.
    recall_value : float
        The desired recall score.
    pos_label : 1 | 0, optional
        1 to compute the threshold for sensitivity, 0 for
        specificity.
    greater_than : bool, optional
        True to let the recall score be higher than recall_value, false
        to let the recall score be lower than recall_value.
    dtype : np.dtype, optional
        The data type of the threshold, by default np.float32

    Returns
    -------
    float
        The classification threshold.
    """

    if np.any(prediction < 0.0) or np.any(prediction > 1.0):
        raise ValueError("All values in prediction must be between 0 and 1.")

    if pos_label:
        label_prediction = np.sort(prediction[target == pos_label])[::-1]
    else:
        label_prediction = np.sort(prediction[target == pos_label])

    if label_prediction.size == 0:
        warnings.warn(
            "Recall is ill-defined and the threshold is set to 0 due to no true samples."
        )
        return 0

    true_label = recall_value * len(label_prediction)
    threshold_index = max(0, np.ceil(true_label - 1).astype(int))

    # set the threshold slightly over/under the prediction value
    # to avoid >= or > issues while computing binary predictions
    # np.mod(x+y,2) is the same as xor(x,y) for two binary values
    threshold = np.nextafter(
        dtype(label_prediction[threshold_index]),
        np.mod(pos_label + greater_than, 2, dtype=dtype),
    )
    return threshold




[docs]
def root_mean_square_error(
    target: np.ndarray, prediction: np.ndarray, axis: int = 0
) -> float | np.ndarray:
    """Compute the root mean square error (RMSE) between target and prediction.

    Parameters
    ----------
    target : np.ndarray
        Ground truth values.
    prediction : np.ndarray
        Model output predictions.
    axis : int, optional
        Axis to sum over, by default 0.

    Returns
    -------
    :
        Array of RMSE values (:math:`L^2`-norms) over the specified axes.

    See Also
    --------
    l2_norm : This is a wrapper for ``l2_norm(target - prediction, axis=axis)``.
    l1_norm, mean_absolute_error

    Examples
    --------
    >>> root_mean_square_error(np.array([1, 2, 3]), np.array([1, 2, 3]))
    0.0

    >>> target = np.random.normal(0, 1, (10, 5, 3, 2))
    >>> prediction = np.random.normal(0, 1, (10, 5, 3, 2))
    >>> root_mean_square_error(target, prediction, axis=1).shape  # norm over second axis
    (10, 3, 1)
    """
    return l2_norm(target - prediction, axis=axis)




[docs]
def specificity(target: np.ndarray, prediction: np.ndarray) -> float:
    """
    Compute the specificity (or true negative rate). Specificity is
    also known as the recall score of the negative class.

    Parameters
    ----------
    target : np.ndarray
        Ground truth values.
    prediction : np.ndarray
        Model output predictions.

    Returns
    -------
    float
        Specificity score.

    See Also
    --------
    sensitivity
    """
    return sklearn.metrics.recall_score(target, prediction, pos_label=0)




[docs]
def sensitivity(target: np.ndarray, prediction: np.ndarray) -> float:
    """
    Compute the sensitivity (or true positive rate). Sensitivity is
    also known as the recall score of the positive class.

    Parameters
    ----------
    target : np.ndarray
        Ground truth values.
    prediction : np.ndarray
        Model output predictions.

    Returns
    -------
    float
        Sensitivity score.

    See Also
    --------
    specificity
    """
    return sklearn.metrics.recall_score(target, prediction, pos_label=1)



if __name__ == "__main__":
    pass