Source code for cinnabar.classification_metrics

# This code is part of cinnabar and is licensed under the MIT license.
# For details, see https://github.com/OpenFreeEnergy/cinnabar

import math
from typing import Iterable

import numpy as np
from numpy.typing import NDArray


def _create_2d_histogram(y_true: Iterable[float], y_pred: Iterable[float]) -> tuple[NDArray, NDArray, NDArray]:
    """
    Create a 2D histogram from two arrays of data.

    Parameters
    ----------
    y_true : array-like
        The true values.
    y_pred : array-like
        The predicted values.

    Returns
    -------
    histogram : ndarray
        The 2D histogram of the input data.
    bins_true : ndarray
        The bin edges along the y_true axis.
    bins_pred : ndarray
        The bin edges along the y_pred axis.

    Raises
    ------
    ValueError
        If ``y_true`` and ``y_pred`` have different lengths.
    """

    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    if y_true.shape != y_pred.shape:
        raise ValueError("y_true and y_pred must have the same length.")

    y_true_sorted = np.sort(y_true)
    y_pred_sorted = np.sort(y_pred)
    # Calculate bin edges using midpoints between sorted values
    bins_true = np.concatenate(([y_true.min()], (y_true_sorted[:-1] + y_true_sorted[1:]) / 2, [y_true.max()]))
    bins_pred = np.concatenate(([y_pred.min()], (y_pred_sorted[:-1] + y_pred_sorted[1:]) / 2, [y_pred.max()]))

    # Note a perfect prediction will have all counts in the diagonal bins
    histogram, bins_true, bins_pred = np.histogram2d(y_true, y_pred, bins=[bins_true, bins_pred])

    return histogram, bins_true, bins_pred


def _compute_overlap_coefficient(histogram: NDArray, ranking: int) -> float:
    """
    Compute the overlap coefficient from a 2D histogram.

    The overlap coefficient is calculated based on the counts in the histogram
    for the top N ranked ligands (most active).

    Parameters
    ----------
    histogram : ndarray
        A 2D histogram array where the counts are stored.
    ranking : int
        The number of rankings to consider when computing overlap.

    Returns
    -------
    float
        The overlap coefficient.

    Raises
    ------
    ValueError
        If ``top_n_ligands`` is greater than the number of ligands in the histogram.
    """
    if ranking < 1:
        raise ValueError("Ranking must be greater than 0.")

    if histogram.shape[0] < ranking:
        raise ValueError("Ranking must be less than the number of ligands.")

    overlap = np.sum(histogram[:ranking, :ranking])

    return overlap / ranking



[docs]
def compute_fraction_best_ligands(y_true: Iterable[float], y_pred: Iterable[float], fraction: float = 0.5) -> float:
    """
    Compute the fraction of the best ligands metric introduced by Chris Bayly.

    This function calculates the fraction of the best ligands by computing overlap
    coefficients for each ranking up to the number of ligands and then averaging up to the specified fraction.

    Parameters
    ----------
    y_true : array-like
        The true values.
    y_pred : array-like
        The predicted values.
    fraction : float, default 0.5
        The fraction of ligands to consider as the best.

    Returns
    -------
    float
        The computed fraction of the best ligands.

    Raises
    ------
    ValueError
        If ``fraction`` is not between 0 and 1.
    """

    if not (0 <= fraction <= 1):
        raise ValueError("Fraction must be between 0 and 1.")

    histogram = _create_2d_histogram(y_true, y_pred)[0]

    num_ligands = histogram.shape[0]
    num_best_ligands = math.floor(num_ligands * fraction)

    overlap_coefficients = [_compute_overlap_coefficient(histogram, i + 1) for i in range(num_best_ligands)]

    fraction_best_ligands = sum(overlap_coefficients) / num_best_ligands

    return fraction_best_ligands