Source code for cinnabar.stats

from typing import Literal, get_args

import networkx as nx
import numpy as np
import scipy
import sklearn.metrics

from cinnabar._due import Doi, due

due.cite(
    Doi("10.1021/acs.jcim.9b00528"),
    description="Compute maximum likelihood estimate of free energies and covariance in their estimates",
    path="cinnabar.stats.mle",
    cite_module=True,
)


[docs] def calculate_rmse( y_true: np.ndarray, y_pred: np.ndarray, ) -> float: r"""Compute root mean squared error between true and predicted values. Note ---- The RMSE is calculated as: .. math:: RMSE = \sqrt{\frac{1}{N} \sum_{i=1}^N (y_i - \hat{y}_i)^2} where :math:`y_i` is the predicted value and :math:`\hat{y}_i` is the true value. Parameters ---------- y_true : ndarray with shape (N,) True values y_pred : ndarray with shape (N,) Predicted values Returns ------- rmse : float RMSE between true and predicted values """ return np.sqrt(sklearn.metrics.mean_squared_error(y_true, y_pred))
[docs] def calculate_mue( y_true: np.ndarray, y_pred: np.ndarray, ) -> float: r"""Compute mean unsigned error between true and predicted values. Note ---- The MUE is calculated as: .. math:: MUE = \frac{1}{N} \sum_{i=1}^N |y_i - \hat{y}_i| where :math:`y_i` is the predicted value and :math:`\hat{y}_i` is the true value. Parameters ---------- y_true : ndarray with shape (N,) True values y_pred : ndarray with shape (N,) Predicted values Returns ------- mue : float MUE between true and predicted values """ return sklearn.metrics.mean_absolute_error(y_true, y_pred)
[docs] def calculate_rae( y_true: np.ndarray, y_pred: np.ndarray, ) -> float: r"""Compute relative absolute error between true and predicted values. Note ---- The RAE compares the mean absolute error of the predictions with a baseline model that always predicts the mean of the true values. It is calculated as: .. math:: RAE = \frac{\frac{1}{N} \sum_{i=1}^N |y_i - \hat{y}_i|}{\frac{1}{N} \sum_{i=1}^N |\bar{y} - \hat{y}_i|} where :math:`y_i` is the predicted value, :math:`\hat{y}_i` is the true value, and :math:`\bar{y}` is the mean of the true values. Parameters ---------- y_true : ndarray with shape (N,) True values y_pred : ndarray with shape (N,) Predicted values Returns ------- rae : float RAE between true and predicted values """ # mean unsigned error of the predictions mue = calculate_mue(y_true, y_pred) true_mean = np.mean(y_true) # mean absolute deviation of the true values from their mean mad = np.mean([np.abs(true_mean - i) for i in y_true]) return mue / mad
[docs] def calculate_r2( y_true: np.ndarray, y_pred: np.ndarray, ) -> float: """Compute R^2 between true and predicted values. Note ---- R^2 is calculated as the square of the Pearson correlation coefficient between true and predicted values. Parameters ---------- y_true : ndarray with shape (N,) True values y_pred : ndarray with shape (N,) Predicted values Returns ------- r2 : float R^2 between true and predicted values """ r_value = calculate_pearson_r(y_true, y_pred) return r_value**2
[docs] def calculate_pearson_r( y_true: np.ndarray, y_pred: np.ndarray, ) -> float: """Compute Pearson's r between true and predicted values. Parameters ---------- y_true : ndarray with shape (N,) True values y_pred : ndarray with shape (N,) Predicted values Returns ------- r : float Pearson's r between true and predicted values """ return scipy.stats.pearsonr(y_true, y_pred)[0]
[docs] def calculate_kendalls_tau( y_true: np.ndarray, y_pred: np.ndarray, ) -> float: """Compute Kendall's tau between true and predicted values. Parameters ---------- y_true : ndarray with shape (N,) True values y_pred : ndarray with shape (N,) Predicted values Returns ------- tau : float Kendall's tau between true and predicted values """ return scipy.stats.kendalltau(y_true, y_pred)[0]
[docs] def calculate_nrmse(y_true: np.ndarray, y_pred: np.ndarray) -> float: r""" Compute the normalized root mean squared error between true and predicted values, using the true mean to normalize the RMSE. [1]_ Note ---- The NRMSE is calculated as: .. math:: NRMSE = \frac{RMSE}{\bar{y}} where :math:`RMSE` is the root mean squared error between true and predicted values, and :math:`\bar{y}` is the mean of the true values. Parameters ---------- y_true : ndarray with shape (N,) True values y_pred : ndarray with shape (N,) Predicted values Returns ------- nrmse : float NRMSE between true and predicted values References ---------- .. [1] https://en.wikipedia.org/wiki/Root_mean_square_deviation """ rmse = calculate_rmse(y_true, y_pred) mean_true = np.mean(y_true) return rmse / np.abs(mean_true)
[docs] def calculate_predictive_index(y_true: np.ndarray, y_pred: np.ndarray) -> float: r"""Compute the predictive index as introduced by Pearlman et al. between true and predicted values. [1]_ Parameters ---------- y_true : ndarray with shape (N,) True values y_pred : ndarray with shape (N,) Predicted values Note ---- The predictive index measures the correlation between the true and predicted values with a higher weight given to ligand pairs with larger true differences. The final value is between -1 and 1, where 1 indicates perfect ranking and -1 indicates perfectly anti-correlated ranking. It is calculated as: .. math:: PI = \frac{\sum^n_{j>i}\sum^n_{i}{W_{ij}C_{ij}}}{\sum^n_{j>i}\sum^n_{i}{W_{ij}}} where :math:`W_{ij} = abs(E_{j} - E_{i})` is a weight based on the true difference between the ligand pairs and :math:`C_{ij}` indicates if the rank ordering of the true differences agree with the predicted differences: .. math:: C_{ij} = \begin{cases} 1 & \text{if } (E_{j} - E_{i})/(P_{j} - P_{i}) > 0 \\ 0 & \text{if } (P_{j} - P_{i}) = 0 \\ -1 & \text{if } (E_{j} - E_{i})/(P_{j} - P_{i}) < 0 \end{cases} Returns ------- pi : float Predictive index between true and predicted values between -1 and 1. References ---------- .. [1] Pearlman, D.A. and Charifson, P.S., 2001. Are free energy calculations useful in practice? A comparison with rapid scoring functions for the p38 MAP kinase protein system. Journal of Medicinal Chemistry, 44(21), pp.3417-3423. """ numerator, denominator = 0.0, 0.0 n = len(y_true) for i in range(n): for j in range(i + 1, n): w_ij = np.abs(y_true[j] - y_true[i]) # avoid division by zero when the predicted values are the same if y_pred[j] == y_pred[i]: c_ij = 0.0 else: c_ij = np.sign((y_true[j] - y_true[i]) / (y_pred[j] - y_pred[i])) numerator += w_ij * c_ij denominator += w_ij return numerator / denominator
# map from statistic name to function that calculates the statistic _AVAILABLE_STATS = { "RMSE": calculate_rmse, "NRMSE": calculate_nrmse, "MUE": calculate_mue, "RAE": calculate_rae, "R2": calculate_r2, "rho": calculate_pearson_r, "KTAU": calculate_kendalls_tau, "PI": calculate_predictive_index, } # make a type hint for the statistic names Statistics = Literal["RMSE", "NRMSE", "MUE", "RAE", "R2", "rho", "KTAU", "PI"] # make sure the type hint and the list stay in sync assert set(get_args(Statistics)) == set(_AVAILABLE_STATS.keys())
[docs] def bootstrap_statistic( y_true: np.ndarray, y_pred: np.ndarray, dy_true: np.ndarray | None = None, dy_pred: np.ndarray | None = None, ci: float = 0.95, statistic: Statistics = "RMSE", nbootstrap: int = 1000, include_true_uncertainty: bool = False, include_pred_uncertainty: bool = False, ) -> dict: """Compute mean and confidence intervals of specified statistic. Parameters ---------- y_true : ndarray with shape (N,) True values y_pred : ndarray with shape (N,) Predicted values dy_true : ndarray with shape (N,) | None, default None Errors of true values. If None, the values are assumed to have no errors. dy_pred : ndarray with shape (N,) | None, default None Errors of predicted values. If None, the values are assumed to have no errors ci : float, default 0.95 Interval for confidence interval (CI). statistic : {"RMSE", "NRMSE", "MUE", "RAE", "R2", "rho", "KTAU", "PI"}, default "RMSE" Statistic to be calculated. nbootstrap : int, default 1000 Number of bootstrap samples used to estimate the confidence interval. include_true_uncertainty : bool, default False Whether to account for the uncertainty in ``y_true`` when bootstrapping. include_pred_uncertainty : bool, default False Whether to account for the uncertainty in ``y_pred`` when bootstrapping. Note ----- If ``include_true_uncertainty`` or ``include_pred_uncertainty`` is True, normal noise will be added to the corresponding values during each bootstrap replicate. The standard deviation of the normal noise is taken from ``dy_true`` or ``dy_pred``. Returns ------- stats : dict of float 'mle': statistic computed on the original data 'mean' : mean value of the statistic over all bootstrap samples 'stderr' : standard error of the statistic over all bootstrap samples 'low' : low end of CI 'high' : high end of CI """ # check the statistic is valid if statistic not in _AVAILABLE_STATS: raise ValueError(f"unknown statistic {statistic}") stat_func = _AVAILABLE_STATS[statistic] if dy_true is None: dy_true = np.zeros_like(y_true) if dy_pred is None: dy_pred = np.zeros_like(y_pred) sample_size = len(y_true) # check the lengths of the inputs are the same and raise an error if not for arr in [y_pred, dy_true, dy_pred]: if len(arr) != sample_size: raise ValueError("All input arrays must have the same length") s_n = np.zeros([nbootstrap], np.float64) # s_n[n] is the statistic computed for bootstrap sample n for replicate in range(nbootstrap): # draw bootstrap indices once and select values vectorized indices = np.random.choice(sample_size, size=sample_size, replace=True) y_true_sample = y_true[indices] y_pred_sample = y_pred[indices] # only simulate normal noise when requested if include_true_uncertainty: std_true = np.fabs(dy_true[indices]) y_true_sample = np.random.normal(loc=y_true_sample, scale=std_true) if include_pred_uncertainty: std_pred = np.fabs(dy_pred[indices]) y_pred_sample = np.random.normal(loc=y_pred_sample, scale=std_pred) s_n[replicate] = stat_func(y_true_sample, y_pred_sample) # calculate the statistics and CI low_percentile = (1.0 - ci) / 2.0 * 100 high_percentile = 100 - low_percentile stats = { "mle": stat_func(y_true, y_pred), # the sample statistic "stderr": np.std(s_n), # standard error of the bootstrap samples "mean": np.mean(s_n), # mean of the bootstrap samples "low": np.percentile(s_n, low_percentile), # low end of confidence interval "high": np.percentile(s_n, high_percentile), # high end of confidence interval } return stats
[docs] def mle(graph: nx.DiGraph, factor: str = "f_ij", node_factor: str | None = None) -> tuple[np.ndarray, np.ndarray]: """ Compute maximum likelihood estimate of free energies and covariance in their estimates. The number 'factor' is the node attribute on which the MLE will be calculated, where d'factor' will be used as the standard error of the factor Reference : https://pubs.acs.org/doi/abs/10.1021/acs.jcim.9b00528 Xu, Huafeng. "Optimal measurement network of pairwise differences." Journal of Chemical Information and Modeling 59.11 (2019): 4720-4728. NOTE: Self-edges (edges that connect a node to itself) will be ignored. Parameters ---------- graph :nx.Graph The graph for which an estimate is to be computed Each edge must have attributes 'f_ij' and 'df_ij' for the free energy and uncertainty estimate Will have 'bayesian_f_ij' and 'bayesian_df_ij' added to each edge and 'bayesian_f_i' and 'bayesian_df_i' added to each node. factor : string, default 'f_ij' Node attribute of nx.Graph that will be used for MLE. node_factor : string, default None Provide if there is node data (i.e. absolute values) 'f_i' or 'exp_DG' to include will expect a corresponding uncertainty 'f_di' or 'exp_dDG' Returns ------- f_i : np.array with shape (n_ligands,) f_i[i] is the absolute free energy of ligand i in kcal/mol C : np.array with shape (n_ligands, n_ligands) C[i,j] is the covariance of the free energy estimates of i and j """ # if we have bidirectional edge results we need to raise an error as they can not be used with MLE # track the edges we have seen edges = [] for a, b in graph.edges: edge_name = tuple(sorted([a, b])) if edge_name in edges: raise ValueError( f"Multiple edges detected between nodes {a} and {b}. MLE cannot be performed on graphs with multiple " f"edges between the same nodes. The results should be combined into a single estimate and uncertainty " f"before performing MLE. See https://cinnabar.openfree.energy/en/latest/concepts/estimators.html#limitations for more details." ) edges.append(edge_name) N = graph.number_of_nodes() if node_factor is None: f_ij = form_edge_matrix(graph, factor, action="antisymmetrize") df_ij = form_edge_matrix(graph, factor.replace("_", "_d"), action="symmetrize") else: f_ij = form_edge_matrix(graph, factor, action="antisymmetrize", node_label=node_factor) df_ij = form_edge_matrix( graph, factor.replace("_", "_d"), action="symmetrize", node_label=node_factor.replace("_", "_d"), ) node_name_to_index = {} for i, name in enumerate(graph.nodes()): node_name_to_index[name] = i # Form F matrix (Eq 4) F_matrix = np.zeros([N, N]) for a, b in graph.edges: i = node_name_to_index[a] j = node_name_to_index[b] if i == j: # The MLE solver will fail if we include self-edges, so we need to omit these continue # check if the uncertainty is zero and raise an error if it is, since this will cause the MLE solver to fail if df_ij[i, j] == 0.0: raise ValueError( f"MLE solver will fail with zero reported uncertainty for calculated differences. Edge ({a}, {b}) has zero uncertainty check inputs." ) F_matrix[i, j] = -(df_ij[i, j] ** (-2)) F_matrix[j, i] = -(df_ij[i, j] ** (-2)) for n in graph.nodes: i = node_name_to_index[n] if df_ij[i, i] == 0.0: F_matrix[i, i] = -np.sum(F_matrix[i, :]) else: F_matrix[i, i] = df_ij[i, i] ** (-2) - np.sum(F_matrix[i, :]) # Form z vector (Eq 3) z = np.zeros([N]) for n in graph.nodes: i = node_name_to_index[n] if df_ij[i, i] != 0.0: z[i] = f_ij[i, i] * df_ij[i, i] ** (-2) for a, b in graph.edges: i = node_name_to_index[a] j = node_name_to_index[b] if i == j: # The MLE solver will fail if we include self-edges, so we need to omit these continue z[i] += f_ij[i, j] * df_ij[i, j] ** (-2) z[j] += f_ij[j, i] * df_ij[j, i] ** (-2) # Compute MLE estimate (Eq 2) Finv = np.linalg.pinv(F_matrix) f_i = np.matmul(Finv, z) # Compute uncertainty C = Finv return f_i, C
[docs] def form_edge_matrix( graph: nx.Graph, label: str, step=None, action: Literal["symmetrize", "antisymmetrize"] | None = None, node_label=None, ) -> np.ndarray: """ Extract the labeled property from edges into a matrix. Parameters ---------- graph : nx.Graph The graph to extract data from. label : str The label to use for extracting edge properties. action : {"symmetrize", "antisymmetrize"} | None, default None If 'symmetrize', returns a symmetric matrix A[i,j] = A[j,i] If 'antisymmetrize', returns an antisymmetric matrix A[i,j] = -A[j,i] node_label : str | None, default None Diagonal will be occupied with absolute values, where labeled. Returns ---------- matrix """ N = len(graph.nodes) matrix = np.zeros([N, N]) node_name_to_index = {} for i, name in enumerate(graph.nodes()): node_name_to_index[name] = i for a, b in graph.edges: i = node_name_to_index[a] j = node_name_to_index[b] matrix[j, i] = graph.edges[a, b][label] if action == "symmetrize": matrix[i, j] = matrix[j, i] elif action == "antisymmetrize": matrix[i, j] = -matrix[j, i] elif action is None: pass else: raise ValueError(f'action "{action}" unknown.') if node_label is not None: for n in graph.nodes(data=True): i = node_name_to_index[n[0]] if node_label in n[1]: matrix[i, i] = n[1][node_label] return matrix