from typing import Literal, get_args
import networkx as nx
import numpy as np
import scipy
import sklearn.metrics
from cinnabar._due import Doi, due
due.cite(
Doi("10.1021/acs.jcim.9b00528"),
description="Compute maximum likelihood estimate of free energies and covariance in their estimates",
path="cinnabar.stats.mle",
cite_module=True,
)
[docs]
def calculate_rmse(
y_true: np.ndarray,
y_pred: np.ndarray,
) -> float:
r"""Compute root mean squared error between true and predicted values.
Note
----
The RMSE is calculated as:
.. math:: RMSE = \sqrt{\frac{1}{N} \sum_{i=1}^N (y_i - \hat{y}_i)^2}
where :math:`y_i` is the predicted value and :math:`\hat{y}_i` is the true value.
Parameters
----------
y_true : ndarray with shape (N,)
True values
y_pred : ndarray with shape (N,)
Predicted values
Returns
-------
rmse : float
RMSE between true and predicted values
"""
return np.sqrt(sklearn.metrics.mean_squared_error(y_true, y_pred))
[docs]
def calculate_mue(
y_true: np.ndarray,
y_pred: np.ndarray,
) -> float:
r"""Compute mean unsigned error between true and predicted values.
Note
----
The MUE is calculated as:
.. math:: MUE = \frac{1}{N} \sum_{i=1}^N |y_i - \hat{y}_i|
where :math:`y_i` is the predicted value and :math:`\hat{y}_i` is the true value.
Parameters
----------
y_true : ndarray with shape (N,)
True values
y_pred : ndarray with shape (N,)
Predicted values
Returns
-------
mue : float
MUE between true and predicted values
"""
return sklearn.metrics.mean_absolute_error(y_true, y_pred)
[docs]
def calculate_rae(
y_true: np.ndarray,
y_pred: np.ndarray,
) -> float:
r"""Compute relative absolute error between true and predicted values.
Note
----
The RAE compares the mean absolute error of the predictions with a baseline model that always predicts the mean of the true values.
It is calculated as:
.. math:: RAE = \frac{\frac{1}{N} \sum_{i=1}^N |y_i - \hat{y}_i|}{\frac{1}{N} \sum_{i=1}^N |\bar{y} - \hat{y}_i|}
where :math:`y_i` is the predicted value, :math:`\hat{y}_i` is the true value, and :math:`\bar{y}` is the mean of the true values.
Parameters
----------
y_true : ndarray with shape (N,)
True values
y_pred : ndarray with shape (N,)
Predicted values
Returns
-------
rae : float
RAE between true and predicted values
"""
# mean unsigned error of the predictions
mue = calculate_mue(y_true, y_pred)
true_mean = np.mean(y_true)
# mean absolute deviation of the true values from their mean
mad = np.mean([np.abs(true_mean - i) for i in y_true])
return mue / mad
[docs]
def calculate_r2(
y_true: np.ndarray,
y_pred: np.ndarray,
) -> float:
"""Compute R^2 between true and predicted values.
Note
----
R^2 is calculated as the square of the Pearson correlation coefficient between true and predicted values.
Parameters
----------
y_true : ndarray with shape (N,)
True values
y_pred : ndarray with shape (N,)
Predicted values
Returns
-------
r2 : float
R^2 between true and predicted values
"""
r_value = calculate_pearson_r(y_true, y_pred)
return r_value**2
[docs]
def calculate_pearson_r(
y_true: np.ndarray,
y_pred: np.ndarray,
) -> float:
"""Compute Pearson's r between true and predicted values.
Parameters
----------
y_true : ndarray with shape (N,)
True values
y_pred : ndarray with shape (N,)
Predicted values
Returns
-------
r : float
Pearson's r between true and predicted values
"""
return scipy.stats.pearsonr(y_true, y_pred)[0]
[docs]
def calculate_kendalls_tau(
y_true: np.ndarray,
y_pred: np.ndarray,
) -> float:
"""Compute Kendall's tau between true and predicted values.
Parameters
----------
y_true : ndarray with shape (N,)
True values
y_pred : ndarray with shape (N,)
Predicted values
Returns
-------
tau : float
Kendall's tau between true and predicted values
"""
return scipy.stats.kendalltau(y_true, y_pred)[0]
[docs]
def calculate_nrmse(y_true: np.ndarray, y_pred: np.ndarray) -> float:
r"""
Compute the normalized root mean squared error between true and predicted values, using the true mean to normalize
the RMSE. [1]_
Note
----
The NRMSE is calculated as:
.. math:: NRMSE = \frac{RMSE}{\bar{y}}
where :math:`RMSE` is the root mean squared error between true and predicted values, and :math:`\bar{y}` is the mean of the true values.
Parameters
----------
y_true : ndarray with shape (N,)
True values
y_pred : ndarray with shape (N,)
Predicted values
Returns
-------
nrmse : float
NRMSE between true and predicted values
References
----------
.. [1] https://en.wikipedia.org/wiki/Root_mean_square_deviation
"""
rmse = calculate_rmse(y_true, y_pred)
mean_true = np.mean(y_true)
return rmse / np.abs(mean_true)
[docs]
def calculate_predictive_index(y_true: np.ndarray, y_pred: np.ndarray) -> float:
r"""Compute the predictive index as introduced by Pearlman et al. between true and predicted values. [1]_
Parameters
----------
y_true : ndarray with shape (N,)
True values
y_pred : ndarray with shape (N,)
Predicted values
Note
----
The predictive index measures the correlation between the true and predicted values with a higher weight given to
ligand pairs with larger true differences. The final value is between -1 and 1, where 1 indicates perfect ranking
and -1 indicates perfectly anti-correlated ranking. It is calculated as:
.. math:: PI = \frac{\sum^n_{j>i}\sum^n_{i}{W_{ij}C_{ij}}}{\sum^n_{j>i}\sum^n_{i}{W_{ij}}}
where :math:`W_{ij} = abs(E_{j} - E_{i})` is a weight based on the true difference between the ligand pairs and :math:`C_{ij}`
indicates if the rank ordering of the true differences agree with the predicted differences:
.. math::
C_{ij} = \begin{cases}
1 & \text{if } (E_{j} - E_{i})/(P_{j} - P_{i}) > 0 \\
0 & \text{if } (P_{j} - P_{i}) = 0 \\
-1 & \text{if } (E_{j} - E_{i})/(P_{j} - P_{i}) < 0
\end{cases}
Returns
-------
pi : float
Predictive index between true and predicted values between -1 and 1.
References
----------
.. [1] Pearlman, D.A. and Charifson, P.S., 2001. Are free energy calculations useful in practice? A comparison with rapid scoring functions for the p38 MAP kinase protein system. Journal of Medicinal Chemistry, 44(21), pp.3417-3423.
"""
numerator, denominator = 0.0, 0.0
n = len(y_true)
for i in range(n):
for j in range(i + 1, n):
w_ij = np.abs(y_true[j] - y_true[i])
# avoid division by zero when the predicted values are the same
if y_pred[j] == y_pred[i]:
c_ij = 0.0
else:
c_ij = np.sign((y_true[j] - y_true[i]) / (y_pred[j] - y_pred[i]))
numerator += w_ij * c_ij
denominator += w_ij
return numerator / denominator
# map from statistic name to function that calculates the statistic
_AVAILABLE_STATS = {
"RMSE": calculate_rmse,
"NRMSE": calculate_nrmse,
"MUE": calculate_mue,
"RAE": calculate_rae,
"R2": calculate_r2,
"rho": calculate_pearson_r,
"KTAU": calculate_kendalls_tau,
"PI": calculate_predictive_index,
}
# make a type hint for the statistic names
Statistics = Literal["RMSE", "NRMSE", "MUE", "RAE", "R2", "rho", "KTAU", "PI"]
# make sure the type hint and the list stay in sync
assert set(get_args(Statistics)) == set(_AVAILABLE_STATS.keys())
[docs]
def bootstrap_statistic(
y_true: np.ndarray,
y_pred: np.ndarray,
dy_true: np.ndarray | None = None,
dy_pred: np.ndarray | None = None,
ci: float = 0.95,
statistic: Statistics = "RMSE",
nbootstrap: int = 1000,
include_true_uncertainty: bool = False,
include_pred_uncertainty: bool = False,
) -> dict:
"""Compute mean and confidence intervals of specified statistic.
Parameters
----------
y_true : ndarray with shape (N,)
True values
y_pred : ndarray with shape (N,)
Predicted values
dy_true : ndarray with shape (N,) | None, default None
Errors of true values. If None, the values are assumed to have no errors.
dy_pred : ndarray with shape (N,) | None, default None
Errors of predicted values. If None, the values are assumed to have no errors
ci : float, default 0.95
Interval for confidence interval (CI).
statistic : {"RMSE", "NRMSE", "MUE", "RAE", "R2", "rho", "KTAU", "PI"}, default "RMSE"
Statistic to be calculated.
nbootstrap : int, default 1000
Number of bootstrap samples used to estimate the confidence interval.
include_true_uncertainty : bool, default False
Whether to account for the uncertainty in ``y_true`` when bootstrapping.
include_pred_uncertainty : bool, default False
Whether to account for the uncertainty in ``y_pred`` when bootstrapping.
Note
-----
If ``include_true_uncertainty`` or ``include_pred_uncertainty`` is True,
normal noise will be added to the corresponding values during each bootstrap replicate.
The standard deviation of the normal noise is taken from ``dy_true`` or ``dy_pred``.
Returns
-------
stats : dict of float
'mle': statistic computed on the original data
'mean' : mean value of the statistic over all bootstrap samples
'stderr' : standard error of the statistic over all bootstrap samples
'low' : low end of CI
'high' : high end of CI
"""
# check the statistic is valid
if statistic not in _AVAILABLE_STATS:
raise ValueError(f"unknown statistic {statistic}")
stat_func = _AVAILABLE_STATS[statistic]
if dy_true is None:
dy_true = np.zeros_like(y_true)
if dy_pred is None:
dy_pred = np.zeros_like(y_pred)
sample_size = len(y_true)
# check the lengths of the inputs are the same and raise an error if not
for arr in [y_pred, dy_true, dy_pred]:
if len(arr) != sample_size:
raise ValueError("All input arrays must have the same length")
s_n = np.zeros([nbootstrap], np.float64) # s_n[n] is the statistic computed for bootstrap sample n
for replicate in range(nbootstrap):
# draw bootstrap indices once and select values vectorized
indices = np.random.choice(sample_size, size=sample_size, replace=True)
y_true_sample = y_true[indices]
y_pred_sample = y_pred[indices]
# only simulate normal noise when requested
if include_true_uncertainty:
std_true = np.fabs(dy_true[indices])
y_true_sample = np.random.normal(loc=y_true_sample, scale=std_true)
if include_pred_uncertainty:
std_pred = np.fabs(dy_pred[indices])
y_pred_sample = np.random.normal(loc=y_pred_sample, scale=std_pred)
s_n[replicate] = stat_func(y_true_sample, y_pred_sample)
# calculate the statistics and CI
low_percentile = (1.0 - ci) / 2.0 * 100
high_percentile = 100 - low_percentile
stats = {
"mle": stat_func(y_true, y_pred), # the sample statistic
"stderr": np.std(s_n), # standard error of the bootstrap samples
"mean": np.mean(s_n), # mean of the bootstrap samples
"low": np.percentile(s_n, low_percentile), # low end of confidence interval
"high": np.percentile(s_n, high_percentile), # high end of confidence interval
}
return stats
[docs]
def mle(graph: nx.DiGraph, factor: str = "f_ij", node_factor: str | None = None) -> tuple[np.ndarray, np.ndarray]:
"""
Compute maximum likelihood estimate of free energies and covariance in their estimates.
The number 'factor' is the node attribute on which the MLE will be calculated,
where d'factor' will be used as the standard error of the factor
Reference : https://pubs.acs.org/doi/abs/10.1021/acs.jcim.9b00528
Xu, Huafeng. "Optimal measurement network of pairwise differences."
Journal of Chemical Information and Modeling 59.11 (2019): 4720-4728.
NOTE: Self-edges (edges that connect a node to itself) will be ignored.
Parameters
----------
graph :nx.Graph
The graph for which an estimate is to be computed
Each edge must have attributes 'f_ij' and 'df_ij' for the free energy and uncertainty
estimate
Will have 'bayesian_f_ij' and 'bayesian_df_ij' added to each edge
and 'bayesian_f_i' and 'bayesian_df_i' added to each node.
factor : string, default 'f_ij'
Node attribute of nx.Graph that will be used for MLE.
node_factor : string, default None
Provide if there is node data (i.e. absolute values) 'f_i' or 'exp_DG' to
include will expect a corresponding uncertainty 'f_di' or 'exp_dDG'
Returns
-------
f_i : np.array with shape (n_ligands,)
f_i[i] is the absolute free energy of ligand i in kcal/mol
C : np.array with shape (n_ligands, n_ligands)
C[i,j] is the covariance of the free energy estimates of i and j
"""
# if we have bidirectional edge results we need to raise an error as they can not be used with MLE
# track the edges we have seen
edges = []
for a, b in graph.edges:
edge_name = tuple(sorted([a, b]))
if edge_name in edges:
raise ValueError(
f"Multiple edges detected between nodes {a} and {b}. MLE cannot be performed on graphs with multiple "
f"edges between the same nodes. The results should be combined into a single estimate and uncertainty "
f"before performing MLE. See https://cinnabar.openfree.energy/en/latest/concepts/estimators.html#limitations for more details."
)
edges.append(edge_name)
N = graph.number_of_nodes()
if node_factor is None:
f_ij = form_edge_matrix(graph, factor, action="antisymmetrize")
df_ij = form_edge_matrix(graph, factor.replace("_", "_d"), action="symmetrize")
else:
f_ij = form_edge_matrix(graph, factor, action="antisymmetrize", node_label=node_factor)
df_ij = form_edge_matrix(
graph,
factor.replace("_", "_d"),
action="symmetrize",
node_label=node_factor.replace("_", "_d"),
)
node_name_to_index = {}
for i, name in enumerate(graph.nodes()):
node_name_to_index[name] = i
# Form F matrix (Eq 4)
F_matrix = np.zeros([N, N])
for a, b in graph.edges:
i = node_name_to_index[a]
j = node_name_to_index[b]
if i == j:
# The MLE solver will fail if we include self-edges, so we need to omit these
continue
# check if the uncertainty is zero and raise an error if it is, since this will cause the MLE solver to fail
if df_ij[i, j] == 0.0:
raise ValueError(
f"MLE solver will fail with zero reported uncertainty for calculated differences. Edge ({a}, {b}) has zero uncertainty check inputs."
)
F_matrix[i, j] = -(df_ij[i, j] ** (-2))
F_matrix[j, i] = -(df_ij[i, j] ** (-2))
for n in graph.nodes:
i = node_name_to_index[n]
if df_ij[i, i] == 0.0:
F_matrix[i, i] = -np.sum(F_matrix[i, :])
else:
F_matrix[i, i] = df_ij[i, i] ** (-2) - np.sum(F_matrix[i, :])
# Form z vector (Eq 3)
z = np.zeros([N])
for n in graph.nodes:
i = node_name_to_index[n]
if df_ij[i, i] != 0.0:
z[i] = f_ij[i, i] * df_ij[i, i] ** (-2)
for a, b in graph.edges:
i = node_name_to_index[a]
j = node_name_to_index[b]
if i == j:
# The MLE solver will fail if we include self-edges, so we need to omit these
continue
z[i] += f_ij[i, j] * df_ij[i, j] ** (-2)
z[j] += f_ij[j, i] * df_ij[j, i] ** (-2)
# Compute MLE estimate (Eq 2)
Finv = np.linalg.pinv(F_matrix)
f_i = np.matmul(Finv, z)
# Compute uncertainty
C = Finv
return f_i, C